├── .editorconfig
├── .github
    ├── ISSUE_TEMPLATE.md
    └── workflows
    │   └── nodejs.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── docker-entrypoint.sh
├── docs
    ├── .editorconfig
    ├── .vuepress
    │   ├── components
    │   │   └── bilibili-player.vue
    │   └── config.js
    ├── README.md
    ├── advance
    │   ├── cli.md
    │   └── patch.md
    ├── courses
    │   ├── cnmooc.md
    │   ├── icourse163.md
    │   ├── icourses.md
    │   ├── livedu.md
    │   ├── open_163.md
    │   ├── study_163.md
    │   ├── study_mooc.md
    │   └── xuetangx.md
    ├── guide
    │   ├── basic.md
    │   ├── faq.md
    │   ├── getting-started.md
    │   ├── known-issues.md
    │   └── notice.md
    └── images
    │   ├── get_cookies.png
    │   └── icourse163_01.png
├── mooc.py
├── moocs
    ├── __init__.py
    ├── cnmooc.py
    ├── icourse163.py
    ├── icourses.py
    ├── icourses_share.py
    ├── livedu.py
    ├── open_163.py
    ├── study_163.py
    ├── study_mooc.py
    ├── utils.py
    ├── xuetangx.py
    └── xuetangx_next.py
├── package.json
├── requirements.txt
├── scripts
    └── deploy.sh
└── utils
    ├── aria2.py
    └── crawler.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # EditorConfig
 2 | # https://editorconfig.org/
 3 | 
 4 | root = true
 5 | 
 6 | [*]
 7 | indent_style = space
 8 | indent_size = 2
 9 | end_of_line = lf
10 | charset = utf-8
11 | trim_trailing_whitespace = true
12 | insert_final_newline = true
13 | 
14 | [*.py]
15 | indent_size = 4
16 | 
17 | [*.md]
18 | indent_size = 3
19 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## （请在这里填写错误简述）
2 | 
3 | 网站：中国大学MOOC（网易云课堂 MOOC、学堂在线）
4 | 
5 | 课程地址：（请在这里填写课程地址）
6 | 
7 | 问题描述：（请在这里填写问题描述）
8 | 


--------------------------------------------------------------------------------
/.github/workflows/nodejs.yml:
--------------------------------------------------------------------------------
 1 | name: Node CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 | jobs:
 9 |   build-and-deploy:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@master
13 |       - name: git-lfs
14 |         run: |
15 |           git lfs install
16 |           git lfs pull
17 |       - uses: actions/setup-node@master
18 |       - name: deploy
19 |         run: |
20 |           npm install yarn
21 |           yarn
22 |           yarn deploy $ACCESS_TOKEN
23 |         env:
24 |           ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }}
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # IPython
 78 | profile_default/
 79 | ipython_config.py
 80 | 
 81 | # pyenv
 82 | .python-version
 83 | 
 84 | # celery beat schedule file
 85 | celerybeat-schedule
 86 | 
 87 | # SageMath parsed files
 88 | *.sage.py
 89 | 
 90 | # Environments
 91 | .env
 92 | .venv
 93 | env/
 94 | venv/
 95 | ENV/
 96 | env.bak/
 97 | venv.bak/
 98 | 
 99 | # Spyder project settings
100 | .spyderproject
101 | .spyproject
102 | 
103 | # Rope project settings
104 | .ropeproject
105 | 
106 | # mkdocs documentation
107 | /site
108 | 
109 | # mypy
110 | .mypy_cache/
111 | .dmypy.json
112 | dmypy.json
113 | 
114 | # Pyre type checker
115 | .pyre/
116 | 
117 | ### Node ###
118 | # Logs
119 | logs
120 | *.log
121 | npm-debug.log*
122 | yarn-debug.log*
123 | yarn-error.log*
124 | lerna-debug.log*
125 | 
126 | # Diagnostic reports (https://nodejs.org/api/report.html)
127 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
128 | 
129 | # Runtime data
130 | pids
131 | *.pid
132 | *.seed
133 | *.pid.lock
134 | 
135 | # Directory for instrumented libs generated by jscoverage/JSCover
136 | lib-cov
137 | 
138 | # Coverage directory used by tools like istanbul
139 | coverage
140 | *.lcov
141 | 
142 | # nyc test coverage
143 | .nyc_output
144 | 
145 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
146 | .grunt
147 | 
148 | # Bower dependency directory (https://bower.io/)
149 | bower_components
150 | 
151 | # node-waf configuration
152 | .lock-wscript
153 | 
154 | # Compiled binary addons (https://nodejs.org/api/addons.html)
155 | build/Release
156 | 
157 | # Dependency directories
158 | node_modules/
159 | jspm_packages/
160 | 
161 | # TypeScript v1 declaration files
162 | typings/
163 | 
164 | # TypeScript cache
165 | *.tsbuildinfo
166 | 
167 | # Optional npm cache directory
168 | .npm
169 | 
170 | # Optional eslint cache
171 | .eslintcache
172 | 
173 | # Optional REPL history
174 | .node_repl_history
175 | 
176 | # Output of 'npm pack'
177 | *.tgz
178 | 
179 | # Yarn Integrity file
180 | .yarn-integrity
181 | 
182 | # dotenv environment variables file
183 | .env
184 | .env.test
185 | 
186 | # parcel-bundler cache (https://parceljs.org/)
187 | .cache
188 | 
189 | # next.js build output
190 | .next
191 | 
192 | # nuxt.js build output
193 | .nuxt
194 | 
195 | # vuepress build output
196 | .vuepress/dist
197 | 
198 | # Serverless directories
199 | .serverless/
200 | 
201 | # FuseBox cache
202 | .fusebox/
203 | 
204 | # DynamoDB Local files
205 | .dynamodb/
206 | 
207 | # End of https://www.gitignore.io/api/node
208 | 
209 | # Node.js
210 | yarn.lock
211 | package.json
212 | .huskyrc
213 | .editorconfig
214 | commitlint.config.js
215 | 
216 | # draft
217 | draft/
218 | 
219 | # IDEs/editors
220 | .vscode/
221 | .idea/
222 | 
223 | # Yarn
224 | yarn.lock
225 | 
226 | # course crawler
227 | __pycache__/
228 | *.pyc
229 | /* - */
230 | /*.json
231 | 
232 | # Others
233 | .ipynb_checkpoints
234 | .idea
235 | .DS_Store
236 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:alpine
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | 
 6 | RUN apk add --update --no-cache --virtual build_images g++ gcc libxslt-dev git && \
 7 |     git clone https://github.com/Foair/course-crawler.git /app && \
 8 |     pip install requests BeautifulSoup4 lxml -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com && \
 9 |     apk del build_images && \
10 |     rm -rf /app/README.md /app/LICENSE
11 | 
12 | COPY ./docker-entrypoint.sh /app
13 | 
14 | RUN chmod 777 ./docker-entrypoint.sh
15 | 
16 | ENTRYPOINT ["./docker-entrypoint.sh"]
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Foair
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Course Crawler
 2 | 
 3 | ![python 3.6.7](https://img.shields.io/badge/python-3.6.7-green?style=flat-square&logo=python)
 4 | 
 5 | 一个基于 Python 3 的 MOOC 课程下载工具，可以获取多个慕课网站的课件，方便离线观看
 6 | 
 7 | ### 支持列表
 8 | 
 9 | - [中国大学MOOC](https://www.icourse163.org/)
10 | - [网易云课堂](http://study.163.com/)
11 |    - [普通课程](http://study.163.com/)
12 |    - [MOOC 课程](http://mooc.study.163.com/)
13 | - [网易公开课](https://open.163.com/)
14 | - [好大学在线](https://www.cnmooc.org/)
15 | - [爱课程](http://www.icourses.cn/)
16 |    - [视频公开课](http://www.icourses.cn/cuoc/)
17 |    - [资源共享课](http://www.icourses.cn/mooc/)
18 | - [学堂在线](http://www.xuetangx.com/)
19 | - [北京高校优质课程研究会](http://www.livedu.com.cn/)
20 | 
21 | 详细信息和用法请见 [https://www.sigure.xyz/course-crawler/](https://www.sigure.xyz/course-crawler/)。
22 | 
23 | ### 声明
24 | 
25 | 仅限个人学习和研究使用，切勿用于其他用途。强烈建议到 MOOC 网站进行学习，本程序只是提供一个备选方案。
26 | 
27 | 本程序主体功能只是下载课件和附件，无任何手段获得付费课程，也没有以任何方式向任何人收取费用。
28 | 
29 | 如果将程序用于商业用途或其他非法用途，一切后果由用户自负。
30 | 
31 | 如果您发现有侵犯到您的合法权益，请与我联系删除相关代码，同时我对无意冒犯到您致以深深的歉意。
32 | 
33 | ### 许可协议
34 | 
35 | 请遵照 MIT 许可使用该程序。
36 | 


--------------------------------------------------------------------------------
/docker-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | python mooc.py "$@" -d "/video"
4 | 


--------------------------------------------------------------------------------
/docs/.editorconfig:
--------------------------------------------------------------------------------
 1 | # EditorConfig
 2 | # https://editorconfig.org/
 3 | 
 4 | root = true
 5 | 
 6 | [*]
 7 | indent_style = space
 8 | indent_size = 2
 9 | end_of_line = lf
10 | charset = utf-8
11 | trim_trailing_whitespace = true
12 | insert_final_newline = true
13 | 
14 | [*.py]
15 | indent_size = 4
16 | 
17 | [*.sh]
18 | indent_size = 4
19 | 
20 | [*.md]
21 | indent_size = 3
22 | 


--------------------------------------------------------------------------------
/docs/.vuepress/components/bilibili-player.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <div>
 3 |     <iframe
 4 |       :src="url"
 5 |       scrolling="no"
 6 |       border="0"
 7 |       frameborder="no"
 8 |       framespacing="0"
 9 |       allowfullscreen="true"
10 |     >
11 |     </iframe>
12 |   </div>
13 | </template>
14 | 
15 | <script>
16 | export default {
17 |   props: ["avid", "cid", "page"],
18 |   data() {
19 |     let video_url =
20 |       "//player.bilibili.com/player.html?high_quality=1&aid=" + this.avid;
21 |     // 移动端必须指定 cid 字段才能解析到正确的视频
22 |     if (this.cid) {
23 |       video_url += "&cid=" + this.cid;
24 |     }
25 |     if (this.page) {
26 |       video_url += "&page=" + this.page;
27 |     }
28 |     return {
29 |       url: video_url
30 |     };
31 |   }
32 | };
33 | </script>
34 | 
35 | <style scoped>
36 | div {
37 |   position: relative;
38 |   width: 100%;
39 |   height: 0;
40 |   padding-bottom: 75%;
41 | }
42 | iframe {
43 |   position: absolute;
44 |   width: 100%;
45 |   height: 100%;
46 |   left: 0;
47 |   top: 0;
48 | }
49 | </style>
50 | 


--------------------------------------------------------------------------------
/docs/.vuepress/config.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   title: "Course Crawler",
 3 |   description: "基于 Python 3 的 MOOC 课程下载工具",
 4 |   base: "/course-crawler/",
 5 | 
 6 |   // 插件
 7 |   plugins: [
 8 |     // 页面滚动时自动激活侧边栏链接
 9 |     "@vuepress/active-header-links"
10 |   ],
11 | 
12 |   // 主题配置
13 |   themeConfig: {
14 |     nav: [
15 |       { text: "指南", link: "/" },
16 |       { text: "分类", link: "/courses/icourse163" },
17 |       { text: "进阶", link: "/advance/cli" }
18 |     ],
19 |     sidebarDepth: 1,
20 |     sidebar: {
21 |       "/advance/": ["cli", "patch"],
22 |       "/courses/": [
23 |         "icourse163",
24 |         "study_163",
25 |         "study_mooc",
26 |         "open_163",
27 |         "icourses",
28 |         "xuetangx",
29 |         "cnmooc",
30 |         "livedu"
31 |       ],
32 |       "/": [
33 |         "",
34 |         "guide/getting-started",
35 |         "guide/basic",
36 |         "guide/faq",
37 |         "guide/known-issues",
38 |         "guide/notice"
39 |       ]
40 |     },
41 | 
42 |     // algolia: {
43 |     //   apiKey: "20560f10044e76d7f16908746c3adeb1",
44 |     //   indexName: "siguremo_course-crawler"
45 |     // },
46 | 
47 |     lastUpdated: "Last Updated", // string | boolean
48 | 
49 |     // 假定是 GitHub. 同时也可以是一个完整的 GitLab URL
50 |     repo: "SigureMo/course-crawler",
51 |     // 自定义仓库链接文字。默认从 `themeConfig.repo` 中自动推断为
52 |     // "GitHub"/"GitLab"/"Bitbucket" 其中之一，或是 "Source"。
53 |     repoLabel: "GitHub",
54 | 
55 |     // 以下为可选的编辑链接选项
56 | 
57 |     // 假如你的文档仓库和项目本身不在一个仓库：
58 |     docsRepo: "SigureMo/course-crawler",
59 |     // 假如文档不是放在仓库的根目录下：
60 |     docsDir: "docs/",
61 |     // 假如文档放在一个特定的分支下：
62 |     // docsBranch: "docs",
63 |     // 默认是 false, 设置为 true 来启用
64 |     editLinks: true,
65 |     // 默认为 "Edit this page"
66 |     editLinkText: "在GitHub上编辑此页！",
67 |     // Service Worker 的配置
68 |     serviceWorker: {
69 |       updatePopup: true
70 |     }
71 |   }
72 | };
73 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # 介绍
 2 | 
 3 | ![python 3.6.7](https://img.shields.io/badge/python-3.6.7-green?style=flat-square&logo=python)
 4 | 
 5 | 一个基于 Python 3 的 MOOC 课程内容获取工具，方便离线观看。
 6 | 
 7 | [下载最新程序](https://github.com/SigureMo/course-crawler/archive/master.zip) 或 [前往 GitHub](https://github.com/SigureMo/course-crawler)
 8 | 
 9 | ## 支持列表
10 | 
11 | -  [中国大学 MOOC](https://www.icourse163.org/) 是国内优质的中文 MOOC 学习平台，由爱课程网携手·网易云课堂·打造。**大多数的名校都有一定数量课程**，如北京大学、浙江大学、哈尔滨工业大学等
12 | -  [网易云课堂](http://study.163.com/)
13 |    -  [普通课程](http://study.163.com/) 涵盖方面较广，更注重于**职场、生活技能**，很多需要付费
14 |    -  [MOOC 课程](http://mooc.study.163.com/) 有一部分中国大学 MOOC 的内容，此外还有一些微专业内容，但是很多需要付费，推荐 [顶尖中文大学计算机专业课程体系](https://study.163.com/curricula/cs.htm) 与 [深度学习工程师微专业](https://mooc.study.163.com/smartSpec/detail/1001319001.htm)
15 | -  [网易公开课](https://open.163.com/) 是网易推出的“全球名校视频公开课项目”，收录了哈佛大学等**世界级名校**的公开课课程以及可汗学院，TED 等教育性组织的精彩视频，内容较经典，但是也相对比较陈旧
16 | -  [好大学在线](https://www.cnmooc.org/) 是上海交通大学拥有的中国顶尖慕课平台。主要是 **上海交通大学** 等大学或机构的课程
17 | -  [爱课程](http://www.icourses.cn/) 的资源比较多，但总体相对陈旧
18 |    -  [视频公开课](http://www.icourses.cn/cuoc/)
19 |    -  [资源共享课](http://www.icourses.cn/mooc/)
20 | -  [学堂在线](http://www.xuetangx.com/) 是清华大学发起的精品中文慕课平台。主要是 **清华大学** 的课程
21 | -  [北京高校优质课程研究会](http://www.livedu.com.cn/) 是北京市教委组织的**北京各高校**课程平台
22 | 


--------------------------------------------------------------------------------
/docs/advance/cli.md:
--------------------------------------------------------------------------------
  1 | # 命令行参数
  2 | 
  3 | <bilibili-player avid=65418448 cid=113566361 page=2></bilibili-player>
  4 | 
  5 | ## 显示帮助信息
  6 | 
  7 | > `-h` `--help` 用于显示帮助信息。
  8 | 
  9 | 输入 `python mooc.py -h` 或 `python mooc.py --help`。
 10 | 
 11 | ## 指定下载目录
 12 | 
 13 | > `-d <path>` `--dir=<path>` 用于指定下载目录为 `<path>`。
 14 | 
 15 | 课程文件夹将在创建在 `<path>` 中。默认创建在当前目录，即 `-d ""`。
 16 | 
 17 | 示例
 18 | 
 19 | ```bash
 20 | python mooc.py -d "G:\MOOCs" https://www.icourse163.org/course/TONGJI-53004
 21 | ```
 22 | 
 23 | ::: tip
 24 | `<path>` 不能以 \ 结尾；当 `<path>` 存在空格的时候，必须使用 `"` 将路径包裹起来。
 25 | :::
 26 | 
 27 | ## 重新录入 Cookies
 28 | 
 29 | > `-c` `--restore-cookies` 用于在程序运行时录入新的 Cookies，以覆盖旧的 Cookies
 30 | 
 31 | 由于 Cookies 经常存在过期的情况，手动去删除会很麻烦，这时只需要运行时加上这样一个参数就可以将旧的 Cookies 覆盖掉
 32 | 
 33 | ## 指定视频清晰度
 34 | 
 35 | > `-r <quality>` `--quality <quality>` 用于指定视频清晰度为 `<quality>`
 36 | 
 37 | `<quality>` 可选列表为 `shd` `hd` `sd` ，分别对应超高清、高清、标清，默认为超高清
 38 | 
 39 | 示例
 40 | 
 41 | ```bash
 42 | python mooc.py -r hd https://www.icourse163.org/course/TONGJI-53004
 43 | ```
 44 | 
 45 | ::: tip
 46 | 在支持清晰度调节的课程中，如果指定的清晰度不存在，则先自动降低清晰度，若仍无匹配的清晰度，则后升高清晰度，比如指定为 hd ，则会以 hd sd shd 序列对清晰度进行匹配
 47 | :::
 48 | 
 49 | ## 强制覆盖已下载文件 <Badge text="danger" type="error"/>
 50 | 
 51 | > `-w`, `--overwrite` 用于启用强制覆盖已经下载过的文件
 52 | 
 53 | 示例
 54 | 
 55 | ```bash
 56 | python mooc.py https://www.icourse163.org/course/TONGJI-53004 -w
 57 | ```
 58 | 
 59 | ## aria2 的调用 <Badge text="beta" type="warn"/>
 60 | 
 61 | 为了方便后续视频的下载，增加了直接调用 `aria2` 进行下载的支持
 62 | 
 63 | ::: tip aria2 相关下载：
 64 | 
 65 | -  [aria2](https://github.com/aria2/aria2/releases)
 66 | -  [aria2 webui](https://github.com/ziahamza/webui-aria2/archive/master.zip)
 67 | -  [AriaNg（一个比较好看的 webui）](https://github.com/mayswind/AriaNg/releases)
 68 | 
 69 | :::
 70 | 
 71 | > `--aria2` 用于启用 `aria2` 直接下载视频
 72 | 
 73 | 当配置好 aria2 路径后，在课件解析完成时程序不退出，直接调用 `aria2` 下载视频
 74 | 
 75 | ::: tip
 76 | 
 77 | 请事先确保 `aria2c` 已经是可执行程序，即已经添加到环境变量
 78 | 
 79 | :::
 80 | 
 81 | 示例
 82 | 
 83 | ```bash
 84 | python mooc.py --aria2 https://www.icourse163.org/course/TONGJI-53004
 85 | ```
 86 | 
 87 | ## 播放列表设置
 88 | 
 89 | 由于不同播放器对播放列表格式的要求并不相同，通过修改参数可以获得更通用的播放列表
 90 | 
 91 | ::: tip 一些推荐的播放器
 92 | 
 93 | -  Windows
 94 |    -  PotPlayer
 95 | -  Linux
 96 |    -  SMPlayer
 97 | -  MacOS
 98 |    -  IINA
 99 | 
100 | :::
101 | 
102 | ### 播放列表类型
103 | 
104 | > `--playlist-type=<playlist_type>` 用于指定播放列表类型
105 | 
106 | 可选列表 `dpl` `m3u` `no` ，默认为 `dpl` ，若指定 `no` 则不生成播放列表
107 | 
108 | ::: tip
109 | 
110 | 默认生成的 `Playlist.dpl` 仅仅对 PotPlayer 有效，如果无法使用 PotPlayer （比如 Linux 下），请生成更通用的 `m3u` 格式
111 | 
112 | :::
113 | 
114 | 示例
115 | 
116 | ```bash
117 | python mooc.py --playlist-type=m3u https://www.icourse163.org/course/TONGJI-53004
118 | ```
119 | 
120 | ### 播放列表路径类型
121 | 
122 | > `--abs-path` 用于指定播放列表内的路径为绝对路径
123 | 
124 | ::: tip
125 | 
126 | 有些播放器并不支持相对路径的播放列表，如果你的播放器无法打开该文件，请尝试生成绝对路径的播放列表
127 | 
128 | :::
129 | 
130 | 示例
131 | 
132 | ```bash
133 | python mooc.py --playlist-type=m3u --abs-path https://www.icourse163.org/course/TONGJI-53004
134 | ```
135 | 
136 | ::: warning
137 | 
138 | 绝对路径的播放列表会在课程文件夹移动后失效，如果开启该选项，请不要在课程下载后进行移动
139 | 
140 | :::
141 | 
142 | ## 不下载 ...
143 | 
144 | ### 不下载文档
145 | 
146 | > `--no-doc` 用于阻止下载 PDF、Word、PowerPoint 等文档。
147 | 
148 | 默认会下载所有文档。
149 | 
150 | 当指定了这个选项之后，不会下载任何文档（包括 PPT 和书籍等）。
151 | 
152 | 示例
153 | 
154 | ```bash
155 | python mooc.py https://www.icourse163.org/course/TONGJI-53004 --no-doc
156 | ```
157 | 
158 | ### 不下载字幕
159 | 
160 | > `--no-sub` 用于阻止下载字幕。
161 | 
162 | ### 不下载富文本
163 | 
164 | > `--no-text` 用于阻止下载富文本。
165 | 
166 | ### 不下载附件
167 | 
168 | > `--no-file` 用于阻止下载附件。
169 | 
170 | ### 不下载播放列表
171 | 
172 | > `--playlist-type=no` 用于阻止下载播放列表。详情见 [播放列表类型](#播放列表类型)
173 | 
174 | ## 修正视频/文档名
175 | 
176 | > `--inter` 用于修改文件名。
177 | 
178 | 会调出文件编辑器，编辑好视频的名字之后保存。默认没有启用。
179 | 
180 | ::: tip
181 | 请严格按照原来文本长度进行设置，否则可能会发生没有标题的情况。
182 | :::
183 | 


--------------------------------------------------------------------------------
/docs/advance/patch.md:
--------------------------------------------------------------------------------
 1 | # 修改默认值
 2 | 
 3 | <bilibili-player avid=65418448 cid=113567107 page=3></bilibili-player>
 4 | 
 5 | ## 修改默认获取目录
 6 | 
 7 | 如果不想每次都指定获取目录的话，可以修改 `mooc.py`，找到如下行：
 8 | 
 9 | ```python
10 |     parser.add_argument('-d', default=r'G:\MOOCs', help='下载目录')
11 | ```
12 | 
13 | 将 `G:\MOOCs` 替换为想要的文件夹即可。
14 | 
15 | ## 默认启用某个选项
16 | 
17 | 修改 `mooc.py`，将选项所在 `store_false` 或 `store_true` 切换一下就行了。
18 | 
19 | 示例
20 | 
21 | 如果我想默认不下载 PDF，那么将 `--no-pdf` 所在的那一行的 `store_false` 改了就行了，改成这样
22 | 
23 | ```python
24 |     parser.add_argument('--no-pdf', action='store_true', help='不下载 PDF 文档')
25 | ```
26 | 
27 | 这样默认就不会下载 PDF，而如果在命令中使用了 `--no-pdf` 就会下载 PDF 了。
28 | 


--------------------------------------------------------------------------------
/docs/courses/cnmooc.md:
--------------------------------------------------------------------------------
 1 | # 好大学在线
 2 | 
 3 | ## 简介
 4 | 
 5 | [好大学在线](https://www.cnmooc.org/) 是上海交通大学拥有的中国顶尖慕课平台。主要是 **上海交通大学** 等大学或机构的课程。
 6 | 
 7 | ## 地址格式
 8 | 
 9 | 课程的地址必须类似以下这种格式
10 | 
11 | ```
12 | https://www.cnmooc.org/portal/course/4386/9729.mooc
13 | ```
14 | 
15 | ## 碎碎念
16 | 
17 | 要想获得课程必须保证一下两个条件均满足：
18 | 
19 | -  已经在客户端或 Web 端手动加入课程；
20 | -  当前课程已经在开课时间内。
21 | 
22 | 同·中国大学 MOOC·一样，可以通过切换「开课班级」参加以前的课程。
23 | 
24 | 如果当前课程还未开课，可以切换到以前的班次，并加入，这样就可以获得视频等资源。
25 | 


--------------------------------------------------------------------------------
/docs/courses/icourse163.md:
--------------------------------------------------------------------------------
 1 | # 中国大学 MOOC
 2 | 
 3 | ## 简介
 4 | 
 5 | [中国大学 MOOC](https://www.icourse163.org/) 是国内优质的中文 MOOC 学习平台，由爱课程网携手·网易云课堂·打造。**大多数的名校都有一定数量课程**，如北京大学、浙江大学、哈尔滨工业大学等
 6 | 
 7 | ## 地址格式
 8 | 
 9 | 课程的地址必须类似以下两种格式
10 | 
11 | ```
12 | https://www.icourse163.org/course/TONGJI-53004
13 | https://www.icourse163.org/course/TONGJI-53004?tid=1001770008
14 | ```
15 | 
16 | ::: tip
17 | 
18 | -  上面的 `course` 替换为 `learn` 也是支持的
19 | -  `SPOC` 课程也是支持的，比如 `https://www.icourse163.org/spoc/course/WHUT-1002745006?tid=1002931006`
20 |    :::
21 | 
22 | ## 开课次数
23 | 
24 | 课程的地址包含了两部分信息，以 `https://www.icourse163.org/course/TONGJI-53004?tid=1001770008` 为例，`53004` 是课程号，唯一标志了同济大学开设的高等数学（一）这门课程，而 `1001770008` 代表了某学期的该课程的课程号，如果地址中不出现 `?tid=xxx` 字段，则默认为最新一次开课，所以我们可以通过控制最后的 `tid` 以达到下载不同学期的课件，而不同学期的地址我们可以在课程主页获取
25 | 
26 | ![icourse163_01.png](../images/icourse163_01.png)
27 | 
28 | 切换开课学期后便可在浏览器地址栏看到对应的学期课程地址
29 | 
30 | ## 身份验证
31 | 
32 | 中 M 的视频接口很不稳定，在这一年内进行了多次的变更，当前有两种内置的方案
33 | 
34 | -  一种是在程序要求输入 Cookies 的时候直接回车注入空的 Cookies 以调用旧接口，但不保证该接口以后会不会删掉
35 | -  另一种输入完整的 Cookies ，这样会调用新的接口，但是最近（19 年 10 月），该接口只会返回新视频的 m3u8 播放列表，如果遇到该问题，请使用旧接口进行下载，问题详细描述见 [issue37](https://github.com/Foair/course-crawler/issues/37)，如果该方案也无法解决，请临时使用 [mooc-dl](https://github.com/SigureMo/mooc-dl) 或者自行寻求其他解决方案
36 | 
37 | ## 碎碎念
38 | 
39 | 「老师已关闭该学期，无法查看」暂时无所畏惧。
40 | 
41 | 找不到开课页面的话，可以先进入课程的公告页面，然后点击课程名。
42 | 
43 | 如果你下载的是最新学期的课程，请**确定最新学期已经开课**，未开课的学期是无法下载的，不过你可以尝试下载前几个学期的课程。
44 | 


--------------------------------------------------------------------------------
/docs/courses/icourses.md:
--------------------------------------------------------------------------------
 1 | # 爱课程
 2 | 
 3 | ## 简介
 4 | 
 5 | [爱课程](https://www.icourse163.org/) 的资源比较多，但总体相对陈旧
 6 | 
 7 | ## 地址格式
 8 | 
 9 | -  「资源共享课」
10 | 
11 |    ```
12 |    http://www.icourses.cn/sCourse/course_6076.html
13 |    http://www.icourses.cn/web/sword/portal/shareDetails?cId=6076#/course/chapter
14 |    ```
15 | 
16 | -  「视频公开课」
17 | 
18 |    ```
19 |    http://www.icourses.cn/web/sword/portal/videoDetail?courseId=1013d845-1344-1000-b974-22f745f72788#/?resId=10195dd1-1344-1000-bbd7-22f745f72788
20 |    ```
21 | 
22 |    ::: tip
23 |    只要是以如下地址开始都可以，不用在意是在那一个视频。
24 | 
25 |    ```
26 |    http://www.icourses.cn/web/sword/portal/videoDetail
27 |    ```
28 | 
29 |    :::
30 | 


--------------------------------------------------------------------------------
/docs/courses/livedu.md:
--------------------------------------------------------------------------------
 1 | # 北京高校优质课程研究会
 2 | 
 3 | ## 简介
 4 | 
 5 | [北京高校优质课程研究会](http://www.livedu.com.cn/) 是北京市教委组织的**北京各高校**课程平台
 6 | 
 7 | ## 地址格式
 8 | 
 9 | 课程的地址必须类似以下这种格式
10 | 
11 | ```
12 | http://www.livedu.com.cn/ispace4.0/moocxjkc/toKcView.do?kcid=253
13 | ```
14 | 
15 | ## 碎碎念
16 | 
17 | 下载前请确定你已经完成选课，否则也是无法解析的
18 | 
19 | 另外，由于是从 HTML 中解析数据，速度极慢
20 | 


--------------------------------------------------------------------------------
/docs/courses/open_163.md:
--------------------------------------------------------------------------------
 1 | # 网易公开课
 2 | 
 3 | ## 简介
 4 | 
 5 | [网易公开课](https://open.163.com/) 是网易推出的“全球名校视频公开课项目”，收录了哈佛大学等**世界级名校**的公开课课程以及可汗学院，TED 等教育性组织的精彩视频，内容较经典，但是也相对比较陈旧
 6 | 
 7 | ## 地址格式
 8 | 
 9 | 课程的地址必须类似以下两种格式
10 | 
11 | ```
12 | http://open.163.com/special/opencourse/cs50.html
13 | http://open.163.com/movie/2010/3/U/R/M6U6LS8CV_M6U6MHDUR.html
14 | ```
15 | 
16 | ## 碎碎念
17 | 
18 | 网易公开课也是不需要 Cookies 的
19 | 


--------------------------------------------------------------------------------
/docs/courses/study_163.md:
--------------------------------------------------------------------------------
 1 | # 网易云课堂
 2 | 
 3 | ## 简介
 4 | 
 5 | [网易云课堂](http://study.163.com/) 涵盖方面较广，更注重于**职场、生活技能**，很多需要付费
 6 | 
 7 | ## 地址格式
 8 | 
 9 | 课程的地址必须类似以下三种格式
10 | 
11 | ```
12 | https://study.163.com/course/courseLearn.htm?courseId=1004570029#/learn/video?lessonId=1052094278&courseId=1004570029
13 | https://study.163.com/course/courseMain.htm?courseId=1004570029
14 | https://study.163.com/course/introduction/1004570029.htm
15 | ```
16 | 
17 | ## 碎碎念
18 | 
19 | 网易云课堂免费课程当前并不需要身份认证
20 | 
21 | 当然，没有身份认证的话也是**不可能支持下载付费视频的**，暂时也不打算做相关支持
22 | 
23 | 本文档仅针对网易云课堂普通课程，普通课程与 MOOC 课程相差很大， MOOC 课程更类似于中国大学 MOOC ，如需查看其文档，请移步 [网易云课堂 MOOC](study_mooc.md)
24 | 


--------------------------------------------------------------------------------
/docs/courses/study_mooc.md:
--------------------------------------------------------------------------------
 1 | # 网易云课堂 MOOC
 2 | 
 3 | ## 简介
 4 | 
 5 | [网易云课堂 MOOC 课程](http://mooc.study.163.com/) 有一部分中国大学 MOOC 的内容，此外还有一些微专业内容，但是很多需要付费，推荐 [顶尖中文大学计算机专业课程体系](https://study.163.com/curricula/cs.htm) 与 [深度学习工程师微专业](https://mooc.study.163.com/smartSpec/detail/1001319001.htm)
 6 | 
 7 | ## 地址格式
 8 | 
 9 | 课程的地址必须类似以下两种格式
10 | 
11 | ```
12 | http://mooc.study.163.com/course/2001281002#/info
13 | http://mooc.study.163.com/course/2001281002
14 | ```
15 | 
16 | ::: tip
17 | 
18 | -  上面的 `course` 替换为 `learn` 也是支持的
19 |    :::
20 | 
21 | ## 碎碎念
22 | 
23 | 与[中国大学 MOOC](./icourse163.md) 大体上相同，但它对身份的验证比较苛刻，你**本身无法访问到的内容程序也是无法帮你获取的，也就是说它并不能帮你获取你未参加的已关闭学期的内容**
24 | 
25 | Cookies 极易失效，可在运行时添加参数 `-c` 注入新的 Cookies
26 | 


--------------------------------------------------------------------------------
/docs/courses/xuetangx.md:
--------------------------------------------------------------------------------
 1 | # 学堂在线
 2 | 
 3 | ## 简介
 4 | 
 5 | [学堂在线](http://www.xuetangx.com/) 是清华大学发起的精品中文慕课平台。主要是 **清华大学** 的课程
 6 | 
 7 | ## 地址格式
 8 | 
 9 | 课程的地址必须类似以下这种格式
10 | 
11 | ```
12 | https://next.xuetangx.com/course/HNU08071000999/1076493
13 | ```
14 | 
15 | ## 碎碎念
16 | 
17 | 学堂在线于 19 年 10 月左右进行了大更新，域名改为了 `next.xuetangx.com` ，如果你还能找到类似下面这种 `www.xuetangx.com` 下的旧版本课程的话，现在也是支持下载的
18 | 
19 | ```
20 | http://www.xuetangx.com/courses/course-v1:TsinghuaX+00740043_2x_2015_T2+sp/about
21 | ```
22 | 


--------------------------------------------------------------------------------
/docs/guide/basic.md:
--------------------------------------------------------------------------------
 1 | # 深入了解
 2 | 
 3 | ## 课程目录结构
 4 | 
 5 | ```
 6 | <cousre>
 7 | |-- Outline.txt
 8 | |-- Playlist.dpl
 9 | |-- Files/
10 | |-- PDFs/
11 | |-- Texts/
12 | `-- Videos/
13 |     |-- Rename.bat
14 |     `-- Videos.txt
15 | ```
16 | 
17 | ### 课程大纲
18 | 
19 | `Outline.txt` 是课程的大纲，它的内容类似
20 | 
21 | ```
22 | 6.1 空间直角坐标系及向量 {1}
23 |   6.1.1 空间直角坐标系的基本概念 {1.1}
24 |     6.1.1 空间直角坐标系的基本概念（视频） {1.1.1}#
25 |     6.1.1 空间直角坐标系的基本概念（PPT） {1.1.1}+
26 |     6.1.1 空间直角坐标系的基本概念（PPT） 空间直角坐标系的基本概念.rar {1.1.1}!
27 | ...
28 | ```
29 | 
30 | 每个级别依次增加 2 个空格的缩进，`{}` 之间的是程序生成的编号，用来唯一标识一个资源（比如视频、富文本等等）。
31 | 
32 | `{1.1.1}` 说明该视频文件以 `1.1.1` 开头，可以在 `Videos/` 中找到。如此可以方便地找到视频。
33 | 
34 | 有些后面可能有奇怪的符号，比如 `{1.1.1}+`的后面有个 `+`。下面是符号的说明：
35 | 
36 | -  #: 视频，可以下载到 `Videos/`
37 | -  \*：课件，一般是 PDF 文件，位于 `PDFs/`
38 | -  +：富文本，一般是 HTML 文件，位于 `Texts/`
39 | -  !：附件，位于 `Files/`
40 | -  &：字幕，位于 `Videos/`
41 | 
42 | ### 视频地址
43 | 
44 | `Videos.txt` 是视频的链接，它的内容类似
45 | 
46 | ```
47 | http://v.stu.126.net/mooc-video/nos/mp4/2017/02/21/1005820377_aa6e1b0d92314cdfaf6dcad3351b3533_shd.mp4?ak=99ed7479ee303d1b1361b0ee5a4abcee11069a7277fd2bfbd983de77f6586b3ab4d3781458cdbd61bf0041fae59dee85cb91769ba5850a28845217d0bc9bfb580015e48ffc49c659b128bfe612dda086d65894b8ef217f1626539e3c9eb40879c29b730d22bdcadb1b4f67996129275fa4c38c6336120510aea1ae1790819de86e0fa3e09eeabea1b068b3d9b9b6597acf0c219eb000a69c12ce9d568813365b3e099fcdb77c69ca7cd6141d92c122af
48 | http://v.stu.126.net/mooc-video/nos/mp4/2017/02/21/1005822368_a91783c5f05a49e29960d24f1dc06f15_shd.mp4?ak=99ed7479ee303d1b1361b0ee5a4abcee11069a7277fd2bfbd983de77f6586b3a33090c48273cc5e338f1d269a2b016013857294759d07b499e26c45d788128b30015e48ffc49c659b128bfe612dda086d65894b8ef217f1626539e3c9eb40879c29b730d22bdcadb1b4f67996129275fa4c38c6336120510aea1ae1790819de86e0fa3e09eeabea1b068b3d9b9b6597acf0c219eb000a69c12ce9d568813365b3e099fcdb77c69ca7cd6141d92c122af
49 | ...
50 | ```
51 | 
52 | 复制到下载工具下载，比如 [aria2](https://github.com/aria2/aria2/releases)、迅雷 等，也可以直接在浏览器中打开。
53 | 
54 | ### 视频文件名
55 | 
56 | `Rename.bat` （或 `Rename.sh`）用于将视频重命名，它的内容类似
57 | 
58 | ```
59 | CHCP 65001
60 | 
61 | REN "1005820377_aa6e1b0d92314cdfaf6dcad3351b3533_shd.mp4" "1.1.1 空间直角坐标系的基本概念(视频）.mp4"
62 | REN "1005822368_a91783c5f05a49e29960d24f1dc06f15_shd.mp4" "1.2.1 向量的坐标表示(视频）.mp4"
63 | REN "1005817378_500b5301360f49c18c6f8d3406959cf5_shd.mp4" "1.3.1 向量的模、方向余弦、投影（视频）.mp4"
64 | REN "1005821395_ff485bb1e65145ec90bf04a259eb6b0e_shd.mp4" "2.1.1 向量的数量积(视频）.mp4"
65 | REN "1005821396_9180e5908bc847548a8db625af9b1ad7_shd.mp4" "2.2.1 向量的数量积（续）（视频）.mp4"
66 | REN "1005817386_18d7ede415ec4cb5befa71a9d790ce0f_shd.mp4" "2.3.1 向量的向量积（视频）.mp4"
67 | REN "1005822373_8bf3846066e045cda306bd7d27e38786_shd.mp4" "2.4.1 向量的向量积（续）（视频）.mp4"
68 | REN "1005899086_7780acc4ac074ed89b6301e41349a2c1_shd.mp4" "3.1.1 平面方程（视频）.mp4"
69 | ...
70 | ```
71 | 
72 | 下载下来的视频文件名是一团糟的，比如
73 | 
74 | ```
75 | 1005820377_aa6e1b0d92314cdfaf6dcad3351b3533_shd.mp4
76 | ```
77 | 
78 | 运行该文件，视频的文件名就清晰整齐了，也会按照章节次序排列。
79 | 
80 | ::: tip
81 | 
82 | -  `Windows` 下，当视频和这个文件在同一个文件夹时直接**双击**该文件即可运行
83 | -  `*nix`需要终端运行 `sh Rename.sh`
84 | 
85 | :::
86 | 
87 | ### 播放列表
88 | 
89 | 打开 `Playlist.dpl` 即可播放 `Videos/` 中的视频。
90 | 
91 | 由于文件系统的限制，特殊字符比如 `"` `/` `\` 都不允许出现在文件名中，所以文件名中的特殊字符是被删除的。假如原视频的标题是「有 3/4 的概率会下雨」，就会变成 `有 34 的概率会下雨`，就很奇怪吧。而播放的列表就可以解决这个问题，在播放列表中会显示 `有 3/4 的概率会下雨`。
92 | 
93 | ## 说明
94 | 
95 | 学堂在线暂时只有 `Books`，没有 `PDFs`，因为如果提供 `PPT` 的话，在讲义那一栏就有链接可以下载。
96 | 


--------------------------------------------------------------------------------
/docs/guide/faq.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ::: danger Q1:
 4 | 
 5 | 我的登录信息输错了（失效了），怎么重新填写？
 6 | 
 7 | :::
 8 | 
 9 | ::: tip A1:
10 | 
11 | 重新启动程序，启动时添加参数 `-c` 并输入新的 Cookies
12 | 
13 | :::
14 | 
15 | ---
16 | 
17 | ::: danger Q2:
18 | 
19 | 我遇到了一个课程无法成功获取，最快捷的反馈方式是？
20 | 
21 | :::
22 | 
23 | ::: tip A2:
24 | 
25 | 依次进行如下检查：
26 | 
27 | -  Cookie 是否失效，如果失效请使用参数 `-c` 并重新输入
28 | -  当前账号是否加入了该课程，并对该课程**有访问权限**（比如该学期是否是开启状态，课程是否是付费才能观看）
29 | -  [Github issues](https://github.com/Foair/course-crawler/issues) 中是否有相似问题与解决方案
30 | 
31 | 如果仍然无法解决，请在 Github 提出 [issue](https://github.com/Foair/course-crawler/issues/new) ，或者[邮件联系我](mailto:sigure_mo@163.com)，我会尽快处理
32 | 
33 | :::
34 | 
35 | ---
36 | 
37 | ::: danger Q3:
38 | 
39 | 我想看原版文档
40 | 
41 | :::
42 | 
43 | ::: tip A3:
44 | 
45 | 请前往 [Foair 的文档](https://mooc.xoy.io/) 查看
46 | 
47 | :::
48 | 
49 | ---
50 | 
51 | ::: danger Q4:
52 | 
53 | 如何参与文档的修改？
54 | 
55 | :::
56 | 
57 | ::: tip A4:
58 | 
59 | 点击文档左下角的“在 GitHub 上编辑此页” 即可~
60 | 
61 | :::
62 | 


--------------------------------------------------------------------------------
/docs/guide/getting-started.md:
--------------------------------------------------------------------------------
 1 | # 快速开始
 2 | 
 3 | <bilibili-player avid=65418448></bilibili-player>
 4 | 
 5 | ## 准备工作
 6 | 
 7 | 在下载之前，你需要保证你已经安装 `python3.5` 及其以上版本，并且安装完成依赖
 8 | 
 9 | 需要的依赖如下
10 | 
11 | -  `requests`
12 | -  `BeautifulSoup4`
13 | -  `lxml`
14 | -  `pycryptodome`
15 | 
16 | ```bash
17 | pip install requests BeautifulSoup4 lxml pycryptodome
18 | ```
19 | 
20 | ## 下载程序源码
21 | 
22 | 前往项目主页下载程序，或者直接点击[这里](https://github.com/SigureMo/course-crawler/archive/master.zip)，之后解压
23 | 
24 | 当然，已经安装 `git` 的同学可以直接 `clone`
25 | 
26 | ```bash
27 | git clone https://github.com/SigureMo/course-crawler.git
28 | ```
29 | 
30 | ## 运行程序
31 | 
32 | 在刚刚下载的项目根目录下打开命令行（“终端”、“命令提示符”、“PowerShell”都行，`Win10` 在项目根目录按住 `shift` 右键就有相应的选项，后面统称命令行）
33 | 
34 | 在命令行中输入 `python mooc.py <url>` ，即可将课程课件下载到当前文件夹
35 | 
36 | 比如，中国大学 MOOC 课程 `《高等数学（一）》 - 同济大学`
37 | 
38 | ```bash
39 | python mooc.py https://www.icourse163.org/course/TONGJI-53004
40 | ```
41 | 
42 | ::: tip
43 | 这里的 `<url>` 为课程主页的地址，网址的具体要求及课程下载的额外要求详见[分类](../courses/icourse163.md)
44 | :::
45 | 
46 | ## 身份验证
47 | 
48 | 很多课程并不是直接就能下载的，需要验证下你的身份，这大多都可以通过输入 Cookies 解决
49 | 
50 | 当你下载的课程需要输入 Cookies 时，用浏览器打开课程主页，然后按下 `F12` 打开开发者工具
51 | 
52 | 切换到 `Network` 选项卡，刷新页面，在左侧选择第一个抓到的包，在右侧 `Headers` 中找到 `cookie` （也可能是 `Cookie`），复制粘贴到程序命令行中
53 | 
54 | ![get_cookies.png](../images/get_cookies.png)
55 | 
56 | ::: tip
57 | 
58 | 如果你和我一样懒的话，可以直接三击 cookies 快速将整个 cookies 及前面的 `cookie:` 一起选中，直接复制粘贴到程序中，也是可以的，反正我是懒得从左上滑到右下啦，所以特意做了这个小“优化”~
59 | 
60 | :::
61 | 
62 | ## 等待 ...
63 | 
64 | 等待程序运行，程序首先会从课程主页获取课件列表及解析所需相关信息，之后逐个课件进行解析下载
65 | 
66 | ## 下载视频
67 | 
68 | 特别地，由于视频资源相对来说花费时间较多，所以视频资源并不是在解析时直接进行下载，而是解析出 `url` 至 `<course>/Videos/Videos.txt` ，之后需要你自行使用下载工具进行下载（比如 `aria2` ，或者迅雷等）
69 | 
70 | 下载后将视频移动到 `<course>/Videos/` 内，之后双击 `Rename.bat` 即可修正视频名
71 | 
72 | ::: tip
73 | 
74 | -  这里的 `<course>` 指课程根目录
75 | -  Linux 下的使用以及 `Rename` 文件详情请见[视频文件名](basic.html#视频文件名)
76 | 
77 | :::
78 | 
79 | ## 视频的播放
80 | 
81 | 使用 PotPlayer 打开 `Playlist.dpl` 即可播放视频
82 | 
83 | ::: tip
84 | 
85 | 如果你并不想使用 PotPlayer ，请修改[播放列表设置](../advance/cli.html#播放列表设置)
86 | 
87 | :::
88 | 


--------------------------------------------------------------------------------
/docs/guide/known-issues.md:
--------------------------------------------------------------------------------
 1 | # 已知问题
 2 | 
 3 | ::: warning Q1:
 4 | 
 5 | 可能会出现被远程主机强制关闭一个连接。
 6 | 
 7 | :::
 8 | 
 9 | ::: tip A1:
10 | 
11 | 解决方法：等待一段时间然后重新尝试。
12 | 
13 | :::
14 | 
15 | ---
16 | 
17 | ::: warning Q2:
18 | 
19 | 网易云课堂(MOOC) 的 Cookie 很容易失效。
20 | 
21 | :::
22 | 
23 | ::: tip A2:
24 | 
25 | 解决方法：更加频繁地修改 Cookie。
26 | 
27 | :::
28 | 
29 | ---
30 | 
31 | ::: warning Q3:
32 | 
33 | Windows 下不能自动删除 `process.out`。
34 | 
35 | :::
36 | 
37 | ::: tip A3:
38 | 
39 | 解决方法：手动删除 :joy:。
40 | 
41 | :::
42 | 


--------------------------------------------------------------------------------
/docs/guide/notice.md:
--------------------------------------------------------------------------------
 1 | # 告示板
 2 | 
 3 | ## Course Cralwer
 4 | 
 5 | 仅限个人学习和研究使用，切勿用于其他用途。强烈建议到 MOOC 网站进行学习，本程序只是提供一个备选方案。
 6 | 
 7 | 本程序主体功能只是下载课件和附件，无任何手段获得付费课程，也没有以任何方式向任何人收取费用。
 8 | 
 9 | 如果将程序用于商业用途或其他非法用途，一切后果由用户自负。
10 | 
11 | 如果您发现有侵犯到您的合法权益，请与我联系删除相关程序，同时我对无意冒犯到您致以深深的歉意。
12 | 
13 | 许可协议：MIT
14 | 
15 | ## 本文档
16 | 
17 | 许可协议：CC0
18 | 
19 | ## 与原作联系
20 | 
21 | SigureMo/course-crawler 基于 Foair 的 Course Crawler ，修复部分 bug ，并且增加部分新功能，本文档亦然
22 | 
23 | 本程序的所有“完善”工作均离不开 Foair 原有的框架，在我刚刚接触到这个项目的时候，Foair 给了我莫大的鼓励与支持，这对我之后的 Coding 风格产生了极大的影响
24 | 
25 | ## 推广
26 | 
27 | -  [bilili-dl](https://github.com/SigureMo/bilili-dl) B 站视频下载器，支持普通视频以及番剧的下载，B 站也是有很多不错的课程的
28 | -  [mooc-dl](https://github.com/SigureMo/mooc-dl) 中国大学 MOOC 爬虫，使用手机端接口，可作为本项目 icourse163 的备用接口
29 | 
30 | ## 感谢
31 | 
32 | -  vuepress [https://github.com/vuejs/vuepress](https://github.com/vuejs/vuepress)
33 | -  [Foair/course-crawler](https://github.com/Foair/course-crawler)
34 | -  [https://mooc.xoy.io/](https://mooc.xoy.io/)
35 | 
36 | 以及你们的支持，有你们， Course Crawler 才能更加完善~
37 | 


--------------------------------------------------------------------------------
/docs/images/get_cookies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SigureMo/course-crawler/5828d61ff69ddc344c573ec06e198f137aa9164b/docs/images/get_cookies.png


--------------------------------------------------------------------------------
/docs/images/icourse163_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SigureMo/course-crawler/5828d61ff69ddc344c573ec06e198f137aa9164b/docs/images/icourse163_01.png


--------------------------------------------------------------------------------
/mooc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """MOOC 课程下载"""
 3 | 
 4 | import os
 5 | import sys
 6 | import re
 7 | import argparse
 8 | 
 9 | from moocs.utils import aria2_download, store_cookies
10 | 
11 | 
12 | def main():
13 |     """解析命令行参数并调用相关模块进行下载"""
14 | 
15 |     parser = argparse.ArgumentParser(description='Course Crawler')
16 |     parser.add_argument('url', help='课程地址')
17 |     parser.add_argument('-c', '--restore-cookies', action='store_true',
18 |                         help='执行任务的时候重新输入 cookies')
19 |     parser.add_argument('-d', '--dir', default=r'', help='下载目录')
20 |     parser.add_argument('-r', '--quality', default='shd', help='视频清晰度')
21 |     parser.add_argument('-w', '--overwrite',
22 |                         action='store_true', help='强制覆盖重新下载')
23 |     parser.add_argument('--inter', action='store_true', help='交互式修改文件名')
24 |     parser.add_argument('--no-doc', action='store_false',
25 |                         help='不下载 PDF、Word 等文档')
26 |     parser.add_argument('--no-sub', action='store_false', help='不下载字幕')
27 |     parser.add_argument('--no-file', action='store_false', help='不下载附件')
28 |     parser.add_argument('--no-text', action='store_false', help='不下载富文本')
29 |     parser.add_argument("--playlist-type", default="dpl",
30 |                         choices=["dpl", "m3u", "no"], help="播放列表类型，支持 dpl 和 m3u，输入 no 不生成播放列表")
31 |     parser.add_argument("--abs-path", action='store_true',
32 |                         help="播放列表路径使用绝对路径，默认为相对路径")
33 |     parser.add_argument('--aria2', action='store_true', help='自动调用aria2下载视频')
34 | 
35 |     args = parser.parse_args()
36 |     resolutions = ['shd', 'hd', 'sd']
37 |     playlist_path_type = 'AP' if args.abs_path else 'RP'
38 | 
39 |     config = {'doc': args.no_doc, 'sub': args.no_sub, 'file': args.no_file, 'text': args.no_text,
40 |               'rename': args.inter, 'dir': args.dir, 'resolution': resolutions.index(args.quality.lower()),
41 |               'overwrite': args.overwrite, 'playlist_type': args.playlist_type, 'playlist_path_type': playlist_path_type,
42 |               'aria2': args.aria2}
43 | 
44 |     if re.match(r'https?://www.icourse163.org/(spoc/)?(course|learn)/', args.url):
45 |         from moocs import icourse163 as mooc
46 |     elif re.match(r'https?://www.xuetangx.com/courses/.+/about', args.url):
47 |         from moocs import xuetangx as mooc
48 |     elif re.match(r'https?://next.xuetangx.com/course/.+', args.url):
49 |         from moocs import xuetangx_next as mooc
50 |     elif re.match(r'https?://mooc.study.163.com/(course|learn)/', args.url):
51 |         from moocs import study_mooc as mooc
52 |     elif re.match(r'https?://study.163.com/course/', args.url):
53 |         from moocs import study_163 as mooc
54 |     elif re.match(r'https?://open.163.com/(special|movie)/', args.url):
55 |         from moocs import open_163 as mooc
56 |     elif re.match(r'https?://www.cnmooc.org/portal/course/', args.url):
57 |         from moocs import cnmooc as mooc
58 |     elif re.match(r'https?://www.icourses.cn/web/sword/portal/videoDetail', args.url):
59 |         from moocs import icourses as mooc
60 |     elif re.match(r'https?://www.icourses.cn/sCourse/course_\d+.html', args.url) or \
61 |             re.match(r'https?://www.icourses.cn/web/sword/portal/shareDetails\?cId=', args.url):
62 |         from moocs import icourses_share as mooc
63 |     elif re.match(r'https?://www.livedu.com.cn/ispace4.0/moocxjkc/toKcView.do\?kcid=', args.url):
64 |         from moocs import livedu as mooc
65 |     else:
66 |         print('课程地址有误！')
67 |         sys.exit(1)
68 | 
69 |     if mooc.need_cookies:
70 |         cookies = store_cookies(mooc.name, restore=args.restore_cookies)
71 |     else:
72 |         cookies = None
73 | 
74 |     mooc.start(args.url, config, cookies)
75 | 
76 |     # 视频下载
77 |     if config['aria2']:
78 |         workdir = mooc.exports["workdir"]
79 |         workdir.change('Videos')
80 |         videos = mooc.exports["videos"]
81 |         aria2_download(videos, workdir.path, overwrite=config["overwrite"])
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     main()
86 | 


--------------------------------------------------------------------------------
/moocs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SigureMo/course-crawler/5828d61ff69ddc344c573ec06e198f137aa9164b/moocs/__init__.py


--------------------------------------------------------------------------------
/moocs/cnmooc.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """好大学在线"""
  3 | 
  4 | from bs4 import BeautifulSoup
  5 | 
  6 | from moocs.utils import *
  7 | from utils.crawler import Crawler
  8 | 
  9 | name = "cnmooc"
 10 | need_cookies = True
 11 | CANDY = Crawler()
 12 | CONFIG = {}
 13 | FILES = {}
 14 | VIDEOS = []
 15 | exports = {}
 16 | __all__ = ["name", "need_cookies", "start", "exports"]
 17 | 
 18 | 
 19 | def get_summary(url):
 20 |     """获得课程信息"""
 21 | 
 22 |     res = CANDY.get(url).text
 23 |     soup = BeautifulSoup(res, 'lxml')
 24 |     title = soup.find(class_='view-title substr').get_text(strip=True)
 25 |     university = soup.find(class_='person-attach substr').get_text(strip=True)
 26 | 
 27 |     dir_name = course_dir(title, university)
 28 |     print(dir_name)
 29 |     return dir_name
 30 | 
 31 | 
 32 | def get_resource(course_nav):
 33 |     """获得视频资源"""
 34 | 
 35 |     counter = Counter()
 36 |     outline = Outline()
 37 |     video_list = []
 38 |     document_list = []
 39 | 
 40 |     res = CANDY.get(course_nav).text
 41 |     soup = BeautifulSoup(res, 'lxml')
 42 |     nav = soup.find(id='unitNavigation')
 43 |     chapters = nav.find_all(class_='view-chapter')
 44 |     for chapter in chapters:
 45 |         chapter_name = chapter.find(
 46 |             class_='chapter-text substr').get_text(strip=True)
 47 |         counter.add(0)
 48 |         outline.write(chapter_name, counter, 0)
 49 | 
 50 |         lectures = chapter.find_all(class_='view-lecture')
 51 |         for lecture in lectures:
 52 |             actions = lecture.find(class_='lecture-title')
 53 |             lecture_name = actions.get_text(strip=True)
 54 |             counter.add(1)
 55 |             outline.write(lecture_name, counter, 1)
 56 |             # unitid = actions.a['unitid']
 57 |             # print(unitid)
 58 |             group = actions.div.find_all('a')
 59 |             # for action in group:
 60 |             #     print(action.i['class'])
 61 |             videos = list(
 62 |                 filter(lambda action: 'icon-play' in action.i['class'][0], group))
 63 |             # videos = [action for action in group if lambda :'icon-play' in action.i['class'][0]]
 64 |             docs = list(
 65 |                 filter(lambda action: 'icon-doc' in action.i['class'][0], group))
 66 |             for video in videos:
 67 |                 counter.add(2)
 68 |                 outline.write(video['title'], counter, 2, sign='#')
 69 |                 if len(videos) == 1:
 70 |                     extra_num = ''
 71 |                 else:
 72 |                     extra_num = '-%s' % str(counter)[-1:]
 73 |                 video_list.append(
 74 |                     Video(counter, lecture_name + extra_num, video['itemid']))
 75 |             counter.reset()
 76 |             for doc in docs:
 77 |                 counter.add(2)
 78 |                 outline.write(doc['title'], counter, 2, sign='*')
 79 |                 document_list.append(
 80 |                     Document(counter, lecture_name, doc['itemid']))
 81 |     return video_list, document_list
 82 | 
 83 | 
 84 | def parse_resource(video):
 85 |     """解析视频地址"""
 86 | 
 87 |     res = CANDY.post('https://www.cnmooc.org/study/play.mooc',
 88 |                     data={'itemId': video.meta, 'itemType': '10', 'testPaperId': ''}).text
 89 |     soup = BeautifulSoup(res, 'lxml')
 90 |     node_id = soup.find(id='nodeId')['value']
 91 | 
 92 |     res = CANDY.post('https://www.cnmooc.org/item/detail.mooc',
 93 |                     data={'nodeId': node_id, 'itemId': video.meta}).json()
 94 |     if WORK_DIR.need_download(video.file_name+".mp4", CONFIG["overwrite"]):
 95 |         url = res['node']['flvUrl']
 96 |         FILES['videos'].write_string(url)
 97 |         FILES['renamer'].write(url.split('/')[-1], video.file_name)
 98 |         VIDEOS.append((url, video.file_name+".mp4"))
 99 | 
100 |     if CONFIG['sub']:
101 |         exts = res['node']['nodeExts']
102 |         for ext in exts:
103 |             file_name = '%s%s.srt' % (video.file_name, '' if len(
104 |                 exts) == 1 else '_' + ext['languageCode'])
105 |             if WORK_DIR.need_download(file_name, CONFIG["overwrite"]):
106 |                 CANDY.download_bin('https://static.cnmooc.org' +
107 |                                 ext['node']['rsUrl'], WORK_DIR.file(file_name))
108 | 
109 | 
110 | def get_doc(doc_list):
111 |     """获得文档"""
112 | 
113 |     WORK_DIR.change('Docs')
114 |     for doc in doc_list:
115 |         post_data = {'itemId': doc.meta, 'itemType': '20', 'testPaperId': ''}
116 |         res = CANDY.post(
117 |             'https://www.cnmooc.org/study/play.mooc', data=post_data).text
118 |         try:
119 |             url = re.search(r'isSlideShow\("(.+)?"\);', res).group(1)
120 |         except AttributeError:
121 |             continue
122 |         ext = url.split('.')[-1]
123 |         file_name = doc.file_name
124 |         if WORK_DIR.need_download(file_name + '.' + ext, CONFIG["overwrite"]):
125 |             CANDY.download_bin('https://static.cnmooc.org' + url, WORK_DIR.file(file_name + '.' + ext))
126 | 
127 | 
128 | def start(url, config, cookies=None):
129 |     """调用接口函数"""
130 | 
131 |     global WORK_DIR
132 |     CONFIG.update(config)
133 | 
134 |     CANDY.set_cookies(cookies)
135 | 
136 |     course_info = get_summary(url)
137 |     WORK_DIR = WorkingDir(CONFIG['dir'], course_info)
138 |     WORK_DIR.change('Videos')
139 | 
140 |     FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
141 |     FILES['videos'] = ClassicFile(WORK_DIR.file('Videos.txt'))
142 | 
143 |     course = 'https://www.cnmooc.org/portal/session/unitNavigation/'
144 |     course_nav = course + url.split('/')[-1]
145 |     resource = get_resource(course_nav)
146 | 
147 |     rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
148 | 
149 |     playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
150 |     if playlist:
151 |         parse_res_list(resource[0], rename, playlist.write, parse_resource)
152 |     else:
153 |         parse_res_list(resource[0], rename, parse_resource)
154 | 
155 |     if CONFIG['doc']:
156 |         get_doc(resource[1])
157 | 
158 |     exports.update({
159 |         "workdir": WORK_DIR,
160 |         "spider": CANDY,
161 |         "videos": VIDEOS
162 |     })
163 | 


--------------------------------------------------------------------------------
/moocs/icourse163.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """中国大学MOOC"""
  3 | 
  4 | import json
  5 | import time
  6 | import sys
  7 | 
  8 | from moocs.utils import *
  9 | from utils.crawler import Crawler
 10 | 
 11 | name = "icourse163"
 12 | need_cookies = True
 13 | CANDY = Crawler()
 14 | CONFIG = {}
 15 | FILES = {}
 16 | VIDEOS = []
 17 | exports = {}
 18 | __all__ = ["name", "need_cookies", "start", "exports"]
 19 | 
 20 | 
 21 | def get_summary(url):
 22 |     """从课程主页面获取信息"""
 23 | 
 24 |     url = url.replace('learn/', 'course/')
 25 |     res = CANDY.get(url).text
 26 | 
 27 |     term_id = re.search(r'termId : "(\d+)"', res).group(1)
 28 |     names = re.findall(r'name:"(.+)"', res)
 29 | 
 30 |     dir_name = course_dir(*names[:2])
 31 | 
 32 |     print(dir_name)
 33 |     CONFIG['term_id'] = term_id
 34 |     return term_id, dir_name
 35 | 
 36 | 
 37 | def parse_resource(resource):
 38 |     """解析资源地址和下载资源"""
 39 | 
 40 |     post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190',
 41 |                  'httpSessionId': '5531d06316b34b9486a6891710115ebc', 'c0-scriptName': 'CourseBean',
 42 |                  'c0-methodName': 'getLessonUnitLearnVo', 'c0-id': '0', 'c0-param0': 'number:' + resource.meta[0],
 43 |                  'c0-param1': 'number:' + resource.meta[1], 'c0-param2': 'number:0',
 44 |                  'c0-param3': 'number:' + resource.meta[2], 'batchId': str(int(time.time()) * 1000)}
 45 |     res = CANDY.post('https://www.icourse163.org/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr',
 46 |                      data=post_data).text
 47 | 
 48 |     file_name = resource.file_name
 49 |     if resource.type == 'Video':
 50 |         if CONFIG["hasToken"]:
 51 |             video_token = CANDY.post('https://www.icourse163.org/web/j/resourceRpcBean.getResourceToken.rpc?csrfKey='+CONFIG['token'], data={
 52 |                 'bizId': resource.meta[2],
 53 |                 'bizType': 1,
 54 |                 'contentType': 1,
 55 |             }).json()['result']['videoSignDto']['signature']
 56 |             data = CANDY.post('https://vod.study.163.com/eds/api/v1/vod/video', data={
 57 |                 'videoId': resource.meta[0],
 58 |                 'signature': video_token,
 59 |                 'clientType': '1'
 60 |             }).json()
 61 | 
 62 |             resolutions = [3, 2, 1]
 63 |             url, ext = '', ''
 64 |             for sp in resolutions[CONFIG['resolution']:]:
 65 |                 # TODO: 增加视频格式选择
 66 |                 for video in data['result']['videos']:
 67 |                     if video['quality'] == sp and video['format'] == 'mp4':
 68 |                         url = video['videoUrl']
 69 |                         ext = '.mp4'
 70 |                         break
 71 |                 else:
 72 |                     continue
 73 |                 break
 74 |             assert ext, "近期中国大学 MOOC 接口变动，请临时使用 https://github.com/SigureMo/mooc-dl"
 75 | 
 76 |             if WORK_DIR.need_download(file_name + ext, CONFIG["overwrite"]):
 77 |                 FILES['renamer'].write(
 78 |                     re.search(r'(\w+\.mp4)', url).group(1), file_name, ext)
 79 |                 FILES['video'].write_string(url)
 80 |                 VIDEOS.append((url, file_name+ext))
 81 |                 resource.ext = ext
 82 |         else:
 83 |             resolutions = ['Shd', 'Hd', 'Sd']
 84 |             url, ext = '', ''
 85 |             for sp in resolutions[CONFIG['resolution']:]:
 86 |                 # TODO: 增加视频格式选择
 87 |                 # video_info = re.search(r'%sUrl="(?P<url>.*?(?P<ext>\.((m3u8)|(mp4)|(flv))).*?)"' % sp, res)
 88 |                 video_info = re.search(r'(?P<ext>mp4)%sUrl="(?P<url>.*?\.(?P=ext).*?)"' % sp, res)
 89 |                 if video_info:
 90 |                     url, ext = video_info.group('url', 'ext')
 91 |                     ext = '.' + ext
 92 |                     break
 93 |             assert ext, "近期中国大学 MOOC 接口变动，请临时使用 https://github.com/SigureMo/mooc-dl"
 94 | 
 95 |             url = url.replace('v.stu.126.net', 'jdvodrvfb210d.vod.126.net')
 96 |             if CANDY.head(url, allow_redirects=True, timeout=20).status_code != 200:
 97 |                 url = url.replace('mooc-video', 'jdvodrvfb210d')
 98 |             if WORK_DIR.need_download(file_name + ext, CONFIG["overwrite"]):
 99 |                 FILES['renamer'].write(re.search(r'(\w+\.((m3u8)|(mp4)|(flv)))', url).group(1), file_name, ext)
100 |                 FILES['video'].write_string(url)
101 |                 VIDEOS.append((url, file_name+ext))
102 |                 resource.ext = ext
103 | 
104 |         if not CONFIG['sub']:
105 |             return
106 |         subtitles = re.findall(r'name="(.+)";.*url="(.*?)"', res)
107 |         for subtitle in subtitles:
108 |             if len(subtitles) == 1:
109 |                 sub_name = file_name + '.srt'
110 |             else:
111 |                 subtitle_lang = subtitle[0].encode(
112 |                     'utf_8').decode('unicode_escape')
113 |                 sub_name = file_name + '_' + subtitle_lang + '.srt'
114 |             if not WORK_DIR.need_download(sub_name, CONFIG["overwrite"]):
115 |                 continue
116 |             CANDY.download_bin(subtitle[1], WORK_DIR.file(sub_name))
117 | 
118 |     elif resource.type == 'Document':
119 |         if not WORK_DIR.need_download(file_name + '.pdf', CONFIG["overwrite"]):
120 |             return
121 |         pdf_url = re.search(r'textOrigUrl:"(.*?)"', res).group(1)
122 |         CANDY.download_bin(pdf_url, WORK_DIR.file(file_name + '.pdf'))
123 | 
124 |     elif resource.type == 'Rich':
125 |         if not WORK_DIR.need_download(file_name + '.html', CONFIG["overwrite"]):
126 |             return
127 |         text = re.search(r'htmlContent:"(.*)",id',
128 |                          res.encode('utf_8').decode('unicode_escape'), re.S).group(1)
129 |         with open(WORK_DIR.file(file_name + '.html'), 'w', encoding='utf_8') as file:
130 |             file.write(text)
131 | 
132 | 
133 | def get_resource(term_id):
134 |     """获取各种资源"""
135 | 
136 |     outline = Outline()
137 |     counter = Counter()
138 | 
139 |     video_list = []
140 |     pdf_list = []
141 |     rich_text_list = []
142 | 
143 |     post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190', 'c0-scriptName': 'CourseBean',
144 |                  'c0-methodName': 'getMocTermDto', 'c0-id': '0', 'c0-param0': 'number:' + term_id,
145 |                  'c0-param1': 'number:0', 'c0-param2': 'boolean:true', 'batchId': str(int(time.time()) * 1000)}
146 |     res = CANDY.post('https://www.icourse163.org/dwr/call/plaincall/CourseBean.getMocTermDto.dwr',
147 |                      data=post_data).text.encode('utf_8').decode('unicode_escape')
148 | 
149 |     chapters = re.findall(r'homeworks=\w+;.+id=(\d+).+name="([\s\S]+?)";', res)
150 |     for chapter in chapters:
151 |         counter.add(0)
152 |         outline.write(chapter[1], counter, 0)
153 | 
154 |         lessons = re.findall(
155 |             r'chapterId=' + chapter[0] + r'.+contentId=null.+contentType=1.+id=(\d+).+name="([\s\S]+?)"', res)
156 |         for lesson in lessons:
157 |             counter.add(1)
158 |             outline.write(lesson[1], counter, 1)
159 | 
160 |             videos = re.findall(r'contentId=(\d+).+contentType=(1).+id=(\d+).+lessonId=' +
161 |                                 lesson[0] + r'.+name="([\s\S]+?)"', res)
162 |             for video in videos:
163 |                 counter.add(2)
164 |                 outline.write(video[3], counter, 2, sign='#')
165 |                 video_list.append(Video(counter, video[3], video))
166 |             counter.reset()
167 | 
168 |             pdfs = re.findall(r'contentId=(\d+).+contentType=(3).+id=(\d+).+lessonId=' +
169 |                               lesson[0] + r'.+name="([\s\S]+?)"', res)
170 |             for pdf in pdfs:
171 |                 counter.add(2)
172 |                 outline.write(pdf[3], counter, 2, sign='*')
173 |                 if CONFIG['doc']:
174 |                     pdf_list.append(Document(counter, pdf[3], pdf))
175 |             counter.reset()
176 | 
177 |             rich_text = re.findall(r'contentId=(\d+).+contentType=(4).+id=(\d+).+jsonContent=(.+?);.+lessonId=' +
178 |                                    lesson[0] + r'.+name="([\s\S]]+?)"', res)
179 |             for text in rich_text:
180 |                 counter.add(2)
181 |                 outline.write(text[4], counter, 2, sign='+')
182 |                 if CONFIG['text']:
183 |                     rich_text_list.append(RichText(counter, text[4], text))
184 |                 if CONFIG['file']:
185 |                     if text[3] != 'null' and text[3] != '""':
186 |                         params = {'nosKey': re.search('nosKey":"(.+?)"', text[3]).group(1),
187 |                                   'fileName': re.search('"fileName":"(.+?)"', text[3]).group(1)}
188 |                         file_name = Resource.file_to_save(params['fileName'])
189 |                         outline.write(file_name, counter, 2, sign='!')
190 | 
191 |                         WORK_DIR.change('Files')
192 |                         file_name = '%s %s' % (counter, file_name)
193 |                         if WORK_DIR.need_download(file_name, CONFIG["overwrite"]):
194 |                             CANDY.download_bin('https://www.icourse163.org/course/attachment.htm',
195 |                                             WORK_DIR.file(file_name), params=params)
196 |             counter.reset()
197 | 
198 |     if video_list:
199 |         rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
200 |         WORK_DIR.change('Videos')
201 |         playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
202 |         if playlist is not None:
203 |             parse_res_list(video_list, rename, parse_resource, playlist.write)
204 |         else:
205 |             parse_res_list(video_list, rename, parse_resource)
206 |     if pdf_list:
207 |         WORK_DIR.change('PDFs')
208 |         parse_res_list(pdf_list, None, parse_resource)
209 |     if rich_text_list:
210 |         WORK_DIR.change('Texts')
211 |         parse_res_list(rich_text_list, None, parse_resource)
212 | 
213 | 
214 | def start(url, config, cookies):
215 |     """调用接口函数"""
216 | 
217 |     global WORK_DIR
218 |     CANDY.set_cookies(cookies)
219 |     CONFIG.update(config)
220 | 
221 |     if cookies.get('NTESSTUDYSI'):
222 |         CONFIG['hasToken'] = True
223 |         CONFIG['token'] = cookies.get('NTESSTUDYSI')
224 |     else:
225 |         CONFIG['hasToken'] = False
226 | 
227 |     term_id, dir_name = get_summary(url)
228 |     WORK_DIR = WorkingDir(CONFIG['dir'], dir_name)
229 |     WORK_DIR.change('Videos')
230 |     FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
231 |     FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt'))
232 | 
233 |     get_resource(term_id)
234 | 
235 |     exports.update({
236 |         "workdir": WORK_DIR,
237 |         "spider": CANDY,
238 |         "videos": VIDEOS
239 |     })
240 | 


--------------------------------------------------------------------------------
/moocs/icourses.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """爱课程"""
 3 | 
 4 | from moocs.utils import *
 5 | from bs4 import BeautifulSoup
 6 | import re
 7 | import json
 8 | from utils.crawler import Crawler
 9 | 
10 | name = "icourses"
11 | need_cookies = False
12 | CANDY = Crawler()
13 | CONFIG = {}
14 | FILES = {}
15 | VIDEOS = []
16 | exports = {}
17 | __all__ = ["name", "need_cookies", "start", "exports"]
18 | 
19 | 
20 | def get_content(url):
21 |     """获得课程信息"""
22 | 
23 |     res = CANDY.get(url).text
24 |     soup = BeautifulSoup(res, 'lxml')
25 |     script = soup.find_all('script')[-2].string
26 |     js = re.search(r'_sourceArrStr = (.*);', script)
27 |     school = soup.find(class_='teacher-infor-from').string
28 |     name = soup.find(class_='coursetitle pull-left').a.string
29 |     dir_name = course_dir(name, school)
30 |     res_info = json.loads(js.group(1))
31 |     print(dir_name)
32 |     return dir_name, res_info
33 | 
34 | 
35 | def parse_res(js):
36 |     """获得视频名称和地址"""
37 |     outline = Outline()
38 |     length = len(str(len(js)))
39 |     counter = 0
40 |     video_list = []
41 |     for lesson in js:
42 |         counter += 1
43 |         counter_str = str(counter).zfill(length)
44 |         title = lesson['title']
45 |         url = lesson['fullLinkUrl']
46 |         outline.write_string('%s {%s}#' % (title, counter_str))
47 |         video = Video(counter_str, title, url)
48 |         video_list.append(video)
49 | 
50 |     return video_list
51 | 
52 | 
53 | def parse_video(video):
54 |     """将视频信息添加到相关列表中"""
55 | 
56 |     if WORK_DIR.need_download(video.file_name+".mp4", CONFIG["overwrite"]):
57 |         FILES['videos'].write_string(video.meta)
58 |         FILES['renamer'].write(video.meta.split('/')[-1], video.file_name)
59 |         VIDEOS.append((video.meta, video.file_name+".mp4"))
60 | 
61 | 
62 | def start(url, config, cookies=None):
63 |     """调用接口函数"""
64 | 
65 |     global WORK_DIR
66 |     CONFIG.update(config)
67 | 
68 |     course_info = get_content(url)
69 |     WORK_DIR = WorkingDir(CONFIG['dir'], course_info[0])
70 | 
71 |     WORK_DIR.change('Videos')
72 |     FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
73 |     FILES['videos'] = ClassicFile(WORK_DIR.file('Videos.txt'))
74 |     playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
75 |     if playlist:
76 |         FILES['playlist'] = playlist
77 | 
78 |     video_list = parse_res(course_info[1])
79 | 
80 |     rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
81 | 
82 |     if playlist:
83 |         parse_res_list(video_list, rename,
84 |                        FILES['playlist'].write, parse_video)
85 |     else:
86 |         parse_res_list(video_list, rename, parse_video)
87 | 
88 |     exports.update({
89 |         "workdir": WORK_DIR,
90 |         "spider": CANDY,
91 |         "videos": VIDEOS
92 |     })
93 | 


--------------------------------------------------------------------------------
/moocs/icourses_share.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """爱课程 资源共享课"""
  3 | import re
  4 | 
  5 | from bs4 import BeautifulSoup
  6 | 
  7 | from moocs.utils import *
  8 | from utils.crawler import Crawler
  9 | 
 10 | name = "icourses_share"
 11 | need_cookies = False
 12 | CANDY = Crawler()
 13 | CONFIG = {}
 14 | FILES = {}
 15 | VIDEOS = []
 16 | exports = {}
 17 | __all__ = ["name", "need_cookies", "start", "exports"]
 18 | 
 19 | 
 20 | def get_summary(url):
 21 |     """从课程主页面获取信息"""
 22 |     if re.match(r'https?://www.icourses.cn/web/sword/portal/shareDetails\?cId=(\d+)', url):
 23 |         course_id = re.match(
 24 |             r'https?://www.icourses.cn/web/sword/portal/shareDetails\?cId=(\d+)', url).group(1)
 25 |         url = 'http://www.icourses.cn/sCourse/course_{}.html'.format(course_id)
 26 |     else:
 27 |         course_id = re.match(
 28 |             r'https?://www.icourses.cn/sCourse/course_(\d+).html', url).group(1)
 29 |     res = CANDY.get(url)
 30 |     res.encoding = 'utf8'
 31 |     soup = BeautifulSoup(res.text, 'lxml')
 32 |     name = soup.find('div', class_='course-introduction-infor').find('div',
 33 |                                                                      class_='course-title').p.string
 34 | 
 35 |     dir_name = course_dir(name, '爱课程资源共享课')
 36 | 
 37 |     print(dir_name)
 38 | 
 39 |     return course_id, dir_name
 40 | 
 41 | 
 42 | def parse_resource(resource):
 43 |     """解析资源地址和下载资源"""
 44 | 
 45 |     file_name = resource.file_name
 46 |     if resource.type == 'Video':
 47 |         video_urls = {}
 48 |         video_urls['sd'] = resource.meta['fullResUrl']
 49 |         if resource.meta.get('fullResUrl2'):
 50 |             video_urls['hd'] = resource.meta['fullResUrl2']
 51 | 
 52 |         resolutions = ['shd', 'hd', 'sd']
 53 |         for sp in resolutions[CONFIG['resolution']:]:
 54 |             if video_urls.get(sp):
 55 |                 url = video_urls[sp]
 56 |                 break
 57 | 
 58 |         if WORK_DIR.need_download(file_name+".mp4", CONFIG["overwrite"]):
 59 |             FILES['renamer'].write(
 60 |                 re.search(r'(\w+\.mp4)', url).group(1), file_name)
 61 |             FILES['video'].write_string(url)
 62 |             VIDEOS.append((url, file_name+".mp4"))
 63 |             #resource.ext = ext
 64 | 
 65 |         if not CONFIG['sub']:
 66 |             return
 67 |         # 暂未发现字幕
 68 | 
 69 |     elif resource.type == 'Document':
 70 |         pdf_url = resource.meta['fullResUrl']
 71 |         if WORK_DIR.need_download(file_name+".pdf", CONFIG["overwrite"]):
 72 |             CANDY.download_bin(pdf_url, WORK_DIR.file(file_name + '.pdf'))
 73 | 
 74 | 
 75 | def get_resource(course_id):
 76 |     """获取各种资源"""
 77 | 
 78 |     outline = Outline()
 79 |     counter = Counter()
 80 | 
 81 |     video_list = []
 82 |     pdf_list = []
 83 | 
 84 |     res = CANDY.get(
 85 |         'http://www.icourses.cn/web/sword/portal/shareChapter?cid={}'.format(course_id))
 86 |     soup = BeautifulSoup(res.text, 'lxml')
 87 |     chapters = soup.find('ul', id='chapters').children
 88 |     for chapter in chapters:
 89 |         if chapter.name is None:
 90 |             continue
 91 |         counter.add(0)
 92 |         chapter_id = chapter.attrs['data-id']
 93 |         chapter_name = chapter.find(
 94 |             'a', class_='chapter-title-text').string.replace('\n\t\t\t\t\t\t\t', ' ')
 95 |         outline.write(chapter_name, counter, 0)
 96 | 
 97 |         # 章前导读
 98 |         try:
 99 |             important = chapter.find(
100 |                 'a', attrs={'title': '重点难点'}).attrs['data-url']
101 |             instructional_design = chapter.find(
102 |                 'a', attrs={'title': '教学设计'}).attrs['data-url']
103 |             exam_id = chapter.find(
104 |                 'a', attrs={'title': '评价考核'}).attrs['data-id']
105 |             exam_contents = CANDY.post(
106 |                 'http://www.icourses.cn/web//sword/common/getTextBody', data={'id': exam_id}).text
107 |             textbook_id = chapter.find(
108 |                 'a', attrs={'title': '教材内容'}).attrs['data-id']
109 |             textbook_contents = CANDY.post(
110 |                 'http://www.icourses.cn/web//sword/common/getTextBody', data={'id': textbook_id}).text
111 |             WORK_DIR.change('Introduction')
112 |             outline.write('重点难点', counter, 2, sign='*')
113 |             CANDY.download_bin(important, WORK_DIR.file(
114 |                 '%s 重点难点.html') % counter)
115 |             outline.write('教学设计', counter, 2, sign='*')
116 |             CANDY.download_bin(instructional_design,
117 |                                WORK_DIR.file('%s 教学设计.html') % counter)
118 |             outline.write('评价考核', counter, 2, sign='+')
119 |             with open(WORK_DIR.file('%s 评价考核.html' % counter), 'w', encoding='utf_8') as file:
120 |                 file.write(exam_contents)
121 |             outline.write('教材内容', counter, 2, sign='+')
122 |             with open(WORK_DIR.file('%s 教材内容.html' % counter), 'w', encoding='utf_8') as file:
123 |                 file.write(textbook_contents)
124 |         except:
125 |             pass
126 | 
127 |         lessons = chapter.find('ul', class_='chapter-body-l').contents
128 |         for lesson in lessons:
129 |             if len(lessons) == 1:
130 |                 counter.add(1)
131 |                 lesson_id = chapter_id
132 |                 lesson_name = chapter_name
133 |             else:
134 |                 if lesson.name is None:
135 |                     continue
136 |                 counter.add(1)
137 |                 lesson_info = lesson.find(
138 |                     'a', class_='chapter-body-content-text')
139 |                 lesson_id = lesson_info.attrs['data-secid']
140 |                 lesson_name = lesson_info.text.replace('\n', '')
141 |             rej = CANDY.post(
142 |                 'http://www.icourses.cn/web//sword/portal/getRess', data={'sectionId': lesson_id}).json()
143 | 
144 |             outline.write(lesson_name, counter, 1)
145 | 
146 |             for resource in rej['model']['listRes']:
147 |                 if resource['mediaType'] == 'mp4':
148 |                     counter.add(2)
149 |                     outline.write(resource['title'], counter, 2, sign='#')
150 |                     video_list.append(
151 |                         Video(counter, resource['title'], resource))
152 |             counter.reset()
153 | 
154 |             for resource in rej['model']['listRes']:
155 |                 if resource['mediaType'] in ['pdf', 'ppt']:
156 |                     counter.add(2)
157 |                     outline.write(resource['title'], counter, 2, sign='*')
158 |                     if CONFIG['doc']:
159 |                         pdf_list.append(
160 |                             Document(counter, resource['title'], resource))
161 |             counter.reset()
162 | 
163 |     if video_list:
164 |         rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
165 |         WORK_DIR.change('Videos')
166 |         playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
167 |         if playlist:
168 |             parse_res_list(video_list, rename, playlist.write, parse_resource)
169 |         else:
170 |             parse_res_list(video_list, rename, parse_resource)
171 |     if pdf_list:
172 |         WORK_DIR.change('PDFs')
173 |         parse_res_list(pdf_list, None, parse_resource)
174 | 
175 | 
176 | def start(url, config, cookies=None):
177 |     """调用接口函数"""
178 | 
179 |     # 初始化设置
180 |     global WORK_DIR
181 |     CONFIG.update(config)
182 | 
183 |     # 课程信息
184 |     course_info = get_summary(url)
185 | 
186 |     # 创建课程目录
187 |     WORK_DIR = WorkingDir(CONFIG['dir'], course_info[1])
188 | 
189 |     WORK_DIR.change('Videos')
190 |     FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
191 |     FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt'))
192 | 
193 |     # 获得资源
194 |     get_resource(course_info[0])
195 | 
196 |     exports.update({
197 |         "workdir": WORK_DIR,
198 |         "spider": CANDY,
199 |         "videos": VIDEOS
200 |     })
201 | 


--------------------------------------------------------------------------------
/moocs/livedu.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """北京高校优质课程研究会"""
  3 | 
  4 | import time
  5 | 
  6 | from bs4 import BeautifulSoup
  7 | 
  8 | from moocs.utils import *
  9 | from utils.crawler import Crawler
 10 | 
 11 | name = "livedu"
 12 | need_cookies = True
 13 | CANDY = Crawler()
 14 | CONFIG = {}
 15 | FILES = {}
 16 | VIDEOS = []
 17 | exports = {}
 18 | __all__ = ["name", "need_cookies", "start", "exports"]
 19 | 
 20 | 
 21 | def get_summary(url):
 22 |     """从课程主页面获取信息"""
 23 | 
 24 |     course_id = re.search(r'kcid=(?P<course_id>\d+)', url).group('course_id')
 25 |     data = {
 26 |         'kcid': course_id,
 27 |         'kcdm': course_id,
 28 |     }
 29 |     res = CANDY.post(CONFIG['study_page'], data=data)
 30 |     study_soup = BeautifulSoup(res.text, 'html.parser')
 31 |     name = study_soup.find(
 32 |         'dl', class_='content-a-title').find('dt').find('span').string
 33 | 
 34 |     home_text = CANDY.get(url).text
 35 |     home_soup = BeautifulSoup(home_text, 'html.parser')
 36 |     chapter_names = []
 37 |     if home_soup.find('div', class_='vice-main-kcap'):
 38 |         for chapter_lable in home_soup.find('div', class_='vice-main-kcap')\
 39 |             .find('ul')\
 40 |                 .children:
 41 |             try:
 42 |                 chapter_names.insert(
 43 |                     0, chapter_lable.find('div').find('span').string)
 44 |             except:
 45 |                 pass
 46 |     else:
 47 |         for chapter_lable in home_soup.find('div', id='accordion')\
 48 |                 .find_all('h3'):
 49 |             chapter_names.insert(0, chapter_lable.text)
 50 | 
 51 |     dir_name = course_dir(name, '北京高校优质课程研究会')
 52 | 
 53 |     print(dir_name)
 54 | 
 55 |     CONFIG['course_id'] = course_id
 56 |     CONFIG['study_soup'] = study_soup
 57 |     CONFIG['chapter_names'] = chapter_names
 58 |     return course_id, dir_name
 59 | 
 60 | 
 61 | def parse_resource(resource):
 62 |     """解析资源地址和下载资源"""
 63 | 
 64 |     file_name = resource.file_name
 65 |     if resource.type == 'Video':
 66 |         ext = '.mp4'
 67 |         if WORK_DIR.need_download(file_name+ext, CONFIG["overwrite"]):
 68 |             resource.ext = ext
 69 |             FILES['renamer'].write(
 70 |                 re.search(r'(\w+\.mp4)', resource.meta).group(1), file_name, ext)
 71 |             FILES['video'].write_string(resource.meta)
 72 |             VIDEOS.append((resource.meta, file_name+ext))
 73 | 
 74 |     elif resource.type == 'Document':
 75 |         if not WORK_DIR.need_download(file_name+".pdf", CONFIG["overwrite"]):
 76 |             return
 77 |         CANDY.download_bin(resource.meta, WORK_DIR.file(file_name + '.pdf'))
 78 | 
 79 |     elif resource.type == 'Rich':
 80 |         if not WORK_DIR.need_download(file_name+".html", CONFIG["overwrite"]):
 81 |             return
 82 |         with open(WORK_DIR.file(file_name + '.html'), 'w', encoding='utf_8') as file:
 83 |             file.write(resource.meta)
 84 | 
 85 | 
 86 | def get_resource(course_id):
 87 |     """获取各种资源"""
 88 | 
 89 |     outline = Outline()
 90 |     counter = Counter()
 91 | 
 92 |     video_list = []
 93 |     pdf_list = []
 94 |     test_list = []
 95 | 
 96 |     study_soup = CONFIG['study_soup']
 97 |     chapter_names = CONFIG['chapter_names']
 98 |     study_div = study_soup.find('div', class_='ation-a-main')
 99 |     left_div = study_div.find('div', class_='xx-main-left')
100 |     info_div = left_div.find('div', class_='xx-left-main')
101 |     chapters = info_div.find_all('dl')
102 |     for chapter in chapters:
103 |         counter.add(0)
104 |         # chapter_name = chapter.find('dt').contents[2].strip()
105 |         chapter_name = chapter_names.pop()
106 |         outline.write(chapter_name, counter, 0)
107 | 
108 |         lessons = chapter.find_all('dd')
109 |         for lesson in lessons:
110 |             counter.add(1)
111 |             lesson_info = lesson.find('a')
112 |             lesson_id = re.search(r"xsxx\('(?P<lesson_id>.+)'\)",
113 |                                   lesson_info.attrs.get('onclick')).group('lesson_id')
114 | 
115 |             data = {
116 |                 'kcdm': course_id,
117 |                 'zjdm': lesson_id,
118 |             }
119 |             res = CANDY.post(CONFIG['study_page'], data=data)
120 |             soup = BeautifulSoup(res.text, 'html.parser')
121 |             study_div = soup.find('div', class_='ation-a-main')
122 |             right_div = study_div.find('div', class_='xx-main-right')
123 |             study_box = right_div.find('div', class_='xx-main-box')
124 |             lesson_name = study_box.find('h4').contents[1]
125 |             outline.write(lesson_name, counter, 1)
126 |             resource_div = study_box.find('div', class_='study-L-text')
127 | 
128 |             # GET video url
129 |             video_div = resource_div.find('div', id='videoBj_1')
130 |             if video_div:
131 |                 video_url = video_div.find('input', id='sp').attrs.get('value')
132 |                 video_name = 'Video:{}'.format(lesson_name)
133 |                 outline.write(video_name, counter, 2, sign='#')
134 |                 video_list.append(Video(counter, video_name, video_url))
135 | 
136 |             # GET pdf url
137 |             pdf_iframe = resource_div.find(
138 |                 'iframe', attrs={'name': 'pdfContainer'})
139 |             if pdf_iframe:
140 |                 pdf_div = pdf_iframe.parent
141 |                 pdf_name = pdf_div.find('span').string.replace('.pdf', '')
142 |                 pdf_url = re.search(
143 |                     r'cclj=(?P<pdf_url>http.+\.pdf)', pdf_iframe.attrs.get('src')).group('pdf_url')
144 |                 outline.write(pdf_name, counter, 2, sign='*')
145 |                 if CONFIG['doc']:
146 |                     pdf_list.append(Document(counter, pdf_name, pdf_url))
147 | 
148 |             # GET test text
149 |             test_div = study_box.find('div', class_='zy-a-list')
150 |             if test_div:
151 |                 test_name = 'Test:{}'.format(lesson_name)
152 |                 outline.write(test_name, counter, 2, sign='+')
153 |                 if CONFIG['text']:
154 |                     test_list.append(
155 |                         RichText(counter, test_name, str(test_div)))
156 | 
157 |     if video_list:
158 |         rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
159 |         WORK_DIR.change('Videos')
160 |         playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
161 |         if playlist:
162 |             parse_res_list(video_list, rename, playlist.write, parse_resource)
163 |         else:
164 |             parse_res_list(video_list, rename, parse_resource)
165 |     if pdf_list:
166 |         WORK_DIR.change('PDFs')
167 |         parse_res_list(pdf_list, None, parse_resource)
168 |     if test_list:
169 |         WORK_DIR.change('Texts')
170 |         parse_res_list(test_list, None, parse_resource)
171 | 
172 | 
173 | def start(url, config, cookies=None):
174 |     """调用接口函数"""
175 | 
176 |     # 初始化设置
177 |     global WORK_DIR
178 |     CANDY.set_cookies(cookies)
179 |     CONFIG.update(config)
180 |     CONFIG['study_page'] = 'http://www.livedu.com.cn/ispace4.0/moocxsxx/queryAllZjByKcdm.do'
181 | 
182 |     # 课程信息
183 |     course_info = get_summary(url)
184 | 
185 |     # 创建课程目录
186 |     WORK_DIR = WorkingDir(CONFIG['dir'], course_info[1])
187 | 
188 |     WORK_DIR.change('Videos')
189 |     FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
190 |     FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt'))
191 | 
192 |     # 获得资源
193 |     get_resource(course_info[0])
194 | 
195 |     exports.update({
196 |         "workdir": WORK_DIR,
197 |         "spider": CANDY,
198 |         "videos": VIDEOS
199 |     })
200 | 


--------------------------------------------------------------------------------
/moocs/open_163.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """网易公开课"""
  3 | 
  4 | import time
  5 | 
  6 | from bs4 import BeautifulSoup
  7 | from Crypto.Cipher import AES
  8 | 
  9 | from moocs.utils import *
 10 | from utils.crawler import Crawler
 11 | 
 12 | name = "open_163"
 13 | need_cookies = False
 14 | CANDY = Crawler()
 15 | CONFIG = {}
 16 | FILES = {}
 17 | VIDEOS = []
 18 | exports = {}
 19 | __all__ = ["name", "need_cookies", "start", "exports"]
 20 | 
 21 | 
 22 | def get_summary(url):
 23 |     """从课程主页面获取信息"""
 24 | 
 25 |     res = CANDY.get(url).text
 26 |     soup = BeautifulSoup(res, 'html.parser')
 27 |     links = []
 28 |     if re.match(r'https?://open.163.com/special/', url):
 29 |         # 从课程主页解析各课程链接
 30 |         names = soup.find_all('div', class_='g-container')[1]
 31 |         organization = names.find('a').string.strip()
 32 |         course = names.find('span', class_='pos').string.strip()
 33 |         list1 = soup.find('table', id='list2')
 34 |         tds = list1.find_all('td', class_="u-ctitle")
 35 | 
 36 |         for td in tds:
 37 |             a = td.find('a')
 38 |             links.append((a.get('href'), a.string))
 39 | 
 40 |     else:
 41 |         # 从学习页面解析各课程链接（有的课程不含课程主页）
 42 |         names = soup.find('p', class_='bread').find_all('a', class_='f-c9')
 43 |         organization = names[0].string.strip()
 44 |         course = names[1].string.strip()
 45 |         listrow = soup.find('div', class_='listrow')
 46 |         for item in listrow.find_all('div', class_='item'):
 47 |             p = item.find('p', class_='f-thide')
 48 |             if p.find('a'):
 49 |                 a = p.find('a')
 50 |                 links.append((a.get('href'), a.string))
 51 |             else:
 52 |                 links.append((url, p.string.split(']')[-1]))
 53 | 
 54 |     dir_name = course_dir(course, organization)
 55 | 
 56 |     print(dir_name)
 57 | 
 58 |     CONFIG['links'] = links
 59 |     return links, dir_name
 60 | 
 61 | 
 62 | def parse_resource(resource):
 63 |     """解析资源地址和下载资源"""
 64 | 
 65 |     def open_decrypt(hex_string, t):
 66 |         """将加密16进制字符串转化为真实url"""
 67 |         CRYKey = {1: b"4fxGZqoGmesXqg2o", 2: b"3fxVNqoPmesAqg2o"}
 68 |         aes = AES.new(CRYKey[t], AES.MODE_ECB)
 69 |         return str(aes.decrypt(bytes.fromhex(hex_string)), encoding='gbk', errors="ignore").replace('\x08', '').replace('\x06', '')
 70 | 
 71 |     def update_hex_urls(node, hex_urls):
 72 |         """从node中解析出来url信息，并更新hex_url"""
 73 |         for child in node.children:
 74 |             sp = child.name
 75 |             if not hex_urls.get(sp):
 76 |                 hex_urls[sp] = {}
 77 |             for hex_url_tag in child.children:
 78 |                 hex_urls[sp][hex_url_tag.name] = hex_url_tag.string
 79 | 
 80 |     link = resource.meta
 81 |     file_name = resource.file_name
 82 |     video_info = link.replace('.html', '').split('/')[-1]
 83 |     xml_url = 'http://live.ws.126.net/movie/' + \
 84 |         video_info[-2] + '/' + video_info[-1] + '/2_' + video_info + '.xml'
 85 |     res = CANDY.get(xml_url)
 86 |     res.encoding = 'gbk'
 87 | 
 88 |     # 解析xml数据
 89 |     soup = BeautifulSoup(res.text, 'lxml')
 90 |     name = soup.find('title').string
 91 |     encrypt = int(soup.find('encrypt').string)
 92 |     hex_urls = {}
 93 |     update_hex_urls(soup.find('flvurl'), hex_urls)
 94 |     update_hex_urls(soup.find('flvurlorigin'), hex_urls)
 95 |     update_hex_urls(soup.find('playurl'), hex_urls)
 96 |     update_hex_urls(soup.find('playurl_origin'), hex_urls)
 97 |     subs = {}
 98 |     for sub in soup.find('subs'):
 99 |         subs[sub.find('name').string] = sub.find('url').string
100 | 
101 |     formats = ['mp4', 'flv']
102 |     resolutions = ['shd', 'hd', 'sd']
103 |     resolutions = resolutions[CONFIG['resolution']:] + \
104 |         list(reversed(resolutions[:CONFIG['resolution']]))
105 |     modes = ((sp, ext) for sp in resolutions for ext in formats)
106 |     for sp, ext in modes:
107 |         if hex_urls.get(sp):
108 |             if hex_urls[sp].get(ext):
109 |                 hex_url = hex_urls[sp][ext]
110 |                 video_url = open_decrypt(hex_url, encrypt)
111 |                 ext = video_url.split('.')[-1]  # 对扩展名进行修正，有的课程从mp4中解析出来的仍为flv
112 |                 if ext in formats:
113 |                     ext = '.' + ext
114 |                     resource.ext = ext
115 |                     break
116 | 
117 |     if WORK_DIR.need_download(file_name+ext, CONFIG["overwrite"]):
118 |         FILES['renamer'].write(re.search(r'(\w+\%s)' %
119 |                                         ext, video_url).group(1), file_name, ext)
120 |         FILES['video'].write_string(video_url)
121 |         VIDEOS.append((video_url, file_name+ext))
122 | 
123 |     if not CONFIG['sub']:
124 |         return
125 |     for subtitle_lang, subtitle_url in subs.items():
126 |         if len(subs) == 1:
127 |             sub_name = file_name + '.srt'
128 |         else:
129 |             sub_name = file_name + '_' + subtitle_lang + '.srt'
130 |         if WORK_DIR.need_download(sub_name, CONFIG["overwrite"]):
131 |             CANDY.download_bin(subtitle_url, WORK_DIR.file(sub_name))
132 | 
133 | 
134 | def get_resource(links):
135 |     """获取各种资源"""
136 | 
137 |     outline = Outline()
138 |     counter = Counter(1)
139 | 
140 |     video_list = []
141 | 
142 |     for link, name in links:
143 |         counter.add(0)
144 |         outline.write(name, counter, 0, sign='#')
145 |         video_list.append(Video(counter, name, link))
146 | 
147 |     if video_list:
148 |         rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
149 |         WORK_DIR.change('Videos')
150 |         playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
151 |         if playlist:
152 |             parse_res_list(video_list, rename, parse_resource, playlist.write)
153 |         else:
154 |             parse_res_list(video_list, rename, parse_resource)
155 | 
156 | 
157 | def start(url, config, cookies=None):
158 |     """调用接口函数"""
159 | 
160 |     # 初始化设置
161 |     global WORK_DIR
162 |     CONFIG.update(config)
163 | 
164 |     # 课程信息
165 |     course_info = get_summary(url)
166 | 
167 |     # 创建课程目录
168 |     WORK_DIR = WorkingDir(CONFIG['dir'], course_info[1])
169 | 
170 |     WORK_DIR.change('Videos')
171 |     FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
172 |     FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt'))
173 | 
174 |     # 获得资源
175 |     get_resource(course_info[0])
176 | 
177 |     exports.update({
178 |         "workdir": WORK_DIR,
179 |         "spider": CANDY,
180 |         "videos": VIDEOS
181 |     })
182 | 


--------------------------------------------------------------------------------
/moocs/study_163.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """网易云课堂"""
  3 | 
  4 | import time
  5 | from urllib import parse
  6 | 
  7 | import requests
  8 | 
  9 | from moocs.utils import *
 10 | from utils.crawler import Crawler
 11 | 
 12 | name = "study_163"
 13 | need_cookies = False
 14 | CANDY = Crawler()
 15 | CONFIG = {}
 16 | FILES = {}
 17 | VIDEOS = []
 18 | exports = {}
 19 | __all__ = ["name", "need_cookies", "start", "exports"]
 20 | 
 21 | 
 22 | def get_summary(url):
 23 |     """从课程主页面获取信息"""
 24 | 
 25 |     res = requests.get(url).text
 26 | 
 27 |     if re.search(r'courseId=(\d+)', url):
 28 |         course_id = re.search(r'courseId=(\d+)', url).group(1)
 29 |     else:
 30 |         course_id = re.search(r'introduction/(\d+)\.htm', url).group(1)
 31 |     name = re.search(r'<title>(.+) - 网易云课堂</title>', res).group(1)
 32 | 
 33 |     dir_name = course_dir(name, '网易云课堂')
 34 | 
 35 |     print(dir_name)
 36 | 
 37 |     CONFIG['course_id'] = course_id
 38 |     return course_id, dir_name
 39 | 
 40 | 
 41 | def parse_resource(resource):
 42 |     """解析资源地址和下载资源"""
 43 | 
 44 |     file_name = resource.file_name
 45 |     if resource.type == 'Video':
 46 |         post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190',
 47 |                      'httpSessionId': 'b1a6d411df364e51833ac11570fc3f07', 'c0-scriptName': 'LessonLearnBean',
 48 |                      'c0-methodName': 'getVideoLearnInfo', 'c0-id': '0', 'c0-param0': 'string:' + resource.meta[1],
 49 |                      'c0-param1': 'string:' + CONFIG['course_id'],
 50 |                      'batchId': str(int(time.time() * 1000))}
 51 |         res = CANDY.post('https://study.163.com/dwr/call/plaincall/LessonLearnBean.getVideoLearnInfo.dwr',
 52 |                          data=post_data).text.encode('utf_8').decode('unicode_escape')
 53 |         video_info = re.search(
 54 |             r'signature="(\w+)";.+videoId=(\d+);[\s\S]+name:"(.+?)",', res).group(1, 2, 3)
 55 |         data = CANDY.post('https://vod.study.163.com/eds/api/v1/vod/video', data={
 56 |             'videoId': video_info[1],
 57 |             'signature': video_info[0],
 58 |             'clientType': '1'
 59 |         }).json()
 60 | 
 61 |         resolutions = [3, 2, 1]
 62 |         for sp in resolutions[CONFIG['resolution']:]:
 63 |             # TODO: 增加视频格式选择
 64 |             for video in data['result']['videos']:
 65 |                 if video['quality'] == sp and video['format'] == 'mp4':
 66 |                     url = video['videoUrl']
 67 |                     ext = '.mp4'
 68 |                     break
 69 |             else:
 70 |                 continue
 71 |             break
 72 |         if WORK_DIR.need_download(file_name + ext, CONFIG["overwrite"]):
 73 |             FILES['renamer'].write(
 74 |                 re.search(r'(\w+\.mp4)', url).group(1), file_name, ext)
 75 |             FILES['video'].write_string(url)
 76 |             VIDEOS.append((url, file_name+ext))
 77 |             resource.ext = ext
 78 | 
 79 |         if not CONFIG['sub']:
 80 |             return
 81 |         # 暂未发现字幕 api应该在data['result']['srtCaptions']
 82 | 
 83 |     elif resource.type == 'Document':
 84 |         if not WORK_DIR.need_download(file_name+".pdf", CONFIG["overwrite"]):
 85 |             return
 86 |         post_data = {
 87 |             'callCount': '1',
 88 |             'scriptSessionId': '${scriptSessionId}190',
 89 |             'httpSessionId': 'c4927103a1c042ee95faed758d0db8f8',
 90 |             'c0-scriptName': 'LessonLearnBean',
 91 |             'c0-methodName': 'getTextLearnInfo',
 92 |             'c0-id': '0',
 93 |             'c0-param0': 'string:' + resource.meta[1],
 94 |             'c0-param1': 'string:' + CONFIG['course_id'],
 95 |             'batchId': str(int(time.time() * 1000)),
 96 |         }
 97 |         res = CANDY.post('https://study.163.com/dwr/call/plaincall/LessonLearnBean.getTextLearnInfo.dwr',
 98 |                          data=post_data).text.encode('utf_8').decode('unicode_escape')
 99 |         pdf_url = re.search(r'pdfUrl:"(http://.+?)",', res).group(1)
100 |         CANDY.download_bin(pdf_url, WORK_DIR.file(file_name + '.pdf'))
101 |     else:
102 |         if not WORK_DIR.need_download(file_name+resource.meta[2], CONFIG["overwrite"]):
103 |             return
104 |         CANDY.download_bin(resource.meta[3], WORK_DIR.file(
105 |             file_name + resource.meta[2]))
106 | 
107 | 
108 | def get_resource(course_id):
109 |     """获取各种资源"""
110 | 
111 |     outline = Outline()
112 |     counter = Counter()
113 | 
114 |     video_list = []
115 |     pdf_list = []
116 |     file_list = []
117 | 
118 |     post_data = {
119 |         'callCount': '1',
120 |         'scriptSessionId': '${scriptSessionId}190',
121 |         'httpSessionId': '89a04ce41c7d42759b0a62efe392e153',
122 |         'c0-scriptName': 'PlanNewBean',
123 |         'c0-methodName': 'getPlanCourseDetail',
124 |         'c0-id': '0',
125 |         'c0-param0': 'string:' + course_id,
126 |         'c0-param1': 'number:0',
127 |         'c0-param2': 'null:null',
128 |         'batchId': str(int(time.time() * 1000)),
129 |     }
130 |     res = CANDY.post('https://study.163.com/dwr/call/plaincall/PlanNewBean.getPlanCourseDetail.dwr',
131 |                      data=post_data).text.encode('utf_8').decode('unicode_escape')
132 | 
133 |     chapters = re.findall(r'courseId=\d+;.+id=(\d+);.+name="(.+)";', res)
134 |     for chapter in chapters:
135 |         counter.add(0)
136 |         outline.write(chapter[1], counter, 0)
137 | 
138 |         lessons = re.findall(
139 |             r'chapterId=%s;.+?hasReferences=(\w+);.+?id=(\d+).+?lessonName="(.*?)";.+?type=(\d+);' % chapter[0], res, re.DOTALL)
140 |         for lesson in lessons:
141 |             counter.add(1)
142 |             outline.write(lesson[2], counter, 1)
143 | 
144 |             # Video
145 |             if lesson[3] == '2' or lesson[3] == '50':
146 |                 counter.add(2)
147 |                 outline.write(lesson[2], counter, 2, sign='#')
148 |                 video_list.append(Video(counter, lesson[2], lesson))
149 |                 counter.reset()
150 | 
151 |             # Pdf
152 |             elif lesson[3] == '3':
153 |                 counter.add(2)
154 |                 outline.write(lesson[2], counter, 2, sign='*')
155 |                 pdf_list.append(Document(counter, lesson[2], lesson))
156 | 
157 |             # References
158 |             files = []
159 |             if eval(lesson[0][0].upper() + lesson[0][1:]):
160 |                 post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190',
161 |                              'httpSessionId': 'b1a6d411df364e51833ac11570fc3f07', 'c0-scriptName': 'LessonReferenceBean',
162 |                              'c0-methodName': 'getLessonReferenceVoByLessonId', 'c0-id': '0', 'c0-param0': 'number:' + lesson[1],
163 |                              'batchId': str(int(time.time() * 1000))}
164 |                 ref_info = CANDY.post('https://study.163.com/dwr/call/plaincall/LessonReferenceBean.getLessonReferenceVoByLessonId.dwr',
165 |                                       data=post_data).text.encode('utf_8').decode('unicode_escape')
166 |                 refs = re.findall(
167 |                     r'id=(\d+);.+name="(.+)";.+suffix="(\.\w+)";.+url="(.+?)";', ref_info)
168 | 
169 |                 for ref in refs:
170 |                     ref = (ref[0], parse.unquote(ref[1]), ref[2], ref[3])
171 |                     files.append(ref)
172 | 
173 |             for file in files:
174 |                 counter.add(2)
175 |                 outline.write(file[1], counter, 2, sign='!')
176 |                 if CONFIG['file']:
177 |                     file_list.append(Resource(counter, file[1], file))
178 |             counter.reset()
179 | 
180 |     if video_list:
181 |         rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
182 |         WORK_DIR.change('Videos')
183 |         playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
184 |         if playlist:
185 |             parse_res_list(video_list, rename, playlist.write, parse_resource)
186 |         else:
187 |             parse_res_list(video_list, rename, parse_resource)
188 |     if pdf_list:
189 |         WORK_DIR.change('PDFs')
190 |         parse_res_list(pdf_list, None, parse_resource)
191 |     if file_list:
192 |         WORK_DIR.change('Files')
193 |         parse_res_list(file_list, None, parse_resource)
194 | 
195 | 
196 | def start(url, config, cookies=None):
197 |     """调用接口函数"""
198 | 
199 |     # 初始化设置
200 |     global WORK_DIR
201 |     CONFIG.update(config)
202 | 
203 |     # 课程信息
204 |     course_info = get_summary(url)
205 | 
206 |     # 创建课程目录
207 |     WORK_DIR = WorkingDir(CONFIG['dir'], course_info[1])
208 | 
209 |     WORK_DIR.change('Videos')
210 |     FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
211 |     FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt'))
212 | 
213 |     # 获得资源
214 |     get_resource(course_info[0])
215 | 
216 |     exports.update({
217 |         "workdir": WORK_DIR,
218 |         "spider": CANDY,
219 |         "videos": VIDEOS
220 |     })
221 | 


--------------------------------------------------------------------------------
/moocs/study_mooc.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """网易云课堂 MOOC"""
  3 | 
  4 | import time
  5 | 
  6 | from moocs.utils import *
  7 | from utils.crawler import Crawler
  8 | 
  9 | name = "study_mooc"
 10 | need_cookies = True
 11 | CANDY = Crawler()
 12 | CONFIG = {}
 13 | FILES = {}
 14 | VIDEOS = []
 15 | exports = {}
 16 | __all__ = ["name", "need_cookies", "start", "exports"]
 17 | 
 18 | 
 19 | def get_summary(url):
 20 |     """从课程主页面获取信息"""
 21 | 
 22 |     url = url.replace('learn/', 'course/')
 23 |     res = CANDY.get(url).text
 24 | 
 25 |     term_id = re.search(r'termId : "(\d+)"', res).group(1)
 26 |     names = re.findall(r'name:"(.+)"', res)
 27 | 
 28 |     dir_name = course_dir(names[0], names[1])
 29 | 
 30 |     print(dir_name)
 31 | 
 32 |     CONFIG['term_id'] = term_id
 33 |     return term_id, dir_name
 34 | 
 35 | 
 36 | def get_announce(term_id):
 37 |     """ 获取课程的公告 """
 38 | 
 39 |     post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190',
 40 |                  'httpSessionId': 'dba4977be78d42a78a6e2c2dd2b9bb42', 'c0-scriptName': 'CourseBean',
 41 |                  'c0-methodName': 'getAllAnnouncementByTerm', 'c0-id': '0', 'c0-param0': 'number:' + term_id,
 42 |                  'c0-param1': 'number:1', 'batchId': str(int(time.time() * 1000))}
 43 |     res = CANDY.post('https://mooc.study.163.com/dwr/call/plaincall/CourseBean.getAllAnnouncementByTerm.dwr',
 44 |                      data=post_data).text
 45 |     announcements = re.findall(
 46 |         r'content="(.*?[^\\])".*title="(.*?[^\\])"', res)
 47 | 
 48 |     with open('Announcements.html', 'w', encoding='utf-8') as announce_file:
 49 |         for announcement in announcements:
 50 |             # 公告内容
 51 |             announce_content = announcement[0].encode(
 52 |                 'utf-8').decode('unicode_escape')
 53 | 
 54 |             # 公告标题
 55 |             announce_title = announcement[1].encode(
 56 |                 'utf-8').decode('unicode_escape')
 57 |             announce_file.write('<h1>' + announce_title +
 58 |                                 '</h1>\n' + announce_content + '\n')
 59 | 
 60 | 
 61 | def parse_resource(resource):
 62 |     """解析资源地址和下载资源"""
 63 | 
 64 |     post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190',
 65 |                  'httpSessionId': 'b8efd4c73fd1434896507b83de631f0f', 'c0-scriptName': 'CourseBean',
 66 |                  'c0-methodName': 'getLessonUnitLearnVo', 'c0-id': '0', 'c0-param0': 'number:' + CONFIG['term_id'],
 67 |                  'c0-param1': 'number:' + resource.meta[0], 'c0-param2': 'number:' + resource.meta[1],
 68 |                  'c0-param3': 'number:0', 'c0-param4': 'number:' + resource.meta[2],
 69 |                  'batchId': str(int(time.time() * 1000))}
 70 |     res = CANDY.post('https://mooc.study.163.com/dwr/call/plaincall/CourseBean.getLessonUnitLearnVo.dwr',
 71 |                      data=post_data).text
 72 | 
 73 |     file_name = resource.file_name
 74 |     if resource.type == 'Video':
 75 |         signature = re.search(r'signature="(.+?)"', res).group(1)
 76 |         data = CANDY.post('https://vod.study.163.com/eds/api/v1/vod/video', data={
 77 |             'videoId': resource.meta[0],
 78 |             'signature': signature,
 79 |             'clientType': '1'
 80 |         }).json()
 81 | 
 82 |         resolutions = [3, 2, 1]
 83 |         for sp in resolutions[CONFIG['resolution']:]:
 84 |             # TODO: 增加视频格式选择
 85 |             for video in data['result']['videos']:
 86 |                 if video['quality'] == sp and video['format'] == 'mp4':
 87 |                     url = video['videoUrl']
 88 |                     ext = '.mp4'
 89 |                     break
 90 |             else:
 91 |                 continue
 92 |             break
 93 |         if WORK_DIR.need_download(file_name + ext, CONFIG["overwrite"]):
 94 |             FILES['renamer'].write(
 95 |                 re.search(r'(\w+\.mp4)', url).group(1), file_name, ext)
 96 |             FILES['video'].write_string(url)
 97 |             VIDEOS.append((url, file_name+ext))
 98 |             resource.ext = ext
 99 | 
100 |         if not CONFIG['sub']:
101 |             return
102 |         subtitles = re.findall(r'name="(.+)";.*url="(.*?)"', res)
103 |         WORK_DIR.change('Videos')
104 |         for subtitle in subtitles:
105 |             if len(subtitles) == 1:
106 |                 sub_name = file_name + '.srt'
107 |             else:
108 |                 subtitle_lang = subtitle[0].encode(
109 |                     'utf_8').decode('unicode_escape')
110 |                 sub_name = file_name + '_' + subtitle_lang + '.srt'
111 |             if WORK_DIR.need_download(sub_name, CONFIG["overwrite"]):
112 |                 CANDY.download_bin(subtitle[1], WORK_DIR.file(sub_name))
113 | 
114 |     elif resource.type == 'Document':
115 |         if not WORK_DIR.need_download(file_name + '.pdf', CONFIG["overwrite"]):
116 |             return
117 |         pdf_url = re.search(r'textOrigUrl:"(.*?)"', res).group(1)
118 |         CANDY.download_bin(pdf_url, WORK_DIR.file(file_name + '.pdf'))
119 | 
120 |     elif resource.type == 'Rich':
121 |         if not WORK_DIR.need_download(file_name + '.html', CONFIG["overwrite"]):
122 |             return
123 |         text = re.search(r'htmlContent:"(.*)",id',
124 |                          res.encode('utf_8').decode('unicode_escape'), re.S).group(1)
125 |         with open(WORK_DIR.file(file_name + '.html'), 'w', encoding='utf_8') as file:
126 |             file.write(text)
127 | 
128 | 
129 | def get_resource(term_id):
130 |     """获取各种资源"""
131 | 
132 |     outline = Outline()
133 |     counter = Counter()
134 | 
135 |     video_list = []
136 |     pdf_list = []
137 |     rich_text_list = []
138 | 
139 |     post_data = {'callCount': '1', 'scriptSessionId': '${scriptSessionId}190',
140 |                  'httpSessionId': 'b8efd4c73fd1434896507b83de631f0f', 'c0-scriptName': 'CourseBean',
141 |                  'c0-methodName': 'getLastLearnedMocTermDto', 'c0-id': '0', 'c0-param0': 'number:' + term_id,
142 |                  'batchId': str(int(time.time() * 1000))}
143 |     res = CANDY.post('https://mooc.study.163.com/dwr/call/plaincall/CourseBean.getLastLearnedMocTermDto.dwr',
144 |                      data=post_data).text.encode('utf_8').decode('unicode_escape')
145 | 
146 |     chapters = re.findall(r'homeworks=\w+;.+id=(\d+).+name="(.+)";', res)
147 |     for chapter in chapters:
148 |         counter.add(0)
149 |         outline.write(chapter[1], counter, 0)
150 | 
151 |         lessons = re.findall(
152 |             r'chapterId=' + chapter[0] + r'.+contentType=1.+id=(\d+).+name="(.+)".+test', res)
153 |         for lesson in lessons:
154 |             counter.add(1)
155 |             outline.write(lesson[1], counter, 1)
156 | 
157 |             videos = re.findall(r'contentId=(\d+).+contentType=(1).+id=(\d+).+lessonId=' +
158 |                                 lesson[0] + r'.+name="(.+)"', res)
159 |             for video in videos:
160 |                 counter.add(2)
161 |                 outline.write(video[3], counter, 2, sign='#')
162 |                 video_list.append(Video(counter, video[3], video))
163 |             counter.reset()
164 | 
165 |             pdfs = re.findall(r'contentId=(\d+).+contentType=(3).+id=(\d+).+lessonId=' +
166 |                               lesson[0] + r'.+name="(.+)"', res)
167 |             for pdf in pdfs:
168 |                 counter.add(2)
169 |                 outline.write(pdf[3], counter, 2, sign='*')
170 |                 if CONFIG['doc']:
171 |                     pdf_list.append(Document(counter, pdf[3], pdf))
172 |             counter.reset()
173 | 
174 |             rich_text = re.findall(r'contentId=(\d+).+contentType=(4).+id=(\d+).+jsonContent=(.+);.+lessonId=' +
175 |                                    lesson[0] + r'.+name="(.+)"', res)
176 |             for text in rich_text:
177 |                 counter.add(2)
178 |                 outline.write(text[4], counter, 2, sign='+')
179 |                 if CONFIG['text']:
180 |                     rich_text_list.append(RichText(counter, text[4], text))
181 |                 if CONFIG['file']:
182 |                     if text[3] != 'null' and text[3] != '""':
183 |                         params = {'nosKey': re.search('nosKey":"(.+?)"', text[3]).group(1),
184 |                                   'fileName': re.search('"fileName":"(.+?)"', text[3]).group(1)}
185 |                         file_name = Resource.file_to_save(params['fileName'])
186 |                         outline.write(file_name, counter, 2, sign='!')
187 | 
188 |                         WORK_DIR.change('Files')
189 |                         file_name = '%s %s' % (counter, file_name)
190 |                         if WORK_DIR.need_download(file_name, CONFIG["overwrite"]):
191 |                             CANDY.download_bin('https://www.icourse163.org/course/attachment.htm',
192 |                                             WORK_DIR.file(file_name), params=params, cookies={'STUDY_SESS': None})
193 |             counter.reset()
194 | 
195 |     if video_list:
196 |         rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
197 |         WORK_DIR.change('Videos')
198 |         playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
199 |         if playlist:
200 |             parse_res_list(video_list, rename, playlist.write, parse_resource)
201 |         else:
202 |             parse_res_list(video_list, rename, parse_resource)
203 |     if pdf_list:
204 |         WORK_DIR.change('PDFs')
205 |         parse_res_list(pdf_list, None, parse_resource)
206 |     if rich_text_list:
207 |         WORK_DIR.change('Texts')
208 |         parse_res_list(rich_text_list, None, parse_resource)
209 | 
210 | 
211 | def start(url, config, cookies=None):
212 |     """调用接口函数"""
213 | 
214 |     # 初始化设置
215 |     global WORK_DIR
216 |     CANDY.set_cookies(cookies)
217 |     CONFIG.update(config)
218 | 
219 |     # 课程信息
220 |     course_info = get_summary(url)
221 | 
222 |     # 创建课程目录
223 |     WORK_DIR = WorkingDir(CONFIG['dir'], course_info[1])
224 | 
225 |     print(course_info[0])
226 |     # 课程公告
227 |     get_announce(course_info[0])
228 | 
229 |     WORK_DIR.change('Videos')
230 |     FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
231 |     FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt'))
232 | 
233 |     # 获得资源
234 |     get_resource(course_info[0])
235 | 
236 |     exports.update({
237 |         "workdir": WORK_DIR,
238 |         "spider": CANDY,
239 |         "videos": VIDEOS
240 |     })
241 | 


--------------------------------------------------------------------------------
/moocs/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """核心程序组件"""
  3 | 
  4 | import json
  5 | import os
  6 | import platform
  7 | import re
  8 | import subprocess
  9 | import sys
 10 | import time
 11 | 
 12 | from utils.aria2 import Aria2, Aria2File
 13 | 
 14 | SYS = platform.system()
 15 | 
 16 | 
 17 | class Resource(object):
 18 |     """所有资源类的基类
 19 | 
 20 |     用来定义一个资源，但不同类型的资源可能要对部分功能进行重写。
 21 | 
 22 |     属性
 23 |         类
 24 |             regex_sort：匹配序号的正则表达式；
 25 |             regex_file：匹配 Windows 下文件名的非法字符；
 26 |             regex_spaces：匹配连续多个空白字符。
 27 |             type：资源的类型，默认是 'Resource'；
 28 | 
 29 |         id：资源的唯一标识，用于在程序中标识一个资源，如 '2.3.2'；
 30 |         name：资源的名称（可含有特殊字符），和最终的文件名有关；
 31 |         meta：资源的元信息，比如资源在每个网站的 ID 和文件名等等；
 32 |         feature：其他特征（基本用不到）。
 33 |     """
 34 | 
 35 |     regex_sort = re.compile(r'^[第一二三四五六七八九十\d]+[\s\d._\-章课节讲]*[.\s、\-]\s*\d*')
 36 |     regex_file = re.compile(r'[\\/:*?"<>|]')
 37 |     regex_spaces = re.compile(r'\s+')
 38 |     type = 'Resource'
 39 | 
 40 |     def __init__(self, identify, name, meta, feature=None):
 41 |         """将 name 的序号消除，并依次为属性赋值"""
 42 |         self.id = str(identify)
 43 |         self.name = Resource.regex_spaces.sub(
 44 |             ' ', Resource.regex_sort.sub('', name)).strip()
 45 |         self.meta = meta
 46 |         self.feature = feature
 47 | 
 48 |     def __str__(self):
 49 |         """返回资源的名称"""
 50 | 
 51 |         return self.name
 52 | 
 53 |     @property
 54 |     def file_name(self):
 55 |         """动态生成文件名（包含前缀的 ID，不含扩展名），比如 '2.3.2 file_name'"""
 56 | 
 57 |         return self.id + ' ' + Resource.regex_file.sub('', self.name)
 58 | 
 59 |     def operation(self, *funcs):
 60 |         """传入一个或多个函数，使用函数对资源对象进行调用"""
 61 | 
 62 |         for func in funcs:
 63 |             func(self)
 64 | 
 65 |     @staticmethod
 66 |     def file_to_save(name):
 67 |         """通过一个名字生成文件名"""
 68 | 
 69 |         return Resource.regex_file.sub('', Resource.regex_spaces.sub(' ', Resource.regex_sort.sub('', name)).strip())
 70 | 
 71 | 
 72 | class Video(Resource):
 73 |     """视频资源类
 74 | 
 75 |     属性
 76 |         type：默认值是 'Video'；
 77 |     """
 78 | 
 79 |     type = 'Video'
 80 |     ext = '.mp4'
 81 | 
 82 | 
 83 | class Document(Resource):
 84 |     """文档资源类
 85 | 
 86 |     属性
 87 |         type：默认值是 'Video'；
 88 |     """
 89 | 
 90 |     type = 'Document'
 91 | 
 92 | 
 93 | class RichText(Resource):
 94 |     """富文本资源类
 95 | 
 96 |     属性
 97 |         type：默认值是 'Rich'；
 98 |     """
 99 | 
100 |     type = 'Rich'
101 | 
102 | 
103 | class Attachment(Resource):
104 |     """视频资源类
105 | 
106 |     属性
107 |         type：默认值是 'Attachment'；
108 |     """
109 | 
110 |     type = 'Attachment'
111 | 
112 | 
113 | class ClassicFile(object):
114 |     """典型文件（UTF-8 编码的文件）类
115 | 
116 |     属性
117 |         _f：文件指针；
118 |         file：文件名或文件路径。
119 |     """
120 | 
121 |     def __init__(self, file):
122 |         """传入一个文件名或路径，然后打开文件，并保存文件指针和文件名"""
123 | 
124 |         self._f = open(file, 'w', encoding='utf_8')
125 |         self.file = file
126 | 
127 |     def __del__(self):
128 |         """关闭文件，并将文件号和文件名都清空"""
129 | 
130 |         self._f.close()
131 |         del self._f
132 |         del self.file
133 | 
134 |     def write_string(self, string):
135 |         """向对象中打开的文件写入字符串，会自动加入换行"""
136 | 
137 |         self._f.write(string + '\n')
138 | 
139 | 
140 | class Playlist(ClassicFile):
141 |     """ 播放列表类 """
142 | 
143 |     def __init__(self, file, path_type):
144 |         super().__init__(file)
145 |         self.path_type = path_type
146 | 
147 |     def switch_path(self, path):
148 |         """ 根据路径类别生成路径项 """
149 |         path = os.path.normpath(path)
150 |         if self.path_type == 'AP':
151 |             path = os.path.abspath(path)
152 |         elif self.path_type == 'RP':
153 |             path = os.path.relpath(path, start=os.path.dirname(self.file))
154 |         return path
155 | 
156 |     def write(self, video):
157 |         """传入一个 Video 类的对象，将该对象的信息写入播放列表"""
158 | 
159 |         path = os.path.join("Videos", video.file_name + video.ext)
160 |         path = self.switch_path(path)
161 |         self.write_string(path)
162 | 
163 | 
164 | class M3u(Playlist):
165 |     """ m3u 播放列表类 """
166 | 
167 |     def __init__(self, path_type='RP'):
168 |         super().__init__('Playlist.m3u', path_type)
169 | 
170 | 
171 | class Dpl(Playlist):
172 |     """ Potplayer 播放列表类
173 | 
174 |     属性
175 |         _count：已经写入的播放列表的文件数；
176 |     """
177 | 
178 |     def __init__(self, path_type='RP'):
179 |         super().__init__('Playlist.dpl', path_type)
180 |         self.write_string('DAUMPLAYLIST\n')
181 |         self._count = 0
182 | 
183 |     def write(self, video):
184 |         """传入一个 Video 类的对象，将该对象的信息写入播放列表"""
185 | 
186 |         self._count += 1
187 |         path = os.path.join("Videos", video.file_name + video.ext)
188 |         path = self.switch_path(path)
189 |         self.write_string('{}*file*{}'.format(self._count, path))
190 |         self.write_string('{}*title*{} {}\n'.format(self._count,
191 |                                                     '.'.join(video.id.split('.')[:-1]), video.name))
192 | 
193 | 
194 | class Subtitle(ClassicFile):
195 |     """ 播放列表类 """
196 | 
197 |     def __init__(self, path):
198 |         super().__init__(path)
199 |         self._count = 0
200 | 
201 |     @staticmethod
202 |     def time_format(seconds):
203 |         ms = int(1000 * (seconds - int(seconds)))
204 |         seconds = int(seconds)
205 |         minutes, sec = seconds // 60, seconds % 60
206 |         hour, min = minutes // 60, minutes % 60
207 |         return "{:02}:{:02}:{:02},{}".format(hour, min, sec, ms)
208 | 
209 | 
210 |     def write(self, content, from_time, to_time):
211 |         self._count += 1
212 |         self.write_string(str(self._count))
213 |         self.write_string(
214 |             "{} --> {}".format(self.time_format(from_time), self.time_format(to_time)))
215 |         self.write_string(content + "\n")
216 | 
217 | 
218 | class Renamer(ClassicFile):
219 |     """重命名批处理文件类"""
220 | 
221 |     ext = 'bat' if SYS == 'Windows' else 'sh'
222 | 
223 |     def __init__(self, file):
224 |         """初始化文件，并写入调用 UTF-8 代码页的命令"""
225 | 
226 |         file = file.format(ext=Renamer.ext)
227 |         super().__init__(file)
228 |         if SYS == 'Windows':
229 |             self.write_string('CHCP 65001\n')
230 | 
231 |     def write(self, origin_name, file_name, ext='.mp4'):
232 |         """传入一个文件的原始名字（URL 中的文件名）和一个新的文件名"""
233 | 
234 |         if SYS == 'Windows':
235 |             self.write_string('REN "%s" "%s%s"' %
236 |                               (origin_name, file_name, ext))
237 |         else:
238 |             self.write_string('mv "%s" "%s%s"' % (origin_name, file_name, ext))
239 | 
240 | 
241 | class Outline(ClassicFile):
242 |     """课程大纲类
243 | 
244 |     属性
245 |         res_type：通过一个符号代表一种文件类型。
246 |     """
247 | 
248 |     res_type = {'#': '【视频】', '!': '【附件】', '*': '【文档】',
249 |                 '+': '【富文本】', '&': '【字幕】', '': ''}
250 | 
251 |     def __init__(self):
252 |         """创建 Outline.txt 文件"""
253 | 
254 |         super().__init__('Outline.txt')
255 | 
256 |     def write(self, string, counter, level=2, sign=''):
257 |         """传入一个字符串，一个计数器，一个级别（从 0 开始）和一个符号，然后写入大纲。首先会打印出相关信息。"""
258 | 
259 |         print('%s%s%s' % ('  ' * level, Outline.res_type[sign], string))
260 |         name = '%s%s {%s}%s' % ('  ' * level, string, counter[level], sign)
261 |         self.write_string(name)
262 | 
263 | 
264 | class WorkingDir(object):
265 |     """工作目录类
266 | 
267 |     用于切换下载目录和创建目录等。
268 | 
269 |     属性
270 |         base_dir：工作目录的根目录，任何时候都基于这个目录；
271 |         path：相对于根目录的路径。
272 |     """
273 | 
274 |     def __init__(self, *base_dirs):
275 |         """传递一些字符串，创建一个目录，并切换到这个目录"""
276 | 
277 |         base_dir = os.path.join(*base_dirs)
278 |         if not os.path.isdir(base_dir):
279 |             os.makedirs(base_dir)
280 |         os.chdir(base_dir)
281 |         self.base_dir = os.getcwd()
282 |         self.path = ''
283 | 
284 |     def change(self, *relative):
285 |         """切换工作目录（假），可以接受连续多个目录名，如果不存在该目录就创建它
286 | 
287 |         切换的功能需要配合 file() 才能实现。
288 |         """
289 | 
290 |         self.path = os.path.join(self.base_dir, *relative)
291 |         if not os.path.isdir(self.path):
292 |             os.makedirs(self.path)
293 | 
294 |     def file(self, file_name):
295 |         """根据文件名返回一个完整的路径，会根据 path 生成一个路径"""
296 | 
297 |         return os.path.join(self.path, file_name)
298 | 
299 |     def exist(self, file_name):
300 |         """判断当前路径（雾）是否存在一个文件"""
301 | 
302 |         return os.path.exists(os.path.join(self.path, file_name))
303 | 
304 |     def need_download(self, file_name, overwrite=False):
305 |         """判断当前文件是否需要下载，并且打印输出"""
306 | 
307 |         need = overwrite or not self.exist(file_name)
308 |         sign = ">" if need else "!"
309 |         res_print(file_name, sign=sign)
310 |         return need
311 | 
312 | 
313 | class Counter(object):
314 |     """计数器类
315 | 
316 |     属性
317 |         counter：计数器的列表。
318 |     """
319 | 
320 |     def __init__(self, num_level=3):
321 |         """初始化一个列表"""
322 | 
323 |         self.counter = [0] * num_level
324 |         self.num_level = num_level
325 | 
326 |     def add(self, level):
327 |         """给第 level 级别的计数器 +1"""
328 | 
329 |         for i in range(level + 1, self.num_level):
330 |             self.counter[i] = 0
331 |         self.counter[level] += 1
332 | 
333 |     def __str__(self):
334 |         """返回一个完整的计数器"""
335 | 
336 |         return '.'.join(map(str, self.counter))
337 | 
338 |     def __getitem__(self, index):
339 |         """返回到第 level 级别为止的计数器"""
340 | 
341 |         return '.'.join(map(str, self.counter[:index + 1]))
342 | 
343 |     def reset(self):
344 |         """将第 2 级别的计数置为 0"""
345 | 
346 |         self.counter[-1] = 0
347 | 
348 | 
349 | def res_print(file_name, sign=">"):
350 |     """打印一个将要输出的文件"""
351 | 
352 |     print('------{}'.format(sign), file_name)
353 | 
354 | 
355 | def course_dir(course_name, institution):
356 |     """通过课程名和机构名返回一个完整的目录名字"""
357 | 
358 |     return Resource.regex_file.sub('', '%s - %s' % (course_name, institution))
359 | 
360 | def file_input(file, origin_text="", message=""):
361 |     """ 调用编辑器，以文件的形式获取输入 """
362 | 
363 |     with open(file, 'w', encoding='utf8') as f:
364 |         f.write(origin_text)
365 | 
366 |     if SYS == 'Windows':
367 |         os.startfile(file)
368 |     elif SYS == 'Linux':
369 |         subprocess.run('gedit "%s"' %
370 |                         file, shell=True, stdout=subprocess.PIPE)
371 |     elif SYS == 'Darwin':
372 |         subprocess.run('open -t "%s"' %
373 |                         file, shell=True, stdout=subprocess.PIPE)
374 |     input(message)
375 |     with open(file, 'r', encoding='utf8') as f:
376 |         res = f.read()
377 |     os.remove(file)
378 |     return res
379 | 
380 | 
381 | def parse_res_list(res_list, file, *operator):
382 |     """传入一个 Resource 实例的列表，并传入一个临时文件名，将调出默认程序修改名字，并调用对象的 operation 方法"""
383 | 
384 |     if file:
385 |         names_text = '\n'.join(list(map(lambda res: str(res), res_list)))
386 |         names = file_input(file, origin_text=names_text, message='修改完文件名后按回车继续。').split('\n')
387 |         for (res, name) in zip(res_list, names):
388 |             res.name = name
389 |             res.operation(*operator)
390 |     else:
391 |         for res in res_list:
392 |             res.operation(*operator)
393 | 
394 | 
395 | def store_cookies(mooc_type, restore=False):
396 |     """存储并返回 Cookie 字典"""
397 | 
398 |     def cookie_input():
399 |         # Mac 容易由于 Cookie 太多而阻塞
400 |         if SYS == 'Darwin':
401 |             cookies = file_input('cookies_tmp.txt', message='输入 Cookie 后保存，并回到终端回车继续...')
402 |         else:
403 |             print('输入 Cookie：')
404 |             cookies = input('> ')
405 |         return cookies
406 | 
407 |     def cookie_to_json(raw_cookies):
408 |         """将分号分隔的 Cookie 转为字典"""
409 | 
410 |         cookies_dict = {}
411 |         if not raw_cookies:
412 |             return {}
413 |         if raw_cookies[:7].lower() == 'cookie:':
414 |             raw_cookies = raw_cookies[7:]
415 | 
416 |         for cookie in raw_cookies.split(';'):
417 |             key, value = cookie.strip().split("=", 1)
418 |             cookies_dict[key] = value
419 | 
420 |         return cookies_dict
421 | 
422 |     file_path = os.path.join(sys.path[0], "cookies.json")
423 |     if not os.path.isfile(file_path):
424 |         cookies = {}
425 |     else:
426 |         with open(file_path, 'r') as cookies_file:
427 |             cookies = json.load(cookies_file)
428 | 
429 |     if restore or not cookies.get(mooc_type):
430 |         raw_cookies = cookie_input()
431 |         cookies[mooc_type] = cookie_to_json(raw_cookies)
432 |         with open(file_path, 'w') as f:
433 |             json.dump(cookies, f, indent=2)
434 | 
435 |     return cookies[mooc_type]
436 | 
437 | 
438 | def size_format(size, ndigits=2):
439 |     """ 输入数据字节数，与保留小数位数，返回数据量字符串 """
440 |     flag = '-' if size < 0 else ''
441 |     size = abs(size)
442 |     units = ["Bytes", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB", "BB"]
443 |     idx = len(units) - 1
444 |     unit = ""
445 |     unit_size = 0
446 |     while idx >= 0:
447 |         unit_size = 2 ** (idx * 10)
448 |         if size >= unit_size:
449 |             unit = units[idx]
450 |             break
451 |         idx -= 1
452 |     return "{}{:.{}f} {}".format(flag, size/unit_size, ndigits, unit)
453 | 
454 | 
455 | def get_playlist(playlist_type, path_type):
456 |     """传入播放列表类型及路径类型，返回播放列表对象"""
457 | 
458 |     if playlist_type == 'no':
459 |         playlist = None
460 |     elif playlist_type == 'dpl':
461 |         playlist = Dpl(path_type=path_type)
462 |     elif playlist_type == 'm3u':
463 |         playlist = M3u(path_type=path_type)
464 |     return playlist
465 | 
466 | 
467 | def aria2_download(videos, workdir, overwrite=False):
468 |     """调用 aria2 下载视频"""
469 | 
470 |     aria2 = Aria2()
471 |     files = []
472 | 
473 |     for url, file_name in videos:
474 |         file = Aria2File(aria2, url, file_name, workdir, overwrite=overwrite)
475 |         files.append(file)
476 | 
477 |     # 显示进度
478 |     process_bar_length = 50
479 |     total_length = sum([file.get_length() for file in files])
480 |     length_flag = False
481 |     while True:
482 |         if not length_flag:
483 |             length_flag = True
484 |             total_length = 0
485 |             for file in files:
486 |                 length = file.get_length()
487 |                 if length == 0:
488 |                     length_flag = False
489 |                 total_length += length
490 | 
491 |         speed = sum([file.get_speed() for file in files])
492 |         completed_length = sum([file.get_complete_length() for file in files])
493 |         len_done = (process_bar_length * completed_length // \
494 |                     total_length) if total_length else process_bar_length
495 |         len_undone = process_bar_length - len_done
496 |         log_string = '{}{} {}/{} {:12}'.format(
497 |             "#" * len_done, "_" * len_undone, size_format(completed_length),
498 |             size_format(total_length), size_format(speed)+"/s")
499 |         print(log_string, end="\r")
500 |         time.sleep(1)
501 | 
502 |         # 重命名文件
503 |         for file in files:
504 |             if file.get_status() == "complete" and not file.renamed:
505 |                 file.rename()
506 |         if all([file.get_status() == "complete" for file in files]):
507 |             break
508 | 
509 |     print("视频已下载全部完成~")
510 | 


--------------------------------------------------------------------------------
/moocs/xuetangx.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """学堂在线"""
  3 | 
  4 | import json
  5 | import sys
  6 | 
  7 | from bs4 import BeautifulSoup
  8 | 
  9 | from moocs.utils import *
 10 | from utils.crawler import Crawler
 11 | 
 12 | name = "xuetangx"
 13 | need_cookies = True
 14 | BASE_URL = 'http://www.xuetangx.com'
 15 | CANDY = Crawler()
 16 | CONFIG = {}
 17 | FILES = {}
 18 | VIDEOS = []
 19 | exports = {}
 20 | __all__ = ["name", "need_cookies", "start", "exports"]
 21 | 
 22 | 
 23 | def get_book(url):
 24 |     """获得所有的 PDF 电子书"""
 25 | 
 26 |     nav_page = CANDY.get(url).text
 27 |     shelves = set(re.findall(r'/courses/.+/pdfbook/\d/', nav_page))
 28 |     for shelf_count, shelf in enumerate(shelves, 1):
 29 |         res = CANDY.get(BASE_URL + shelf).text
 30 |         soup = BeautifulSoup(res, 'lxml')
 31 |         WORK_DIR.change('Books', str(shelf_count))
 32 |         for book_count, book in enumerate(soup.select('#booknav a'), 1):
 33 |             file_name = Resource.file_to_save(book.string) + '.pdf'
 34 |             if WORK_DIR.need_download(file_name, CONFIG["overwrite"]):
 35 |                 CANDY.download_bin(
 36 |                     BASE_URL + book['rel'][0], WORK_DIR.file(file_name))
 37 | 
 38 | 
 39 | def get_handout(url):
 40 |     """从课程信息页面获得课程讲义并存为 HTML 文件"""
 41 | 
 42 |     handouts_html = ClassicFile('Handouts.html')
 43 |     res = CANDY.get(url).text
 44 |     soup = BeautifulSoup(res, 'lxml')
 45 |     handouts = soup.find(class_='handouts')
 46 | 
 47 |     # 将相对地址替换为绝对地址
 48 |     for link in handouts.select('a[href^="/"]'):
 49 |         link['href'] = BASE_URL + link['href']
 50 |     handouts_html.write_string('<!DOCTYPE html>\n<html>\n<head>\n<title>讲义</title>\n<meta charset="utf-8">\n'
 51 |                                '</head>\n<body>\n%s</body>\n</html>' % handouts.prettify())
 52 | 
 53 | 
 54 | def get_video(video):
 55 |     """根据视频 ID 和文件名字获取视频信息"""
 56 | 
 57 |     file_name = video.file_name
 58 |     if WORK_DIR.need_download(file_name+'.mp4', CONFIG["overwrite"]):
 59 |         res = CANDY.get('http://xuetangx.com/videoid2source/' + video.meta).text
 60 |         try:
 61 |             video_url = json.loads(res)['sources']['quality20'][0]
 62 |         except:
 63 |             video_url = json.loads(res)['sources']['quality10'][0]
 64 |         FILES['videos'].write_string(video_url)
 65 |         FILES['renamer'].write(
 66 |             re.search(r'(\w+-[12]0.mp4)', video_url).group(1), file_name)
 67 |         VIDEOS.append((video_url, file_name+".mp4"))
 68 | 
 69 | 
 70 | def get_content(url):
 71 |     """获取网页详细内容"""
 72 | 
 73 |     outline = Outline()
 74 |     counter = Counter()
 75 |     video_counter = Counter()
 76 |     video_list = []
 77 | 
 78 |     courseware = CANDY.get(url).text
 79 |     soup = BeautifulSoup(courseware, 'lxml')
 80 | 
 81 |     chapters = soup.find(id='accordion').find_all(class_='chapter')
 82 |     for chapter in chapters:
 83 |         counter.add(0)
 84 |         video_counter.add(0)
 85 |         chapter_title = chapter.h3.a.get_text(strip=True)
 86 |         outline.write(chapter_title, counter, 0)
 87 | 
 88 |         sections = chapter.select('ul a')
 89 |         for section_info in sections:
 90 |             counter.add(1)
 91 |             video_counter.add(1)
 92 |             section_url = BASE_URL + section_info['href']
 93 |             section_title = section_info.p.string.strip()
 94 | 
 95 |             outline.write(section_title, counter, 1)
 96 | 
 97 |             section_page = CANDY.get(section_url).text
 98 |             soup = BeautifulSoup(section_page, 'lxml')
 99 | 
100 |             # 对于某些需要安装 MathPlayer 插件的网页
101 |             try:
102 |                 tabs = soup.find(id='sequence-list').find_all('li')
103 |             except AttributeError:
104 |                 break
105 |             for tab_count, tab_info in enumerate(tabs, 1):
106 |                 counter.add(2)
107 |                 # title 可能出现换行符和重复，所以用 data-page-title
108 |                 tab_title = tab_info.a.get('data-page-title')
109 | 
110 |                 outline.write(tab_title, counter)
111 | 
112 |                 if tab_title == 'Video' or tab_title == '视频' or tab_title == '':
113 |                     tab_title = section_title
114 | 
115 |                 tab_sequence = tab_info.a.get('aria-controls')
116 | 
117 |                 tab_escape = soup.find(id=tab_sequence).string
118 |                 tab = BeautifulSoup(tab_escape, 'lxml').div.div
119 | 
120 |                 blocks = tab.find_all('div', class_='xblock')
121 |                 for block in blocks:
122 |                     try:
123 |                         # 极少数没有 data-type 属性
124 |                         block_type = block['data-type']
125 |                     except KeyError:
126 |                         continue
127 |                     if block_type == 'Video':
128 |                         video_counter.add(2)
129 |                         # 替换连续空格或制表符为单个空格
130 |                         video_name = block.h2.string.strip()
131 | 
132 |                         outline.write(video_name, video_counter,
133 |                                       level=3, sign='#')
134 | 
135 |                         if video_name == 'Video' or video_name == '视频' or video_name == '':
136 |                             video_name = tab_title
137 | 
138 |                         video_id = block.div['data-ccsource']
139 | 
140 |                         video = Video(video_counter, video_name, video_id)
141 |                         video_list.append(video)
142 | 
143 |                         if CONFIG['sub']:
144 |                             get_subtitles(block.div['data-transcript-available-translations-url'],
145 |                                           block.div['data-transcript-translation-url'],
146 |                                           video.file_name)
147 |     if video_list:
148 |         WORK_DIR.change('Videos')
149 |         rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
150 |         playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
151 |         if playlist:
152 |             parse_res_list(video_list, rename, playlist.write, get_video)
153 |         else:
154 |             parse_res_list(video_list, rename, get_video)
155 | 
156 | 
157 | def get_subtitles(available, transcript, file_name):
158 |     """获取字幕"""
159 | 
160 |     subtitle_available_url = BASE_URL + available
161 |     try:
162 |         subtitle_available = CANDY.get(subtitle_available_url).json()
163 |     except json.decoder.JSONDecodeError:
164 |         return
165 |     WORK_DIR.change('Videos')
166 |     base_subtitle_url = BASE_URL + transcript + '/'
167 |     multi_subtitle = False if len(subtitle_available) == 1 else True
168 |     for subtitle_desc in subtitle_available:
169 |         subtitle_url = base_subtitle_url + subtitle_desc
170 |         CANDY.get(subtitle_url)
171 |         if multi_subtitle:
172 |             sub_file_name = file_name + '_' + \
173 |                 subtitle_desc.replace('_xuetangx', '') + '.srt'
174 |         else:
175 |             sub_file_name = file_name + '.srt'
176 |         subtitle = CANDY.get(subtitle_available_url.rstrip(
177 |             'available_translations') + 'download').content
178 |         with open(WORK_DIR.file(sub_file_name), 'wb') as subtitle_file:
179 |             subtitle_file.write(subtitle)
180 | 
181 | 
182 | def get_summary(url):
183 |     """从课程地址获得课程文件夹名称"""
184 | 
185 |     about_page = CANDY.get(url).text
186 |     soup = BeautifulSoup(about_page, 'lxml')
187 | 
188 |     course_name = soup.find(id='title1').string
189 |     institution = soup.find(class_='courseabout_text').a.string
190 | 
191 |     dir_name = course_dir(course_name, institution)
192 |     print(dir_name)
193 |     return dir_name
194 | 
195 | 
196 | def start(url, config, cookies=None):
197 |     """调用接口函数"""
198 | 
199 |     global WORK_DIR
200 |     CONFIG.update(config)
201 | 
202 |     CANDY.set_cookies(cookies)
203 |     status = CANDY.get('http://www.xuetangx.com/header_ajax')
204 |     if status.json()['login']:
205 |         print('验证成功！')
206 |     else:
207 |         print('Cookie 失效。请获取新的 Cookie ')
208 |         sys.exit(1)
209 | 
210 |     course_name = get_summary(url)
211 | 
212 |     WORK_DIR = WorkingDir(CONFIG['dir'], course_name)
213 |     WORK_DIR.change('Videos')
214 |     FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
215 |     FILES['videos'] = ClassicFile(WORK_DIR.file('Videos.txt'))
216 | 
217 |     handout = url.rstrip('about') + 'info'
218 |     courseware = url.rstrip('about') + 'courseware'
219 | 
220 |     if CONFIG['doc']:
221 |         # 使用 handout 作为入口更快
222 |         get_book(handout)
223 | 
224 |     get_handout(handout)
225 |     get_content(courseware)
226 | 
227 |     exports.update({
228 |         "workdir": WORK_DIR,
229 |         "spider": CANDY,
230 |         "videos": VIDEOS
231 |     })
232 | 


--------------------------------------------------------------------------------
/moocs/xuetangx_next.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """学堂在线"""
  3 | 
  4 | from moocs.utils import *
  5 | from utils.crawler import Crawler
  6 | 
  7 | name = "xuetangx_next"
  8 | need_cookies = True
  9 | CANDY = Crawler()
 10 | CONFIG = {}
 11 | FILES = {}
 12 | VIDEOS = []
 13 | exports = {}
 14 | __all__ = ["name", "need_cookies", "start", "exports"]
 15 | 
 16 | 
 17 | def get_summary(url):
 18 |     """从课程主页面获取信息"""
 19 | 
 20 |     sign, cid = re.match(r"https?://next.xuetangx.com/course/"
 21 |                         "(?P<sign>.+?)/(?P<cid>.+)", url).group("sign", "cid")
 22 | 
 23 |     res = CANDY.get("https://next.xuetangx.com/api/v1/lms/learn/product/info?cid=%s&sign=%s" % (cid, sign))
 24 |     course_name = res.json()['data']['classroom_name']
 25 |     # 机构名称不太容易获取，暂时不获取
 26 |     dir_name = course_dir(course_name, "学堂在线")
 27 | 
 28 |     print(dir_name)
 29 |     CONFIG['sign'] = sign
 30 |     CONFIG['cid'] = cid
 31 |     return cid, sign, dir_name
 32 | 
 33 | 
 34 | def parse_resource(resource):
 35 | 
 36 |     cid, sign = CONFIG['cid'], CONFIG['sign']
 37 |     file_name = resource.file_name
 38 |     item_id, item_info_id = resource.meta
 39 |     res = CANDY.get("https://next.xuetangx.com/api/v1/lms/learn/leaf_info/%s/%s/?sign=%s" % (cid, item_id, sign),
 40 |                     headers={"xtbz": "xt"})
 41 |     if resource.type == 'Video':
 42 |         ccid = res.json()['data']['content_info']['media']['ccid']
 43 | 
 44 |         video_url_res = CANDY.get("https://next.xuetangx.com/api/v1/lms/service/playurl/%s/?appid=10000" % ccid)
 45 |         sources = video_url_res.json()['data']['sources']
 46 |         qualitys = ['20', '10']
 47 |         for qa in qualitys:
 48 |             if sources.get('quality' + qa):
 49 |                 # 居然是个数组，暂时没发现多段的，希望以后也没有吧……
 50 |                 video_url = sources['quality' + qa][0]
 51 |                 break
 52 | 
 53 |         ext = '.mp4'
 54 |         if WORK_DIR.need_download(file_name + ext, CONFIG["overwrite"]):
 55 |             FILES['renamer'].write(video_url.split('?')[0].split('/')[-1], file_name, ext)
 56 |             FILES['video'].write_string(video_url)
 57 |             VIDEOS.append((video_url, file_name+ext))
 58 |             resource.ext = ext
 59 | 
 60 |         if not CONFIG['sub']:
 61 |             return
 62 |         # 暂未支持多语言
 63 |         subtitle_res = CANDY.get("https://next.xuetangx.com/api/v1/lms/service/subtitle_parse/?c_d=%s&lg=0" % ccid)
 64 |         if subtitle_res.status_code != 200:
 65 |             return
 66 |         subtitle_json = subtitle_res.json()
 67 |         starts, ends, texts = subtitle_json['start'], subtitle_json['end'], subtitle_json['text']
 68 |         subtitle = Subtitle(WORK_DIR.file(file_name + '.srt'))
 69 |         assert len(starts) == len(ends) == len(texts)
 70 |         for i in range(len(starts)):
 71 |             subtitle.write(texts[i], starts[i]/1000, ends[i]/1000)
 72 | 
 73 |     elif resource.type == 'Document':
 74 |         if not WORK_DIR.need_download(file_name + '.pdf', CONFIG["overwrite"]):
 75 |             return
 76 |         # 暂时也没遇到多个文件的情况
 77 |         downloads = res.json()['data']['content_info']['download']
 78 |         if downloads:
 79 |             pdf_url = downloads[0]['file_url']
 80 |             CANDY.download_bin(pdf_url, WORK_DIR.file(file_name + '.pdf'))
 81 | 
 82 | 
 83 | def get_resource(cid, sign):
 84 |     """获取各种资源"""
 85 | 
 86 |     outline = Outline()
 87 |     counter = Counter()
 88 | 
 89 |     video_list = []
 90 |     pdf_list = []
 91 | 
 92 |     res = CANDY.get("https://next.xuetangx.com/api/v1/lms/learn/course/chapter?cid=%s&sign=%s" % (cid, sign),
 93 |                     headers={"xtbz": "xt"})
 94 |     for chapter in res.json()['data']['course_chapter']:
 95 |         counter.add(0)
 96 |         chapter_id, chapter_name, chapter_order = chapter['id'], chapter['name'], chapter['order']
 97 |         outline.write(chapter_name, counter, 0)
 98 | 
 99 |         for section in chapter['section_leaf_list']:
100 |             counter.add(1)
101 |             section_id, section_name, section_order = section['id'], section['name'], section['order']
102 |             outline.write(section_name, counter, 1)
103 | 
104 |             # 暂时忽略讨论、测验，以后可能支持（在 section 中作为叶子结点， type_id = 4 6）
105 |             for item in section.get('leaf_list', []):
106 |                 counter.add(2)
107 |                 item_id, item_name, item_order = item['id'], item['name'], item['order']
108 |                 item_type, item_info_id = item['leaf_type'], item['leafinfo_id']
109 |                 # Video
110 |                 if item_type == 0:
111 |                     outline.write(item_name, counter, 2, sign='#')
112 |                     video_list.append(Video(counter, item_name, (item_id, item_info_id)))
113 |                 # Docs
114 |                 elif item_type == 3:
115 |                     item_name = item_name.rstrip('.pdf')
116 |                     outline.write(item_name, counter, 2, sign='*')
117 |                     if CONFIG['doc']:
118 |                         pdf_list.append(Document(counter, item_name, (item_id, item_info_id)))
119 | 
120 |     if video_list:
121 |         rename = WORK_DIR.file('Names.txt') if CONFIG['rename'] else False
122 |         WORK_DIR.change('Videos')
123 |         playlist = get_playlist(CONFIG["playlist_type"], CONFIG["playlist_path_type"])
124 |         if playlist is not None:
125 |             parse_res_list(video_list, rename, parse_resource, playlist.write)
126 |         else:
127 |             parse_res_list(video_list, rename, parse_resource)
128 |     if pdf_list:
129 |         WORK_DIR.change('PDFs')
130 |         parse_res_list(pdf_list, None, parse_resource)
131 | 
132 | 
133 | def start(url, config, cookies=None):
134 |     """调用接口函数"""
135 | 
136 |     global WORK_DIR
137 |     CANDY.set_cookies(cookies)
138 |     CONFIG.update(config)
139 | 
140 |     cid, sign, course_name = get_summary(url)
141 | 
142 |     WORK_DIR = WorkingDir(CONFIG['dir'], course_name)
143 |     WORK_DIR.change('Videos')
144 |     FILES['renamer'] = Renamer(WORK_DIR.file('Rename.{ext}'))
145 |     FILES['video'] = ClassicFile(WORK_DIR.file('Videos.txt'))
146 | 
147 |     get_resource(cid, sign)
148 | 
149 |     exports.update({
150 |         "workdir": WORK_DIR,
151 |         "spider": CANDY,
152 |         "videos": VIDEOS
153 |     })
154 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "course-crawler",
 3 |   "description": "一个基于 Python 3 的 MOOC 课程下载工具",
 4 |   "scripts": {
 5 |     "docs:dev": "vuepress dev docs",
 6 |     "docs:build": "vuepress build docs",
 7 |     "deploy": "bash scripts/deploy.sh"
 8 |   },
 9 |   "husky": {
10 |     "hooks": {
11 |       "pre-commit": "pretty-quick --staged"
12 |     }
13 |   },
14 |   "devDependencies": {
15 |     "husky": "^3.0.4",
16 |     "prettier": "1.18.2",
17 |     "pretty-quick": "^1.11.1",
18 |     "vuepress": "^1.2.0"
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.22.0
2 | beautifulsoup4==4.8.0
3 | lxml==4.4.1
4 | pycryptodome==3.9.0
5 | 


--------------------------------------------------------------------------------
/scripts/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | 
 3 | ACCESS_TOKEN=$1
 4 | USERNAME=SigureMo       # 你的用户名
 5 | REPO=course-crawler     # 如果不指定，将发布在 <username>.github.io
 6 | BRANCH=gh-pages         # 如果不指定，将发布在 master 分支
 7 | CNAME=""                # 你想发布到的域名
 8 | 
 9 | if [ $ACCESS_TOKEN ]
10 | then TOKEN_PREFIX="${ACCESS_TOKEN}@"
11 | else TOKEN_PREFIX=""
12 | fi
13 | 
14 | if [ $BRANCH ]
15 | then BRANCH_POSTFIX=":${BRANCH}"
16 | else BRANCH_POSTFIX=""
17 | fi
18 | 
19 | if [ $REPO ]
20 | then REMOTE=https://${TOKEN_PREFIX}github.com/${USERNAME}/${REPO}.git
21 | else REMOTE=https://${TOKEN_PREFIX}github.com/${USERNAME}/${USERNAME}.github.io.git
22 | fi
23 | 
24 | # 确保脚本抛出遇到的错误
25 | set -e
26 | 
27 | # 生成静态文件
28 | npm run docs:build
29 | 
30 | # 进入生成的文件夹
31 | cd docs/.vuepress/dist
32 | 
33 | # 如果是发布到自定义域名
34 | if [ $CNAME ]
35 | then echo $CNAME > CNAME
36 | fi
37 | 
38 | # 初始化仓库并提交发布
39 | git init
40 | git config user.name "GitHub Actions"
41 | git config user.email "support@github.com"
42 | git add -A
43 | time=$(date "+%Y-%m-%d %H:%M:%S")
44 | git commit -m "rebuild @${time}"
45 | git push -f $REMOTE master${BRANCH_POSTFIX}
46 | 
47 | cd -
48 | 


--------------------------------------------------------------------------------
/utils/aria2.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import subprocess
  3 | import json
  4 | import time
  5 | import os
  6 | 
  7 | from urllib.request import urlopen
  8 | 
  9 | rpc_url = "http://localhost:{port}/jsonrpc"
 10 | 
 11 | 
 12 | class Aria2():
 13 |     """ Aria2 RPC 接口调用器
 14 |     完整接口见（简单封装即可）：
 15 |     http://aria2.github.io/manual/en/html/aria2c.html#rpc-interface
 16 |     """
 17 | 
 18 |     def __init__(self, aria2_path="aria2c", port=6800):
 19 |         self.port = port
 20 |         self.rpc_url = rpc_url.format(port=port)
 21 |         self.aria2_path = aria2_path
 22 |         self.process_file = open("process.out", "w")
 23 |         assert self.is_installed(), "请配置正确的 aria2 路径"
 24 |         if not self.is_connected():
 25 |             self.process = self.init_rpc()
 26 |             # 防止操作过快导致 aria2 没来得及开启
 27 |             time.sleep(1)
 28 | 
 29 |     def __del__(self):
 30 |         """ 析构时确保 aria2 关闭 """
 31 |         if self.is_connected():
 32 |             self.shutdown()
 33 |         self.process_file.close()
 34 |         try:
 35 |             os.remove(self.process_file.name)
 36 |         except:
 37 |             print("process.out 自动删除失败……")
 38 | 
 39 |     def rpc_api(method):
 40 |         """ RPC 装饰器 """
 41 |         def rpc_method(func):
 42 |             def new_func(self, *args):
 43 |                 data = {
 44 |                     'jsonrpc': '2.0',
 45 |                     'id': 'qwer',
 46 |                     'method': method,
 47 |                     'params': list(filter(lambda arg: arg is not None, args)),
 48 |                 }
 49 |                 res = requests.post(
 50 |                     self.rpc_url, data=json.dumps(data), timeout=2)
 51 |                 return res.json()["result"]
 52 |             return new_func
 53 |         return rpc_method
 54 | 
 55 |     @rpc_api(method="aria2.addUri")
 56 |     def add_uri(self, uris, options=None, position=None):
 57 |         """ 添加 URI 任务 """
 58 |         pass
 59 | 
 60 |     @rpc_api(method="aria2.getGlobalStat")
 61 |     def get_global_stat(self):
 62 |         """ 获取全局统计信息 """
 63 |         pass
 64 | 
 65 |     @rpc_api(method="aria2.shutdown")
 66 |     def shutdown(self):
 67 |         """ 关闭 aria2 """
 68 |         pass
 69 | 
 70 |     @rpc_api(method="aria2.tellStatus")
 71 |     def tell_status(self, gid, keys=None):
 72 |         """ 获取某一下载资源的状态信息 """
 73 |         pass
 74 | 
 75 |     def init_rpc(self):
 76 |         """ 启动 aria2 RPC """
 77 |         cmd = self.aria2_path + \
 78 |             ' --enable-rpc' \
 79 |             ' --rpc-listen-port %d' \
 80 |             ' --continue' \
 81 |             ' --max-concurrent-downloads=20' \
 82 |             ' --max-connection-per-server=10' \
 83 |             ' --rpc-max-request-size=1024M' % self.port
 84 | 
 85 |         return subprocess.Popen(cmd, shell=True, stdout=self.process_file)
 86 | 
 87 |     def is_connected(self):
 88 |         """ 是否可以连接 aria2 """
 89 |         try:
 90 |             requests.post(self.rpc_url)
 91 |             return True
 92 |         except requests.exceptions.ConnectionError:
 93 |             return False
 94 | 
 95 |     def is_installed(self):
 96 |         """ 是否已经下载 aria2 """
 97 |         try:
 98 |             return subprocess.run([self.aria2_path], stdout=subprocess.PIPE,
 99 |                                   stderr=subprocess.PIPE).returncode == 1
100 |         except FileNotFoundError:
101 |             return False
102 | 
103 | 
104 | class Aria2File():
105 | 
106 |     def __init__(self, aria2, url, file_name, dir, overwrite=False):
107 |         self.aria2 = aria2
108 |         self.path = os.path.join(dir, file_name)
109 |         self.tmp_path = self.path + ".t"
110 |         self.aria2_file = self.tmp_path + ".aria2"
111 |         if overwrite:
112 |             if os.path.exists(self.tmp_path):
113 |                 os.remove(self.tmp_path)
114 |             if os.path.exists(self.aria2_file):
115 |                 os.remove(self.aria2_file)
116 |         self.gid = aria2.add_uri([url], {"dir": dir, "out": file_name+".t"})
117 |         self.renamed = False
118 | 
119 |     def get_length(self):
120 |         """ 获取总大小 """
121 |         return int(self.aria2.tell_status(self.gid)["totalLength"])
122 | 
123 |     def get_complete_length(self):
124 |         """ 获取已完成部分大小 """
125 |         return int(self.aria2.tell_status(self.gid)["completedLength"])
126 | 
127 |     def get_status(self):
128 |         """ 获取状态 """
129 |         return self.aria2.tell_status(self.gid)["status"]
130 | 
131 |     def get_speed(self):
132 |         """ 获取下载速度 """
133 |         return int(self.aria2.tell_status(self.gid)["downloadSpeed"])
134 | 
135 |     def exists(self):
136 |         """ 文件是否已存在 """
137 |         return os.path.exists(self.path)
138 | 
139 |     def rename(self):
140 |         """ 将文件从临时位置移动到目标位置 """
141 |         if os.path.exists(self.path):
142 |             os.remove(self.path)
143 |         os.rename(self.tmp_path, self.path)
144 |         self.renamed = True
145 | 


--------------------------------------------------------------------------------
/utils/crawler.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | 
 4 | import requests
 5 | 
 6 | 
 7 | class Crawler(requests.Session):
 8 | 
 9 |     header = {
10 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
11 |     }
12 | 
13 |     def __init__(self):
14 |         super().__init__()
15 |         self.headers.update(Crawler.header)
16 | 
17 |     def set_cookies(self, cookies):
18 |         """传入一个字典，用于设置 cookies"""
19 | 
20 |         requests.utils.add_dict_to_cookiejar(self.cookies, cookies)
21 | 
22 |     def download_bin(self, url, file_path, stream=True, chunk_size=1024, **kw):
23 |         """下载二进制文件"""
24 | 
25 |         res = self.get(url, stream=stream, **kw)
26 |         tmp_path = file_path + ".t"
27 |         try:
28 |             with open(tmp_path, "wb") as f:
29 |                 if stream:
30 |                     for chunk in res.iter_content(chunk_size=chunk_size):
31 |                         if not chunk:
32 |                             break
33 |                         f.write(chunk)
34 |                 else:
35 |                     f.write(res.content)
36 |         except:
37 |             os.remove(tmp_path)
38 |             print("[warn] {} failed to download".format(file_path))
39 |         if os.path.exists(file_path):
40 |             os.remove(file_path)
41 |         os.rename(tmp_path, file_path)
42 | 
43 |     def download_text(self, url, file_path, **kw):
44 |         """下载文本，以 UTF-8 编码保存文件"""
45 | 
46 |         res = self.get(url, **kw)
47 |         res.encoding = res.apparent_encoding
48 |         with open(file_path, 'w', encoding='utf_8') as f:
49 |             f.write(res.text)
50 | 


--------------------------------------------------------------------------------