├── edx_dl ├── __init__.py ├── utils.py ├── common.py ├── parsing.py └── edx_dl.py ├── test ├── json │ ├── empty.json │ ├── minimal.json │ ├── empty-text.json │ ├── abridged-01.json │ └── abridged-02.json └── html │ ├── index.txt │ ├── new_sections_structure.html │ └── empty_sections.html ├── MANIFEST.in ├── requirements-dev.txt ├── requirements.txt ├── edx-dl.py ├── tox.ini ├── .gitignore ├── .travis.yml ├── AUTHORS.md ├── .github ├── ISSUE_TEMPLATE.md └── PULL_REQUEST_TEMPLATE.md ├── setup.py ├── CONTRIBUTING.md ├── README.md ├── test_utils.py ├── test_edx_dl.py ├── test_parsing.py └── LICENSE /edx_dl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/json/empty.json: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/json/minimal.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements*.txt 2 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest>=2.5 2 | pytest-cov 3 | pytest-xdist 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.1.3 2 | html5lib>=1.0b2 3 | six>=1.5.0 4 | youtube_dl>=2015.05.20 5 | -------------------------------------------------------------------------------- /edx-dl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from edx_dl import edx_dl 5 | 6 | edx_dl.main() 7 | -------------------------------------------------------------------------------- /test/json/empty-text.json: -------------------------------------------------------------------------------- 1 | { 2 | "end": [ 3 | 20428 4 | ], 5 | "start": [ 6 | 18104 7 | ], 8 | "text": [ 9 | "" 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /test/json/abridged-01.json: -------------------------------------------------------------------------------- 1 | { 2 | "end": [ 3 | 20428 4 | ], 5 | "start": [ 6 | 18104 7 | ], 8 | "text": [ 9 | "I am very glad to see everyone here\uff0c" 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /test/json/abridged-02.json: -------------------------------------------------------------------------------- 1 | { 2 | "end": [ 3 | 20428, 4 | 24721 5 | ], 6 | "start": [ 7 | 18104, 8 | 20569 9 | ], 10 | "text": [ 11 | "I am very glad to see everyone here\uff0c", 12 | "so let's enjoy the beauty of combinatorics together." 13 | ] 14 | } 15 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27,py34,pypy 3 | 4 | [testenv] 5 | downloadcache = .tox/_download/ 6 | 7 | deps = 8 | beautifulsoup4>=4.1.3 9 | html5lib>=1.0b2 10 | pytest>=2.5 11 | six>=1.5.0 12 | pytest-cov>=1.8.0 13 | pytest-xdist>=1.8 14 | 15 | commands = py.test -v --cov edx_dl --cov-report html . 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | semantic.cache 3 | Downloaded/ 4 | *~ 5 | .*.swp 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | .ropeproject/ 12 | 13 | # Distribution-related files 14 | *.egg-info 15 | /dist 16 | /build 17 | 18 | # Testing and CI files 19 | /.tox 20 | /.coverage 21 | /htmlcov 22 | .cache/ 23 | 24 | # Application files 25 | edx-dl.cache 26 | -------------------------------------------------------------------------------- /test/html/index.txt: -------------------------------------------------------------------------------- 1 | # this file has a mapping of the local htmls and their origin 2 | single_unit_multiple_subs.html: https://courses.edx.org/courses/edX/DemoX.1/2014/courseware/6156e0e685ee4a2ab017258108c0bccd/194bd1729fab47aba6507f737d9b90ba 3 | multiple_units.html: https://courses.edx.org/courses/BerkeleyX/CS184.1x/2012_Fall/courseware/Unit_0/L1 4 | from_html_single_unit_multiple_subs: https://mitprofessionalx.mit.edu 5 | new_sections_structure: https://courses.edx.org/courses/course-v1:Microsoft+DEV207.1x+1T2016/courseware/2e4818cb44e546e18777fa7e4b250574/ 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.3" 5 | - "3.4" 6 | - "pypy" 7 | matrix: 8 | allow_failures: 9 | - python: "pypy" 10 | # command to install dependencies 11 | install: 12 | - pip install -r requirements.txt 13 | - pip install pytest pytest-cov pytest-xdist coveralls coverage 14 | 15 | # command to run tests 16 | script: py.test -v --cov edx_dl --cov-report html 17 | 18 | after_success: 19 | coveralls 20 | 21 | notifications: 22 | email: 23 | - iemejia@gmail.com 24 | - kidsshk3@gmail.com 25 | - rbrito@ime.usp.br 26 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | In lexicographic/alphabetical order, this file lists authors and 4 | contributors to the project. It is meant to recognize and credit their 5 | contributions to the project. 6 | 7 | Introduction of names in this file is completely voluntary, as some people 8 | may not want to be included given their potential employment requirements or 9 | other issues. We respect the contributor's wishes. 10 | 11 | To be included in this file, just send a pull request with your name, once 12 | you have at least one contribution to the project. 13 | 14 | # Contributors 15 | 16 | * Emad Shaaban 17 | * George Monkey 18 | * Ismaël Mejía 19 | * Rogério Theodoro de Brito 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 🚨Please review the [Troubleshooting](../#troubleshooting) section 2 | before reporting any issue. Don't forget also to check the current issues to 3 | avoid duplicates. 4 | 5 | ### Subject of the issue 6 | Describe your issue here. 7 | 8 | ### Your environment 9 | * Operating System (name/version): 10 | * Python version: 11 | * youtube-dl version: 12 | * edx-dl version: 13 | 14 | ### Steps to reproduce 15 | Tell us how to reproduce this issue. Please provide us the course URL, and the 16 | specific subsection or unit if possible. 17 | 18 | ### Expected behaviour 19 | Tell us what should happen. 20 | 21 | ### Actual behaviour 22 | Tell us what happens instead. If the script fails, please copy the *entire* 23 | output of the command or the stacktrace (don't forget to obfuscate your 24 | username and password). If you cannot copy the exception, attach a screenshot. 25 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 🚨Please review the [guidelines for contributing](CONTRIBUTING.md) to this repository. 2 | 3 | ## Proposed changes 4 | 5 | Describe the big picture of your changes here to communicate to the maintainers 6 | why we should accept this pull request. If it fixes a bug or resolves a feature 7 | request, be sure to link to that issue. 8 | 9 | ## Types of changes 10 | 11 | What types of changes does your code introduce? 12 | _Put an `x` in the boxes that apply_ 13 | 14 | - [ ] Bugfix (non-breaking change which fixes an issue) 15 | - [ ] New feature (non-breaking change which adds functionality) 16 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 17 | 18 | ## Checklist 19 | 20 | _Put an `x` in the boxes that apply. You can also fill these out after creating 21 | the PR. If you're unsure about any of them, don't hesitate to ask. We're here 22 | to help! This is simply a reminder of what we are going to look for before 23 | merging your code._ 24 | 25 | - [ ] I have read the [CONTRIBUTING](/CONTRIBUTING.md) doc 26 | - [ ] I agree to contribute my changes under the project's [LICENSE](/LICENSE) 27 | - [ ] I have checked that the unit tests pass locally with my changes 28 | - [ ] I have checked the style of the new code (lint/pep). 29 | - [ ] I have added tests that prove my fix is effective or that my feature works 30 | - [ ] I have added necessary documentation (if appropriate) 31 | 32 | ## Further comments 33 | 34 | If this is a relatively large or complex change, please explain why you chose 35 | the solution you did and what alternatives you considered, etc. 36 | 37 | ### Reviewers 38 | If you know the person who can review your code please add a @mention. 39 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from setuptools import setup 4 | 5 | # you can install this to a local test virtualenv like so: 6 | # virtualenv venv 7 | # ./venv/bin/pip install --editable . 8 | # ./venv/bin/pip install --editable .[dev] # with dev requirements, too 9 | 10 | # 11 | # FIXME: This won't work until we have a README file in .rst format (which 12 | # is what PyPI knows how to parse). In the mean time, we can use the following: 13 | # 14 | # pandoc --from=markdown --to=rst --output=README.rst README.md 15 | # 16 | 17 | setup( 18 | name='edx-dl', 19 | version='0.0', 20 | maintainer='Ismaël Mejía, Rogério Theodoro de Brito', 21 | maintainer_email='iemejia@gmail.com, rbrito@ime.usp.br', 22 | 23 | license='LGPL', 24 | url='https://github.com/shk3/edx-downloader', 25 | 26 | install_requires=open('requirements.txt').readlines(), 27 | extras_require=dict( 28 | dev=open('requirements-dev.txt').readlines() 29 | ), 30 | 31 | description='Simple tool to download video and lecture materials from edx.org.', 32 | long_description=open('README.rst', 'r').read(), 33 | keywords=['edX', 'download', 'education', 'MOOCs', 'video'], 34 | classifiers=[ 35 | 'Development Status :: 4 - Beta', 36 | 'Environment :: Console', 37 | 'Intended Audience :: End Users/Desktop', 38 | 'License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)', 39 | 'Operating System :: OS Independent', 40 | 'Programming Language :: Python :: 2', 41 | 'Programming Language :: Python :: 2.7', 42 | 'Programming Language :: Python :: 3', 43 | 'Programming Language :: Python :: 3.3', 44 | 'Programming Language :: Python :: 3.4', 45 | 'Programming Language :: Python :: Implementation :: CPython', 46 | 'Programming Language :: Python :: Implementation :: PyPy', 47 | 'Programming Language :: Python', 48 | 'Topic :: Education', 49 | ], 50 | 51 | packages=["edx_dl"], 52 | entry_points=dict( 53 | console_scripts=[ 54 | 'edx-dl=edx_dl.edx_dl:main' 55 | ] 56 | ), 57 | 58 | platforms=['any'], 59 | ) 60 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Please, before sending patches, read these brief comments. They are here to 2 | help the project have both its users happy using the program and the 3 | developers/maintainers feel good when trying to change code that other 4 | people contributed. 5 | 6 | For the record, when this document mentions "I", it mostly means Rogério 7 | Brito's (@rbrito) is the one to blame. 8 | 9 | # Write good commit messages 10 | 11 | When you write your pull request and your commit messages, please, be 12 | detailed, explaining why you are doing what you are doing. Don't be afraid 13 | of being too verbose here. Also, please follow the highly recommended 14 | guidelines on how to write good [good commit messages][commit-msgs]. 15 | 16 | When in doubt, follow the model of the Linux kernel commit logs. Their 17 | commit messages are some of the best that I have seen. Also, the ffmpeg has 18 | some good messages that I believe that should be followed. If you are in a 19 | hurry, read the section named 20 | ["Contributing" from subsurface's README][contributing]. 21 | 22 | [commit-msgs]: https://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message 23 | [contributing]: https://github.com/torvalds/subsurface/blob/master/README#L71-L114 24 | 25 | 26 | # Test that your changes don't break existing functionality 27 | 28 | Make sure that you have all dependencies installed, like via: 29 | 30 | pip install -r requirements.txt 31 | pip install -r requirements-dev.txt 32 | 33 | Run the test suite with 34 | 35 | py.test -v --cov edx_dl --cov-report html 36 | 37 | If some test fails, please don't send your changes yet. Fix what broke 38 | before sending your pull request. 39 | 40 | If you need to change the test suite, explain in the commit message why it 41 | needs to be changed (e.g., the page layout or the authentication methods 42 | from edX changed, or they implemented a new kind of course). 43 | 44 | # Check for potential bugs 45 | 46 | Please, help keep the code tidy by checking for any potential bugs with the 47 | help of [`pep8`][pep8], [`pyflakes`][pyflakes], and [`pylint`][pylint]. If 48 | you know of any other good tools for analyzing the code, let me know about 49 | them! 50 | 51 | [pep8]: https://pypi.python.org/pypi/pep8 52 | [pyflakes]: https://pypi.python.org/pypi/pyflakes/ 53 | [pylint]: https://pypi.python.org/pypi/pylint 54 | 55 | If you happen to find any issue reported by these programs, I welcome you to 56 | fix them. Many of the issues are usually very easy to fix and they are a 57 | great way to start contributing to this (and other projects in general). 58 | Furthermore, we all benefit from a better code base. 59 | 60 | # Changes in the tools that we use 61 | 62 | If you are proposing the use of a substitute of a tool that we already use, 63 | take a few paragraphs to tell us why we would like to change. 64 | 65 | If we are not using something, it is most likely that one of the following 66 | options applies: 67 | 68 | 1. I (@rbrito) may know even know that what you are proposing exists or have 69 | not yet "seen the light" as to why I should use it instead of using what 70 | I am currently using. 71 | 2. Even if I know about the tool, I may not know how to use it, or how it 72 | would make me more productive. Educate me and we all will gain from a 73 | better project. 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/coursera-dl/edx-dl.svg?branch=master)](https://travis-ci.org/coursera-dl/edx-dl) 2 | [![Coverage Status](https://coveralls.io/repos/coursera-dl/edx-dl/badge.svg?branch=master&service=github)](https://coveralls.io/github/coursera-dl/edx-dl?branch=master) 3 | [![Code Climate](https://codeclimate.com/github/coursera-dl/edx-dl/badges/gpa.svg)](https://codeclimate.com/github/coursera-dl/edx-dl) 4 | 5 | # Description 6 | 7 | `edx-dl` is a simple tool to download videos and lecture materials from Open 8 | edX-based sites. It requires a [Python][python] interpreter (>= 2.7) and 9 | very few other dependencies. It is platform independent, and should work 10 | fine under Unix (Linux, BSDs etc.), Windows or Mac OS X. 11 | 12 | We strongly recommend that, if you don't already have a Python interpreter 13 | installed, that you [install Python >= 3.4][python3], if possible, since it 14 | has better security than Python 2. 15 | 16 | [python]: https://www.python.org/ 17 | [python3]: https://www.python.org/downloads/ 18 | 19 | # Dependencies 20 | 21 | To install all the dependencies please do: 22 | 23 | pip install -r requirements.txt 24 | 25 | ## youtube-dl 26 | 27 | One of the most important dependencies of `edx-dl` is `youtube-dl`. The 28 | installation step listed above already pulls in the most recent version of 29 | `youtube-dl` for you. 30 | 31 | Unfortunately, since many Open edX sites store their videos on Youtube and 32 | Youtube changes their layout from time to time, it may be necessary to 33 | upgrade your copy of `youtube-dl`. There are many ways to proceed here, but 34 | the simplest is to simply use: 35 | 36 | pip install --upgrade youtube-dl 37 | 38 | # Quick Start 39 | 40 | Once you have installed everything, to use `edx-dl.py`, let it discover the 41 | courses in which you are enrolled, by issuing: 42 | 43 | python edx-dl.py -u user@user.com --list-courses 44 | 45 | From there, choose the course you are interested in, copy its URL and use it 46 | in the following command: 47 | 48 | python edx-dl.py -u user@user.com COURSE_URL 49 | 50 | replacing `COURSE_URL` with the URL that you just copied in the first step. 51 | It should look something like: 52 | https://courses.edx.org/courses/edX/DemoX.1/2014/info 53 | 54 | Your downloaded videos will be placed in a new directory called 55 | `Downloaded`, inside your current directory, but you can also choose another 56 | destination with the `-o` argument. 57 | 58 | To see all available options and a brief description of what they do, simply 59 | execute: 60 | 61 | python edx-dl.py --help 62 | 63 | *Important Note:* To use sites other than edx.org, you have to specify the 64 | site along with the `-x` option. For example, `-x stanford`, if the course 65 | that you want to get is hosted on Stanford's site. 66 | 67 | # Reporting issues 68 | 69 | Before reporting any issue please follow the steps below: 70 | 71 | 1. Verify that you are running the latest version of all the programs (both 72 | of `edx-dl` and of `youtube-dl`). Use the following command if in doubt: 73 | 74 | pip install --upgrade edx-dl 75 | 76 | 2. If the problem persists, feel free to [open an issue][issue] in our 77 | bugtracker, please fill the issue template with *as much information as 78 | possible*. 79 | 80 | [issue]: https://github.com/coursera-dl/edx-dl/issues 81 | 82 | # Supported sites 83 | 84 | These are the current supported sites: 85 | 86 | - [edX](http://edx.org) 87 | - [Stanford](http://lagunita.stanford.edu/) 88 | - [University of Sydney](http://online.it.usyd.edu.au) 89 | - [France Université Numérique](https://www.france-universite-numerique-mooc.fr/) 90 | - [GW Online SEAS](http://openedx.seas.gwu.edu/) - George Washington University 91 | - [GW Online Open](http://mooc.online.gwu.edu/) - George Washington University 92 | 93 | This is the full [list of sites powered by Open edX][sites]. Not all of them 94 | are supported at the moment, we welcome you to contribute support for them 95 | and send a pull request also via our [issue tracker][issue]. 96 | 97 | [sites]: https://github.com/edx/edx-platform/wiki/Sites-powered-by-Open-edX 98 | 99 | # Authors 100 | 101 | See the contributors to the project in the [AUTHORS.md][authors] file. If 102 | you have contributed to the project, we would like to gladly credit you for 103 | your work. Just send us a note to be added to that list. 104 | 105 | [authors]: https://github.com/coursera-dl/edx-dl/blob/master/AUTHORS.md 106 | -------------------------------------------------------------------------------- /edx_dl/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # This module contains generic functions, ideally useful to any other module 5 | from six.moves.urllib.request import urlopen, Request 6 | from six.moves import html_parser 7 | 8 | import errno 9 | import json 10 | import logging 11 | import os 12 | import string 13 | import subprocess 14 | 15 | 16 | def get_filename_from_prefix(target_dir, filename_prefix): 17 | """ 18 | Return the basename for the corresponding filename_prefix. 19 | """ 20 | # This whole function is not the nicest thing, but isolating it makes 21 | # things clearer. A good refactoring would be to get the info from the 22 | # video_url or the current output, to avoid the iteration from the 23 | # current dir. 24 | filenames = os.listdir(target_dir) 25 | for name in filenames: # Find the filename of the downloaded video 26 | if name.startswith(filename_prefix): 27 | basename, _ = os.path.splitext(name) 28 | return basename 29 | return None 30 | 31 | 32 | def execute_command(cmd, args): 33 | """ 34 | Creates a process with the given command cmd. 35 | """ 36 | try: 37 | subprocess.check_call(cmd) 38 | except subprocess.CalledProcessError as e: 39 | if args.ignore_errors: 40 | logging.warn('External command error ignored: %s', e) 41 | else: 42 | raise e 43 | 44 | 45 | def directory_name(initial_name): 46 | """ 47 | Transform the name of a directory into an ascii version 48 | """ 49 | result = clean_filename(initial_name) 50 | return result if result != "" else "course_folder" 51 | 52 | 53 | def get_page_contents(url, headers): 54 | """ 55 | Get the contents of the page at the URL given by url. While making the 56 | request, we use the headers given in the dictionary in headers. 57 | """ 58 | result = urlopen(Request(url, None, headers)) 59 | try: 60 | # for python3 61 | charset = result.headers.get_content_charset(failobj="utf-8") 62 | except: 63 | charset = result.info().getparam('charset') or 'utf-8' 64 | return result.read().decode(charset) 65 | 66 | 67 | def get_page_contents_as_json(url, headers): 68 | """ 69 | Makes a request to the url and immediately parses the result asuming it is 70 | formatted as json 71 | """ 72 | json_string = get_page_contents(url, headers) 73 | json_object = json.loads(json_string) 74 | return json_object 75 | 76 | 77 | def remove_duplicates(orig_list, seen=set()): 78 | """ 79 | Returns a new list based on orig_list with elements from the (optional) 80 | set seen and elements of orig_list removed. 81 | 82 | The function tries to maintain the order of the elements in orig_list as 83 | much as possible, only "removing" a given element if it appeared earlier 84 | in orig_list or if it was already a member of seen. 85 | 86 | This function does *not* modify any of its input parameters. 87 | """ 88 | new_list = [] 89 | new_seen = set(seen) 90 | 91 | for elem in orig_list: 92 | if elem not in new_seen: 93 | new_list.append(elem) 94 | new_seen.add(elem) 95 | 96 | return new_list, new_seen 97 | 98 | 99 | # The next functions come from coursera-dl/coursera 100 | def mkdir_p(path, mode=0o777): 101 | """ 102 | Create subdirectory hierarchy given in the paths argument. 103 | """ 104 | try: 105 | os.makedirs(path, mode) 106 | except OSError as exc: 107 | if exc.errno == errno.EEXIST and os.path.isdir(path): 108 | pass 109 | else: 110 | raise 111 | 112 | 113 | def clean_filename(s, minimal_change=False): 114 | """ 115 | Sanitize a string to be used as a filename. 116 | If minimal_change is set to true, then we only strip the bare minimum of 117 | characters that are problematic for filesystems (namely, ':', '/' and 118 | '\x00', '\n'). 119 | """ 120 | 121 | # First, deal with URL encoded strings 122 | h = html_parser.HTMLParser() 123 | s = h.unescape(s) 124 | 125 | # strip paren portions which contain trailing time length (...) 126 | s = ( 127 | s.replace(':', '-') 128 | .replace('/', '-') 129 | .replace('\x00', '-') 130 | .replace('\n', '') 131 | ) 132 | 133 | if minimal_change: 134 | return s 135 | 136 | s = s.replace('(', '').replace(')', '') 137 | s = s.rstrip('.') # Remove excess of trailing dots 138 | 139 | s = s.strip().replace(' ', '_') 140 | valid_chars = '-_.()%s%s' % (string.ascii_letters, string.digits) 141 | return ''.join(c for c in s if c in valid_chars) 142 | -------------------------------------------------------------------------------- /edx_dl/common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Common type definitions and constants for edx-dl 5 | 6 | The classes in this module represent the structure of courses in edX. The 7 | structure is: 8 | 9 | * A Course contains Sections 10 | * Each Section contains Subsections 11 | * Each Subsection contains Units 12 | 13 | Notice that we don't represent the full tree structure for both performance 14 | and UX reasons: 15 | 16 | Course -> [Section] -> [SubSection] -> [Unit] -> [Video] 17 | 18 | In the script the data structures used are: 19 | 20 | 1. The data structures to represent the course information: 21 | Course, Section->[SubSection] 22 | 23 | 2. The data structures to represent the chosen courses and sections: 24 | selections = {Course, [Section]} 25 | 26 | 3. The data structure of all the downloable resources which represent each 27 | subsection via its URL and the of resources who can be extracted from the 28 | Units it contains: 29 | all_units = {Subsection.url: [Unit]} 30 | 31 | 4. The units can contain multiple videos: 32 | Unit -> [Video] 33 | """ 34 | 35 | 36 | class Course(object): 37 | """ 38 | Course class represents course information. 39 | """ 40 | def __init__(self, id, name, url, state): 41 | """ 42 | @param id: The id of a course in edX is composed by the path 43 | {organization}/{course_number}/{course_run} 44 | @type id: str or None 45 | 46 | @param name: Name of the course. The name is taken from course page 47 | h3 header. 48 | @type name: str 49 | 50 | @param url: URL of the course. 51 | @type url: str or None 52 | 53 | @param state: State of the course. One of the following values: 54 | * 'Not yet' 55 | * 'Started' 56 | @type state: str 57 | """ 58 | self.id = id 59 | self.name = name 60 | self.url = url 61 | self.state = state 62 | 63 | def __repr__(self): 64 | url = self.url if self.url else "None" 65 | return self.name + ": " + url 66 | 67 | 68 | class Section(object): 69 | """ 70 | Representation of a section of the course. 71 | """ 72 | def __init__(self, position, name, url, subsections): 73 | """ 74 | @param position: Integer position of the section in the list of 75 | sections. Starts at 1. 76 | @type position: int 77 | 78 | @param name: Name of the section. 79 | @type name: str 80 | 81 | @param url: URL of the section. None when section contains no 82 | subsections. 83 | @type url: str or None 84 | 85 | @param subsections: List of subsections. 86 | @type subsections: [SubSection] 87 | """ 88 | self.position = position 89 | self.name = name 90 | self.url = url 91 | self.subsections = subsections 92 | 93 | 94 | class SubSection(object): 95 | """ 96 | Representation of a subsection in a section. 97 | """ 98 | def __init__(self, position, name, url): 99 | """ 100 | @param position: Integer position of the subsection in the subsection 101 | list. Starts at 1. 102 | @type position: int 103 | 104 | @param name: Name of the subsection. 105 | @type name: str 106 | 107 | @param url: URL of the subsection. 108 | @type url: str 109 | """ 110 | self.position = position 111 | self.name = name 112 | self.url = url 113 | 114 | def __repr__(self): 115 | return self.name + ": " + self.url 116 | 117 | class Unit(object): 118 | """ 119 | Representation of a single unit of the course. 120 | """ 121 | def __init__(self, videos, resources_urls): 122 | """ 123 | @param videos: List of videos present in the unit. 124 | @type videos: [Video] 125 | 126 | @param resources_urls: List of additional resources that are come along 127 | with the unit. Resources include files with certain extensions 128 | and youtube links. 129 | @type resources_urls: [str] 130 | """ 131 | self.videos = videos 132 | self.resources_urls = resources_urls 133 | 134 | 135 | class Video(object): 136 | """ 137 | Representation of a single video. 138 | """ 139 | def __init__(self, video_youtube_url, available_subs_url, 140 | sub_template_url, mp4_urls): 141 | """ 142 | @param video_youtube_url: Youtube link (if any). 143 | @type video_youtube_url: str or None 144 | 145 | @param available_subs_url: URL to the available subtitles. 146 | @type available_subs_url: str 147 | 148 | @param sub_template_url: ??? 149 | @type sub_template_url: str 150 | 151 | @param mp4_urls: List of URLs to mp4 video files. 152 | @type mp4_urls: [str] 153 | """ 154 | self.video_youtube_url = video_youtube_url 155 | self.available_subs_url = available_subs_url 156 | self.sub_template_url = sub_template_url 157 | self.mp4_urls = mp4_urls 158 | 159 | 160 | class ExitCode(object): 161 | """ 162 | Class that contains all exit codes of the program. 163 | """ 164 | OK = 0 165 | MISSING_CREDENTIALS = 1 166 | WRONG_EMAIL_OR_PASSWORD = 2 167 | MISSING_COURSE_URL = 3 168 | INVALID_COURSE_URL = 4 169 | UNKNOWN_PLATFORM = 5 170 | NO_DOWNLOADABLE_VIDEO = 6 171 | 172 | 173 | YOUTUBE_DL_CMD = ['youtube-dl', '--ignore-config'] 174 | DEFAULT_CACHE_FILENAME = 'edx-dl.cache' 175 | DEFAULT_FILE_FORMATS = ['e?ps', 'pdf', 'txt', 'doc', 'xls', 'ppt', 176 | 'docx', 'xlsx', 'pptx', 'odt', 'ods', 'odp', 'odg', 177 | 'zip', 'rar', 'gz', 'mp3'] 178 | -------------------------------------------------------------------------------- /test_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import unicode_literals 5 | 6 | import subprocess 7 | 8 | import pytest 9 | import six 10 | 11 | from edx_dl import utils 12 | 13 | 14 | def test_clean_filename(): 15 | strings = { 16 | '(23:90)': '23-90', 17 | '(:': '-', 18 | 'a téest &and a@noòtheèr': 'a_test_and_another', 19 | 'Lecture 2.7 - Evaluation and Operators (16:25)': 20 | 'Lecture_2.7_-_Evaluation_and_Operators_16-25', 21 | 'Week 3: Data and Abstraction': 22 | 'Week_3-_Data_and_Abstraction', 23 | ' (Week 1) BRANDING: Marketing Strategy and Brand Positioning': 24 | 'Week_1_BRANDING-__Marketing_Strategy_and_Brand_Positioning', 25 | 'test & " adfas': 'test___adfas', 26 | ' ': '' 27 | } 28 | for k, v in six.iteritems(strings): 29 | actual_res = utils.clean_filename(k) 30 | assert actual_res == v, actual_res 31 | 32 | 33 | def test_clean_filename_minimal_change(): 34 | strings = { 35 | '(23:90)': '(23-90)', 36 | '(:': '(-', 37 | 'a téest &and a@noòtheèr': 'a téest &and a@noòtheèr', 38 | 'Lecture 2.7 - Evaluation and Operators (16:25)': 39 | 'Lecture 2.7 - Evaluation and Operators (16-25)', 40 | 'Week 3: Data and Abstraction': 41 | 'Week 3- Data and Abstraction', 42 | ' (Week 1) BRANDING: Marketing Strategy and Brand Positioning': 43 | ' (Week 1) BRANDING- Marketing Strategy and Brand Positioning', 44 | 'test & " adfas': 'test & " adfas', 45 | ' ': u'\xa0' 46 | } 47 | for k, v in six.iteritems(strings): 48 | actual_res = utils.clean_filename(k, minimal_change=True) 49 | assert actual_res == v, actual_res 50 | 51 | 52 | @pytest.mark.skipif(True, 53 | reason="Needs change in interface") 54 | def test_execute_command_should_succeed(): 55 | actual_res = utils.execute_command(['ls', '--help']) 56 | assert actual_res == 0, actual_res 57 | 58 | 59 | @pytest.mark.skipif(True, 60 | reason="Needs change in interface") 61 | def test_execute_command_should_fail(): 62 | try: 63 | actual_res = utils.execute_command(['ls', '--help-does-not-exist']) 64 | except subprocess.CalledProcessError as e: 65 | assert True, "Expected exception thrown." 66 | else: 67 | assert False, "Unexpected exception (or no exception) thrown" 68 | 69 | # For the future 70 | # actual_res == 2, actual_res 71 | 72 | 73 | def test_get_filename_from_prefix(): 74 | target_dir = '.' 75 | 76 | cases = { 77 | 'requirements.txt': 'requirements', 78 | 'does-not-exist': None, 79 | # 'requirements': 'requirements-dev', # depends on filesystem! 80 | } 81 | 82 | for k, v in six.iteritems(cases): 83 | actual_res = utils.get_filename_from_prefix(target_dir, k) 84 | assert actual_res == v, actual_res 85 | 86 | 87 | def test_remove_duplicates_without_seen(): 88 | empty_set = set() 89 | lists = [ 90 | ([], [], empty_set), 91 | ([1], [1], {1}), 92 | ([1, 1], [1], {1}), 93 | 94 | ([None], [None], {None}), 95 | ([None, None], [None], {None}), 96 | ([1, None], [1, None], {1, None}), 97 | 98 | (['a'], ['a'], {'a'}), 99 | (['a', 'a'], ['a'], {'a'}), 100 | (['a', 'b'], ['a', 'b'], {'a', 'b'}), 101 | 102 | (['a', 'b', 'a'], ['a', 'b'], {'a', 'b'}), 103 | (['a', 'a', 'b'], ['a', 'b'], {'a', 'b'}), 104 | (['b', 'a', 'b'], ['b', 'a'], {'a', 'b'}), 105 | (['b', 'a', 'a'], ['b', 'a'], {'a', 'b'}), 106 | 107 | ([1, 2, 1, 2], [1, 2], {1, 2}), 108 | ] 109 | for l, reduced_l, seen in lists: 110 | actual_res = utils.remove_duplicates(l) 111 | assert actual_res == (reduced_l, seen), actual_res 112 | 113 | 114 | def test_remove_duplicates_with_seen(): 115 | empty_set = set() 116 | lists = [ 117 | ([], empty_set, [], empty_set), 118 | ([], {None}, [], {None}), 119 | ([], {1}, [], {1}), 120 | ([], {1, 2}, [], {1, 2}), 121 | 122 | ([1], empty_set, [1], {1}), 123 | ([1], {1}, [], {1}), 124 | 125 | ([1, 1], empty_set, [1], {1}), 126 | ([1, 1], {1}, [], {1}), 127 | ([1, 1], {None}, [1], {1, None}), 128 | ([1, 1], {2}, [1], {1, 2}), 129 | ([1, 1], {1, 2}, [], {1, 2}), 130 | 131 | ([None], empty_set, [None], {None}), 132 | ([None], {1}, [None], {1, None}), 133 | ([None], {1, 2}, [None], {1, 2, None}), 134 | ([None], {1, 2}, [None], {2, 1, None}), 135 | ([None], {1, 2}, [None], {None, 2, 1}), 136 | ([None], {1, 2}, [None], {2, None, 1}), 137 | ([None], {1, 2, None}, [], {1, 2, None}), 138 | 139 | ([1, None], empty_set, [1, None], {1, None}), 140 | ([1, None], {1}, [None], {1, None}), 141 | ([1, None], {None}, [1], {1, None}), 142 | ([1, None], {1, None}, [], {1, None}), 143 | ([1, None], {1, None, 2}, [], {1, None, 2}), 144 | 145 | ([None, 1], empty_set, [None, 1], {1, None}), 146 | ([None, 1], {1}, [None], {1, None}), 147 | ([None, 1], {None}, [1], {1, None}), 148 | ([None, 1], {1, None}, [], {1, None}), 149 | ([None, 1], {1, None, 2}, [], {1, None, 2}), 150 | 151 | (['a'], empty_set, ['a'], {'a'}), 152 | (['a'], {'a'}, [], {'a'}), 153 | (['a'], {None}, ['a'], {'a', None}), 154 | (['a'], {'b'}, ['a'], {'a', 'b'}), 155 | (['a'], {'a', 'b'}, [], {'a', 'b'}), 156 | 157 | (['a'], {'a', 'b', tuple()}, [], {'a', 'b', tuple()}), 158 | 159 | 160 | # (['a', 'a'], ['a'], {'a'}), 161 | # (['a', 'b'], ['a', 'b'], {'a', 'b'}), 162 | # (['a', 'b', 'a'], ['a', 'b'], {'a', 'b'}), 163 | # (['a', 'a', 'b'], ['a', 'b'], {'a', 'b'}), 164 | # (['b', 'a', 'b'], ['b', 'a'], {'a', 'b'}), 165 | # (['b', 'a', 'a'], ['b', 'a'], {'a', 'b'}), 166 | # ([1, 2, 1, 2], [1, 2], {1, 2}), 167 | # ([1, 2, 1, 2], [1, 2], {1, 2}), 168 | ] 169 | for l, seen_before, reduced_l, seen_after in lists: 170 | actual_res = utils.remove_duplicates(l, seen_before) 171 | assert actual_res == (reduced_l, seen_after), actual_res 172 | -------------------------------------------------------------------------------- /test_edx_dl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import pytest 5 | from edx_dl import edx_dl, parsing 6 | from edx_dl.common import Unit, Video, DEFAULT_FILE_FORMATS 7 | 8 | 9 | def test_failed_login(): 10 | resp = edx_dl.edx_login( 11 | edx_dl.LOGIN_API, edx_dl.edx_get_headers(), "guest", "guest") 12 | assert not resp.get('success', False) 13 | 14 | 15 | def test_remove_repeated_urls(): 16 | url = "test/html/multiple_units.html" 17 | site = 'https://courses.edx.org' 18 | with open(url, "r") as f: 19 | html_contents = f.read() 20 | page_extractor = parsing.CurrentEdXPageExtractor() 21 | units_extracted = page_extractor.extract_units_from_html(html_contents, 22 | site, 23 | DEFAULT_FILE_FORMATS) 24 | 25 | all_units = {url: units_extracted} 26 | filtered_units = edx_dl.remove_repeated_urls(all_units) 27 | num_all_urls = edx_dl.num_urls_in_units_dict(all_units) 28 | num_filtered_urls = edx_dl.num_urls_in_units_dict(filtered_units) 29 | 30 | assert num_all_urls == 18 31 | assert num_filtered_urls == 16 32 | assert num_all_urls != num_filtered_urls 33 | 34 | 35 | @pytest.fixture 36 | def all_units(): 37 | return { 38 | 'empty_section': [], 39 | 'nonempty_section': [Unit(videos=[], resources_urls=[]), 40 | Unit(videos=[Video(video_youtube_url=None, 41 | available_subs_url=None, 42 | sub_template_url=None, 43 | mp4_urls=[])], resources_urls=[]), 44 | Unit(videos=[Video(video_youtube_url=None, 45 | available_subs_url=None, 46 | sub_template_url=None, 47 | mp4_urls=['1', '2'])], resources_urls=['3']), 48 | ] 49 | } 50 | 51 | 52 | @pytest.fixture 53 | def unknown_units(): 54 | return { 55 | 'nonempty_section': ['shouldfail'] 56 | } 57 | 58 | 59 | @pytest.fixture 60 | def unknown_videos(): 61 | return { 62 | 'nonempty_section': [Unit(videos=['shoudfail'], resources_urls=['3'])] 63 | } 64 | 65 | 66 | def test_extract_urls_from_units(all_units): 67 | """ 68 | Make sure that urls are grabbed from both mp4_urls and from 69 | resources_urls of Unit class. 70 | """ 71 | urls = edx_dl.extract_urls_from_units(all_units, '%(url)s') 72 | expected = ['1\n', '2\n', '3\n'] 73 | assert sorted(urls) == sorted(expected) 74 | 75 | 76 | def test_extract_urls_from_units_unknown_units(unknown_units): 77 | """ 78 | Make sure that we only expect Units in the list of units. 79 | """ 80 | with pytest.raises(TypeError): 81 | edx_dl.extract_urls_from_units(unknown_units, '%(url)s') 82 | 83 | 84 | def test_extract_urls_from_units_unknown_videos(unknown_videos): 85 | """ 86 | Make sure that we only expect Video in the list of Unit videos. 87 | """ 88 | with pytest.raises(TypeError): 89 | edx_dl.extract_urls_from_units(unknown_videos, '%(url)s') 90 | 91 | 92 | def test_edx_get_subtitle(): 93 | """ 94 | Make sure Stanford subtitle URLs are distinguished from EdX ones. 95 | """ 96 | 97 | def mock_get_page_contents(u, h): 98 | assert u == url 99 | assert h == headers 100 | return u 101 | 102 | def mock_get_page_contents_as_json(u, h): 103 | assert u == url 104 | assert h == headers 105 | return { 'start' : [123], 'end' : [456], 'text' : ["subtitle content"] } 106 | 107 | url = "https://lagunita.stanford.edu/courses/Engineering/QMSE02./Winter2016/xblock/i4x:;_;_Engineering;_QMSE02.;_video;_7f4f16e3eb294538aa8db4c43877132b/handler/transcript/download" 108 | headers = {} 109 | get_page_contents = lambda u, h: u 110 | 111 | expected = url 112 | actual = edx_dl.edx_get_subtitle(url, headers, mock_get_page_contents, mock_get_page_contents_as_json) 113 | assert expected == actual 114 | 115 | # Make sure Non-Stanford URLs still work 116 | url = "https://www.edx.org/could/be/more/realistic" 117 | 118 | expected = '0\n00:00:00,123 --> 00:00:00,456\nsubtitle content\n\n' 119 | actual = edx_dl.edx_get_subtitle(url, headers, mock_get_page_contents, mock_get_page_contents_as_json) 120 | assert expected == actual 121 | 122 | 123 | def test_extract_subtitle_urls(): 124 | text = """ 125 | <li class="video-tracks video-download-button"> 126 | <a href="/courses/Engineering/QMSE02./Winter2016/xblock/i4x:;_;_Engineering;_QMSE02.;_video;_1a4c7ff41e484a15927987b745a5c779/handler/transcript/download">Download transcript</a> 127 | <div class="a11y-menu-container"> 128 | <a class="a11y-menu-button" href="#" title=".srt" role="button" aria-disabled="false">.srt</a> 129 | <ol class="a11y-menu-list" role="menu"> 130 | <li class="a11y-menu-item active"> 131 | 132 | <a class="a11y-menu-item-link" href="#srt" title="SubRip (.srt) file" data-value="srt" role="menuitem" aria-disabled="false"> 133 | SubRip (.srt) file 134 | </a> 135 | </li> 136 | <li class="a11y-menu-item"> 137 | 138 | <a class="a11y-menu-item-link" href="#txt" title="Text (.txt) file" data-value="txt" role="menuitem" aria-disabled="false"> 139 | Text (.txt) file 140 | </a> 141 | </li> 142 | </ol> 143 | </div> 144 | </li> 145 | """ 146 | 147 | page_extractor = parsing.CurrentEdXPageExtractor() 148 | expected = (None, 'https://base.url/courses/Engineering/QMSE02./Winter2016/xblock/i4x:;_;_Engineering;_QMSE02.;_video;_1a4c7ff41e484a15927987b745a5c779/handler/transcript/download') 149 | actual = page_extractor.extract_subtitle_urls(text, "https://base.url") 150 | print("actual", actual) 151 | assert expected == actual 152 | -------------------------------------------------------------------------------- /test_parsing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import unicode_literals 5 | 6 | import json 7 | 8 | import pytest 9 | 10 | from edx_dl.common import DEFAULT_FILE_FORMATS 11 | 12 | from edx_dl.parsing import ( 13 | edx_json2srt, 14 | ClassicEdXPageExtractor, 15 | CurrentEdXPageExtractor, 16 | is_youtube_url, 17 | ) 18 | 19 | 20 | # Test conversion of JSON subtitles to srt 21 | def test_empty_json_subtitle(): 22 | with open('test/json/empty.json') as f: 23 | json_string = f.read() 24 | with pytest.raises(ValueError): 25 | json_contents = json.loads(json_string) 26 | 27 | 28 | @pytest.mark.parametrize( 29 | 'file,expected', [ 30 | ('test/json/empty-text.json', ''), 31 | ('test/json/minimal.json', ''), 32 | ('test/json/abridged-01.json', ('0\n' 33 | '00:00:18,104 --> 00:00:20,428\n' 34 | 'I am very glad to see everyone here,\n\n')), 35 | ('test/json/abridged-02.json', ('0\n' 36 | '00:00:18,104 --> 00:00:20,428\n' 37 | 'I am very glad to see everyone here,\n\n' 38 | '1\n' 39 | '00:00:20,569 --> 00:00:24,721\n' 40 | 'so let\'s enjoy the beauty of combinatorics together.\n\n')) 41 | ] 42 | ) 43 | def test_subtitles_from_json(file, expected): 44 | with open(file) as f: 45 | json_contents = json.loads(f.read()) 46 | res = edx_json2srt(json_contents) 47 | assert res == expected 48 | 49 | 50 | # Test extraction of video/other assets from HTML 51 | def test_extract_units_from_html_single_unit_multiple_subs(): 52 | site = 'https://courses.edx.org' 53 | with open("test/html/single_unit_multiple_subs.html", "r") as f: 54 | units = CurrentEdXPageExtractor().extract_units_from_html(f.read(), 55 | site, 56 | DEFAULT_FILE_FORMATS) 57 | 58 | assert units[0].videos[0].video_youtube_url == 'https://youtube.com/watch?v=b7xgknqkQk8' 59 | assert units[0].videos[0].mp4_urls[0] == 'https://d2f1egay8yehza.cloudfront.net/edx-edx101/EDXSPCPJSP13-H010000_100.mp4' 60 | assert units[0].videos[0].sub_template_url == 'https://courses.edx.org/courses/edX/DemoX.1/2014/xblock/i4x:;_;_edX;_DemoX.1;_video;_14459340170c476bb65f73a0a08a076f/handler/transcript/translation/%s' 61 | 62 | 63 | def test_extract_multiple_units_multiple_resources(): 64 | site = 'https://courses.edx.org' 65 | with open("test/html/multiple_units.html", "r") as f: 66 | units = CurrentEdXPageExtractor().extract_units_from_html(f.read(), 67 | site, 68 | DEFAULT_FILE_FORMATS) 69 | assert len(units) == 3 70 | # this one has multiple speeds in the data-streams field 71 | assert 'https://youtube.com/watch?v=CJ482b9r_0g' in [video.video_youtube_url for video in units[0].videos] 72 | assert len(units[0].videos[0].mp4_urls) > 0 73 | assert 'https://s3.amazonaws.com/berkeley-cs184x/videos/overview-motivation.mp4' in units[0].videos[0].mp4_urls 74 | assert 'https://courses.edx.org/static/content-berkeley-cs184x~2012_Fall/slides/overview.pdf' in units[0].resources_urls 75 | 76 | 77 | def test_extract_multiple_units_no_youtube_ids(): 78 | site = 'https://courses.edx.org' 79 | with open("test/html/multiple_units_no_youtube_ids.html", "r") as f: 80 | units = ClassicEdXPageExtractor().extract_units_from_html(f.read(), 81 | site, 82 | DEFAULT_FILE_FORMATS) 83 | assert units[0].videos[0].video_youtube_url is None 84 | assert len(units[0].videos[0].mp4_urls) > 0 85 | 86 | 87 | def test_extract_multiple_units_youtube_link(): 88 | site = 'https://courses.edx.org' 89 | with open("test/html/multiple_units_youtube_link.html", "r") as f: 90 | units = CurrentEdXPageExtractor().extract_units_from_html(f.read(), 91 | site, 92 | DEFAULT_FILE_FORMATS) 93 | assert 'https://www.youtube.com/watch?v=5OXQypOAbdI' in units[0].resources_urls 94 | 95 | 96 | def test_extract_multiple_units_multiple_youtube_videos(): 97 | site = 'https://courses.edx.org' 98 | with open("test/html/multiple_units_multiple_youtube_videos.html", "r") as f: 99 | units = CurrentEdXPageExtractor().extract_units_from_html(f.read(), 100 | site, 101 | DEFAULT_FILE_FORMATS) 102 | assert len(units[0].videos) == 3 103 | assert 'https://youtube.com/watch?v=3atHHNa2UwI' in [video.video_youtube_url for video in units[0].videos] 104 | 105 | 106 | @pytest.mark.parametrize( 107 | 'file,num_sections_expected,num_subsections_expected', [ 108 | ('test/html/new_sections_structure.html', 2, 12), 109 | ('test/html/empty_sections.html', 0, 0) 110 | ] 111 | ) 112 | def test_extract_sections(file, num_sections_expected, num_subsections_expected): 113 | site = 'https://courses.edx.org' 114 | with open(file, "r") as f: 115 | sections = CurrentEdXPageExtractor().extract_sections_from_html(f.read(), site) 116 | assert len(sections) == num_sections_expected 117 | num_subsections = sum(len(section.subsections) for section in sections) 118 | assert num_subsections == num_subsections_expected 119 | 120 | 121 | def test_extract_courses_from_html(): 122 | site = 'https://courses.edx.org' 123 | with open("test/html/dashboard.html", "r") as f: 124 | courses = CurrentEdXPageExtractor().extract_courses_from_html(f.read(), site) 125 | assert len(courses) == 18 126 | available_courses = [course for course in courses if course.state == 'Started'] 127 | assert len(available_courses) == 14 128 | 129 | 130 | def test_is_youtube_url(): 131 | invalid_urls = [ 132 | 'http://www.google.com/', 'TODO', 133 | 'https://d2f1egay8yehza.cloudfront.net/mit-24118/MIT24118T314-V015000_DTH.mp4', 134 | 'https://courses.edx.org/courses/course-v1:MITx+24.118x+2T2015/xblock/block-v1:MITx+24.118x+2T2015+type@video+block@b1588e7cccff4d448f4f9676c81184d9/handler/transcript/available_translations' 135 | ] 136 | valid_urls = [ 137 | 'http://www.youtu.be/rjOpZ3i6pRo', 138 | 'http://www.youtube.com/watch?v=rjOpZ3i6pRo', 139 | 'http://youtu.be/rjOpZ3i6pRo', 140 | 'http://youtube.com/watch?v=rjOpZ3i6pRo', 141 | 'https://www.youtu.be/rjOpZ3i6pRo', 142 | 'https://www.youtube.com/watch?v=rjOpZ3i6pRo', 143 | 'https://youtu.be/rjOpZ3i6pRo', 144 | 'https://youtube.com/watch?v=rjOpZ3i6pRo', 145 | ] 146 | for url in invalid_urls: 147 | assert not is_youtube_url(url) 148 | for url in valid_urls: 149 | assert is_youtube_url(url) 150 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /edx_dl/parsing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Parsing and extraction functions 5 | """ 6 | import re 7 | import json 8 | 9 | from datetime import timedelta, datetime 10 | 11 | from six.moves import html_parser 12 | from bs4 import BeautifulSoup as BeautifulSoup_ 13 | 14 | from .common import Course, Section, SubSection, Unit, Video 15 | 16 | # Force use of bs4 with html5lib 17 | BeautifulSoup = lambda page: BeautifulSoup_(page, 'html5lib') 18 | 19 | def edx_json2srt(o): 20 | """ 21 | Transform the dict 'o' into the srt subtitles format 22 | """ 23 | if o == {}: 24 | return '' 25 | 26 | base_time = datetime(1, 1, 1) 27 | output = [] 28 | 29 | for i, (s, e, t) in enumerate(zip(o['start'], o['end'], o['text'])): 30 | if t == '': 31 | continue 32 | 33 | output.append(str(i) + '\n') 34 | 35 | s = base_time + timedelta(seconds=s/1000.) 36 | e = base_time + timedelta(seconds=e/1000.) 37 | time_range = "%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\n" % \ 38 | (s.hour, s.minute, s.second, s.microsecond/1000, 39 | e.hour, e.minute, e.second, e.microsecond/1000) 40 | 41 | output.append(time_range) 42 | output.append(t + "\n\n") 43 | 44 | return ''.join(output) 45 | 46 | 47 | class PageExtractor(object): 48 | """ 49 | Base class for PageExtractor 50 | Every subclass can represent a different layout for an OpenEdX site. 51 | They should implement the given methods. 52 | 53 | Usage: 54 | 55 | >>> import parsing 56 | >>> d = parsing.SubclassFromPageExtractor() 57 | >>> units = d.extract_units_from_html(page, BASE_URL) 58 | >>> ... 59 | """ 60 | 61 | def extract_units_from_html(self, page, BASE_URL, file_formats): 62 | """ 63 | Method to extract the resources (units) from the given page 64 | """ 65 | raise NotImplementedError("Subclasses should implement this") 66 | 67 | def extract_sections_from_html(self, page, BASE_URL): 68 | """ 69 | Method to extract the sections (and subsections) from an html page 70 | """ 71 | raise NotImplementedError("Subclasses should implement this") 72 | 73 | def extract_courses_from_html(self, page, BASE_URL): 74 | """ 75 | Method to extract the courses from an html page 76 | """ 77 | raise NotImplementedError("Subclasses should implement this") 78 | 79 | 80 | class ClassicEdXPageExtractor(PageExtractor): 81 | 82 | def extract_units_from_html(self, page, BASE_URL, file_formats): 83 | """ 84 | Extract Units from the html of a subsection webpage as a list of 85 | resources 86 | """ 87 | # in this function we avoid using beautifulsoup for performance reasons 88 | # parsing html with regular expressions is really nasty, don't do this if 89 | # you don't need to ! 90 | re_units = re.compile('(]id="seq_contents_\d+".*?>.*?<\/div>)', 91 | re.DOTALL) 92 | units = [] 93 | 94 | for unit_html in re_units.findall(page): 95 | unit = self.extract_unit(unit_html, BASE_URL, file_formats) 96 | if len(unit.videos) > 0 or len(unit.resources_urls) > 0: 97 | units.append(unit) 98 | return units 99 | 100 | def extract_unit(self, text, BASE_URL, file_formats): 101 | """ 102 | Parses the
of each unit and extracts the urls of its resources 103 | """ 104 | video_youtube_url = self.extract_video_youtube_url(text) 105 | available_subs_url, sub_template_url = self.extract_subtitle_urls(text, BASE_URL) 106 | mp4_urls = self.extract_mp4_urls(text) 107 | videos = [Video(video_youtube_url=video_youtube_url, 108 | available_subs_url=available_subs_url, 109 | sub_template_url=sub_template_url, 110 | mp4_urls=mp4_urls)] 111 | 112 | resources_urls = self.extract_resources_urls(text, BASE_URL, 113 | file_formats) 114 | return Unit(videos=videos, resources_urls=resources_urls) 115 | 116 | def extract_video_youtube_url(self, text): 117 | re_video_youtube_url = re.compile(r'data-streams=".*?1.0\d+\:(?:.*?)(.{11})') 118 | video_youtube_url = None 119 | match_video_youtube_url = re_video_youtube_url.search(text) 120 | 121 | if match_video_youtube_url is not None: 122 | video_id = match_video_youtube_url.group(1) 123 | video_youtube_url = 'https://youtube.com/watch?v=' + video_id 124 | 125 | return video_youtube_url 126 | 127 | def extract_subtitle_urls(self, text, BASE_URL): 128 | re_sub_template_url = re.compile(r'data-transcript-translation-url=(?:"|")([^"&]*)(?:"|")') 129 | re_available_subs_url = re.compile(r'data-transcript-available-translations-url=(?:"|")([^"&]*)(?:"|")') 130 | available_subs_url = None 131 | sub_template_url = None 132 | match_subs = re_sub_template_url.search(text) 133 | 134 | if match_subs: 135 | match_available_subs = re_available_subs_url.search(text) 136 | if match_available_subs: 137 | available_subs_url = BASE_URL + match_available_subs.group(1) 138 | sub_template_url = BASE_URL + match_subs.group(1) + "/%s" 139 | 140 | else: 141 | re_available_subs_url=re.compile(r'href=(?:"|")([^"&]+)(?:"|")>Download transcript<') 142 | match_available_subs = re_available_subs_url.search(text) 143 | if match_available_subs: 144 | sub_template_url = BASE_URL + match_available_subs.group(1) 145 | available_subs_url = None 146 | 147 | return available_subs_url, sub_template_url 148 | 149 | def extract_mp4_urls(self, text): 150 | """ 151 | Looks for available links to the mp4 version of the videos 152 | """ 153 | # mp4 urls may be in two places, in the field data-sources, and as 154 | # refs This regex tries to match all the appearances, however we 155 | # exclude the ';' # character in the urls, since it is used to separate 156 | # multiple urls in one string, however ';' is a valid url name 157 | # character, but it is not really common. 158 | re_mp4_urls = re.compile(r'(?:(https?://[^;]*?\.mp4))') 159 | mp4_urls = list(set(re_mp4_urls.findall(text))) 160 | 161 | return mp4_urls 162 | 163 | def extract_resources_urls(self, text, BASE_URL, file_formats): 164 | """ 165 | Extract resources looking for references in the webpage and 166 | matching the given file formats 167 | """ 168 | formats = '|'.join(file_formats) 169 | re_resources_urls = re.compile(r'<a href=(?:"|")([^"&]*.(?:' + formats + '))(?:"|")') 170 | resources_urls = [url 171 | if url.startswith('http') or url.startswith('https') 172 | else BASE_URL + url 173 | for url in re_resources_urls.findall(text)] 174 | 175 | # we match links to youtube videos as and add them to the 176 | # download list 177 | re_youtube_links = re.compile(r'<a href=(?:"|")(https?\:\/\/(?:www\.)?(?:youtube\.com|youtu\.?be)\/.*?)(?:"|")') 178 | youtube_links = re_youtube_links.findall(text) 179 | resources_urls += youtube_links 180 | 181 | return resources_urls 182 | 183 | def extract_sections_from_html(self, page, BASE_URL): 184 | """ 185 | Extract sections (Section->SubSection) from the html page 186 | """ 187 | def _make_url(section_soup): # FIXME: Extract from here and test 188 | try: 189 | return BASE_URL + section_soup.ul.a['href'] 190 | except AttributeError: 191 | # Section might be empty and contain no links 192 | return None 193 | 194 | def _get_section_name(section_soup): # FIXME: Extract from here and test 195 | try: 196 | return section_soup.h3.a.string.strip() 197 | except AttributeError: 198 | return None 199 | 200 | def _make_subsections(section_soup): 201 | try: 202 | subsections_soup = section_soup.ul.find_all("li") 203 | except AttributeError: 204 | return [] 205 | # FIXME correct extraction of subsection.name (unicode) 206 | subsections = [SubSection(position=i, 207 | url=BASE_URL + s.a['href'], 208 | name=s.p.string) 209 | for i, s in enumerate(subsections_soup, 1)] 210 | 211 | return subsections 212 | 213 | soup = BeautifulSoup(page) 214 | sections_soup = soup.find_all('div', attrs={'class': 'chapter'}) 215 | 216 | sections = [Section(position=i, 217 | name=_get_section_name(section_soup), 218 | url=_make_url(section_soup), 219 | subsections=_make_subsections(section_soup)) 220 | for i, section_soup in enumerate(sections_soup, 1)] 221 | # Filter out those sections for which name or url could not be parsed 222 | sections = [section for section in sections 223 | if section.name and section.url] 224 | 225 | return sections 226 | 227 | def extract_courses_from_html(self, page, BASE_URL): 228 | """ 229 | Extracts courses (Course) from the html page 230 | """ 231 | soup = BeautifulSoup(page) 232 | courses_soup = soup.find_all('article', 'course') 233 | courses = [] 234 | 235 | for course_soup in courses_soup: 236 | course_id = None 237 | course_name = course_soup.h3.text.strip() 238 | course_url = None 239 | course_state = 'Not yet' 240 | try: 241 | # started courses include the course link in the href attribute 242 | course_url = BASE_URL + course_soup.a['href'] 243 | if course_url.endswith('info') or course_url.endswith('info/'): 244 | course_state = 'Started' 245 | # The id of a course in edX is composed by the path 246 | # {organization}/{course_number}/{course_run} 247 | course_id = course_soup.a['href'][9:-5] 248 | except KeyError: 249 | pass 250 | courses.append(Course(id=course_id, 251 | name=course_name, 252 | url=course_url, 253 | state=course_state)) 254 | 255 | return courses 256 | 257 | 258 | class CurrentEdXPageExtractor(ClassicEdXPageExtractor): 259 | """ 260 | A new page extractor for the recent changes in layout of edx 261 | """ 262 | def extract_unit(self, text, BASE_URL, file_formats): 263 | re_metadata = re.compile(r'data-metadata='(.*?)'') 264 | videos = [] 265 | match_metadatas = re_metadata.findall(text) 266 | for match_metadata in match_metadatas: 267 | metadata = html_parser.HTMLParser().unescape(match_metadata) 268 | metadata = json.loads(html_parser.HTMLParser().unescape(metadata)) 269 | video_youtube_url = None 270 | re_video_speed = re.compile(r'1.0\d+\:(?:.*?)(.{11})') 271 | match_video_youtube_url = re_video_speed.search(metadata['streams']) 272 | if match_video_youtube_url is not None: 273 | video_id = match_video_youtube_url.group(1) 274 | video_youtube_url = 'https://youtube.com/watch?v=' + video_id 275 | # notice that the concrete languages come now in 276 | # so we can eventually build the full urls here 277 | # subtitles_download_urls = {sub_lang: 278 | # BASE_URL + metadata['transcriptTranslationUrl'].replace('__lang__', sub_lang) 279 | # for sub_lang in metadata['transcriptLanguages'].keys()} 280 | available_subs_url = BASE_URL + metadata['transcriptAvailableTranslationsUrl'] 281 | sub_template_url = BASE_URL + metadata['transcriptTranslationUrl'].replace('__lang__', '%s') 282 | mp4_urls = [url for url in metadata['sources'] if url.endswith('.mp4')] 283 | videos.append(Video(video_youtube_url=video_youtube_url, 284 | available_subs_url=available_subs_url, 285 | sub_template_url=sub_template_url, 286 | mp4_urls=mp4_urls)) 287 | 288 | resources_urls = self.extract_resources_urls(text, BASE_URL, 289 | file_formats) 290 | return Unit(videos=videos, resources_urls=resources_urls) 291 | 292 | def extract_sections_from_html(self, page, BASE_URL): 293 | """ 294 | Extract sections (Section->SubSection) from the html page 295 | """ 296 | def _make_url(section_soup): # FIXME: Extract from here and test 297 | try: 298 | return BASE_URL + section_soup.div.div.a['href'] 299 | except AttributeError: 300 | # Section might be empty and contain no links 301 | return None 302 | 303 | def _get_section_name(section_soup): # FIXME: Extract from here and test 304 | try: 305 | return section_soup['aria-label'][:-8] # -8 cuts the submenu word 306 | except AttributeError: 307 | return None 308 | 309 | def _make_subsections(section_soup): 310 | try: 311 | subsections_soup = section_soup.find_all('div', attrs={'class': 'menu-item'}) 312 | except AttributeError: 313 | return [] 314 | # FIXME correct extraction of subsection.name (unicode) 315 | subsections = [SubSection(position=i, 316 | url=BASE_URL + s.a['href'], 317 | name=s.p.string) 318 | for i, s in enumerate(subsections_soup, 1)] 319 | 320 | return subsections 321 | 322 | soup = BeautifulSoup(page) 323 | sections_soup = soup.find_all('div', attrs={'class': 'chapter-content-container'}) 324 | 325 | sections = [Section(position=i, 326 | name=_get_section_name(section_soup), 327 | url=_make_url(section_soup), 328 | subsections=_make_subsections(section_soup)) 329 | for i, section_soup in enumerate(sections_soup, 1)] 330 | # Filter out those sections for which name or url could not be parsed 331 | sections = [section for section in sections 332 | if section.name and section.url] 333 | 334 | return sections 335 | 336 | 337 | def get_page_extractor(url): 338 | """ 339 | factory method for page extractors 340 | """ 341 | if url.startswith('https://courses.edx.org'): 342 | return CurrentEdXPageExtractor() 343 | 344 | return ClassicEdXPageExtractor() 345 | 346 | 347 | def is_youtube_url(url): 348 | re_youtube_url = re.compile(r'(https?\:\/\/(?:www\.)?(?:youtube\.com|youtu\.?be)\/.*?)') 349 | return re_youtube_url.match(url) 350 | -------------------------------------------------------------------------------- /edx_dl/edx_dl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Main module for the edx-dl downloader. 6 | It corresponds to the cli interface 7 | """ 8 | 9 | import argparse 10 | import getpass 11 | import json 12 | import logging 13 | import os 14 | import pickle 15 | import re 16 | import sys 17 | 18 | from functools import partial 19 | from multiprocessing.dummy import Pool as ThreadPool 20 | 21 | from six.moves.http_cookiejar import CookieJar 22 | from six.moves.urllib.error import HTTPError, URLError 23 | from six.moves.urllib.parse import urlencode 24 | from six.moves.urllib.request import ( 25 | urlopen, 26 | build_opener, 27 | install_opener, 28 | HTTPCookieProcessor, 29 | Request, 30 | urlretrieve, 31 | ) 32 | 33 | from .common import ( 34 | YOUTUBE_DL_CMD, 35 | DEFAULT_CACHE_FILENAME, 36 | Unit, 37 | Video, 38 | ExitCode, 39 | DEFAULT_FILE_FORMATS, 40 | ) 41 | from .parsing import ( 42 | edx_json2srt, 43 | get_page_extractor, 44 | is_youtube_url, 45 | ) 46 | from .utils import ( 47 | clean_filename, 48 | directory_name, 49 | execute_command, 50 | get_filename_from_prefix, 51 | get_page_contents, 52 | get_page_contents_as_json, 53 | mkdir_p, 54 | remove_duplicates, 55 | ) 56 | 57 | 58 | OPENEDX_SITES = { 59 | 'edx': { 60 | 'url': 'https://courses.edx.org', 61 | 'courseware-selector': ('nav', {'aria-label': 'Course Navigation'}), 62 | }, 63 | 'stanford': { 64 | 'url': 'https://lagunita.stanford.edu', 65 | 'courseware-selector': ('nav', {'aria-label': 'Course Navigation'}), 66 | }, 67 | 'usyd-sit': { 68 | 'url': 'http://online.it.usyd.edu.au', 69 | 'courseware-selector': ('nav', {'aria-label': 'Course Navigation'}), 70 | }, 71 | 'fun': { 72 | 'url': 'https://www.fun-mooc.fr', 73 | 'courseware-selector': ('section', {'aria-label': 'Menu du cours'}), 74 | }, 75 | 'gwu-seas': { 76 | 'url': 'http://openedx.seas.gwu.edu', 77 | 'courseware-selector': ('nav', {'aria-label': 'Course Navigation'}), 78 | }, 79 | 'gwu-open': { 80 | 'url': 'http://mooc.online.gwu.edu', 81 | 'courseware-selector': ('nav', {'aria-label': 'Course Navigation'}), 82 | }, 83 | 'mitprox': { 84 | 'url': 'https://mitprofessionalx.mit.edu', 85 | 'courseware-selector': ('nav', {'aria-label': 'Course Navigation'}), 86 | }, 87 | 'bits':{ 88 | 'url':'http://any-learn.bits-pilani.ac.in', 89 | 'courseware-selector': ('nav', {'aria-label': 'Course Navigation'}), 90 | } 91 | } 92 | BASE_URL = OPENEDX_SITES['edx']['url'] 93 | EDX_HOMEPAGE = BASE_URL + '/login_ajax' 94 | LOGIN_API = BASE_URL + '/login_ajax' 95 | DASHBOARD = BASE_URL + '/dashboard' 96 | COURSEWARE_SEL = OPENEDX_SITES['edx']['courseware-selector'] 97 | 98 | 99 | def change_openedx_site(site_name): 100 | """ 101 | Changes the openedx website for the given one via the key 102 | """ 103 | global BASE_URL 104 | global EDX_HOMEPAGE 105 | global LOGIN_API 106 | global DASHBOARD 107 | global COURSEWARE_SEL 108 | 109 | sites = sorted(OPENEDX_SITES.keys()) 110 | if site_name not in sites: 111 | logging.error("OpenEdX platform should be one of: %s", ', '.join(sites)) 112 | sys.exit(ExitCode.UNKNOWN_PLATFORM) 113 | 114 | BASE_URL = OPENEDX_SITES[site_name]['url'] 115 | EDX_HOMEPAGE = BASE_URL + '/login_ajax' 116 | LOGIN_API = BASE_URL + '/login_ajax' 117 | DASHBOARD = BASE_URL + '/dashboard' 118 | COURSEWARE_SEL = OPENEDX_SITES[site_name]['courseware-selector'] 119 | 120 | 121 | def _display_courses(courses): 122 | """ 123 | List the courses that the user has enrolled. 124 | """ 125 | logging.info('You can access %d courses', len(courses)) 126 | 127 | for i, course in enumerate(courses, 1): 128 | logging.info('%2d - %s [%s]', i, course.name, course.id) 129 | logging.info(' %s', course.url) 130 | 131 | 132 | def get_courses_info(url, headers): 133 | """ 134 | Extracts the courses information from the dashboard. 135 | """ 136 | logging.info('Extracting course information from dashboard.') 137 | 138 | page = get_page_contents(url, headers) 139 | page_extractor = get_page_extractor(url) 140 | courses = page_extractor.extract_courses_from_html(page, BASE_URL) 141 | 142 | logging.debug('Data extracted: %s', courses) 143 | 144 | return courses 145 | 146 | 147 | def _get_initial_token(url): 148 | """ 149 | Create initial connection to get authentication token for future 150 | requests. 151 | 152 | Returns a string to be used in subsequent connections with the 153 | X-CSRFToken header or the empty string if we didn't find any token in 154 | the cookies. 155 | """ 156 | logging.info('Getting initial CSRF token.') 157 | 158 | cookiejar = CookieJar() 159 | opener = build_opener(HTTPCookieProcessor(cookiejar)) 160 | install_opener(opener) 161 | opener.open(url) 162 | 163 | for cookie in cookiejar: 164 | if cookie.name == 'csrftoken': 165 | logging.info('Found CSRF token.') 166 | return cookie.value 167 | 168 | logging.warn('Did not find the CSRF token.') 169 | return '' 170 | 171 | 172 | def get_available_sections(url, headers): 173 | """ 174 | Extracts the sections and subsections from a given url 175 | """ 176 | logging.debug("Extracting sections for :" + url) 177 | 178 | page = get_page_contents(url, headers) 179 | page_extractor = get_page_extractor(url) 180 | sections = page_extractor.extract_sections_from_html(page, BASE_URL) 181 | 182 | logging.debug("Extracted sections: " + str(sections)) 183 | return sections 184 | 185 | 186 | def edx_get_subtitle(url, headers, get_page_contents=get_page_contents, get_page_contents_as_json=get_page_contents_as_json): 187 | """ 188 | Return a string with the subtitles content from the url or None if no 189 | subtitles are available. 190 | """ 191 | try: 192 | if ';' in url: # non-JSON format (e.g. Stanford) 193 | return get_page_contents(url, headers) 194 | else: 195 | json_object = get_page_contents_as_json(url, headers) 196 | return edx_json2srt(json_object) 197 | except URLError as exception: 198 | logging.warn('edX subtitles (error: %s)', exception) 199 | return None 200 | except ValueError as exception: 201 | logging.warn('edX subtitles (error: %s)', exception.message) 202 | return None 203 | 204 | 205 | def edx_login(url, headers, username, password): 206 | """ 207 | Log in user into the openedx website. 208 | """ 209 | logging.info('Logging into Open edX site: %s', url) 210 | 211 | post_data = urlencode({'email': username, 212 | 'password': password, 213 | 'remember': False}).encode('utf-8') 214 | 215 | request = Request(url, post_data, headers) 216 | response = urlopen(request) 217 | resp = json.loads(response.read().decode('utf-8')) 218 | 219 | return resp 220 | 221 | 222 | def parse_args(): 223 | """ 224 | Parse the arguments/options passed to the program on the command line. 225 | """ 226 | parser = argparse.ArgumentParser(prog='edx-dl', 227 | description='Get videos from the OpenEdX platform', 228 | epilog='For further use information,' 229 | 'see the file README.md',) 230 | # positional 231 | parser.add_argument('course_urls', 232 | nargs='*', 233 | action='store', 234 | default=[], 235 | help='target course urls ' 236 | '(e.g., https://courses.edx.org/courses/BerkeleyX/CS191x/2013_Spring/info)') 237 | 238 | # optional 239 | parser.add_argument('-u', 240 | '--username', 241 | required=True, 242 | action='store', 243 | help='your edX username (email)') 244 | 245 | parser.add_argument('-p', 246 | '--password', 247 | action='store', 248 | help='your edX password, ' 249 | 'beware: it might be visible to other users on your system') 250 | 251 | parser.add_argument('-f', 252 | '--format', 253 | dest='format', 254 | action='store', 255 | default=None, 256 | help='format of videos to download') 257 | 258 | parser.add_argument('-s', 259 | '--with-subtitles', 260 | dest='subtitles', 261 | action='store_true', 262 | default=False, 263 | help='download subtitles with the videos') 264 | 265 | parser.add_argument('-o', 266 | '--output-dir', 267 | action='store', 268 | dest='output_dir', 269 | help='store the files to the specified directory', 270 | default='Downloaded') 271 | 272 | parser.add_argument('-i', 273 | '--ignore-errors', 274 | dest='ignore_errors', 275 | action='store_true', 276 | default=False, 277 | help='continue on download errors, to avoid stopping large downloads') 278 | 279 | sites = sorted(OPENEDX_SITES.keys()) 280 | parser.add_argument('-x', 281 | '--platform', 282 | action='store', 283 | dest='platform', 284 | help='OpenEdX platform, one of: %s' % ', '.join(sites), 285 | default='edx') 286 | 287 | parser.add_argument('--list-courses', 288 | dest='list_courses', 289 | action='store_true', 290 | default=False, 291 | help='list available courses') 292 | 293 | parser.add_argument('--filter-section', 294 | dest='filter_section', 295 | action='store', 296 | default=None, 297 | help='filters sections to be downloaded') 298 | 299 | parser.add_argument('--list-sections', 300 | dest='list_sections', 301 | action='store_true', 302 | default=False, 303 | help='list available sections') 304 | 305 | parser.add_argument('--youtube-dl-options', 306 | dest='youtube_dl_options', 307 | action='store', 308 | default='', 309 | help='set extra options to pass to youtube-dl') 310 | 311 | parser.add_argument('--prefer-cdn-videos', 312 | dest='prefer_cdn_videos', 313 | action='store_true', 314 | default=False, 315 | help='prefer CDN video downloads over youtube (BETA)') 316 | 317 | parser.add_argument('--export-filename', 318 | dest='export_filename', 319 | default=None, 320 | help='filename where to put an exported list of urls. ' 321 | 'Use dash "-" to output to stdout. ' 322 | 'Download will not be performed if this option is ' 323 | 'present') 324 | 325 | parser.add_argument('--export-format', 326 | dest='export_format', 327 | default='%(url)s', 328 | help='export format string. Old-style python formatting ' 329 | 'is used. Available variables: %%(url)s. Default: ' 330 | '"%%(url)s"') 331 | 332 | parser.add_argument('--list-file-formats', 333 | dest='list_file_formats', 334 | action='store_true', 335 | default=False, 336 | help='list the default file formats extracted') 337 | 338 | parser.add_argument('--file-formats', 339 | dest='file_formats', 340 | action='store', 341 | default=None, 342 | help='appends file formats to be extracted (comma ' 343 | 'separated)') 344 | 345 | parser.add_argument('--overwrite-file-formats', 346 | dest='overwrite_file_formats', 347 | action='store_true', 348 | default=False, 349 | help='if active overwrites the file formats to be ' 350 | 'extracted') 351 | 352 | parser.add_argument('--cache', 353 | dest='cache', 354 | action='store_true', 355 | default=False, 356 | help='create and use a cache of extracted resources') 357 | 358 | parser.add_argument('--dry-run', 359 | dest='dry_run', 360 | action='store_true', 361 | default=False, 362 | help='makes a dry run, only lists the resources') 363 | 364 | parser.add_argument('--sequential', 365 | dest='sequential', 366 | action='store_true', 367 | default=False, 368 | help='extracts the resources from the pages sequentially') 369 | 370 | parser.add_argument('--quiet', 371 | dest='quiet', 372 | action='store_true', 373 | default=False, 374 | help='omit as many messages as possible, only printing errors') 375 | 376 | parser.add_argument('--debug', 377 | dest='debug', 378 | action='store_true', 379 | default=False, 380 | help='print lots of debug information') 381 | 382 | args = parser.parse_args() 383 | 384 | # Initialize the logging system first so that other functions 385 | # can use it right away. 386 | if args.debug: 387 | logging.basicConfig(level=logging.DEBUG, 388 | format='%(name)s[%(funcName)s] %(message)s') 389 | elif args.quiet: 390 | logging.basicConfig(level=logging.ERROR, 391 | format='%(name)s: %(message)s') 392 | else: 393 | logging.basicConfig(level=logging.INFO, 394 | format='%(message)s') 395 | 396 | return args 397 | 398 | 399 | def edx_get_headers(): 400 | """ 401 | Build the Open edX headers to create future requests. 402 | """ 403 | logging.info('Building initial headers for future requests.') 404 | 405 | headers = { 406 | 'User-Agent': 'edX-downloader/0.01', 407 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 408 | 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', 409 | 'Referer': EDX_HOMEPAGE, 410 | 'X-Requested-With': 'XMLHttpRequest', 411 | 'X-CSRFToken': _get_initial_token(EDX_HOMEPAGE), 412 | } 413 | 414 | logging.debug('Headers built: %s', headers) 415 | return headers 416 | 417 | 418 | def extract_units(url, headers, file_formats): 419 | """ 420 | Parses a webpage and extracts its resources e.g. video_url, sub_url, etc. 421 | """ 422 | logging.info("Processing '%s'", url) 423 | 424 | page = get_page_contents(url, headers) 425 | page_extractor = get_page_extractor(url) 426 | units = page_extractor.extract_units_from_html(page, BASE_URL, file_formats) 427 | 428 | return units 429 | 430 | 431 | def extract_all_units_in_sequence(urls, headers, file_formats): 432 | """ 433 | Returns a dict of all the units in the selected_sections: {url, units} 434 | sequentially, this is clearer for debug purposes 435 | """ 436 | logging.info('Extracting all units information in sequentially.') 437 | logging.debug('urls: ' + str(urls)) 438 | 439 | units = [extract_units(url, headers, file_formats) for url in urls] 440 | all_units = dict(zip(urls, units)) 441 | 442 | return all_units 443 | 444 | 445 | def extract_all_units_in_parallel(urls, headers, file_formats): 446 | """ 447 | Returns a dict of all the units in the selected_sections: {url, units} 448 | in parallel 449 | """ 450 | logging.info('Extracting all units information in parallel.') 451 | logging.debug('urls: ' + str(urls)) 452 | 453 | mapfunc = partial(extract_units, file_formats=file_formats, headers=headers) 454 | pool = ThreadPool(16) 455 | units = pool.map(mapfunc, urls) 456 | pool.close() 457 | pool.join() 458 | all_units = dict(zip(urls, units)) 459 | 460 | return all_units 461 | 462 | 463 | def _display_sections_menu(course, sections): 464 | """ 465 | List the weeks for the given course. 466 | """ 467 | num_sections = len(sections) 468 | 469 | logging.info('%s [%s] has %d sections so far', course.name, course.id, num_sections) 470 | for i, section in enumerate(sections, 1): 471 | logging.info('%2d - Download %s videos', i, section.name) 472 | 473 | 474 | def _filter_sections(index, sections): 475 | """ 476 | Get the sections for the given index. 477 | 478 | If the index is not valid (that is, None, a non-integer, a negative 479 | integer, or an integer above the number of the sections), we choose all 480 | sections. 481 | """ 482 | num_sections = len(sections) 483 | 484 | logging.info('Filtering sections') 485 | 486 | if index is not None: 487 | try: 488 | index = int(index) 489 | if index > 0 and index <= num_sections: 490 | logging.info('Sections filtered to: %d', index) 491 | return [sections[index - 1]] 492 | else: 493 | pass # log some info here 494 | except ValueError: 495 | pass # log some info here 496 | else: 497 | pass # log some info here 498 | 499 | return sections 500 | 501 | 502 | def _display_sections(sections): 503 | """ 504 | Displays a tree of section(s) and subsections 505 | """ 506 | logging.info('Downloading %d section(s)', len(sections)) 507 | 508 | for section in sections: 509 | logging.info('Section %2d: %s', section.position, section.name) 510 | for subsection in section.subsections: 511 | logging.info(' %s', subsection.name) 512 | 513 | 514 | def parse_courses(args, available_courses): 515 | """ 516 | Parses courses options and returns the selected_courses. 517 | """ 518 | if args.list_courses: 519 | _display_courses(available_courses) 520 | exit(ExitCode.OK) 521 | 522 | if len(args.course_urls) == 0: 523 | logging.error('You must pass the URL of at least one course, check the correct url with --list-courses') 524 | exit(ExitCode.MISSING_COURSE_URL) 525 | 526 | selected_courses = [available_course 527 | for available_course in available_courses 528 | for url in args.course_urls 529 | if available_course.url == url] 530 | if len(selected_courses) == 0: 531 | logging.error('You have not passed a valid course url, check the correct url with --list-courses') 532 | exit(ExitCode.INVALID_COURSE_URL) 533 | return selected_courses 534 | 535 | 536 | def parse_sections(args, selections): 537 | """ 538 | Parses sections options and returns selections filtered by 539 | selected_sections 540 | """ 541 | if args.list_sections: 542 | for selected_course, selected_sections in selections.items(): 543 | _display_sections_menu(selected_course, selected_sections) 544 | exit(ExitCode.OK) 545 | 546 | if not args.filter_section: 547 | return selections 548 | 549 | filtered_selections = {selected_course: 550 | _filter_sections(args.filter_section, selected_sections) 551 | for selected_course, selected_sections in selections.items()} 552 | return filtered_selections 553 | 554 | 555 | def parse_file_formats(args): 556 | """ 557 | parse options for file formats and builds the array to be used 558 | """ 559 | file_formats = DEFAULT_FILE_FORMATS 560 | 561 | if args.list_file_formats: 562 | logging.info(file_formats) 563 | exit(ExitCode.OK) 564 | 565 | if args.overwrite_file_formats: 566 | file_formats = [] 567 | 568 | if args.file_formats: 569 | new_file_formats = args.file_formats.split(",") 570 | file_formats.extend(new_file_formats) 571 | 572 | logging.debug("file_formats: %s", file_formats) 573 | return file_formats 574 | 575 | 576 | def _display_selections(selections): 577 | """ 578 | Displays the course, sections and subsections to be downloaded 579 | """ 580 | for selected_course, selected_sections in selections.items(): 581 | logging.info('Downloading %s [%s]', 582 | selected_course.name, selected_course.id) 583 | _display_sections(selected_sections) 584 | 585 | 586 | def parse_units(all_units): 587 | """ 588 | Parses units options and corner cases 589 | """ 590 | flat_units = [unit for units in all_units.values() for unit in units] 591 | if len(flat_units) < 1: 592 | logging.warn('No downloadable video found.') 593 | exit(ExitCode.NO_DOWNLOADABLE_VIDEO) 594 | 595 | 596 | def get_subtitles_urls(available_subs_url, sub_template_url, headers): 597 | """ 598 | Request the available subs and builds the urls to download subs 599 | """ 600 | if available_subs_url is not None and sub_template_url is not None: 601 | try: 602 | available_subs = get_page_contents_as_json(available_subs_url, 603 | headers) 604 | except HTTPError: 605 | available_subs = ['en'] 606 | 607 | return {sub_lang: sub_template_url % sub_lang 608 | for sub_lang in available_subs} 609 | 610 | elif sub_template_url is not None: 611 | try: 612 | available_subs = get_page_contents(sub_template_url, 613 | headers) 614 | except HTTPError: 615 | available_subs = ['en'] 616 | 617 | return {'en': sub_template_url} 618 | 619 | return {} 620 | 621 | 622 | def _build_subtitles_downloads(video, target_dir, filename_prefix, headers): 623 | """ 624 | Builds a dict {url: filename} for the subtitles, based on the 625 | filename_prefix of the video 626 | """ 627 | downloads = {} 628 | filename = get_filename_from_prefix(target_dir, filename_prefix) 629 | 630 | if filename is None: 631 | logging.warn('No video downloaded for %s', filename_prefix) 632 | return downloads 633 | if video.sub_template_url is None: 634 | logging.warn('No subtitles downloaded for %s', filename_prefix) 635 | return downloads 636 | 637 | # This is a fix for the case of retrials because the extension would be 638 | # .lang (from .lang.srt), so the matching does not detect correctly the 639 | # subtitles name 640 | re_is_subtitle = re.compile(r'(.*)(?:\.[a-z]{2})') 641 | match_subtitle = re_is_subtitle.match(filename) 642 | if match_subtitle: 643 | filename = match_subtitle.group(1) 644 | 645 | subtitles_download_urls = get_subtitles_urls(video.available_subs_url, 646 | video.sub_template_url, 647 | headers) 648 | for sub_lang, sub_url in subtitles_download_urls.items(): 649 | subs_filename = os.path.join(target_dir, 650 | filename + '.' + sub_lang + '.srt') 651 | downloads[sub_url] = subs_filename 652 | return downloads 653 | 654 | 655 | def _build_url_downloads(urls, target_dir, filename_prefix): 656 | """ 657 | Builds a dict {url: filename} for the given urls 658 | If it is a youtube url it uses the valid template for youtube-dl 659 | otherwise just takes the name of the file from the url 660 | """ 661 | downloads = {url: 662 | _build_filename_from_url(url, target_dir, filename_prefix) 663 | for url in urls} 664 | return downloads 665 | 666 | 667 | def _build_filename_from_url(url, target_dir, filename_prefix): 668 | """ 669 | Builds the appropriate filename for the given args 670 | """ 671 | if is_youtube_url(url): 672 | filename_template = filename_prefix + "-%(title)s-%(id)s.%(ext)s" 673 | filename = os.path.join(target_dir, filename_template) 674 | else: 675 | original_filename = url.rsplit('/', 1)[1] 676 | filename = os.path.join(target_dir, 677 | filename_prefix + '-' + original_filename) 678 | 679 | return filename 680 | 681 | 682 | def download_url(url, filename, headers, args): 683 | """ 684 | Downloads the given url in filename. 685 | """ 686 | 687 | if is_youtube_url(url): 688 | download_youtube_url(url, filename, headers, args) 689 | else: 690 | import ssl 691 | # FIXME: Ugly hack for coping with broken SSL sites: 692 | # https://www.cs.duke.edu/~angl/papers/imc10-cloudcmp.pdf 693 | # 694 | # We should really ask the user if they want to stop the downloads 695 | # or if they are OK proceeding without verification. 696 | # 697 | # Note that skipping verification by default could be a problem for 698 | # people's lives if they happen to live ditatorial countries. 699 | # 700 | # Note: The mess with various exceptions being caught (and their 701 | # order) is due to different behaviors in different Python versions 702 | # (e.g., 2.7 vs. 3.4). 703 | try: 704 | urlretrieve(url, filename) 705 | except Exception as e: 706 | logging.warn('Got SSL/Connection error: %s', e) 707 | if not args.ignore_errors: 708 | logging.warn('Hint: if you want to ignore this error, add ' 709 | '--ignore-errors option to the command line') 710 | raise e 711 | else: 712 | logging.warn('SSL/Connection error ignored: %s', e) 713 | 714 | 715 | def download_youtube_url(url, filename, headers, args): 716 | """ 717 | Downloads a youtube URL and applies the filters from args 718 | """ 719 | logging.info('Downloading video with URL %s from YouTube.', url) 720 | video_format_option = args.format + '/mp4' if args.format else 'mp4' 721 | cmd = YOUTUBE_DL_CMD + ['-o', filename, '-f', video_format_option] 722 | 723 | if args.subtitles: 724 | cmd.append('--all-subs') 725 | cmd.extend(args.youtube_dl_options.split()) 726 | cmd.append(url) 727 | 728 | execute_command(cmd, args) 729 | 730 | 731 | def download_subtitle(url, filename, headers, args): 732 | """ 733 | Downloads the subtitle from the url and transforms it to the srt format 734 | """ 735 | subs_string = edx_get_subtitle(url, headers) 736 | if subs_string: 737 | full_filename = os.path.join(os.getcwd(), filename) 738 | with open(full_filename, 'wb+') as f: 739 | f.write(subs_string.encode('utf-8')) 740 | 741 | 742 | def skip_or_download(downloads, headers, args, f=download_url): 743 | """ 744 | downloads url into filename using download function f, 745 | if filename exists it skips 746 | """ 747 | for url, filename in downloads.items(): 748 | if os.path.exists(filename): 749 | logging.info('[skipping] %s => %s', url, filename) 750 | continue 751 | else: 752 | logging.info('[download] %s => %s', url, filename) 753 | if args.dry_run: 754 | continue 755 | f(url, filename, headers, args) 756 | 757 | 758 | def download_video(video, args, target_dir, filename_prefix, headers): 759 | if args.prefer_cdn_videos or video.video_youtube_url is None: 760 | mp4_downloads = _build_url_downloads(video.mp4_urls, target_dir, 761 | filename_prefix) 762 | skip_or_download(mp4_downloads, headers, args) 763 | else: 764 | if video.video_youtube_url is not None: 765 | youtube_downloads = _build_url_downloads([video.video_youtube_url], 766 | target_dir, 767 | filename_prefix) 768 | skip_or_download(youtube_downloads, headers, args) 769 | 770 | # the behavior with subtitles is different, since the subtitles don't know 771 | # the destination name until the video is downloaded with youtube-dl 772 | # also, subtitles must be transformed from the raw data to the srt format 773 | if args.subtitles: 774 | sub_downloads = _build_subtitles_downloads(video, target_dir, 775 | filename_prefix, headers) 776 | skip_or_download(sub_downloads, headers, args, download_subtitle) 777 | 778 | 779 | def download_unit(unit, args, target_dir, filename_prefix, headers): 780 | """ 781 | Downloads the urls in unit based on args in the given target_dir 782 | with filename_prefix 783 | """ 784 | if len(unit.videos) == 1: 785 | download_video(unit.videos[0], args, target_dir, filename_prefix, 786 | headers) 787 | else: 788 | # we change the filename_prefix to avoid conflicts when downloading 789 | # subtitles 790 | for i, video in enumerate(unit.videos, 1): 791 | new_prefix = filename_prefix + ('-%02d' % i) 792 | download_video(video, args, target_dir, new_prefix, headers) 793 | 794 | res_downloads = _build_url_downloads(unit.resources_urls, target_dir, 795 | filename_prefix) 796 | skip_or_download(res_downloads, headers, args) 797 | 798 | 799 | def download(args, selections, all_units, headers): 800 | """ 801 | Downloads all the resources based on the selections 802 | """ 803 | logging.info("Output directory: " + args.output_dir) 804 | 805 | # Download Videos 806 | # notice that we could iterate over all_units, but we prefer to do it over 807 | # sections/subsections to add correct prefixes and show nicer information. 808 | 809 | for selected_course, selected_sections in selections.items(): 810 | coursename = directory_name(selected_course.name) 811 | for selected_section in selected_sections: 812 | section_dirname = "%02d-%s" % (selected_section.position, 813 | selected_section.name) 814 | target_dir = os.path.join(args.output_dir, coursename, 815 | clean_filename(section_dirname)) 816 | mkdir_p(target_dir) 817 | counter = 0 818 | for subsection in selected_section.subsections: 819 | units = all_units.get(subsection.url, []) 820 | for unit in units: 821 | counter += 1 822 | filename_prefix = "%02d" % counter 823 | download_unit(unit, args, target_dir, filename_prefix, 824 | headers) 825 | 826 | 827 | def remove_repeated_urls(all_units): 828 | """ 829 | Removes repeated urls from the units, it does not consider subtitles. 830 | This is done to avoid repeated downloads. 831 | """ 832 | existing_urls = set() 833 | filtered_units = {} 834 | for url, units in all_units.items(): 835 | reduced_units = [] 836 | for unit in units: 837 | videos = [] 838 | for video in unit.videos: 839 | # we don't analyze the subtitles for repetition since 840 | # their size is negligible for the goal of this function 841 | video_youtube_url = None 842 | if video.video_youtube_url not in existing_urls: 843 | video_youtube_url = video.video_youtube_url 844 | existing_urls.add(video_youtube_url) 845 | 846 | mp4_urls, existing_urls = remove_duplicates(video.mp4_urls, existing_urls) 847 | 848 | if video_youtube_url is not None or len(mp4_urls) > 0: 849 | videos.append(Video(video_youtube_url=video_youtube_url, 850 | available_subs_url=video.available_subs_url, 851 | sub_template_url=video.sub_template_url, 852 | mp4_urls=mp4_urls)) 853 | 854 | resources_urls, existing_urls = remove_duplicates(unit.resources_urls, existing_urls) 855 | 856 | if len(videos) > 0 or len(resources_urls) > 0: 857 | reduced_units.append(Unit(videos=videos, 858 | resources_urls=resources_urls)) 859 | 860 | filtered_units[url] = reduced_units 861 | return filtered_units 862 | 863 | 864 | def num_urls_in_units_dict(units_dict): 865 | """ 866 | Counts the number of urls in a all_units dict, it ignores subtitles from 867 | its counting. 868 | """ 869 | num_urls = 0 870 | 871 | for units in units_dict.values(): 872 | for unit in units: 873 | for video in unit.videos: 874 | num_urls += int(video.video_youtube_url is not None) 875 | num_urls += int(video.available_subs_url is not None) 876 | num_urls += int(video.sub_template_url is not None) 877 | num_urls += len(video.mp4_urls) 878 | num_urls += len(unit.resources_urls) 879 | 880 | return num_urls 881 | 882 | 883 | def extract_all_units_with_cache(all_urls, headers, file_formats, 884 | filename=DEFAULT_CACHE_FILENAME, 885 | extractor=extract_all_units_in_parallel): 886 | """ 887 | Extracts the units which are not in the cache and extract their resources 888 | returns the full list of units (cached+new) 889 | 890 | The cache is used to improve speed because it avoids requesting already 891 | known (and extracted) objects from URLs. This is useful to follow courses 892 | week by week since we won't parse the already known subsections/units, 893 | additionally it speeds development of code unrelated to extraction. 894 | """ 895 | cached_units = {} 896 | 897 | if os.path.exists(filename): 898 | with open(filename, 'rb') as f: 899 | cached_units = pickle.load(f) 900 | 901 | # we filter the cached urls 902 | new_urls = [url for url in all_urls if url not in cached_units] 903 | logging.info('loading %d urls from cache [%s]', len(cached_units.keys()), 904 | filename) 905 | new_units = extractor(new_urls, headers, file_formats) 906 | all_units = cached_units.copy() 907 | all_units.update(new_units) 908 | 909 | return all_units 910 | 911 | 912 | def write_units_to_cache(units, filename=DEFAULT_CACHE_FILENAME): 913 | """ 914 | writes units to cache 915 | """ 916 | logging.info('writing %d urls to cache [%s]', len(units.keys()), 917 | filename) 918 | with open(filename, 'wb') as f: 919 | pickle.dump(units, f) 920 | 921 | 922 | def extract_urls_from_units(all_units, format_): 923 | """ 924 | Extract urls from units into a set of strings. Format is specified by 925 | the user. The original purpose of this function is to export urls into 926 | a file for external downloader. 927 | """ 928 | all_urls = set() 929 | 930 | # Collect all urls into a set to remove duplicates 931 | for units in all_units.values(): 932 | for unit in units: 933 | if isinstance(unit, Unit): 934 | for video in unit.videos: 935 | if isinstance(video, Video): 936 | for url in video.mp4_urls: 937 | all_urls.add('%s\n' % (format_ % {'url': url})) 938 | else: 939 | raise TypeError('Unknown unit video type (%s) occured ' 940 | 'while exporting urls' % type(video)) 941 | for url in unit.resources_urls: 942 | all_urls.add('%s\n' % (format_ % {'url': url})) 943 | else: 944 | raise TypeError('Unknown unit type (%s) occured while ' 945 | 'exporting urls' % type(unit)) 946 | return list(all_urls) 947 | 948 | 949 | def save_urls_to_file(urls, filename): 950 | """ 951 | Save urls to file. Filename is specified by the user. The original 952 | purpose of this function is to export urls into a file for external 953 | downloader. 954 | """ 955 | file_ = sys.stdout if filename == '-' else open(filename, 'w') 956 | file_.writelines(urls) 957 | file_.close() 958 | 959 | 960 | def main(): 961 | """ 962 | Main program function 963 | """ 964 | args = parse_args() 965 | file_formats = parse_file_formats(args) 966 | 967 | change_openedx_site(args.platform) 968 | 969 | # Query password, if not alredy passed by command line. 970 | if not args.password: 971 | args.password = getpass.getpass(stream=sys.stderr) 972 | 973 | if not args.username or not args.password: 974 | logging.error("You must supply username and password to log-in") 975 | exit(ExitCode.MISSING_CREDENTIALS) 976 | 977 | # Prepare Headers 978 | headers = edx_get_headers() 979 | 980 | # Login 981 | resp = edx_login(LOGIN_API, headers, args.username, args.password) 982 | if not resp.get('success', False): 983 | logging.error(resp.get('value', "Wrong Email or Password.")) 984 | exit(ExitCode.WRONG_EMAIL_OR_PASSWORD) 985 | 986 | # Parse and select the available courses 987 | courses = get_courses_info(DASHBOARD, headers) 988 | available_courses = [course for course in courses if course.state == 'Started'] 989 | selected_courses = parse_courses(args, available_courses) 990 | 991 | # Parse the sections and build the selections dict filtered by sections 992 | all_selections = {selected_course: 993 | get_available_sections(selected_course.url.replace('info', 'courseware'), headers) 994 | for selected_course in selected_courses} 995 | selections = parse_sections(args, all_selections) 996 | _display_selections(selections) 997 | 998 | # Extract the unit information (downloadable resources) 999 | # This parses the HTML of all the subsection.url and extracts 1000 | # the URLs of the resources as Units. 1001 | all_urls = [subsection.url 1002 | for selected_sections in selections.values() 1003 | for selected_section in selected_sections 1004 | for subsection in selected_section.subsections] 1005 | 1006 | extractor = extract_all_units_in_parallel 1007 | if args.sequential: 1008 | extractor = extract_all_units_in_sequence 1009 | 1010 | if args.cache: 1011 | all_units = extract_all_units_with_cache(all_urls, headers, 1012 | file_formats, 1013 | extractor=extractor) 1014 | else: 1015 | all_units = extractor(all_urls, headers, file_formats) 1016 | 1017 | parse_units(selections) 1018 | 1019 | if args.cache: 1020 | write_units_to_cache(all_units) 1021 | 1022 | # This removes all repeated important urls 1023 | # FIXME: This is not the best way to do it but it is the simplest, a 1024 | # better approach will be to create symbolic or hard links for the repeated 1025 | # units to avoid losing information 1026 | filtered_units = remove_repeated_urls(all_units) 1027 | num_all_urls = num_urls_in_units_dict(all_units) 1028 | num_filtered_urls = num_urls_in_units_dict(filtered_units) 1029 | logging.warn('Removed %d duplicated urls from %d in total', 1030 | (num_all_urls - num_filtered_urls), num_all_urls) 1031 | 1032 | # finally we download or export all the resources 1033 | if args.export_filename is not None: 1034 | logging.info('exporting urls to file %s' % args.export_filename) 1035 | urls = extract_urls_from_units(all_units, args.export_format) 1036 | save_urls_to_file(urls, args.export_filename) 1037 | else: 1038 | download(args, selections, all_units, headers) 1039 | 1040 | 1041 | if __name__ == '__main__': 1042 | try: 1043 | main() 1044 | except KeyboardInterrupt: 1045 | logging.warn("\n\nCTRL-C detected, shutting down....") 1046 | sys.exit(ExitCode.OK) 1047 | -------------------------------------------------------------------------------- /test/html/new_sections_structure.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | DEV207.1x Courseware | edX 20 | 21 | 22 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 |
141 |
142 | Skip to main content 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 191 | 194 | 195 | 196 | 197 | 198 | 199 | 200 |
201 | Help 202 |
203 | 204 | 285 | 286 | 410 | 411 | 412 | 413 | 414 |
415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 524 | 525 | 526 | 527 |
528 | 675 |
676 | 685 | 686 | 690 | 691 | 692 | 720 | 721 | 736 | 737 | 738 | 739 |
740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 820 | 821 | 822 | 823 | 824 |
825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 857 | 873 | 874 | 875 | 878 | 879 | 880 | 881 | 882 | 906 | 907 | 908 | 909 | 910 | 911 | 912 | 913 | 914 | 915 | 916 | 917 | 918 | 919 | 945 | -------------------------------------------------------------------------------- /test/html/empty_sections.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | HTML5.1x Courseware | edX 18 | 19 | 20 | 28 | 29 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 216 | 217 |
218 |
219 |
220 | Skip to main content 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 269 | 272 | 273 | 274 | 275 | 276 | 277 | 278 |
279 | Help 280 |
281 | 282 | 367 | 368 | 492 | 493 | 494 | 495 | 496 |
497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 612 | 613 | 614 | 615 |
616 | 636 |
637 | 646 | 647 | 651 | 652 | 653 | 681 | 682 | 697 | 698 | 699 | 700 |
701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 782 | 783 | 784 | 785 | 786 |
787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 819 | 835 | 836 | 837 | 840 | 841 | 842 | 843 | 844 | 868 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | 877 | 878 | 879 | 880 | 906 |
907 | 908 | --------------------------------------------------------------------------------