├── edx_dl
├── __init__.py
├── utils.py
├── common.py
├── parsing.py
└── edx_dl.py
├── test
├── json
│ ├── empty.json
│ ├── minimal.json
│ ├── empty-text.json
│ ├── abridged-01.json
│ └── abridged-02.json
└── html
│ ├── index.txt
│ ├── new_sections_structure.html
│ └── empty_sections.html
├── MANIFEST.in
├── requirements-dev.txt
├── requirements.txt
├── edx-dl.py
├── tox.ini
├── .gitignore
├── .travis.yml
├── AUTHORS.md
├── .github
├── ISSUE_TEMPLATE.md
└── PULL_REQUEST_TEMPLATE.md
├── setup.py
├── CONTRIBUTING.md
├── README.md
├── test_utils.py
├── test_edx_dl.py
├── test_parsing.py
└── LICENSE
/edx_dl/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/json/empty.json:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/json/minimal.json:
--------------------------------------------------------------------------------
1 | {}
2 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements*.txt
2 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest>=2.5
2 | pytest-cov
3 | pytest-xdist
4 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4>=4.1.3
2 | html5lib>=1.0b2
3 | six>=1.5.0
4 | youtube_dl>=2015.05.20
5 |
--------------------------------------------------------------------------------
/edx-dl.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from edx_dl import edx_dl
5 |
6 | edx_dl.main()
7 |
--------------------------------------------------------------------------------
/test/json/empty-text.json:
--------------------------------------------------------------------------------
1 | {
2 | "end": [
3 | 20428
4 | ],
5 | "start": [
6 | 18104
7 | ],
8 | "text": [
9 | ""
10 | ]
11 | }
12 |
--------------------------------------------------------------------------------
/test/json/abridged-01.json:
--------------------------------------------------------------------------------
1 | {
2 | "end": [
3 | 20428
4 | ],
5 | "start": [
6 | 18104
7 | ],
8 | "text": [
9 | "I am very glad to see everyone here\uff0c"
10 | ]
11 | }
12 |
--------------------------------------------------------------------------------
/test/json/abridged-02.json:
--------------------------------------------------------------------------------
1 | {
2 | "end": [
3 | 20428,
4 | 24721
5 | ],
6 | "start": [
7 | 18104,
8 | 20569
9 | ],
10 | "text": [
11 | "I am very glad to see everyone here\uff0c",
12 | "so let's enjoy the beauty of combinatorics together."
13 | ]
14 | }
15 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py27,py34,pypy
3 |
4 | [testenv]
5 | downloadcache = .tox/_download/
6 |
7 | deps =
8 | beautifulsoup4>=4.1.3
9 | html5lib>=1.0b2
10 | pytest>=2.5
11 | six>=1.5.0
12 | pytest-cov>=1.8.0
13 | pytest-xdist>=1.8
14 |
15 | commands = py.test -v --cov edx_dl --cov-report html .
16 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | semantic.cache
3 | Downloaded/
4 | *~
5 | .*.swp
6 |
7 | # Byte-compiled / optimized / DLL files
8 | __pycache__/
9 | *.py[cod]
10 | *$py.class
11 | .ropeproject/
12 |
13 | # Distribution-related files
14 | *.egg-info
15 | /dist
16 | /build
17 |
18 | # Testing and CI files
19 | /.tox
20 | /.coverage
21 | /htmlcov
22 | .cache/
23 |
24 | # Application files
25 | edx-dl.cache
26 |
--------------------------------------------------------------------------------
/test/html/index.txt:
--------------------------------------------------------------------------------
1 | # this file has a mapping of the local htmls and their origin
2 | single_unit_multiple_subs.html: https://courses.edx.org/courses/edX/DemoX.1/2014/courseware/6156e0e685ee4a2ab017258108c0bccd/194bd1729fab47aba6507f737d9b90ba
3 | multiple_units.html: https://courses.edx.org/courses/BerkeleyX/CS184.1x/2012_Fall/courseware/Unit_0/L1
4 | from_html_single_unit_multiple_subs: https://mitprofessionalx.mit.edu
5 | new_sections_structure: https://courses.edx.org/courses/course-v1:Microsoft+DEV207.1x+1T2016/courseware/2e4818cb44e546e18777fa7e4b250574/
6 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "2.7"
4 | - "3.3"
5 | - "3.4"
6 | - "pypy"
7 | matrix:
8 | allow_failures:
9 | - python: "pypy"
10 | # command to install dependencies
11 | install:
12 | - pip install -r requirements.txt
13 | - pip install pytest pytest-cov pytest-xdist coveralls coverage
14 |
15 | # command to run tests
16 | script: py.test -v --cov edx_dl --cov-report html
17 |
18 | after_success:
19 | coveralls
20 |
21 | notifications:
22 | email:
23 | - iemejia@gmail.com
24 | - kidsshk3@gmail.com
25 | - rbrito@ime.usp.br
26 |
--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
1 | # Introduction
2 |
3 | In lexicographic/alphabetical order, this file lists authors and
4 | contributors to the project. It is meant to recognize and credit their
5 | contributions to the project.
6 |
7 | Introduction of names in this file is completely voluntary, as some people
8 | may not want to be included given their potential employment requirements or
9 | other issues. We respect the contributor's wishes.
10 |
11 | To be included in this file, just send a pull request with your name, once
12 | you have at least one contribution to the project.
13 |
14 | # Contributors
15 |
16 | * Emad Shaaban
17 | * George Monkey
18 | * Ismaël Mejía
19 | * Rogério Theodoro de Brito
20 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | 🚨Please review the [Troubleshooting](../#troubleshooting) section
2 | before reporting any issue. Don't forget also to check the current issues to
3 | avoid duplicates.
4 |
5 | ### Subject of the issue
6 | Describe your issue here.
7 |
8 | ### Your environment
9 | * Operating System (name/version):
10 | * Python version:
11 | * youtube-dl version:
12 | * edx-dl version:
13 |
14 | ### Steps to reproduce
15 | Tell us how to reproduce this issue. Please provide us the course URL, and the
16 | specific subsection or unit if possible.
17 |
18 | ### Expected behaviour
19 | Tell us what should happen.
20 |
21 | ### Actual behaviour
22 | Tell us what happens instead. If the script fails, please copy the *entire*
23 | output of the command or the stacktrace (don't forget to obfuscate your
24 | username and password). If you cannot copy the exception, attach a screenshot.
25 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | 🚨Please review the [guidelines for contributing](CONTRIBUTING.md) to this repository.
2 |
3 | ## Proposed changes
4 |
5 | Describe the big picture of your changes here to communicate to the maintainers
6 | why we should accept this pull request. If it fixes a bug or resolves a feature
7 | request, be sure to link to that issue.
8 |
9 | ## Types of changes
10 |
11 | What types of changes does your code introduce?
12 | _Put an `x` in the boxes that apply_
13 |
14 | - [ ] Bugfix (non-breaking change which fixes an issue)
15 | - [ ] New feature (non-breaking change which adds functionality)
16 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
17 |
18 | ## Checklist
19 |
20 | _Put an `x` in the boxes that apply. You can also fill these out after creating
21 | the PR. If you're unsure about any of them, don't hesitate to ask. We're here
22 | to help! This is simply a reminder of what we are going to look for before
23 | merging your code._
24 |
25 | - [ ] I have read the [CONTRIBUTING](/CONTRIBUTING.md) doc
26 | - [ ] I agree to contribute my changes under the project's [LICENSE](/LICENSE)
27 | - [ ] I have checked that the unit tests pass locally with my changes
28 | - [ ] I have checked the style of the new code (lint/pep).
29 | - [ ] I have added tests that prove my fix is effective or that my feature works
30 | - [ ] I have added necessary documentation (if appropriate)
31 |
32 | ## Further comments
33 |
34 | If this is a relatively large or complex change, please explain why you chose
35 | the solution you did and what alternatives you considered, etc.
36 |
37 | ### Reviewers
38 | If you know the person who can review your code please add a @mention.
39 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from setuptools import setup
4 |
5 | # you can install this to a local test virtualenv like so:
6 | # virtualenv venv
7 | # ./venv/bin/pip install --editable .
8 | # ./venv/bin/pip install --editable .[dev] # with dev requirements, too
9 |
10 | #
11 | # FIXME: This won't work until we have a README file in .rst format (which
12 | # is what PyPI knows how to parse). In the mean time, we can use the following:
13 | #
14 | # pandoc --from=markdown --to=rst --output=README.rst README.md
15 | #
16 |
17 | setup(
18 | name='edx-dl',
19 | version='0.0',
20 | maintainer='Ismaël Mejía, Rogério Theodoro de Brito',
21 | maintainer_email='iemejia@gmail.com, rbrito@ime.usp.br',
22 |
23 | license='LGPL',
24 | url='https://github.com/shk3/edx-downloader',
25 |
26 | install_requires=open('requirements.txt').readlines(),
27 | extras_require=dict(
28 | dev=open('requirements-dev.txt').readlines()
29 | ),
30 |
31 | description='Simple tool to download video and lecture materials from edx.org.',
32 | long_description=open('README.rst', 'r').read(),
33 | keywords=['edX', 'download', 'education', 'MOOCs', 'video'],
34 | classifiers=[
35 | 'Development Status :: 4 - Beta',
36 | 'Environment :: Console',
37 | 'Intended Audience :: End Users/Desktop',
38 | 'License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)',
39 | 'Operating System :: OS Independent',
40 | 'Programming Language :: Python :: 2',
41 | 'Programming Language :: Python :: 2.7',
42 | 'Programming Language :: Python :: 3',
43 | 'Programming Language :: Python :: 3.3',
44 | 'Programming Language :: Python :: 3.4',
45 | 'Programming Language :: Python :: Implementation :: CPython',
46 | 'Programming Language :: Python :: Implementation :: PyPy',
47 | 'Programming Language :: Python',
48 | 'Topic :: Education',
49 | ],
50 |
51 | packages=["edx_dl"],
52 | entry_points=dict(
53 | console_scripts=[
54 | 'edx-dl=edx_dl.edx_dl:main'
55 | ]
56 | ),
57 |
58 | platforms=['any'],
59 | )
60 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Please, before sending patches, read these brief comments. They are here to
2 | help the project have both its users happy using the program and the
3 | developers/maintainers feel good when trying to change code that other
4 | people contributed.
5 |
6 | For the record, when this document mentions "I", it mostly means Rogério
7 | Brito's (@rbrito) is the one to blame.
8 |
9 | # Write good commit messages
10 |
11 | When you write your pull request and your commit messages, please, be
12 | detailed, explaining why you are doing what you are doing. Don't be afraid
13 | of being too verbose here. Also, please follow the highly recommended
14 | guidelines on how to write good [good commit messages][commit-msgs].
15 |
16 | When in doubt, follow the model of the Linux kernel commit logs. Their
17 | commit messages are some of the best that I have seen. Also, the ffmpeg has
18 | some good messages that I believe that should be followed. If you are in a
19 | hurry, read the section named
20 | ["Contributing" from subsurface's README][contributing].
21 |
22 | [commit-msgs]: https://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message
23 | [contributing]: https://github.com/torvalds/subsurface/blob/master/README#L71-L114
24 |
25 |
26 | # Test that your changes don't break existing functionality
27 |
28 | Make sure that you have all dependencies installed, like via:
29 |
30 | pip install -r requirements.txt
31 | pip install -r requirements-dev.txt
32 |
33 | Run the test suite with
34 |
35 | py.test -v --cov edx_dl --cov-report html
36 |
37 | If some test fails, please don't send your changes yet. Fix what broke
38 | before sending your pull request.
39 |
40 | If you need to change the test suite, explain in the commit message why it
41 | needs to be changed (e.g., the page layout or the authentication methods
42 | from edX changed, or they implemented a new kind of course).
43 |
44 | # Check for potential bugs
45 |
46 | Please, help keep the code tidy by checking for any potential bugs with the
47 | help of [`pep8`][pep8], [`pyflakes`][pyflakes], and [`pylint`][pylint]. If
48 | you know of any other good tools for analyzing the code, let me know about
49 | them!
50 |
51 | [pep8]: https://pypi.python.org/pypi/pep8
52 | [pyflakes]: https://pypi.python.org/pypi/pyflakes/
53 | [pylint]: https://pypi.python.org/pypi/pylint
54 |
55 | If you happen to find any issue reported by these programs, I welcome you to
56 | fix them. Many of the issues are usually very easy to fix and they are a
57 | great way to start contributing to this (and other projects in general).
58 | Furthermore, we all benefit from a better code base.
59 |
60 | # Changes in the tools that we use
61 |
62 | If you are proposing the use of a substitute of a tool that we already use,
63 | take a few paragraphs to tell us why we would like to change.
64 |
65 | If we are not using something, it is most likely that one of the following
66 | options applies:
67 |
68 | 1. I (@rbrito) may know even know that what you are proposing exists or have
69 | not yet "seen the light" as to why I should use it instead of using what
70 | I am currently using.
71 | 2. Even if I know about the tool, I may not know how to use it, or how it
72 | would make me more productive. Educate me and we all will gain from a
73 | better project.
74 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://travis-ci.org/coursera-dl/edx-dl)
2 | [](https://coveralls.io/github/coursera-dl/edx-dl?branch=master)
3 | [](https://codeclimate.com/github/coursera-dl/edx-dl)
4 |
5 | # Description
6 |
7 | `edx-dl` is a simple tool to download videos and lecture materials from Open
8 | edX-based sites. It requires a [Python][python] interpreter (>= 2.7) and
9 | very few other dependencies. It is platform independent, and should work
10 | fine under Unix (Linux, BSDs etc.), Windows or Mac OS X.
11 |
12 | We strongly recommend that, if you don't already have a Python interpreter
13 | installed, that you [install Python >= 3.4][python3], if possible, since it
14 | has better security than Python 2.
15 |
16 | [python]: https://www.python.org/
17 | [python3]: https://www.python.org/downloads/
18 |
19 | # Dependencies
20 |
21 | To install all the dependencies please do:
22 |
23 | pip install -r requirements.txt
24 |
25 | ## youtube-dl
26 |
27 | One of the most important dependencies of `edx-dl` is `youtube-dl`. The
28 | installation step listed above already pulls in the most recent version of
29 | `youtube-dl` for you.
30 |
31 | Unfortunately, since many Open edX sites store their videos on Youtube and
32 | Youtube changes their layout from time to time, it may be necessary to
33 | upgrade your copy of `youtube-dl`. There are many ways to proceed here, but
34 | the simplest is to simply use:
35 |
36 | pip install --upgrade youtube-dl
37 |
38 | # Quick Start
39 |
40 | Once you have installed everything, to use `edx-dl.py`, let it discover the
41 | courses in which you are enrolled, by issuing:
42 |
43 | python edx-dl.py -u user@user.com --list-courses
44 |
45 | From there, choose the course you are interested in, copy its URL and use it
46 | in the following command:
47 |
48 | python edx-dl.py -u user@user.com COURSE_URL
49 |
50 | replacing `COURSE_URL` with the URL that you just copied in the first step.
51 | It should look something like:
52 | https://courses.edx.org/courses/edX/DemoX.1/2014/info
53 |
54 | Your downloaded videos will be placed in a new directory called
55 | `Downloaded`, inside your current directory, but you can also choose another
56 | destination with the `-o` argument.
57 |
58 | To see all available options and a brief description of what they do, simply
59 | execute:
60 |
61 | python edx-dl.py --help
62 |
63 | *Important Note:* To use sites other than edx.org, you have to specify the
64 | site along with the `-x` option. For example, `-x stanford`, if the course
65 | that you want to get is hosted on Stanford's site.
66 |
67 | # Reporting issues
68 |
69 | Before reporting any issue please follow the steps below:
70 |
71 | 1. Verify that you are running the latest version of all the programs (both
72 | of `edx-dl` and of `youtube-dl`). Use the following command if in doubt:
73 |
74 | pip install --upgrade edx-dl
75 |
76 | 2. If the problem persists, feel free to [open an issue][issue] in our
77 | bugtracker, please fill the issue template with *as much information as
78 | possible*.
79 |
80 | [issue]: https://github.com/coursera-dl/edx-dl/issues
81 |
82 | # Supported sites
83 |
84 | These are the current supported sites:
85 |
86 | - [edX](http://edx.org)
87 | - [Stanford](http://lagunita.stanford.edu/)
88 | - [University of Sydney](http://online.it.usyd.edu.au)
89 | - [France Université Numérique](https://www.france-universite-numerique-mooc.fr/)
90 | - [GW Online SEAS](http://openedx.seas.gwu.edu/) - George Washington University
91 | - [GW Online Open](http://mooc.online.gwu.edu/) - George Washington University
92 |
93 | This is the full [list of sites powered by Open edX][sites]. Not all of them
94 | are supported at the moment, we welcome you to contribute support for them
95 | and send a pull request also via our [issue tracker][issue].
96 |
97 | [sites]: https://github.com/edx/edx-platform/wiki/Sites-powered-by-Open-edX
98 |
99 | # Authors
100 |
101 | See the contributors to the project in the [AUTHORS.md][authors] file. If
102 | you have contributed to the project, we would like to gladly credit you for
103 | your work. Just send us a note to be added to that list.
104 |
105 | [authors]: https://github.com/coursera-dl/edx-dl/blob/master/AUTHORS.md
106 |
--------------------------------------------------------------------------------
/edx_dl/utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # This module contains generic functions, ideally useful to any other module
5 | from six.moves.urllib.request import urlopen, Request
6 | from six.moves import html_parser
7 |
8 | import errno
9 | import json
10 | import logging
11 | import os
12 | import string
13 | import subprocess
14 |
15 |
16 | def get_filename_from_prefix(target_dir, filename_prefix):
17 | """
18 | Return the basename for the corresponding filename_prefix.
19 | """
20 | # This whole function is not the nicest thing, but isolating it makes
21 | # things clearer. A good refactoring would be to get the info from the
22 | # video_url or the current output, to avoid the iteration from the
23 | # current dir.
24 | filenames = os.listdir(target_dir)
25 | for name in filenames: # Find the filename of the downloaded video
26 | if name.startswith(filename_prefix):
27 | basename, _ = os.path.splitext(name)
28 | return basename
29 | return None
30 |
31 |
32 | def execute_command(cmd, args):
33 | """
34 | Creates a process with the given command cmd.
35 | """
36 | try:
37 | subprocess.check_call(cmd)
38 | except subprocess.CalledProcessError as e:
39 | if args.ignore_errors:
40 | logging.warn('External command error ignored: %s', e)
41 | else:
42 | raise e
43 |
44 |
45 | def directory_name(initial_name):
46 | """
47 | Transform the name of a directory into an ascii version
48 | """
49 | result = clean_filename(initial_name)
50 | return result if result != "" else "course_folder"
51 |
52 |
53 | def get_page_contents(url, headers):
54 | """
55 | Get the contents of the page at the URL given by url. While making the
56 | request, we use the headers given in the dictionary in headers.
57 | """
58 | result = urlopen(Request(url, None, headers))
59 | try:
60 | # for python3
61 | charset = result.headers.get_content_charset(failobj="utf-8")
62 | except:
63 | charset = result.info().getparam('charset') or 'utf-8'
64 | return result.read().decode(charset)
65 |
66 |
67 | def get_page_contents_as_json(url, headers):
68 | """
69 | Makes a request to the url and immediately parses the result asuming it is
70 | formatted as json
71 | """
72 | json_string = get_page_contents(url, headers)
73 | json_object = json.loads(json_string)
74 | return json_object
75 |
76 |
77 | def remove_duplicates(orig_list, seen=set()):
78 | """
79 | Returns a new list based on orig_list with elements from the (optional)
80 | set seen and elements of orig_list removed.
81 |
82 | The function tries to maintain the order of the elements in orig_list as
83 | much as possible, only "removing" a given element if it appeared earlier
84 | in orig_list or if it was already a member of seen.
85 |
86 | This function does *not* modify any of its input parameters.
87 | """
88 | new_list = []
89 | new_seen = set(seen)
90 |
91 | for elem in orig_list:
92 | if elem not in new_seen:
93 | new_list.append(elem)
94 | new_seen.add(elem)
95 |
96 | return new_list, new_seen
97 |
98 |
99 | # The next functions come from coursera-dl/coursera
100 | def mkdir_p(path, mode=0o777):
101 | """
102 | Create subdirectory hierarchy given in the paths argument.
103 | """
104 | try:
105 | os.makedirs(path, mode)
106 | except OSError as exc:
107 | if exc.errno == errno.EEXIST and os.path.isdir(path):
108 | pass
109 | else:
110 | raise
111 |
112 |
113 | def clean_filename(s, minimal_change=False):
114 | """
115 | Sanitize a string to be used as a filename.
116 | If minimal_change is set to true, then we only strip the bare minimum of
117 | characters that are problematic for filesystems (namely, ':', '/' and
118 | '\x00', '\n').
119 | """
120 |
121 | # First, deal with URL encoded strings
122 | h = html_parser.HTMLParser()
123 | s = h.unescape(s)
124 |
125 | # strip paren portions which contain trailing time length (...)
126 | s = (
127 | s.replace(':', '-')
128 | .replace('/', '-')
129 | .replace('\x00', '-')
130 | .replace('\n', '')
131 | )
132 |
133 | if minimal_change:
134 | return s
135 |
136 | s = s.replace('(', '').replace(')', '')
137 | s = s.rstrip('.') # Remove excess of trailing dots
138 |
139 | s = s.strip().replace(' ', '_')
140 | valid_chars = '-_.()%s%s' % (string.ascii_letters, string.digits)
141 | return ''.join(c for c in s if c in valid_chars)
142 |
--------------------------------------------------------------------------------
/edx_dl/common.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | Common type definitions and constants for edx-dl
5 |
6 | The classes in this module represent the structure of courses in edX. The
7 | structure is:
8 |
9 | * A Course contains Sections
10 | * Each Section contains Subsections
11 | * Each Subsection contains Units
12 |
13 | Notice that we don't represent the full tree structure for both performance
14 | and UX reasons:
15 |
16 | Course -> [Section] -> [SubSection] -> [Unit] -> [Video]
17 |
18 | In the script the data structures used are:
19 |
20 | 1. The data structures to represent the course information:
21 | Course, Section->[SubSection]
22 |
23 | 2. The data structures to represent the chosen courses and sections:
24 | selections = {Course, [Section]}
25 |
26 | 3. The data structure of all the downloable resources which represent each
27 | subsection via its URL and the of resources who can be extracted from the
28 | Units it contains:
29 | all_units = {Subsection.url: [Unit]}
30 |
31 | 4. The units can contain multiple videos:
32 | Unit -> [Video]
33 | """
34 |
35 |
36 | class Course(object):
37 | """
38 | Course class represents course information.
39 | """
40 | def __init__(self, id, name, url, state):
41 | """
42 | @param id: The id of a course in edX is composed by the path
43 | {organization}/{course_number}/{course_run}
44 | @type id: str or None
45 |
46 | @param name: Name of the course. The name is taken from course page
47 | h3 header.
48 | @type name: str
49 |
50 | @param url: URL of the course.
51 | @type url: str or None
52 |
53 | @param state: State of the course. One of the following values:
54 | * 'Not yet'
55 | * 'Started'
56 | @type state: str
57 | """
58 | self.id = id
59 | self.name = name
60 | self.url = url
61 | self.state = state
62 |
63 | def __repr__(self):
64 | url = self.url if self.url else "None"
65 | return self.name + ": " + url
66 |
67 |
68 | class Section(object):
69 | """
70 | Representation of a section of the course.
71 | """
72 | def __init__(self, position, name, url, subsections):
73 | """
74 | @param position: Integer position of the section in the list of
75 | sections. Starts at 1.
76 | @type position: int
77 |
78 | @param name: Name of the section.
79 | @type name: str
80 |
81 | @param url: URL of the section. None when section contains no
82 | subsections.
83 | @type url: str or None
84 |
85 | @param subsections: List of subsections.
86 | @type subsections: [SubSection]
87 | """
88 | self.position = position
89 | self.name = name
90 | self.url = url
91 | self.subsections = subsections
92 |
93 |
94 | class SubSection(object):
95 | """
96 | Representation of a subsection in a section.
97 | """
98 | def __init__(self, position, name, url):
99 | """
100 | @param position: Integer position of the subsection in the subsection
101 | list. Starts at 1.
102 | @type position: int
103 |
104 | @param name: Name of the subsection.
105 | @type name: str
106 |
107 | @param url: URL of the subsection.
108 | @type url: str
109 | """
110 | self.position = position
111 | self.name = name
112 | self.url = url
113 |
114 | def __repr__(self):
115 | return self.name + ": " + self.url
116 |
117 | class Unit(object):
118 | """
119 | Representation of a single unit of the course.
120 | """
121 | def __init__(self, videos, resources_urls):
122 | """
123 | @param videos: List of videos present in the unit.
124 | @type videos: [Video]
125 |
126 | @param resources_urls: List of additional resources that are come along
127 | with the unit. Resources include files with certain extensions
128 | and youtube links.
129 | @type resources_urls: [str]
130 | """
131 | self.videos = videos
132 | self.resources_urls = resources_urls
133 |
134 |
135 | class Video(object):
136 | """
137 | Representation of a single video.
138 | """
139 | def __init__(self, video_youtube_url, available_subs_url,
140 | sub_template_url, mp4_urls):
141 | """
142 | @param video_youtube_url: Youtube link (if any).
143 | @type video_youtube_url: str or None
144 |
145 | @param available_subs_url: URL to the available subtitles.
146 | @type available_subs_url: str
147 |
148 | @param sub_template_url: ???
149 | @type sub_template_url: str
150 |
151 | @param mp4_urls: List of URLs to mp4 video files.
152 | @type mp4_urls: [str]
153 | """
154 | self.video_youtube_url = video_youtube_url
155 | self.available_subs_url = available_subs_url
156 | self.sub_template_url = sub_template_url
157 | self.mp4_urls = mp4_urls
158 |
159 |
160 | class ExitCode(object):
161 | """
162 | Class that contains all exit codes of the program.
163 | """
164 | OK = 0
165 | MISSING_CREDENTIALS = 1
166 | WRONG_EMAIL_OR_PASSWORD = 2
167 | MISSING_COURSE_URL = 3
168 | INVALID_COURSE_URL = 4
169 | UNKNOWN_PLATFORM = 5
170 | NO_DOWNLOADABLE_VIDEO = 6
171 |
172 |
173 | YOUTUBE_DL_CMD = ['youtube-dl', '--ignore-config']
174 | DEFAULT_CACHE_FILENAME = 'edx-dl.cache'
175 | DEFAULT_FILE_FORMATS = ['e?ps', 'pdf', 'txt', 'doc', 'xls', 'ppt',
176 | 'docx', 'xlsx', 'pptx', 'odt', 'ods', 'odp', 'odg',
177 | 'zip', 'rar', 'gz', 'mp3']
178 |
--------------------------------------------------------------------------------
/test_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from __future__ import unicode_literals
5 |
6 | import subprocess
7 |
8 | import pytest
9 | import six
10 |
11 | from edx_dl import utils
12 |
13 |
14 | def test_clean_filename():
15 | strings = {
16 | '(23:90)': '23-90',
17 | '(:': '-',
18 | 'a téest &and a@noòtheèr': 'a_test_and_another',
19 | 'Lecture 2.7 - Evaluation and Operators (16:25)':
20 | 'Lecture_2.7_-_Evaluation_and_Operators_16-25',
21 | 'Week 3: Data and Abstraction':
22 | 'Week_3-_Data_and_Abstraction',
23 | ' (Week 1) BRANDING: Marketing Strategy and Brand Positioning':
24 | 'Week_1_BRANDING-__Marketing_Strategy_and_Brand_Positioning',
25 | 'test & " adfas': 'test___adfas',
26 | ' ': ''
27 | }
28 | for k, v in six.iteritems(strings):
29 | actual_res = utils.clean_filename(k)
30 | assert actual_res == v, actual_res
31 |
32 |
33 | def test_clean_filename_minimal_change():
34 | strings = {
35 | '(23:90)': '(23-90)',
36 | '(:': '(-',
37 | 'a téest &and a@noòtheèr': 'a téest &and a@noòtheèr',
38 | 'Lecture 2.7 - Evaluation and Operators (16:25)':
39 | 'Lecture 2.7 - Evaluation and Operators (16-25)',
40 | 'Week 3: Data and Abstraction':
41 | 'Week 3- Data and Abstraction',
42 | ' (Week 1) BRANDING: Marketing Strategy and Brand Positioning':
43 | ' (Week 1) BRANDING- Marketing Strategy and Brand Positioning',
44 | 'test & " adfas': 'test & " adfas',
45 | ' ': u'\xa0'
46 | }
47 | for k, v in six.iteritems(strings):
48 | actual_res = utils.clean_filename(k, minimal_change=True)
49 | assert actual_res == v, actual_res
50 |
51 |
52 | @pytest.mark.skipif(True,
53 | reason="Needs change in interface")
54 | def test_execute_command_should_succeed():
55 | actual_res = utils.execute_command(['ls', '--help'])
56 | assert actual_res == 0, actual_res
57 |
58 |
59 | @pytest.mark.skipif(True,
60 | reason="Needs change in interface")
61 | def test_execute_command_should_fail():
62 | try:
63 | actual_res = utils.execute_command(['ls', '--help-does-not-exist'])
64 | except subprocess.CalledProcessError as e:
65 | assert True, "Expected exception thrown."
66 | else:
67 | assert False, "Unexpected exception (or no exception) thrown"
68 |
69 | # For the future
70 | # actual_res == 2, actual_res
71 |
72 |
73 | def test_get_filename_from_prefix():
74 | target_dir = '.'
75 |
76 | cases = {
77 | 'requirements.txt': 'requirements',
78 | 'does-not-exist': None,
79 | # 'requirements': 'requirements-dev', # depends on filesystem!
80 | }
81 |
82 | for k, v in six.iteritems(cases):
83 | actual_res = utils.get_filename_from_prefix(target_dir, k)
84 | assert actual_res == v, actual_res
85 |
86 |
87 | def test_remove_duplicates_without_seen():
88 | empty_set = set()
89 | lists = [
90 | ([], [], empty_set),
91 | ([1], [1], {1}),
92 | ([1, 1], [1], {1}),
93 |
94 | ([None], [None], {None}),
95 | ([None, None], [None], {None}),
96 | ([1, None], [1, None], {1, None}),
97 |
98 | (['a'], ['a'], {'a'}),
99 | (['a', 'a'], ['a'], {'a'}),
100 | (['a', 'b'], ['a', 'b'], {'a', 'b'}),
101 |
102 | (['a', 'b', 'a'], ['a', 'b'], {'a', 'b'}),
103 | (['a', 'a', 'b'], ['a', 'b'], {'a', 'b'}),
104 | (['b', 'a', 'b'], ['b', 'a'], {'a', 'b'}),
105 | (['b', 'a', 'a'], ['b', 'a'], {'a', 'b'}),
106 |
107 | ([1, 2, 1, 2], [1, 2], {1, 2}),
108 | ]
109 | for l, reduced_l, seen in lists:
110 | actual_res = utils.remove_duplicates(l)
111 | assert actual_res == (reduced_l, seen), actual_res
112 |
113 |
114 | def test_remove_duplicates_with_seen():
115 | empty_set = set()
116 | lists = [
117 | ([], empty_set, [], empty_set),
118 | ([], {None}, [], {None}),
119 | ([], {1}, [], {1}),
120 | ([], {1, 2}, [], {1, 2}),
121 |
122 | ([1], empty_set, [1], {1}),
123 | ([1], {1}, [], {1}),
124 |
125 | ([1, 1], empty_set, [1], {1}),
126 | ([1, 1], {1}, [], {1}),
127 | ([1, 1], {None}, [1], {1, None}),
128 | ([1, 1], {2}, [1], {1, 2}),
129 | ([1, 1], {1, 2}, [], {1, 2}),
130 |
131 | ([None], empty_set, [None], {None}),
132 | ([None], {1}, [None], {1, None}),
133 | ([None], {1, 2}, [None], {1, 2, None}),
134 | ([None], {1, 2}, [None], {2, 1, None}),
135 | ([None], {1, 2}, [None], {None, 2, 1}),
136 | ([None], {1, 2}, [None], {2, None, 1}),
137 | ([None], {1, 2, None}, [], {1, 2, None}),
138 |
139 | ([1, None], empty_set, [1, None], {1, None}),
140 | ([1, None], {1}, [None], {1, None}),
141 | ([1, None], {None}, [1], {1, None}),
142 | ([1, None], {1, None}, [], {1, None}),
143 | ([1, None], {1, None, 2}, [], {1, None, 2}),
144 |
145 | ([None, 1], empty_set, [None, 1], {1, None}),
146 | ([None, 1], {1}, [None], {1, None}),
147 | ([None, 1], {None}, [1], {1, None}),
148 | ([None, 1], {1, None}, [], {1, None}),
149 | ([None, 1], {1, None, 2}, [], {1, None, 2}),
150 |
151 | (['a'], empty_set, ['a'], {'a'}),
152 | (['a'], {'a'}, [], {'a'}),
153 | (['a'], {None}, ['a'], {'a', None}),
154 | (['a'], {'b'}, ['a'], {'a', 'b'}),
155 | (['a'], {'a', 'b'}, [], {'a', 'b'}),
156 |
157 | (['a'], {'a', 'b', tuple()}, [], {'a', 'b', tuple()}),
158 |
159 |
160 | # (['a', 'a'], ['a'], {'a'}),
161 | # (['a', 'b'], ['a', 'b'], {'a', 'b'}),
162 | # (['a', 'b', 'a'], ['a', 'b'], {'a', 'b'}),
163 | # (['a', 'a', 'b'], ['a', 'b'], {'a', 'b'}),
164 | # (['b', 'a', 'b'], ['b', 'a'], {'a', 'b'}),
165 | # (['b', 'a', 'a'], ['b', 'a'], {'a', 'b'}),
166 | # ([1, 2, 1, 2], [1, 2], {1, 2}),
167 | # ([1, 2, 1, 2], [1, 2], {1, 2}),
168 | ]
169 | for l, seen_before, reduced_l, seen_after in lists:
170 | actual_res = utils.remove_duplicates(l, seen_before)
171 | assert actual_res == (reduced_l, seen_after), actual_res
172 |
--------------------------------------------------------------------------------
/test_edx_dl.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import pytest
5 | from edx_dl import edx_dl, parsing
6 | from edx_dl.common import Unit, Video, DEFAULT_FILE_FORMATS
7 |
8 |
9 | def test_failed_login():
10 | resp = edx_dl.edx_login(
11 | edx_dl.LOGIN_API, edx_dl.edx_get_headers(), "guest", "guest")
12 | assert not resp.get('success', False)
13 |
14 |
15 | def test_remove_repeated_urls():
16 | url = "test/html/multiple_units.html"
17 | site = 'https://courses.edx.org'
18 | with open(url, "r") as f:
19 | html_contents = f.read()
20 | page_extractor = parsing.CurrentEdXPageExtractor()
21 | units_extracted = page_extractor.extract_units_from_html(html_contents,
22 | site,
23 | DEFAULT_FILE_FORMATS)
24 |
25 | all_units = {url: units_extracted}
26 | filtered_units = edx_dl.remove_repeated_urls(all_units)
27 | num_all_urls = edx_dl.num_urls_in_units_dict(all_units)
28 | num_filtered_urls = edx_dl.num_urls_in_units_dict(filtered_units)
29 |
30 | assert num_all_urls == 18
31 | assert num_filtered_urls == 16
32 | assert num_all_urls != num_filtered_urls
33 |
34 |
35 | @pytest.fixture
36 | def all_units():
37 | return {
38 | 'empty_section': [],
39 | 'nonempty_section': [Unit(videos=[], resources_urls=[]),
40 | Unit(videos=[Video(video_youtube_url=None,
41 | available_subs_url=None,
42 | sub_template_url=None,
43 | mp4_urls=[])], resources_urls=[]),
44 | Unit(videos=[Video(video_youtube_url=None,
45 | available_subs_url=None,
46 | sub_template_url=None,
47 | mp4_urls=['1', '2'])], resources_urls=['3']),
48 | ]
49 | }
50 |
51 |
52 | @pytest.fixture
53 | def unknown_units():
54 | return {
55 | 'nonempty_section': ['shouldfail']
56 | }
57 |
58 |
59 | @pytest.fixture
60 | def unknown_videos():
61 | return {
62 | 'nonempty_section': [Unit(videos=['shoudfail'], resources_urls=['3'])]
63 | }
64 |
65 |
66 | def test_extract_urls_from_units(all_units):
67 | """
68 | Make sure that urls are grabbed from both mp4_urls and from
69 | resources_urls of Unit class.
70 | """
71 | urls = edx_dl.extract_urls_from_units(all_units, '%(url)s')
72 | expected = ['1\n', '2\n', '3\n']
73 | assert sorted(urls) == sorted(expected)
74 |
75 |
76 | def test_extract_urls_from_units_unknown_units(unknown_units):
77 | """
78 | Make sure that we only expect Units in the list of units.
79 | """
80 | with pytest.raises(TypeError):
81 | edx_dl.extract_urls_from_units(unknown_units, '%(url)s')
82 |
83 |
84 | def test_extract_urls_from_units_unknown_videos(unknown_videos):
85 | """
86 | Make sure that we only expect Video in the list of Unit videos.
87 | """
88 | with pytest.raises(TypeError):
89 | edx_dl.extract_urls_from_units(unknown_videos, '%(url)s')
90 |
91 |
92 | def test_edx_get_subtitle():
93 | """
94 | Make sure Stanford subtitle URLs are distinguished from EdX ones.
95 | """
96 |
97 | def mock_get_page_contents(u, h):
98 | assert u == url
99 | assert h == headers
100 | return u
101 |
102 | def mock_get_page_contents_as_json(u, h):
103 | assert u == url
104 | assert h == headers
105 | return { 'start' : [123], 'end' : [456], 'text' : ["subtitle content"] }
106 |
107 | url = "https://lagunita.stanford.edu/courses/Engineering/QMSE02./Winter2016/xblock/i4x:;_;_Engineering;_QMSE02.;_video;_7f4f16e3eb294538aa8db4c43877132b/handler/transcript/download"
108 | headers = {}
109 | get_page_contents = lambda u, h: u
110 |
111 | expected = url
112 | actual = edx_dl.edx_get_subtitle(url, headers, mock_get_page_contents, mock_get_page_contents_as_json)
113 | assert expected == actual
114 |
115 | # Make sure Non-Stanford URLs still work
116 | url = "https://www.edx.org/could/be/more/realistic"
117 |
118 | expected = '0\n00:00:00,123 --> 00:00:00,456\nsubtitle content\n\n'
119 | actual = edx_dl.edx_get_subtitle(url, headers, mock_get_page_contents, mock_get_page_contents_as_json)
120 | assert expected == actual
121 |
122 |
123 | def test_extract_subtitle_urls():
124 | text = """
125 | <li class="video-tracks video-download-button">
126 | <a href="/courses/Engineering/QMSE02./Winter2016/xblock/i4x:;_;_Engineering;_QMSE02.;_video;_1a4c7ff41e484a15927987b745a5c779/handler/transcript/download">Download transcript</a>
127 | <div class="a11y-menu-container">
128 | <a class="a11y-menu-button" href="#" title=".srt" role="button" aria-disabled="false">.srt</a>
129 | <ol class="a11y-menu-list" role="menu">
130 | <li class="a11y-menu-item active">
131 |
132 | <a class="a11y-menu-item-link" href="#srt" title="SubRip (.srt) file" data-value="srt" role="menuitem" aria-disabled="false">
133 | SubRip (.srt) file
134 | </a>
135 | </li>
136 | <li class="a11y-menu-item">
137 |
138 | <a class="a11y-menu-item-link" href="#txt" title="Text (.txt) file" data-value="txt" role="menuitem" aria-disabled="false">
139 | Text (.txt) file
140 | </a>
141 | </li>
142 | </ol>
143 | </div>
144 | </li>
145 | """
146 |
147 | page_extractor = parsing.CurrentEdXPageExtractor()
148 | expected = (None, 'https://base.url/courses/Engineering/QMSE02./Winter2016/xblock/i4x:;_;_Engineering;_QMSE02.;_video;_1a4c7ff41e484a15927987b745a5c779/handler/transcript/download')
149 | actual = page_extractor.extract_subtitle_urls(text, "https://base.url")
150 | print("actual", actual)
151 | assert expected == actual
152 |
--------------------------------------------------------------------------------
/test_parsing.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from __future__ import unicode_literals
5 |
6 | import json
7 |
8 | import pytest
9 |
10 | from edx_dl.common import DEFAULT_FILE_FORMATS
11 |
12 | from edx_dl.parsing import (
13 | edx_json2srt,
14 | ClassicEdXPageExtractor,
15 | CurrentEdXPageExtractor,
16 | is_youtube_url,
17 | )
18 |
19 |
20 | # Test conversion of JSON subtitles to srt
21 | def test_empty_json_subtitle():
22 | with open('test/json/empty.json') as f:
23 | json_string = f.read()
24 | with pytest.raises(ValueError):
25 | json_contents = json.loads(json_string)
26 |
27 |
28 | @pytest.mark.parametrize(
29 | 'file,expected', [
30 | ('test/json/empty-text.json', ''),
31 | ('test/json/minimal.json', ''),
32 | ('test/json/abridged-01.json', ('0\n'
33 | '00:00:18,104 --> 00:00:20,428\n'
34 | 'I am very glad to see everyone here,\n\n')),
35 | ('test/json/abridged-02.json', ('0\n'
36 | '00:00:18,104 --> 00:00:20,428\n'
37 | 'I am very glad to see everyone here,\n\n'
38 | '1\n'
39 | '00:00:20,569 --> 00:00:24,721\n'
40 | 'so let\'s enjoy the beauty of combinatorics together.\n\n'))
41 | ]
42 | )
43 | def test_subtitles_from_json(file, expected):
44 | with open(file) as f:
45 | json_contents = json.loads(f.read())
46 | res = edx_json2srt(json_contents)
47 | assert res == expected
48 |
49 |
50 | # Test extraction of video/other assets from HTML
51 | def test_extract_units_from_html_single_unit_multiple_subs():
52 | site = 'https://courses.edx.org'
53 | with open("test/html/single_unit_multiple_subs.html", "r") as f:
54 | units = CurrentEdXPageExtractor().extract_units_from_html(f.read(),
55 | site,
56 | DEFAULT_FILE_FORMATS)
57 |
58 | assert units[0].videos[0].video_youtube_url == 'https://youtube.com/watch?v=b7xgknqkQk8'
59 | assert units[0].videos[0].mp4_urls[0] == 'https://d2f1egay8yehza.cloudfront.net/edx-edx101/EDXSPCPJSP13-H010000_100.mp4'
60 | assert units[0].videos[0].sub_template_url == 'https://courses.edx.org/courses/edX/DemoX.1/2014/xblock/i4x:;_;_edX;_DemoX.1;_video;_14459340170c476bb65f73a0a08a076f/handler/transcript/translation/%s'
61 |
62 |
63 | def test_extract_multiple_units_multiple_resources():
64 | site = 'https://courses.edx.org'
65 | with open("test/html/multiple_units.html", "r") as f:
66 | units = CurrentEdXPageExtractor().extract_units_from_html(f.read(),
67 | site,
68 | DEFAULT_FILE_FORMATS)
69 | assert len(units) == 3
70 | # this one has multiple speeds in the data-streams field
71 | assert 'https://youtube.com/watch?v=CJ482b9r_0g' in [video.video_youtube_url for video in units[0].videos]
72 | assert len(units[0].videos[0].mp4_urls) > 0
73 | assert 'https://s3.amazonaws.com/berkeley-cs184x/videos/overview-motivation.mp4' in units[0].videos[0].mp4_urls
74 | assert 'https://courses.edx.org/static/content-berkeley-cs184x~2012_Fall/slides/overview.pdf' in units[0].resources_urls
75 |
76 |
77 | def test_extract_multiple_units_no_youtube_ids():
78 | site = 'https://courses.edx.org'
79 | with open("test/html/multiple_units_no_youtube_ids.html", "r") as f:
80 | units = ClassicEdXPageExtractor().extract_units_from_html(f.read(),
81 | site,
82 | DEFAULT_FILE_FORMATS)
83 | assert units[0].videos[0].video_youtube_url is None
84 | assert len(units[0].videos[0].mp4_urls) > 0
85 |
86 |
87 | def test_extract_multiple_units_youtube_link():
88 | site = 'https://courses.edx.org'
89 | with open("test/html/multiple_units_youtube_link.html", "r") as f:
90 | units = CurrentEdXPageExtractor().extract_units_from_html(f.read(),
91 | site,
92 | DEFAULT_FILE_FORMATS)
93 | assert 'https://www.youtube.com/watch?v=5OXQypOAbdI' in units[0].resources_urls
94 |
95 |
96 | def test_extract_multiple_units_multiple_youtube_videos():
97 | site = 'https://courses.edx.org'
98 | with open("test/html/multiple_units_multiple_youtube_videos.html", "r") as f:
99 | units = CurrentEdXPageExtractor().extract_units_from_html(f.read(),
100 | site,
101 | DEFAULT_FILE_FORMATS)
102 | assert len(units[0].videos) == 3
103 | assert 'https://youtube.com/watch?v=3atHHNa2UwI' in [video.video_youtube_url for video in units[0].videos]
104 |
105 |
106 | @pytest.mark.parametrize(
107 | 'file,num_sections_expected,num_subsections_expected', [
108 | ('test/html/new_sections_structure.html', 2, 12),
109 | ('test/html/empty_sections.html', 0, 0)
110 | ]
111 | )
112 | def test_extract_sections(file, num_sections_expected, num_subsections_expected):
113 | site = 'https://courses.edx.org'
114 | with open(file, "r") as f:
115 | sections = CurrentEdXPageExtractor().extract_sections_from_html(f.read(), site)
116 | assert len(sections) == num_sections_expected
117 | num_subsections = sum(len(section.subsections) for section in sections)
118 | assert num_subsections == num_subsections_expected
119 |
120 |
121 | def test_extract_courses_from_html():
122 | site = 'https://courses.edx.org'
123 | with open("test/html/dashboard.html", "r") as f:
124 | courses = CurrentEdXPageExtractor().extract_courses_from_html(f.read(), site)
125 | assert len(courses) == 18
126 | available_courses = [course for course in courses if course.state == 'Started']
127 | assert len(available_courses) == 14
128 |
129 |
130 | def test_is_youtube_url():
131 | invalid_urls = [
132 | 'http://www.google.com/', 'TODO',
133 | 'https://d2f1egay8yehza.cloudfront.net/mit-24118/MIT24118T314-V015000_DTH.mp4',
134 | 'https://courses.edx.org/courses/course-v1:MITx+24.118x+2T2015/xblock/block-v1:MITx+24.118x+2T2015+type@video+block@b1588e7cccff4d448f4f9676c81184d9/handler/transcript/available_translations'
135 | ]
136 | valid_urls = [
137 | 'http://www.youtu.be/rjOpZ3i6pRo',
138 | 'http://www.youtube.com/watch?v=rjOpZ3i6pRo',
139 | 'http://youtu.be/rjOpZ3i6pRo',
140 | 'http://youtube.com/watch?v=rjOpZ3i6pRo',
141 | 'https://www.youtu.be/rjOpZ3i6pRo',
142 | 'https://www.youtube.com/watch?v=rjOpZ3i6pRo',
143 | 'https://youtu.be/rjOpZ3i6pRo',
144 | 'https://youtube.com/watch?v=rjOpZ3i6pRo',
145 | ]
146 | for url in invalid_urls:
147 | assert not is_youtube_url(url)
148 | for url in valid_urls:
149 | assert is_youtube_url(url)
150 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/edx_dl/parsing.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | Parsing and extraction functions
5 | """
6 | import re
7 | import json
8 |
9 | from datetime import timedelta, datetime
10 |
11 | from six.moves import html_parser
12 | from bs4 import BeautifulSoup as BeautifulSoup_
13 |
14 | from .common import Course, Section, SubSection, Unit, Video
15 |
16 | # Force use of bs4 with html5lib
17 | BeautifulSoup = lambda page: BeautifulSoup_(page, 'html5lib')
18 |
19 | def edx_json2srt(o):
20 | """
21 | Transform the dict 'o' into the srt subtitles format
22 | """
23 | if o == {}:
24 | return ''
25 |
26 | base_time = datetime(1, 1, 1)
27 | output = []
28 |
29 | for i, (s, e, t) in enumerate(zip(o['start'], o['end'], o['text'])):
30 | if t == '':
31 | continue
32 |
33 | output.append(str(i) + '\n')
34 |
35 | s = base_time + timedelta(seconds=s/1000.)
36 | e = base_time + timedelta(seconds=e/1000.)
37 | time_range = "%02d:%02d:%02d,%03d --> %02d:%02d:%02d,%03d\n" % \
38 | (s.hour, s.minute, s.second, s.microsecond/1000,
39 | e.hour, e.minute, e.second, e.microsecond/1000)
40 |
41 | output.append(time_range)
42 | output.append(t + "\n\n")
43 |
44 | return ''.join(output)
45 |
46 |
47 | class PageExtractor(object):
48 | """
49 | Base class for PageExtractor
50 | Every subclass can represent a different layout for an OpenEdX site.
51 | They should implement the given methods.
52 |
53 | Usage:
54 |
55 | >>> import parsing
56 | >>> d = parsing.SubclassFromPageExtractor()
57 | >>> units = d.extract_units_from_html(page, BASE_URL)
58 | >>> ...
59 | """
60 |
61 | def extract_units_from_html(self, page, BASE_URL, file_formats):
62 | """
63 | Method to extract the resources (units) from the given page
64 | """
65 | raise NotImplementedError("Subclasses should implement this")
66 |
67 | def extract_sections_from_html(self, page, BASE_URL):
68 | """
69 | Method to extract the sections (and subsections) from an html page
70 | """
71 | raise NotImplementedError("Subclasses should implement this")
72 |
73 | def extract_courses_from_html(self, page, BASE_URL):
74 | """
75 | Method to extract the courses from an html page
76 | """
77 | raise NotImplementedError("Subclasses should implement this")
78 |
79 |
80 | class ClassicEdXPageExtractor(PageExtractor):
81 |
82 | def extract_units_from_html(self, page, BASE_URL, file_formats):
83 | """
84 | Extract Units from the html of a subsection webpage as a list of
85 | resources
86 | """
87 | # in this function we avoid using beautifulsoup for performance reasons
88 | # parsing html with regular expressions is really nasty, don't do this if
89 | # you don't need to !
90 | re_units = re.compile('(
]id="seq_contents_\d+".*?>.*?<\/div>)',
91 | re.DOTALL)
92 | units = []
93 |
94 | for unit_html in re_units.findall(page):
95 | unit = self.extract_unit(unit_html, BASE_URL, file_formats)
96 | if len(unit.videos) > 0 or len(unit.resources_urls) > 0:
97 | units.append(unit)
98 | return units
99 |
100 | def extract_unit(self, text, BASE_URL, file_formats):
101 | """
102 | Parses the
Please note: The edX support team is English speaking. While we will do our best to address your inquiry in any language, our responses will be in English.
238 |
239 |
240 |
241 |
242 |
248 |
249 |
250 |
251 |
264 |
265 |
266 |
267 |
273 |
274 |
275 |
Thank You!
276 |
277 |
278 |
279 |
280 |
281 | Thank you for your inquiry or feedback. We typically respond to a request within one business day (Monday to Friday, 13:00 UTC to 21:00 UTC.) In the meantime, please review our detailed FAQs where most questions have already been answered.
282 |
Please note: The edX support team is English
316 | speaking. While we will do our best to address your inquiry in any
317 | language, our responses will be in English.
318 |
319 |
320 |
321 |
322 |
328 |
329 |
330 |
331 |
344 |
345 |
346 |
347 |
353 |
354 |
355 |
Thank You!
356 |
357 |
358 |
359 |
360 |
361 | Thank you for your inquiry or feedback. We typically respond to a
362 | request within one business day (Monday to Friday, 13:00 UTC to 21:00
363 | UTC.) In the meantime, please review our detailed FAQs where most questions have already been answered.
364 |