├── tests
├── __init__.py
├── utils
│ ├── __init__.py
│ └── utils_tests.py
└── models
│ ├── __init__.py
│ ├── jobs_tests.py
│ └── export_request_tests.py
├── courseraresearchexports
├── __init__.py
├── db
│ ├── __init__.py
│ └── db.py
├── exports
│ ├── __init__.py
│ ├── api.py
│ └── utils.py
├── containers
│ ├── __init__.py
│ ├── utils.py
│ └── client.py
├── constants
│ ├── __init__.py
│ ├── container_constants.py
│ ├── db_constants.py
│ └── api_constants.py
├── models
│ ├── __init__.py
│ ├── ContainerInfo.py
│ ├── ClickstreamDownloadLinksRequest.py
│ ├── utils.py
│ ├── ExportDb.py
│ ├── ExportRequestWithMetadata.py
│ └── ExportRequest.py
├── commands
│ ├── __init__.py
│ ├── version.py
│ ├── utils.py
│ ├── db.py
│ ├── containers.py
│ └── jobs.py
├── sql
│ ├── demographic_survey.sql
│ └── enrollments.sql
└── main.py
├── MANIFEST.in
├── test_requirements.txt
├── tox.ini
├── .travis.yml
├── .gitignore
├── setup.py
├── LICENSE
└── README.rst
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/courseraresearchexports/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include courseraresearchexports/sql *
2 |
--------------------------------------------------------------------------------
/test_requirements.txt:
--------------------------------------------------------------------------------
1 | mock==1.0.1
2 | nose==1.3.7
3 | pep8==1.6.2
4 | testfixtures==4.1.2
5 |
--------------------------------------------------------------------------------
/courseraresearchexports/db/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | "db"
3 | ]
4 |
5 | from . import * # noqa
6 |
--------------------------------------------------------------------------------
/courseraresearchexports/exports/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | "api",
3 | "utils"
4 | ]
5 |
6 | from . import * # noqa
7 |
--------------------------------------------------------------------------------
/courseraresearchexports/containers/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | "client",
3 | "utils"
4 | ]
5 |
6 | from . import * # noqa
7 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py27
3 | [testenv]
4 | deps =
5 | nose
6 | mock
7 | testfixtures
8 | commands =
9 | nosetests
10 |
--------------------------------------------------------------------------------
/courseraresearchexports/constants/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | "api_constants",
3 | "db_constants",
4 | "container_constants"
5 | ]
6 |
7 | from . import * # noqa
8 |
--------------------------------------------------------------------------------
/courseraresearchexports/models/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | "ExportRequestWithMetadata",
3 | "ExportRequest",
4 | "ClickstreamDownloadLinksRequest",
5 | "ContainerInfo",
6 | "ExportDb",
7 | "utils"
8 | ]
9 |
10 | from . import * # noqa
11 |
--------------------------------------------------------------------------------
/courseraresearchexports/commands/__init__.py:
--------------------------------------------------------------------------------
1 | "Commands and their implementations for Coursera's research export tools."
2 |
3 | __all__ = [
4 | "version",
5 | "jobs",
6 | "containers",
7 | "db",
8 | "utils"
9 | ]
10 |
11 | from . import * # noqa
12 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "2.7"
4 |
5 | # command to install dependencies
6 | install:
7 | - "pip install ."
8 | - "pip install -r test_requirements.txt"
9 |
10 | # command to run tests & check style
11 | script:
12 | - nosetests
13 | - pep8 courseraresearchexports tests
14 |
--------------------------------------------------------------------------------
/courseraresearchexports/constants/container_constants.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | COURSERA_DOCKER_LABEL = 'courseraResearchExport'
4 | COURSERA_LOCAL_FOLDER = os.path.expanduser('~/.coursera/exports/')
5 | POSTGRES_DOCKER_IMAGE = 'postgres:9.5'
6 | POSTGRES_INIT_MSG = 'PostgreSQL init process complete; ready for start up.'
7 | POSTGRES_READY_MSG = 'database system is ready to accept connections'
8 |
--------------------------------------------------------------------------------
/courseraresearchexports/constants/db_constants.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | HASHED_USER_ID_COLUMN_TO_SOURCE_TABLE = {
16 | '[partner_user_id]': 'users',
17 | '[demographics_user_id]': 'demographics_answers',
18 | '[feedback_user_id]': 'feedback_course_ratings',
19 | '[assessments_user_id]': 'assessment_actions',
20 | '[peer_assignments_user_id]': 'peer_submissions',
21 | '[discussions_user_id]': 'discussion_answers',
22 | '[programming_assignments_user_id]': 'programming_submissions',
23 | }
24 |
--------------------------------------------------------------------------------
/tests/utils/utils_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright 2016 Coursera
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from courseraresearchexports.models import utils
18 | from mock import Mock
19 | from mock import patch
20 | import requests
21 |
22 | fake_partner_short_name = 'fake_partner_short_name'
23 | fake_partner_id = 1
24 | fake_partner_response = {'elements': [{"id": str(fake_partner_id)}]}
25 |
26 |
27 | @patch.object(requests, 'get')
28 | def test_partner_id_lookup(mockget):
29 | mock_partners_get_response = Mock()
30 | mock_partners_get_response.json.return_value = fake_partner_response
31 | mockget.return_value = mock_partners_get_response
32 | inferred_partner_id = utils.lookup_partner_id_by_short_name(
33 | fake_partner_short_name)
34 |
35 | assert inferred_partner_id == fake_partner_id
36 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
91 | # IDEA project settings
92 | .idea
93 |
94 | # Ignore DS_STORE
95 | .DS_Store
96 |
--------------------------------------------------------------------------------
/courseraresearchexports/constants/api_constants.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | RESEARCH_EXPORTS_APP = 'manage_research_exports'
17 | RESEARCH_EXPORTS_API = 'https://www.coursera.org/api/onDemandExports.v2/'
18 | COURSE_API = 'https://www.coursera.org/api/onDemandCourses.v1/'
19 | PARTNER_API = 'https://www.coursera.org/api/partners.v1/'
20 | CLICKSTREAM_API = 'https://www.coursera.org/api/clickstreamExportsDownload.v1/'
21 | ANONYMITY_LEVEL_COORDINATOR = 'HASHED_IDS_NO_PII'
22 | ANONYMITY_LEVEL_ISOLATED = 'HASHED_IDS_WITH_ISOLATED_UGC_NO_PII'
23 | ANONYMITY_LEVELS = [ANONYMITY_LEVEL_COORDINATOR, ANONYMITY_LEVEL_ISOLATED]
24 | EXPORT_TYPE_TABLES = 'RESEARCH_WITH_SCHEMAS'
25 | EXPORT_TYPE_CLICKSTREAM = 'RESEARCH_EVENTING'
26 | EXPORT_TYPE_GRADEBOOK = 'GRADEBOOK'
27 | EXPORT_TYPES = [EXPORT_TYPE_TABLES, EXPORT_TYPE_CLICKSTREAM,
28 | EXPORT_TYPE_GRADEBOOK]
29 | SCHEMA_NAMES = [
30 | 'demographics',
31 | 'users',
32 | 'course_membership',
33 | 'course_progress',
34 | 'feedback',
35 | 'assessments',
36 | 'course_grades',
37 | 'peer_assignments',
38 | 'staff_graded_assignments',
39 | 'discussions',
40 | 'programming_assignments',
41 | 'course_content',
42 | 'ecb',
43 | 'notebooks',
44 | 'transactions']
45 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 |
4 | def readme():
5 | with open('README.rst') as f:
6 | return f.read()
7 |
8 | setup(
9 | name='courseraresearchexports',
10 | version='0.0.29',
11 | description='Command line tool for convenient access to '
12 | 'Coursera Research Data Exports.',
13 | long_description=readme(),
14 | long_description_content_type='text/markdown',
15 | classifiers=[
16 | 'Development Status :: 5 - Production/Stable',
17 | 'License :: OSI Approved :: Apache Software License',
18 | 'Programming Language :: Python :: 2.7',
19 | ],
20 | keywords='coursera',
21 | url='https://github.com/coursera/courseraresearchexports',
22 | author='Chris Liu',
23 | author_email='cliu@coursera.org',
24 | license='Apache',
25 | entry_points={
26 | 'console_scripts': [
27 | 'courseraresearchexports = courseraresearchexports.main:main',
28 | ],
29 | },
30 | packages=['courseraresearchexports',
31 | 'courseraresearchexports.commands',
32 | 'courseraresearchexports.constants',
33 | 'courseraresearchexports.exports',
34 | 'courseraresearchexports.containers',
35 | 'courseraresearchexports.models',
36 | 'courseraresearchexports.db'],
37 | install_requires=[
38 | 'argcomplete>=1.4.1',
39 | 'courseraoauth2client>=0.0.1',
40 | 'requests>=2.7.0,<2.11',
41 | 'docker-py>=1.2.3',
42 | 'tqdm>=4.8.4',
43 | 'tabulate>=0.7.5',
44 | 'python-dateutil>=2.5.3',
45 | 'SQLAlchemy>=1.0.15',
46 | 'psycopg2>=2.6.2'
47 | ],
48 | test_suite='nose.collector',
49 | tests_require=['nose', 'nose-cover3'],
50 | # IMPORTANT: This makes MANIFEST.in work. DO NOT USE `package_data`, as
51 | # it does not work with sdist correctly.
52 | # See http://flask.pocoo.org/docs/0.11/patterns/distribute/ for details
53 | include_package_data=True,
54 | zip_safe=False
55 | )
56 |
--------------------------------------------------------------------------------
/courseraresearchexports/commands/version.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Coursera's command line SDK for interacting with research data exports.
17 |
18 | You may install it from source, or via pip.
19 | """
20 |
21 | import sys
22 | import logging
23 |
24 |
25 | def command_version(args):
26 | """Implements the version subcommand"""
27 |
28 | # See http://stackoverflow.com/questions/17583443
29 | from pkg_resources import get_distribution, DistributionNotFound
30 | import os.path
31 |
32 | try:
33 | _dist = get_distribution('courseraresearchexports')
34 | # Normalize case for Windows systems
35 | dist_loc = os.path.normcase(_dist.location)
36 | here = os.path.normcase(__file__)
37 | if not here.startswith(
38 | os.path.join(
39 | dist_loc,
40 | 'courseraresearchexports')):
41 | # not installed, but there is another version that *is*
42 | raise DistributionNotFound
43 | except DistributionNotFound:
44 | __version__ = 'Please install this project with setup.py'
45 | else:
46 | __version__ = _dist.version
47 |
48 | if args.quiet and args.quiet > 0:
49 | logging.info(__version__)
50 | else:
51 | logging.info("Your {prog}'s version is:\n\t{version}"
52 | .format(prog=sys.argv[0], version=__version__))
53 |
54 |
55 | def parser(subparsers):
56 | """Build an argparse argument parser to parse the command line."""
57 |
58 | # create the parser for the version subcommand.
59 | parser_version = subparsers.add_parser(
60 | 'version',
61 | help="Output the version of %(prog)s to the console.")
62 | parser_version.set_defaults(func=command_version)
63 |
64 | return parser_version
65 |
--------------------------------------------------------------------------------
/courseraresearchexports/models/ContainerInfo.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import dateutil.parser
16 |
17 |
18 | class ContainerInfo:
19 | """
20 | Represents the relevant information about a docker container used to store
21 | a database of Coursera Export data.
22 | """
23 |
24 | def __init__(self, name=None, id=None, host_port=None, host_ip=None,
25 | creation_time=None, database_name=None, status=None):
26 | self.name = name
27 | self.id = id
28 | self.short_id = id[:12] if id else None
29 | self.host_port = host_port
30 | self.host_ip = host_ip
31 | self.creation_time = creation_time
32 | self.status = status
33 | self.database_name = database_name
34 |
35 | @classmethod
36 | def from_container(cls, container_name, docker_client):
37 | """
38 | Create ContainerInfo using the response from docker-py Client's
39 | `inspect-container` method.
40 | :param container_dict:
41 | :return container_info: ContainerInfo
42 | """
43 | container_dict = docker_client.inspect_container(container_name)
44 | host_config = container_dict['HostConfig']['PortBindings']
45 | network_settings = container_dict['NetworkSettings']['Ports']
46 |
47 | assigned_port = int(host_config['5432/tcp'][0]['HostPort'])
48 | ip_if_running = network_settings and network_settings[
49 | '5432/tcp'][0]['HostIp']
50 |
51 | return cls(
52 | name=container_dict['Name'][1:], # remove prepended '\'
53 | id=container_dict['Id'],
54 | creation_time=dateutil.parser.parse(container_dict['Created']),
55 | database_name=container_dict['Config']['Labels']['database_name'],
56 | status=container_dict['State']['Status'],
57 | host_port=assigned_port,
58 | host_ip=ip_if_running)
59 |
--------------------------------------------------------------------------------
/tests/models/jobs_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright 2016 Coursera
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from courseraresearchexports.commands import jobs
18 | from courseraresearchexports.models.ExportRequest import ExportRequest
19 | from courseraresearchexports.models.ExportRequestWithMetadata import \
20 | ExportRequestWithMetadata
21 | from mock import MagicMock
22 | from mock import patch
23 | import argparse
24 |
25 |
26 | fake_course_id = 'fake_course_id'
27 | fake_course_slug = 'fake_course_slug'
28 |
29 |
30 | @patch('courseraresearchexports.commands.jobs.api.get_all')
31 | def test_get_all(api_get_all):
32 | api_get_all.return_value = []
33 |
34 | jobs.get_all(argparse.Namespace())
35 |
36 | api_get_all.assert_any_call()
37 |
38 |
39 | @patch('courseraresearchexports.models.utils.lookup_course_slug_by_id')
40 | @patch('courseraresearchexports.commands.jobs.api.get')
41 | def test_get(api_get, lookup_course_slug_by_id):
42 | lookup_course_slug_by_id.return_value = fake_course_slug
43 | api_get.return_value = [
44 | ExportRequestWithMetadata(course_id=fake_course_id)
45 | ]
46 | args = argparse.Namespace()
47 | args.id = fake_course_id
48 |
49 | jobs.get(args)
50 |
51 | api_get.assert_called_with(fake_course_id)
52 |
53 |
54 | @patch('courseraresearchexports.commands.jobs.api.post')
55 | def test_request(api_post):
56 | api_post.return_value = [
57 | ExportRequestWithMetadata(course_id=fake_course_id)
58 | ]
59 | args = argparse.Namespace()
60 | args.course_id = fake_course_id
61 | args.course_slug = None
62 | args.partner_id = None
63 | args.partner_short_name = None
64 | args.group_id = None
65 | args.export_type = None
66 | args.user_id_hashing = None
67 | args.purpose = None
68 | args.schemas = None
69 |
70 | jobs.request_tables(args)
71 |
72 | export_request, = api_post.call_args[0]
73 | assert export_request.course_id == fake_course_id
74 |
--------------------------------------------------------------------------------
/courseraresearchexports/models/ClickstreamDownloadLinksRequest.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 |
17 | from courseraresearchexports.models import utils
18 |
19 |
20 | class ClickstreamDownloadLinksRequest:
21 | """
22 | Represents a request for clickstream download links.
23 | """
24 |
25 | def __init__(self, course_id=None, partner_id=None, interval=None,
26 | **kwargs):
27 | self.course_id = course_id
28 | self.partner_id = partner_id
29 | self.interval = interval
30 |
31 | @staticmethod
32 | def from_args(**kwargs):
33 | """
34 | Create a ClickstreamDownloadLinkRequest from arguments. Performs
35 | course_id/partner_id inference.
36 | :param kwargs:
37 | :return eventing_links_request: ClickstreamDownloadLinksRequest
38 | """
39 | if kwargs.get('course_slug') and not kwargs.get('course_id'):
40 | kwargs['course_id'] = utils.lookup_course_id_by_slug(
41 | kwargs['course_slug'])
42 | elif kwargs.get('partner_short_name') and not kwargs.get('partner_id'):
43 | kwargs['partner_id'] = \
44 | utils.lookup_partner_id_by_short_name(
45 | kwargs['partner_short_name'])
46 | elif kwargs.get('group_id'):
47 | logging.error(
48 | 'Eventing exports by group is not currently supported. '
49 | 'Please see: '
50 | 'https://partner.coursera.help/hc/articles/360021121132'
51 | )
52 | raise ValueError('Eventing exports by group is not supported.')
53 |
54 | return ClickstreamDownloadLinksRequest(**kwargs)
55 |
56 | @property
57 | def scope(self):
58 | """
59 | API specific format for request scope context.
60 | :return scope:
61 | """
62 | if self.course_id:
63 | return 'courseContext~{}'.format(self.course_id)
64 | elif self.partner_id:
65 | return 'partnerContext~{}'.format(self.partner_id)
66 |
67 | def to_url_params(self):
68 | """
69 | API specific parameters for POST request.
70 | :return:
71 | """
72 | url_params = {'action': 'generateLinks', 'scope': self.scope}
73 | if self.interval:
74 | url_params['startDate'] = self.interval[0]
75 | url_params['endDate'] = self.interval[1]
76 |
77 | return url_params
78 |
--------------------------------------------------------------------------------
/courseraresearchexports/sql/demographic_survey.sql:
--------------------------------------------------------------------------------
1 | /*
2 | demographic_survey
3 | This query partially denormalizes the demographics tables to create aggregate
4 | information about the users in the present data export.
5 |
6 | Columns
7 | coursera_user_id
8 | demographic_survey_submission_dt
9 | demographic_survey_gender
10 | demographic_survey_age
11 | demographic_survey_country_cd_of_birth
12 | demographic_survey_us_postal_code
13 | demographic_survey_spanish_hispanic_or_latino_descent
14 | demographic_survey_race
15 | demographic_survey_highest_level_of_schooling
16 | demographic_survey_currently_enrolled_in_an_educational_program
17 | demographic_survey_level_of_current_educational_program
18 | demographic_survey_subject_area_of_degree
19 | demographic_survey_current_employment_status
20 | demographic_survey_area_of_industry_currently_employed_in
21 | demographic_survey_english_proficiency
22 | demographic_survey_other_languages_spoken
23 | */
24 |
25 | SELECT
26 | a.[demographics_user_id]
27 | ,MAX(a.submission_ts::DATE) AS demographic_survey_submission_dt
28 | ,MAX(CASE WHEN a.question_id = 11
29 | THEN c.choice_desc END) AS demographic_survey_gender
30 | ,MAX(CASE WHEN a.question_id = 12
31 | THEN DATE_PART('y', CURRENT_DATE) - a.answer_int END) AS demographic_survey_age
32 | ,UPPER(LEFT(MAX(CASE WHEN a.question_id = 13
33 | THEN c.choice_desc END), 2)) AS demographic_survey_country_cd_of_birth
34 | ,MAX(CASE WHEN a.question_id = 15
35 | THEN a.answer_int END) AS demographic_survey_us_postal_code
36 | ,MAX(CASE WHEN a.question_id = 16
37 | THEN c.choice_desc END) AS demographic_survey_spanish_hispanic_or_latino_descent
38 | ,RTRIM(STRING_AGG(CASE WHEN a.question_id = 17 THEN c.choice_desc END, ';')) AS demographic_survey_race
39 | ,MAX(CASE WHEN a.question_id = 18
40 | THEN c.choice_desc END) AS demographic_survey_highest_level_of_schooling
41 | ,MAX(CASE WHEN a.question_id = 19
42 | THEN c.choice_desc END) AS demographic_survey_currently_enrolled_in_an_educational_program
43 | ,MAX(CASE WHEN a.question_id = 20
44 | THEN c.choice_desc END) AS demographic_survey_level_of_current_educational_program
45 | ,RTRIM(STRING_AGG(CASE WHEN a.question_id = 21
46 | THEN c.choice_desc END, ';')) AS demographic_survey_subject_area_of_degree
47 | ,MAX(CASE WHEN a.question_id = 22
48 | THEN c.choice_desc END) AS demographic_survey_current_employment_status
49 | ,MAX(CASE WHEN a.question_id = 23
50 | THEN c.choice_desc END) AS demographic_survey_area_of_industry_currently_employed_in
51 | ,MAX(CASE WHEN a.question_id = 24
52 | THEN c.choice_desc END) AS demographic_survey_english_proficiency
53 | ,RTRIM(STRING_AGG(CASE WHEN a.question_id = 25
54 | THEN c.choice_desc END, ';')) AS demographic_survey_other_languages_spoken
55 | FROM demographics_answers a
56 | JOIN demographics_choices c USING (question_id, choice_id)
57 | WHERE a.question_id BETWEEN 11 AND 25
58 | AND a.question_id = c.question_id
59 | AND a.choice_id = c.choice_id
60 | GROUP BY 1
61 |
--------------------------------------------------------------------------------
/courseraresearchexports/commands/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 | import sys
17 |
18 | import requests
19 |
20 |
21 | def add_logging_parser(main_parser):
22 | """Build an argparse argument parser to parse the command line."""
23 |
24 | main_parser.set_defaults(setup_logging=set_logging_level)
25 |
26 | verbosity_group = main_parser.add_mutually_exclusive_group(required=False)
27 | verbosity_group.add_argument(
28 | '--verbose',
29 | '-v',
30 | action='count',
31 | help='Output more verbose logging. Can be specified multiple times.')
32 | verbosity_group.add_argument(
33 | '--quiet',
34 | '-q',
35 | action='count',
36 | help='Output less information to the console during operation. Can be '
37 | 'specified multiple times.')
38 |
39 | main_parser.add_argument(
40 | '--silence-urllib3',
41 | action='store_true',
42 | help='Silence urllib3 warnings. See '
43 | 'https://urllib3.readthedocs.org/en/latest/security.html for details.')
44 |
45 | return verbosity_group
46 |
47 |
48 | def set_logging_level(args):
49 | """Computes and sets the logging level from the parsed arguments."""
50 | logging.basicConfig()
51 | root_logger = logging.getLogger()
52 | level = logging.INFO
53 | logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING)
54 | logging.getLogger('requests.packages.urllib3').setLevel(logging.WARNING)
55 | if "verbose" in args and args.verbose is not None:
56 | logging.getLogger('requests.packages.urllib3').setLevel(0) # Unset
57 | if args.verbose > 1:
58 | level = 5 # "Trace" level
59 | elif args.verbose > 0:
60 | level = logging.DEBUG
61 | else:
62 | logging.critical("verbose is an unexpected value. {} exiting."
63 | .format(args.verbose))
64 | sys.exit(2)
65 | logging.getLogger('sqlalchemy.engine').setLevel(level)
66 | elif "quiet" in args and args.quiet is not None:
67 | if args.quiet > 1:
68 | level = logging.ERROR
69 | elif args.quiet > 0:
70 | level = logging.WARNING
71 | else:
72 | logging.critical("quiet is an unexpected value. {} exiting."
73 | .format(args.quiet))
74 | if level is not None:
75 | root_logger.setLevel(level)
76 |
77 | if args.silence_urllib3:
78 | # See: https://urllib3.readthedocs.org/en/latest/security.html
79 | requests.packages.urllib3.disable_warnings()
80 |
--------------------------------------------------------------------------------
/courseraresearchexports/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # PYTHON_ARGCOMPLETE_OK
4 |
5 | # Copyright 2016 Coursera
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | """
20 | Coursera's tools for interacting with research data exports.
21 |
22 | You may install it from source, or via pip.
23 | """
24 |
25 | import argcomplete
26 | import argparse
27 | import logging
28 | import sys
29 |
30 | from courseraresearchexports import commands
31 | from courseraresearchexports.commands import utils
32 |
33 |
34 | def build_parser():
35 | """
36 | Build an argparse argument parser to parse the command line.
37 | """
38 |
39 | parser = argparse.ArgumentParser(
40 | description="""Coursera tools for interacting with research exports.
41 | There are a number of subcommands, each with their own help
42 | documentation. Feel free to view them by executing `%(prog)s
43 | SUB_COMMAND -h`. For example: `%(prog)s jobs -h`.""",
44 | epilog="""Please file bugs on github at:
45 | https://github.com/coursera/courseraresearchexports/issues. If you
46 | would like to contribute to this tool's development, check us out at:
47 | https://github.com/coursera/courseraresarchexports""")
48 |
49 | utils.add_logging_parser(parser)
50 |
51 | # We have a number of subcommands. These subcommands have their own
52 | # subparsers. Each subcommand should set a default value for the 'func'
53 | # option. We then call the parsed 'func' function, and execution carries on
54 | # from there.
55 | subparsers = parser.add_subparsers()
56 |
57 | # create the parser for the version subcommand.
58 | commands.version.parser(subparsers)
59 |
60 | # create the parser for the jobs subcommand.
61 | commands.jobs.parser(subparsers)
62 |
63 | # create the parser for the containers subcommand.
64 | commands.containers.parser(subparsers)
65 |
66 | # create the parser for the db subcommand.
67 | commands.db.parser(subparsers)
68 |
69 | return parser
70 |
71 |
72 | def main():
73 | """
74 | Boots up the command line tool
75 | """
76 | logging.captureWarnings(True)
77 | parser = build_parser()
78 |
79 | argcomplete.autocomplete(parser)
80 |
81 | args = parser.parse_args()
82 | # Configure logging
83 | args.setup_logging(args)
84 | # Dispatch into the appropriate subcommand function.
85 | try:
86 | return args.func(args)
87 | except SystemExit:
88 | raise
89 | except:
90 | logging.exception('Problem when running command. Sorry!')
91 | sys.exit(1)
92 |
93 |
94 | if __name__ == "__main__":
95 | main()
96 |
--------------------------------------------------------------------------------
/courseraresearchexports/sql/enrollments.sql:
--------------------------------------------------------------------------------
1 | /*
2 | enrollments
3 | An enrollment is a unique learner-course pair. Many tables log a learner's
4 | interactions in a course, and this view will aggregate key metrics for simple
5 | reporting purposes.
6 |
7 | Columns
8 | coursera_user_id
9 | course_id
10 | commenced_dt
11 | is_enrollment_active
12 | activity_first_dt
13 | activity_last_dt
14 | num_days_active
15 | is_enrollment_completed
16 | completion_dt
17 | was_paid_or_finaid
18 | */
19 |
20 | /*
21 | Any user that reaches the LEARNER membership role in a course is considered a
22 | commenced enrolllment. This excludes those users that pre-enroll in the course,
23 | and then later unenrolls before the course starts.
24 | */
25 | WITH enrollment_commenced AS (
26 | SELECT
27 | cm.[partner_user_id]
28 | ,course_id
29 | ,MIN(course_membership_ts)::DATE AS commenced_dt
30 | FROM course_memberships AS cm
31 | WHERE
32 | course_membership_role = 'LEARNER'
33 | GROUP BY 1,2
34 | )
35 |
36 | /*
37 | Learners' progress on course items (e.g. lectures, quizzes, etc.) are
38 | summarized in the course_progress table. Generate their "activity" metrics with
39 | aggregate functions.
40 | */
41 | ,enrollment_progress AS (
42 | SELECT
43 | cp.[partner_user_id]
44 | ,course_id
45 | ,MIN(course_progress_ts)::DATE AS activity_first_dt
46 | ,MAX(course_progress_ts)::DATE AS activity_last_dt
47 | ,COUNT(DISTINCT course_progress_ts::DATE) AS num_days_active
48 | FROM course_progress AS cp -- contains 'started' or 'completed' progress
49 | GROUP BY 1,2
50 | )
51 |
52 | /*
53 | Learners who complete the course are logged by reaching one of two passing
54 | states in the the course_grades table. Generate when they first pass.
55 | */
56 | ,enrollment_completed AS (
57 | SELECT
58 | cg.[partner_user_id]
59 | ,course_id
60 | ,MIN(course_grade_ts)::DATE AS completion_dt
61 | FROM course_grades AS cg -- contains when the learner reached the highest grade
62 | WHERE
63 | course_passing_state_id IN (1,2) -- 'passed' or 'verified passed' states
64 | GROUP BY 1,2
65 | )
66 |
67 | /*
68 | Learners can own the course, either by payment or receiving financial aid.
69 | */
70 | ,enrollment_ownership AS (
71 | SELECT
72 | uccp.[partner_user_id]
73 | ,course_id
74 | ,was_payment OR was_finaid_grant AS was_paid_or_finaid
75 | FROM users_courses__certificate_payments AS uccp
76 | )
77 |
78 | /*
79 | Combine all learner-course stats into one final table.
80 | */
81 | SELECT
82 | ec.[partner_user_id]
83 | ,course_id
84 | ,commenced_dt
85 | ,activity_first_dt IS NOT NULL AS is_enrollment_active
86 | ,activity_first_dt
87 | ,activity_last_dt
88 | ,num_days_active
89 | ,completion_dt IS NOT NULL AS is_enrollment_completed
90 | ,completion_dt
91 | ,COALESCE(was_paid_or_finaid, FALSE) AS was_paid_or_finaid
92 | FROM enrollment_commenced AS ec
93 | LEFT JOIN enrollment_progress
94 | USING ([partner_user_id], course_id)
95 | LEFT JOIN enrollment_completed
96 | USING ([partner_user_id], course_id)
97 | LEFT JOIN enrollment_ownership
98 | USING ([partner_user_id], course_id)
99 |
100 |
--------------------------------------------------------------------------------
/courseraresearchexports/exports/api.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Coursera's wrapper for data exports API.
17 | """
18 |
19 | import requests
20 | from courseraoauth2client import oauth2
21 | from courseraresearchexports.models.utils import requests_response_to_model
22 | from courseraresearchexports.constants.api_constants import \
23 | RESEARCH_EXPORTS_APP, RESEARCH_EXPORTS_API, CLICKSTREAM_API
24 | from courseraresearchexports.models.ExportRequestWithMetadata import \
25 | ExportRequestWithMetadata
26 |
27 |
28 | @requests_response_to_model(ExportRequestWithMetadata.from_response)
29 | def get(export_job_id):
30 | """
31 | Use Coursera's Research Export Resource to get a data export job given an
32 | export job id.
33 | :param export_job_id:
34 | :return export_request_with_metadata: [ExportRequestWithMetaData]
35 | """
36 | auth = oauth2.build_oauth2(app=RESEARCH_EXPORTS_APP).build_authorizer()
37 | response = requests.get(
38 | url=requests.compat.urljoin(RESEARCH_EXPORTS_API, export_job_id),
39 | auth=auth)
40 |
41 | return response
42 |
43 |
44 | @requests_response_to_model(ExportRequestWithMetadata.from_response)
45 | def get_all():
46 | """
47 | Uses Coursera's Research Exports Resource to get all data export job
48 | requests created by a user. Limited to the 100 most recent requests.
49 | :return export_requests: [ExportRequestWithMetaData]
50 | """
51 | auth = oauth2.build_oauth2(app=RESEARCH_EXPORTS_APP).build_authorizer()
52 | response = requests.get(
53 | url=RESEARCH_EXPORTS_API,
54 | auth=auth,
55 | params={'q': 'my'})
56 |
57 | return response
58 |
59 |
60 | @requests_response_to_model(ExportRequestWithMetadata.from_response)
61 | def post(export_request):
62 | """
63 | Creates a data export job using a formatted json request.
64 | :param export_request:
65 | :return export_request_with_metadata: [ExportRequestWithMetadata]
66 | """
67 | auth = oauth2.build_oauth2(app=RESEARCH_EXPORTS_APP).build_authorizer()
68 | response = requests.post(
69 | url=RESEARCH_EXPORTS_API,
70 | json=export_request.to_json(),
71 | auth=auth)
72 |
73 | return response
74 |
75 |
76 | @requests_response_to_model(lambda response: response.json())
77 | def get_clickstream_download_links(clickstream_download_links_request):
78 | """
79 | Return the download links for clickstream exports in a given scope.
80 | :param clickstream_download_links_request: ClickstreamDownloadLinksRequest
81 | """
82 | auth = oauth2.build_oauth2(app=RESEARCH_EXPORTS_APP).build_authorizer()
83 | response = requests.post(
84 | url=CLICKSTREAM_API,
85 | params=clickstream_download_links_request.to_url_params(),
86 | auth=auth)
87 |
88 | return response
89 |
--------------------------------------------------------------------------------
/courseraresearchexports/containers/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import argparse
16 | from io import BytesIO
17 | import logging
18 | import os
19 | import tarfile
20 | import time
21 | import zipfile
22 |
23 | from docker import Client
24 |
25 |
26 | def extract_zip_archive(archive, dest, delete_archive=True):
27 | """
28 | Extracts a zip archive to `dest`
29 | :param export_archive:
30 | :param dest:
31 | :param delete_archive: delete the archive after extracting
32 | :return dest:
33 | """
34 | try:
35 | logging.debug('Extracting archive to {}'.format(dest))
36 | with zipfile.ZipFile(archive, 'r') as z:
37 | z.extractall(dest)
38 | if delete_archive:
39 | os.remove(archive)
40 | except:
41 | logging.error('Error in extracting zip archive {} to {}'.format(
42 | archive, dest))
43 | raise
44 |
45 |
46 | def create_tar_archive(str, name='init-user-db.sh'):
47 | """
48 | Creates tar archive to load single file as suggested by
49 | https://gist.github.com/zbyte64/6800eae10ce082bb78f0b7a2cca5cbc2
50 | """
51 | archive_tarstream = BytesIO()
52 | archive_file = tarfile.TarFile(fileobj=archive_tarstream, mode='w')
53 |
54 | file_data = str.encode('utf8')
55 | file_info = tarfile.TarInfo(name)
56 | file_info.size = len(file_data)
57 | file_info.mtime = time.time()
58 |
59 | archive_file.addfile(file_info, BytesIO(file_data))
60 | archive_file.close()
61 | archive_tarstream.seek(0)
62 |
63 | return archive_tarstream
64 |
65 |
66 | def get_next_available_port(containers_info):
67 | """
68 | Find next available port to map postgres port to host.
69 | :param containers_info:
70 | :return port:
71 | """
72 | ports = [container_info.host_port for container_info in containers_info]
73 |
74 | return (max(ports) + 1) if ports else 5433
75 |
76 |
77 | def is_container_running(container_name, docker_client):
78 | """
79 | Check whether container is still running.
80 | :param container_name:
81 | :param docker_client:
82 | :return isRunning: Boolean
83 | """
84 | container_details = docker_client.inspect_container(container_name)
85 |
86 | return container_details['State']['Running']
87 |
88 |
89 | def docker_client_arg_parser():
90 | """Builds an argparse parser for docker client connection flags."""
91 | # The following subcommands operate on a single containers. We centralize
92 | # all these options here.
93 | docker_parser = argparse.ArgumentParser(add_help=False)
94 | docker_parser.add_argument(
95 | '--docker-url',
96 | help='The url of the docker demon.')
97 | docker_parser.add_argument(
98 | '--timeout',
99 | type=int,
100 | default=60,
101 | help='Set the default timeout when interacting with the docker demon')
102 | return docker_parser
103 |
104 |
105 | def docker_client(docker_url=None, timeout=60):
106 | """
107 | Attempts to create a docker client.
108 |
109 | - docker_url: base url for docker
110 | - timeout: timeout for docker client
111 | - returns: a docker-py client
112 | """
113 | if docker_url:
114 | return Client(
115 | base_url=docker_url,
116 | timeout=timeout,
117 | version='auto')
118 | else:
119 | return Client(
120 | timeout=timeout,
121 | version='auto')
122 |
--------------------------------------------------------------------------------
/courseraresearchexports/models/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 |
17 | import requests
18 |
19 | from courseraresearchexports.constants.api_constants import \
20 | COURSE_API, PARTNER_API
21 |
22 |
23 | def requests_response_to_model(response_transformer):
24 | """
25 | Creates decorator to handles errors in response from API call and
26 | transforms response with response_handler_func
27 | :param response_transformer: function(response) -> Any
28 | :return:
29 | """
30 | def response_transform_decorator(original_func):
31 | """
32 | Creates wrapper around a function that returns response
33 | """
34 | def response_transformer_wrapper(*args, **kwargs):
35 | """
36 | Log errors and apply transformation in response_handler_func
37 | """
38 | try:
39 | response = original_func(*args, **kwargs)
40 | response.raise_for_status()
41 |
42 | except requests.exceptions.HTTPError:
43 | help_string = ('Please consult the Coursera Data '
44 | 'Exports Guide for further assistance: '
45 | 'https://partner.coursera.help/hc/en-us/articles/360021121132.') # noqa
46 |
47 | if (response.status_code == 403):
48 | help_string = ('Please authorize this application '
49 | 'by running:\n'
50 | '\t$ courseraoauth2client config authorize --app manage_research_exports\n' # noqa
51 | 'See https://github.com/coursera/courseraoauth2client ' # noqa
52 | 'for more information on authorization.\n'
53 | 'For further assistance, consult the '
54 | 'Coursera Data Exports Guide '
55 | 'https://partner.coursera.help/hc/en-us/articles/360021121132.') # noqa
56 |
57 | logging.error(
58 | 'Request to {url} with body:\n\t{body}\nreceived response'
59 | ':\n\t{text}\n'
60 | '{help_string}\n'
61 | .format(url=response.url,
62 | text=response.text,
63 | body=(response.request and response.request.body),
64 | help_string=help_string))
65 | raise
66 |
67 | return response_transformer(response)
68 | return response_transformer_wrapper
69 | return response_transform_decorator
70 |
71 |
72 | @requests_response_to_model(
73 | lambda response: response.json()['elements'][0]['slug'])
74 | def lookup_course_slug_by_id(course_id):
75 | """
76 | Find the course slug given an course_id
77 | """
78 | return requests.get(requests.compat.urljoin(COURSE_API, course_id))
79 |
80 |
81 | @requests_response_to_model(
82 | lambda response: response.json()['elements'][0]['id'])
83 | def lookup_course_id_by_slug(course_slug):
84 | """
85 | Find the course_id given a course_slug
86 | """
87 | payload = {'q': 'slug', 'slug': course_slug}
88 | return requests.get(COURSE_API, params=payload)
89 |
90 |
91 | @requests_response_to_model(
92 | lambda response: int(response.json()['elements'][0]['id']))
93 | def lookup_partner_id_by_short_name(partner_short_name):
94 | """
95 | Find the partner_id by short name
96 | """
97 | payload = {'q': 'shortName', 'shortName': partner_short_name}
98 | return requests.get(PARTNER_API, params=payload)
99 |
100 |
101 | @requests_response_to_model(
102 | lambda response: response.json()['elements'][0]['shortName'])
103 | def lookup_partner_short_name_by_id(partner_id):
104 | """
105 | Find the partner_id by short name
106 | """
107 | return requests.get(requests.compat.urljoin(PARTNER_API, str(partner_id)))
108 |
--------------------------------------------------------------------------------
/courseraresearchexports/models/ExportDb.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import csv
16 |
17 | from sqlalchemy import create_engine
18 | from sqlalchemy.engine import reflection
19 |
20 | from courseraresearchexports.models.ContainerInfo import ContainerInfo
21 |
22 |
23 | class ExportDb:
24 | """
25 | Interface for accessing a database containing research export data.
26 | """
27 | def __init__(self, host_ip=None, host_port=None, db=None, **kwargs):
28 |
29 | if not (host_ip and host_port and db):
30 | raise ValueError(
31 | 'Host IP, port and database name must be specified')
32 |
33 | self.host_ip = host_ip
34 | self.host_port = host_port
35 | self.db = db
36 | self.engine = create_engine(
37 | "postgresql://{user}@{host}:{port}/{db}"
38 | .format(user='postgres',
39 | host=self.host_ip,
40 | port=self.host_port,
41 | db=self.db))
42 |
43 | @classmethod
44 | def from_container(cls, container_name, docker_client):
45 | """
46 | Create ExportDb object directly from container_name identifier.
47 | :param container_name:
48 | :param docker_client:
49 | :return:
50 | """
51 | container_info = ContainerInfo.from_container(container_name,
52 | docker_client)
53 | return cls(host_ip=container_info.host_ip,
54 | host_port=container_info.host_port,
55 | db=container_info.database_name)
56 |
57 | def create_view(self, name, sql_text):
58 | """
59 | Creates or overrides an existing view given a select statement.
60 | :param name:
61 | :param sql_text:
62 | :return:
63 | """
64 | view_statement = """
65 | DROP VIEW IF EXISTS {name};
66 | CREATE VIEW {name} AS {sql_text};
67 | """.format(name=name, sql_text=sql_text)
68 |
69 | self.engine.execute(view_statement)
70 |
71 | def unload(self, query, output_filename):
72 | """
73 | Unloads to a csv file given a query.
74 | :param query:
75 | :param output_filename:
76 | :return rowcount:
77 | """
78 | result = self.engine.execute(query)
79 |
80 | rowcount = result.rowcount
81 |
82 | with open(output_filename, 'wb') as csv_file:
83 | csv_obj = csv.writer(csv_file)
84 | csv_obj.writerow(result.keys())
85 | for row in result:
86 | encoded_row = [col.encode('utf8')
87 | if isinstance(col, unicode) else col
88 | for col in row]
89 | csv_obj.writerow(encoded_row)
90 |
91 | return rowcount
92 |
93 | def unload_relation(self, relation, output_filename):
94 | """
95 | Unload a table or view.
96 | :param relation:
97 | :param output_filename:
98 | :return rowcount:
99 | """
100 | query = 'SELECT * FROM {relation};'.format(relation=relation)
101 | rowcount = self.unload(query, output_filename)
102 | return rowcount
103 |
104 | def get_columns(self, table):
105 | """
106 | Names of all the columns in a table.
107 | :param table:
108 | :return columns:
109 | """
110 | insp = reflection.Inspector.from_engine(self.engine)
111 | return [column['name'] for column in insp.get_columns(table)]
112 |
113 | @property
114 | def tables(self):
115 | """
116 | Names of all tables present on database.
117 | """
118 | insp = reflection.Inspector.from_engine(self.engine)
119 | return insp.get_table_names()
120 |
121 | @property
122 | def views(self):
123 | """
124 | Names of all views present on database.
125 | """
126 | insp = reflection.Inspector.from_engine(self.engine)
127 | return insp.get_view_names()
128 |
--------------------------------------------------------------------------------
/courseraresearchexports/exports/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 | import os
17 | from urlparse import urlparse
18 |
19 | from tqdm import tqdm
20 | import requests
21 |
22 | from courseraresearchexports.constants.api_constants import \
23 | EXPORT_TYPE_CLICKSTREAM, EXPORT_TYPE_TABLES
24 |
25 | from courseraresearchexports.exports import api
26 | from courseraresearchexports.models.ClickstreamDownloadLinksRequest import \
27 | ClickstreamDownloadLinksRequest
28 |
29 |
30 | def download(export_request, dest):
31 | """
32 | Download a data export job using a request id.
33 | """
34 | try:
35 | is_table_export = export_request.export_type == EXPORT_TYPE_TABLES
36 | is_clickstream_export = \
37 | export_request.export_type == EXPORT_TYPE_CLICKSTREAM
38 |
39 | _validate(export_request)
40 |
41 | if not os.path.exists(dest):
42 | logging.info('Creating destination folder: {}'.format(dest))
43 | os.makedirs(dest)
44 |
45 | if is_table_export:
46 | return [download_url(export_request.download_link, dest)]
47 | elif is_clickstream_export:
48 | links_request = ClickstreamDownloadLinksRequest.from_args(
49 | course_id=export_request.course_id,
50 | partner_id=export_request.partner_id,
51 | interval=export_request.interval)
52 | download_links = api.get_clickstream_download_links(links_request)
53 | if len(download_links) == 0:
54 | raise ValueError(
55 | 'Clickstream download links not found. This typically '
56 | 'means no data was available for the dates in '
57 | 'the specified interval: {interval}'
58 | .format(interval=export_request.interval))
59 | return [download_url(link, dest) for link in download_links]
60 | else:
61 | raise ValueError('Require export_type is one of {} or {}'.format(
62 | EXPORT_TYPE_TABLES,
63 | EXPORT_TYPE_CLICKSTREAM))
64 |
65 | except Exception as err:
66 | logging.error('Download failed.\n{err}'.format(err=err))
67 | raise
68 |
69 |
70 | def download_url(url, dest_folder):
71 | """
72 | Download url to dest_folder/FILENAME, where FILENAME is the last
73 | part of the url path.
74 | """
75 | filename = urlparse(url).path.split('/')[-1]
76 | full_filename = os.path.join(dest_folder, filename)
77 | response = requests.get(url, stream=True)
78 | chunk_size = 1024 * 1024
79 | logging.debug('Writing to file: {}'.format(full_filename))
80 |
81 | with open(full_filename, 'wb') as f:
82 | for data in tqdm(
83 | iterable=response.iter_content(chunk_size),
84 | total=int(response.headers['Content-length']) / chunk_size,
85 | unit='MB',
86 | desc=filename):
87 | f.write(data)
88 | return full_filename
89 |
90 |
91 | def _validate(export_request):
92 | is_clickstream_export = \
93 | export_request.export_type == EXPORT_TYPE_CLICKSTREAM
94 |
95 | if not export_request.download_link:
96 | if export_request.status in ['PENDING', 'IN_PROGRESS']:
97 | logging.error(
98 | 'Export request {} is currently {} and is not ready for'
99 | 'download. Please wait until the request is completed.'
100 | .format(export_request.id, export_request.status))
101 | raise ValueError(
102 | 'Export request is not yet ready for download')
103 | elif export_request.status == 'TERMINATED':
104 | logging.error(
105 | 'Export request has been TERMINATED. Please contact '
106 | 'data-support@coursera.org if we have not resolved this '
107 | 'within 24 hours.')
108 | raise ValueError('Export request has been TERMINATED')
109 | elif is_clickstream_export:
110 | # We don't fill in download links for clickstream exports
111 | pass
112 | else:
113 | logging.error('Download link was not found.')
114 | raise ValueError('Download link was not found')
115 |
--------------------------------------------------------------------------------
/courseraresearchexports/commands/db.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import print_function
16 |
17 | import logging
18 |
19 | from tabulate import tabulate
20 |
21 | import courseraresearchexports.db.db as db
22 | from courseraresearchexports.containers import utils
23 |
24 |
25 | def connect(args):
26 | """
27 | Connect postgres shell to dockerized database.
28 | """
29 | d = utils.docker_client(args.docker_url, args.timeout)
30 | db.connect(args.container_name, docker_client=d)
31 |
32 |
33 | def list_tables(args):
34 | """
35 | List all of the tables present in a dockerized database.
36 | """
37 | d = utils.docker_client(args.docker_url, args.timeout)
38 | tables = db.get_table_names(args.container_name, docker_client=d)
39 | print(tabulate([[table] for table in tables]))
40 |
41 |
42 | def list_views(args):
43 | """
44 | List all of the views present in a dockerized database.
45 | """
46 | d = utils.docker_client(args.docker_url, args.timeout)
47 | tables = db.get_view_names(args.container_name, docker_client=d)
48 | print(tabulate([[table] for table in tables]))
49 |
50 |
51 | def create_view(args):
52 | """
53 | Create a view from a sql query.
54 | """
55 | d = utils.docker_client(args.docker_url, args.timeout)
56 |
57 | if args.view_name:
58 | created_view = db.create_registered_view(
59 | args.container_name, args.view_name, d)
60 | elif args.sql_file:
61 | created_view = db.create_view_from_file(
62 | args.container_name, args.sql_file, d)
63 |
64 | logging.info('Created view {}'.format(created_view))
65 |
66 |
67 | def unload_relation(args):
68 | """
69 | Unload a table or view to a CSV file.
70 | """
71 | d = utils.docker_client(args.docker_url, args.timeout)
72 | rowcount = db.unload_relation(args.container_name, args.dest,
73 | args.relation, d)
74 |
75 | logging.info('Unloaded {} rows'.format(rowcount))
76 |
77 |
78 | def parser(subparsers):
79 | """Build an argparse argument parser to parse the command line."""
80 |
81 | # create the parser for the version subcommand.
82 | parser_db = subparsers.add_parser(
83 | 'db',
84 | help='Tools for interacting with dockerized database',
85 | parents=[utils.docker_client_arg_parser()])
86 |
87 | db_subparsers = parser_db.add_subparsers()
88 |
89 | parser_tables = db_subparsers.add_parser(
90 | 'list_tables',
91 | help=list_tables.__doc__)
92 | parser_tables.set_defaults(func=list_tables)
93 | parser_tables.add_argument(
94 | 'container_name',
95 | help='Name of the container database.')
96 |
97 | parser_views = db_subparsers.add_parser(
98 | 'list_views',
99 | help=list_views.__doc__)
100 | parser_views.set_defaults(func=list_views)
101 | parser_views.add_argument(
102 | 'container_name',
103 | help='Name of the container database.')
104 |
105 | parser_create_view = db_subparsers.add_parser(
106 | 'create_view',
107 | help=create_view.__doc__)
108 | parser_create_view.set_defaults(func=create_view)
109 | parser_create_view.add_argument(
110 | 'container_name',
111 | help='Name of the container database.')
112 | create_source_subparser = parser_create_view.add_mutually_exclusive_group(
113 | required=True)
114 | create_source_subparser.add_argument(
115 | '--view_name',
116 | help='Name of view')
117 | create_source_subparser.add_argument(
118 | '--sql_file',
119 | help='SQL file with query.')
120 |
121 | parser_unload = db_subparsers.add_parser(
122 | 'unload_to_csv',
123 | help=unload_relation.__doc__)
124 | parser_unload.set_defaults(func=unload_relation)
125 | parser_unload.add_argument(
126 | 'container_name',
127 | help='Name of the container database.')
128 | parser_unload.add_argument(
129 | '--dest',
130 | help='Destination folder.')
131 | parser_unload.add_argument(
132 | '--relation',
133 | help='Table or view to export.')
134 |
135 | parser_connect = db_subparsers.add_parser(
136 | 'connect',
137 | help=connect.__doc__)
138 | parser_connect.set_defaults(func=connect)
139 | parser_connect.add_argument(
140 | 'container_name',
141 | help='Name of the container database.')
142 |
143 | return parser_db
144 |
--------------------------------------------------------------------------------
/tests/models/export_request_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright 2016 Coursera
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from courseraresearchexports.constants.api_constants import SCHEMA_NAMES, \
18 | EXPORT_TYPE_TABLES, EXPORT_TYPE_CLICKSTREAM, EXPORT_TYPE_GRADEBOOK
19 | from courseraresearchexports.models.ExportRequest import ExportRequest
20 | from courseraresearchexports.models.ExportRequestWithMetadata import \
21 | ExportRequestWithMetadata
22 | from mock import patch
23 | from nose.tools import raises
24 |
25 | fake_course_id = 'fake_course_id'
26 | fake_course_slug = 'fake_course'
27 | fake_partner_id = 1
28 | bad_partner_id = 'bad_partner_id'
29 | fake_partner_short_name = 'fake_partner'
30 | fake_export_id = '1'
31 |
32 |
33 | def test_export_request_serialize_to_json():
34 | export_request = ExportRequest(course_id=fake_course_id)
35 | expected_result = {
36 | 'scope': {
37 | 'typeName': 'courseContext',
38 | 'definition': {
39 | 'courseId': fake_course_id}}}
40 |
41 | assert export_request.to_json() == expected_result
42 |
43 |
44 | def test_export_request_deserialize_from_json():
45 | export_request_json = {
46 | 'scope': {
47 | 'typeName': 'courseContext',
48 | 'definition': {
49 | 'courseId': fake_course_id}}}
50 | export_request = ExportRequest.from_json(export_request_json)
51 |
52 | assert ExportRequest(course_id=fake_course_id) == export_request
53 |
54 |
55 | def test_create_from_args():
56 | export_request = ExportRequest.from_args(course_id=fake_course_id)
57 | assert ExportRequest(course_id=fake_course_id) == export_request
58 |
59 |
60 | @raises(ValueError)
61 | def test_create_from_args_non_integer_partner_id():
62 | export_request = ExportRequest.from_args(partner_id=bad_partner_id)
63 |
64 |
65 | @patch('courseraresearchexports.models.utils.lookup_course_id_by_slug')
66 | def test_course_id_inference(lookup_course_id_by_slug):
67 | lookup_course_id_by_slug.return_value = fake_course_id
68 | export_request = ExportRequest.from_args(course_slug=fake_course_slug)
69 |
70 | assert ExportRequest(course_id=fake_course_id) == export_request
71 |
72 |
73 | @patch('courseraresearchexports.models.utils.'
74 | 'lookup_partner_id_by_short_name')
75 | def test_partner_id_inference(lookup_partner_id_by_short_name):
76 | lookup_partner_id_by_short_name.return_value = fake_partner_id
77 | export_request = ExportRequest.from_args(
78 | partner_short_name=fake_partner_short_name)
79 |
80 | assert ExportRequest(partner_id=fake_partner_id) == export_request
81 |
82 |
83 | def test_scope_id():
84 | export_request = ExportRequest(course_id=fake_course_id)
85 |
86 | assert export_request.scope_id == fake_course_id
87 |
88 |
89 | def test_schemas():
90 | eventing_request = ExportRequest(
91 | course_id=fake_course_id, export_type=EXPORT_TYPE_CLICKSTREAM)
92 | gradebook_request = ExportRequest(
93 | course_id=fake_course_id, export_type=EXPORT_TYPE_GRADEBOOK)
94 | all_tables_request = ExportRequest(
95 | course_id=fake_course_id, export_type=EXPORT_TYPE_TABLES,
96 | schema_names=SCHEMA_NAMES)
97 |
98 | assert eventing_request.schema_names_display is None
99 | assert gradebook_request.schema_names_display is None
100 | assert all_tables_request.schema_names_display == 'all'
101 |
102 |
103 | def test_export_request_with_metadata_from_export_request():
104 | export_request = ExportRequest.from_args(course_id=fake_course_id)
105 | export_request_with_metadata = \
106 | ExportRequestWithMetadata.from_export_request(
107 | export_request, id=fake_export_id)
108 |
109 | assert export_request.course_id == export_request_with_metadata.course_id
110 |
111 |
112 | def test_export_request_with_metadata_serialize_to_json():
113 | export_request = ExportRequestWithMetadata(course_id=fake_course_id,
114 | id=fake_export_id)
115 | expected_result = {
116 | 'scope': {
117 | 'typeName': 'courseContext',
118 | 'definition': {
119 | 'courseId': fake_course_id}},
120 | 'id': fake_export_id}
121 |
122 | assert export_request.to_json() == expected_result
123 |
124 |
125 | def test_export_request_with_metadata_deserialize_from_json():
126 | export_request_json = {
127 | 'scope': {
128 | 'typeName': 'courseContext',
129 | 'definition': {
130 | 'courseId': fake_course_id}},
131 | 'id': fake_export_id}
132 | export_request = ExportRequestWithMetadata.from_json(export_request_json)
133 |
134 | assert export_request == ExportRequestWithMetadata(
135 | course_id=fake_course_id, id=fake_export_id)
136 |
--------------------------------------------------------------------------------
/courseraresearchexports/db/db.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | import logging
17 | import pkg_resources
18 | import subprocess
19 |
20 | from courseraresearchexports.constants.container_constants import \
21 | POSTGRES_DOCKER_IMAGE
22 | from courseraresearchexports.models.ContainerInfo import ContainerInfo
23 | from courseraresearchexports.models.ExportDb import ExportDb
24 | from courseraresearchexports.constants.db_constants import \
25 | HASHED_USER_ID_COLUMN_TO_SOURCE_TABLE
26 |
27 |
28 | def replace_user_id_placeholders(export_db, sql_text):
29 | """
30 | Replace placeholders with actual user_id column names
31 | :param export_db:
32 | :param sql_text:
33 | :return sql_text_with_inferred_columns:
34 | """
35 | hashed_user_id_columns_dict = infer_hashed_user_id_columns(export_db)
36 |
37 | for placeholder, column_name in hashed_user_id_columns_dict.items():
38 | sql_text = sql_text.replace(placeholder, column_name)
39 |
40 | return sql_text
41 |
42 |
43 | def infer_hashed_user_id_columns(export_db):
44 | """
45 | Infer hashed_user_id_columns from database using known placeholders
46 | :param export_db:
47 | :return:
48 | """
49 | hashed_user_id_columns_dict = {}
50 |
51 | for placeholder, table in HASHED_USER_ID_COLUMN_TO_SOURCE_TABLE.items():
52 | if table in export_db.tables:
53 | columns = export_db.get_columns(table)
54 | inferred_column = infer_user_id_column(columns)
55 | if inferred_column:
56 | hashed_user_id_columns_dict[placeholder] = inferred_column
57 |
58 | return hashed_user_id_columns_dict
59 |
60 |
61 | def infer_user_id_column(columns):
62 | """
63 | Infer partner_short_name
64 | :param columns:
65 | :return:
66 | """
67 | return next((column for column in columns
68 | if column.endswith('user_id')), None)
69 |
70 |
71 | def connect(container_name, docker_client):
72 | """
73 | Create psql shell to container databaise
74 | :param container_name:
75 | :param docker_client:
76 | """
77 | container_info = ContainerInfo.from_container(
78 | container_name, docker_client)
79 |
80 | subprocess.call([
81 | 'docker', 'run', '-it', '--rm',
82 | '--link', container_info.name,
83 | POSTGRES_DOCKER_IMAGE, 'psql',
84 | '-h', container_info.name,
85 | '-d', container_info.database_name,
86 | '-U', 'postgres'
87 | ], shell=False)
88 |
89 |
90 | def get_table_names(container_name, docker_client):
91 | """
92 | Returns table names present in containerized database.
93 | :param container_name:
94 | :param docker_client:
95 | :return table_names:
96 | """
97 | export_db = ExportDb.from_container(container_name, docker_client)
98 |
99 | return export_db.tables
100 |
101 |
102 | def get_view_names(container_name, docker_client):
103 | """
104 | Returns view names present in containerized database.
105 | :param container_name:
106 | :param docker_client:
107 | :return table_names:
108 | """
109 | export_db = ExportDb.from_container(container_name, docker_client)
110 |
111 | return export_db.views
112 |
113 |
114 | def unload_relation(container_name, dest, relation, docker_client):
115 | """
116 | Unloads a table or view to a csv file.
117 | :param container_name:
118 | :param dest_file:
119 | :param relation:
120 | :param docker_client:
121 | :return:
122 | """
123 | if not os.path.exists(dest):
124 | logging.debug('Creating destination folder: {}'.format(dest))
125 | os.makedirs(dest)
126 |
127 | export_db = ExportDb.from_container(container_name, docker_client)
128 | output_filename = os.path.join(dest, '{}.csv'.format(relation))
129 | rowcount = export_db.unload_relation(relation, output_filename)
130 | return rowcount
131 |
132 |
133 | def create_registered_view(container_name, view_name, docker_client):
134 | """
135 | Create a prepackaged view
136 | :param container_name:
137 | :param view_name:
138 | :param partner_short_name:
139 | :param docker_client:
140 | :return view_name:
141 | """
142 | export_db = ExportDb.from_container(container_name, docker_client)
143 |
144 | sql_text = pkg_resources.resource_string(
145 | __name__.split('.')[0], 'sql/{}.sql'.format(view_name))
146 | sql_text_with_inferred_columns = replace_user_id_placeholders(
147 | export_db, sql_text)
148 |
149 | export_db.create_view(view_name, sql_text_with_inferred_columns)
150 |
151 | return view_name
152 |
153 |
154 | def create_view_from_file(container_name, sql_file, docker_client):
155 | """
156 | Create a view from a sql file.
157 | :param container_name:
158 | :param sql_file:
159 | :param partner_short_name:
160 | :param docker_client:
161 | :return view_name:
162 | """
163 | export_db = ExportDb.from_container(container_name, docker_client)
164 |
165 | with open(sql_file, 'r') as sf:
166 | sql_text = sf.read()
167 |
168 | view_name = os.path.splitext(os.path.basename(sql_file))[0]
169 |
170 | sql_text_with_inferred_columns = replace_user_id_placeholders(
171 | export_db, sql_text)
172 |
173 | export_db.create_view(view_name, sql_text_with_inferred_columns)
174 |
175 | return view_name
176 |
--------------------------------------------------------------------------------
/courseraresearchexports/commands/containers.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import print_function
16 |
17 | import logging
18 |
19 | from tabulate import tabulate
20 |
21 | from courseraresearchexports.containers import client
22 | from courseraresearchexports.containers import utils
23 |
24 |
25 | def create_container(args):
26 | """
27 | Create a container containing a postgres database using an export job id.
28 | Export job will be downloaded and loaded into dockerized database.
29 | Automatically starts container.
30 | """
31 | d = utils.docker_client(args.docker_url, args.timeout)
32 |
33 | kwargs = {}
34 | if args.container_name:
35 | kwargs['container_name'] = args.container_name
36 | if args.database_name:
37 | kwargs['database_name'] = args.database_name
38 |
39 | if args.export_request_id:
40 | container_id = client.create_from_export_request_id(
41 | args.export_request_id, docker_client=d, **kwargs)
42 | elif args.export_data_folder:
43 | container_id = client.create_from_folder(
44 | args.export_data_folder, docker_client=d, **kwargs)
45 |
46 | logging.info('Container {:.12} ready.'.format(container_id))
47 |
48 |
49 | def list_containers(args):
50 | """
51 | List docker containers created with Coursera data exports.
52 | """
53 | d = utils.docker_client(args.docker_url, args.timeout)
54 | containers_info = client.list_all(docker_client=d)
55 |
56 | if containers_info:
57 | containers_info_table = [['Name', 'Container Id', 'Database',
58 | 'Created', 'Status', 'Host IP', 'Port']]
59 |
60 | for container_info in containers_info:
61 | containers_info_table.append([
62 | container_info.name,
63 | container_info.short_id,
64 | container_info.database_name,
65 | container_info.creation_time.strftime('%c'),
66 | container_info.status,
67 | container_info.host_ip,
68 | container_info.host_port
69 | ])
70 |
71 | print(tabulate(containers_info_table, headers='firstrow'))
72 |
73 |
74 | def start_container(args):
75 | """
76 | Start a docker container.
77 | """
78 | d = utils.docker_client(args.docker_url, args.timeout)
79 | client.start(args.container_name, docker_client=d)
80 |
81 |
82 | def stop_container(args):
83 | """
84 | Stop a docker container.
85 | """
86 | d = utils.docker_client(args.docker_url, args.timeout)
87 | client.stop(args.container_name, docker_client=d)
88 |
89 |
90 | def remove_container(args):
91 | """
92 | Remove a docker container, stop the container
93 | before removing.
94 | """
95 | d = utils.docker_client(args.docker_url, args.timeout)
96 | client.remove(args.container_name, docker_client=d)
97 |
98 |
99 | def parser(subparsers):
100 | parser_containers = subparsers.add_parser(
101 | 'containers',
102 | help='Create docker container from export jobs',
103 | description='Command line tools for creating a docker container'
104 | 'containing the results of a research export. Please first '
105 | 'authenticate with the OAuth2 client before making requests ('
106 | 'courseraoauth2client config authorize --app manage-research-exports)',
107 | epilog='Please file bugs on github at: '
108 | 'https://github.com/coursera/courseraresearchexports/issues. If you '
109 | 'would like to contribute to this tool\'s development, check us out '
110 | 'at: https://github.com/coursera/courseraresarchexports',
111 | parents=[utils.docker_client_arg_parser()])
112 |
113 | containers_subparsers = parser_containers.add_subparsers()
114 |
115 | parser_create = containers_subparsers.add_parser(
116 | 'create',
117 | help=create_container.__doc__,
118 | description=create_container.__doc__)
119 | parser_create.set_defaults(func=create_container)
120 |
121 | source_subparser = parser_create.add_mutually_exclusive_group(
122 | required=True)
123 |
124 | source_subparser.add_argument(
125 | '--export_request_id',
126 | help='Export job to download and create containers')
127 | source_subparser.add_argument(
128 | '--export_data_folder',
129 | help='Location of already downloaded export data')
130 |
131 | parser_create.add_argument(
132 | '--container_name',
133 | help='Name for docker container.')
134 | parser_create.add_argument(
135 | '--database_name',
136 | help='Name for database inside container.')
137 |
138 | parser_list = containers_subparsers.add_parser(
139 | 'list',
140 | help=list_containers.__doc__)
141 | parser_list.set_defaults(func=list_containers)
142 |
143 | parser_stop = containers_subparsers.add_parser(
144 | 'stop',
145 | help=stop_container.__doc__)
146 | parser_stop.add_argument(
147 | 'container_name',
148 | help='Name of the container to stop.')
149 | parser_stop.set_defaults(func=stop_container)
150 |
151 | parser_start = containers_subparsers.add_parser(
152 | 'start',
153 | help=start_container.__doc__)
154 | parser_start.add_argument(
155 | 'container_name',
156 | help='Name of the container to start.')
157 | parser_start.set_defaults(func=start_container)
158 |
159 | parser_remove = containers_subparsers.add_parser(
160 | 'remove',
161 | help=remove_container.__doc__)
162 | parser_remove.add_argument(
163 | 'container_name',
164 | help='Name of the container to remove.')
165 | parser_remove.set_defaults(func=remove_container)
166 |
167 | return parser_containers
168 |
--------------------------------------------------------------------------------
/courseraresearchexports/models/ExportRequestWithMetadata.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from datetime import datetime
16 | import time
17 |
18 | from courseraresearchexports.models.ExportRequest import ExportRequest
19 |
20 |
21 | class ExportRequestMetadata:
22 | """Metadata about the internal timings of the export request"""
23 |
24 | def __init__(self, created_by=None, created_at=None, started_at=None,
25 | completed_at=None, snapshot_at=None, **kwargs):
26 | self._created_by = created_by
27 | self._created_at = created_at
28 | self._started_at = started_at
29 | self._completed_at = completed_at
30 | self._snapshot_at = snapshot_at
31 |
32 | def to_json(self):
33 | """
34 | Serialize metadata from json object.
35 | :return json_metadata:
36 | """
37 | json_metadata = {}
38 | if self._created_by:
39 | json_metadata['createdBy'] = self._created_by
40 | if self._created_at:
41 | json_metadata['createdAt'] = datetime_to_unix_ms(self._created_at)
42 | if self._started_at:
43 | json_metadata['startedAt'] = datetime_to_unix_ms(self._started_at)
44 | if self._completed_at:
45 | json_metadata['completedAt'] = datetime_to_unix_ms(
46 | self._completed_at)
47 | if self._snapshot_at:
48 | json_metadata['snapshotAt'] = datetime_to_unix_ms(
49 | self._snapshot_at)
50 |
51 | return json_metadata
52 |
53 | @classmethod
54 | def from_json(cls, json_metadata):
55 | """
56 | Deserialize ExportRequestMetaData from json object.
57 | :param json_metadata:
58 | :return export_request_metadata: ExportRequestMetadata
59 | """
60 | if json_metadata:
61 | kwargs = {}
62 | if json_metadata.get('createdBy'):
63 | kwargs['created_by'] = json_metadata['createdBy']
64 | if json_metadata.get('createdAt'):
65 | kwargs['created_at'] = unix_ms_to_datetime(
66 | json_metadata['createdAt'])
67 | if json_metadata.get('completedAt'):
68 | kwargs['completed_at'] = unix_ms_to_datetime(
69 | json_metadata['completedAt'])
70 | if json_metadata.get('startedAt'):
71 | kwargs['started_at'] = unix_ms_to_datetime(
72 | json_metadata['startedAt'])
73 | if json_metadata.get('snapshotAt'):
74 | kwargs['snapshot_at'] = unix_ms_to_datetime(
75 | json_metadata['snapshotAt'])
76 | return cls(**kwargs)
77 |
78 | else:
79 | return None
80 |
81 |
82 | class ExportRequestWithMetadata(ExportRequest):
83 | """
84 | Class representing a export request from Coursera's research data export
85 | service with metadata about its status.
86 | """
87 |
88 | def __init__(self, course_id=None, partner_id=None, group_id=None,
89 | export_type=None, anonymity_level=None,
90 | statement_of_purpose=None, schema_names=None,
91 | interval=None, ignore_existing=None, id=None,
92 | status=None, download_link=None, metadata=None, **kwargs):
93 | ExportRequest.__init__(
94 | self, course_id=course_id, partner_id=partner_id,
95 | group_id=group_id, export_type=export_type,
96 | anonymity_level=anonymity_level,
97 | statement_of_purpose=statement_of_purpose,
98 | schema_names=schema_names, interval=interval,
99 | ignore_existing=ignore_existing)
100 | self._id = id
101 | self._status = status
102 | self._download_link = download_link
103 | self._metadata = metadata
104 |
105 | def to_json(self):
106 | """
107 | Serialize ExportRequestWithMetadata to json object
108 | :return json_request:
109 | """
110 | json_request = ExportRequest.to_json(self)
111 |
112 | if self._id:
113 | json_request['id'] = self._id
114 | if self._status:
115 | json_request['status'] = self._status
116 | if self._download_link:
117 | json_request['downloadLink'] = self._download_link
118 | if self._metadata:
119 | json_request['metadata'] = self._metadata.to_json()
120 |
121 | return json_request
122 |
123 | @classmethod
124 | def from_export_request(cls, export_request, id=None, status=None,
125 | download_link=None, metadata=None, **kwargs):
126 | """
127 | Create an object of class ExportRequestWithMetadata from an object of
128 | class ExportRequest.
129 | :param export_request: ExportRequest, parent object
130 | :param id:
131 | :param status:
132 | :param download_link:
133 | :param metadata:
134 | :param kwargs:
135 | :return export_request_with_metadata: ExportRequestWithMetadata
136 | """
137 | return cls(
138 | course_id=export_request._course_id,
139 | partner_id=export_request._partner_id,
140 | group_id=export_request._group_id,
141 | export_type=export_request._export_type,
142 | anonymity_level=export_request._anonymity_level,
143 | statement_of_purpose=export_request._statement_of_purpose,
144 | schema_names=export_request._schema_names,
145 | interval=export_request._interval,
146 | ignore_existing=export_request._ignore_existing,
147 | id=id,
148 | status=status,
149 | download_link=download_link,
150 | metadata=metadata)
151 |
152 | @classmethod
153 | def from_json(cls, json_request):
154 | """
155 | Deserialize ExportRequestWithMetadata from json object.
156 | :param json_request:
157 | :return export_request: ExportRequestWithMetadata
158 | """
159 | export_request = ExportRequest.from_json(json_request)
160 |
161 | return cls.from_export_request(
162 | export_request=export_request,
163 | id=json_request.get('id'),
164 | status=json_request.get('status'),
165 | download_link=json_request.get('downloadLink'),
166 | metadata=ExportRequestMetadata.from_json(
167 | json_request.get('metadata')))
168 |
169 | @classmethod
170 | def from_response(cls, response):
171 | """
172 | Instantiate a list of ExportRequestWithMeta objects from
173 | API call response.
174 | :param response:
175 | :return export_request_with_metadata_list: [ExportRequestWithMetadata]
176 | """
177 | return [cls.from_json(export_request)
178 | for export_request in response.json()['elements']]
179 |
180 | @property
181 | def id(self):
182 | return self._id
183 |
184 | @property
185 | def status(self):
186 | return self._status
187 |
188 | @property
189 | def download_link(self):
190 | return self._download_link
191 |
192 | @property
193 | def metadata(self):
194 | return self._metadata
195 |
196 | @property
197 | def created_at(self):
198 | if self._metadata and self._metadata._created_at:
199 | return self._metadata._created_at
200 | else:
201 | return datetime.fromtimestamp(0)
202 |
203 |
204 | def datetime_to_unix_ms(dt):
205 | """Convert datetime object to timestamp in milliseconds"""
206 | return int(time.mktime(dt.timetuple()) * 1000)
207 |
208 |
209 | def unix_ms_to_datetime(unix_ms):
210 | """Convert timestamp in milliseconds to datetime object"""
211 | return datetime.fromtimestamp(unix_ms / 1000.0)
212 |
--------------------------------------------------------------------------------
/courseraresearchexports/containers/client.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Coursera's tools for managing docker containers configured with a
17 | postgres database.
18 | """
19 |
20 | import logging
21 | import os
22 | import shutil
23 | import time
24 |
25 | from courseraresearchexports import exports
26 | from courseraresearchexports.constants.api_constants import \
27 | EXPORT_TYPE_TABLES
28 | from courseraresearchexports.constants.container_constants import \
29 | COURSERA_DOCKER_LABEL, COURSERA_LOCAL_FOLDER, POSTGRES_DOCKER_IMAGE, \
30 | POSTGRES_INIT_MSG, POSTGRES_READY_MSG
31 | from courseraresearchexports.containers import utils as container_utils
32 | from courseraresearchexports.exports import utils as export_utils
33 | from courseraresearchexports.models.ContainerInfo import ContainerInfo
34 |
35 |
36 | def list_all(docker_client):
37 | """
38 | Return all containers that have Coursera label
39 | :param docker_client:
40 | :return containers_info: [ContainerInfo]
41 | """
42 | return [ContainerInfo.from_container(container['Id'], docker_client)
43 | for container in docker_client.containers(
44 | all=True, filters={'label': COURSERA_DOCKER_LABEL})]
45 |
46 |
47 | def start(container_name, docker_client):
48 | """
49 | Start a docker container containing a research export database. Waits until
50 | """
51 | try:
52 | logging.debug('Starting container {}...'.format(container_name))
53 | docker_client.start(container_name)
54 |
55 | # poll logs to see if database is ready to accept connections
56 | while POSTGRES_READY_MSG not in docker_client.logs(
57 | container_name, tail=4):
58 |
59 | logging.debug('Polling container for database connection...')
60 | if not container_utils.is_container_running(
61 | container_name, docker_client):
62 | raise RuntimeError('Container failed to start.')
63 |
64 | time.sleep(10)
65 |
66 | logging.info('Started container {}.'.format(container_name))
67 |
68 | except:
69 | logging.error(
70 | """Container failed to start, check log for errors:\n{}"""
71 | .format(docker_client.logs(container_name, tail=20)))
72 | raise
73 |
74 |
75 | def stop(container_name, docker_client):
76 | """
77 | Stops a docker container
78 | """
79 | docker_client.stop(container_name)
80 |
81 |
82 | def remove(container_name, docker_client):
83 | """
84 | Remove a stopped container
85 | """
86 | docker_client.remove_container(container_name)
87 |
88 |
89 | def initialize(container_name, docker_client):
90 | """
91 | Initialize a docker container. Polls database for completion of
92 | entrypoint tasks.
93 | """
94 | try:
95 | logging.info('Initializing container {}...'.format(
96 | container_name))
97 |
98 | docker_client.start(container_name)
99 | while POSTGRES_INIT_MSG not in docker_client.logs(
100 | container_name, tail=20):
101 |
102 | logging.debug('Polling data for entrypoint initialization...')
103 | if not container_utils.is_container_running(container_name,
104 | docker_client):
105 | raise RuntimeError('Container initialization failed.')
106 |
107 | time.sleep(10)
108 |
109 | logging.info('Initialized container {}.'.format(container_name))
110 |
111 | except:
112 | logging.error(
113 | """Container initialization failed, check log for errors:\n{}"""
114 | .format(docker_client.logs(container_name, tail=20)))
115 | logging.error(
116 | """If error persists, consider restarting your docker engine.""")
117 | raise
118 |
119 |
120 | def create_from_folder(export_data_folder, docker_client,
121 | container_name='coursera-exports',
122 | database_name='coursera-exports',
123 | database_password=''):
124 | """
125 | Using a folder containing a Coursera research export, create a docker
126 | container with the export data loaded into a data base and start the
127 | container
128 | :param export_data_folder: folder where export data/scripts is stored
129 | :param docker_client:
130 | :param container_name:
131 | :param database_name:
132 | :param database_password:
133 | :return container_id:
134 | """
135 | logging.debug('Creating containers from {folder}'.format(
136 | folder=export_data_folder))
137 |
138 | env = ({'POSTGRES_PASSWORD': database_password} if database_password
139 | else {'POSTGRES_HOST_AUTH_METHOD': 'trust'})
140 | create_container_args = {
141 | 'environment': env,
142 | 'volumes': ['/mnt/exportData'],
143 | 'host_config': docker_client.create_host_config(
144 | binds=['{}:/mnt/exportData:ro'.format(export_data_folder)],
145 | port_bindings={
146 | 5432: ('127.0.0.1',
147 | container_utils.get_next_available_port(list_all(
148 | docker_client)))
149 | })
150 | }
151 | container = create_postgres_container(
152 | docker_client, container_name, database_name, create_container_args)
153 |
154 | container_id = container['Id']
155 |
156 | # copy containers initialization script to entrypoint
157 | database_setup_script = """
158 | createdb -U {user} {db}
159 | cd /mnt/exportData
160 | psql -e -U {user} -d {db} -f setup.sql
161 | psql -e -U {user} -d {db} -f load.sql
162 | """.format(user='postgres', db=database_name)
163 |
164 | docker_client.put_archive(
165 | container_id, # using a named argument causes NullResource error
166 | path='/docker-entrypoint-initdb.d/',
167 | data=container_utils.create_tar_archive(
168 | database_setup_script, name='init-user-db.sh'))
169 |
170 | logging.info('Created container with id: {}'.format(container_id))
171 |
172 | initialize(container_id, docker_client)
173 |
174 | return container_id
175 |
176 |
177 | def create_postgres_container(docker_client, container_name, database_name,
178 | create_container_args):
179 | if not docker_client.images(name=POSTGRES_DOCKER_IMAGE):
180 | logging.info('Downloading image: {}'.format(POSTGRES_DOCKER_IMAGE))
181 | docker_client.import_image(image=POSTGRES_DOCKER_IMAGE)
182 |
183 | for existing_container in docker_client.containers(
184 | all=True, filters={'name': container_name}):
185 | logging.info('Removing existing container with name: {}'.format(
186 | container_name))
187 | docker_client.stop(existing_container)
188 | docker_client.remove_container(existing_container)
189 | create_container_args['image'] = POSTGRES_DOCKER_IMAGE
190 | create_container_args['name'] = container_name
191 | create_container_args['labels'] = {
192 | COURSERA_DOCKER_LABEL: None,
193 | 'database_name': database_name
194 | }
195 | return docker_client.create_container(**create_container_args)
196 |
197 |
198 | def create_from_export_request_id(export_request_id, docker_client,
199 | container_name=None,
200 | database_name=None,
201 | database_password=''):
202 | """
203 | Create a docker container containing the export data from a given
204 | export request. Container and database name will be inferred as the
205 | course slug or partner short name from export_request if not provided.
206 | :param export_request_id:
207 | :param docker_client:
208 | :param container_name:
209 | :param database_name:
210 | :param database_password:
211 | :return container_id:
212 | """
213 | export_request = exports.api.get(export_request_id)[0]
214 |
215 | if export_request.export_type != EXPORT_TYPE_TABLES:
216 | raise ValueError('Invalid Export Type. (Only tables exports supported.'
217 | 'Given [{}])'.format(export_request.export_type))
218 |
219 | logging.info('Downloading export {}'.format(export_request_id))
220 | downloaded_files = export_utils.download(
221 | export_request, dest=COURSERA_LOCAL_FOLDER)
222 | dest = os.path.join(COURSERA_LOCAL_FOLDER, export_request_id)
223 | for f in downloaded_files:
224 | container_utils.extract_zip_archive(
225 | archive=f,
226 | dest=dest,
227 | delete_archive=True)
228 |
229 | container_id = create_from_folder(
230 | export_data_folder=dest,
231 | docker_client=docker_client,
232 | database_name=(database_name if database_name
233 | else export_request.scope_name),
234 | container_name=(container_name if container_name
235 | else export_request.scope_name),
236 | database_password=(database_password if database_password
237 | else '')
238 | )
239 |
240 | shutil.rmtree(dest)
241 |
242 | return container_id
243 |
--------------------------------------------------------------------------------
/courseraresearchexports/models/ExportRequest.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from courseraresearchexports.constants.api_constants import \
16 | ANONYMITY_LEVEL_COORDINATOR, ANONYMITY_LEVEL_ISOLATED, EXPORT_TYPE_TABLES,\
17 | EXPORT_TYPE_CLICKSTREAM, EXPORT_TYPE_GRADEBOOK, SCHEMA_NAMES
18 | from courseraresearchexports.models import utils
19 | import re
20 | import string
21 |
22 |
23 | class ExportRequest:
24 | """
25 | Represents a export request for Coursera's research data export
26 | service and provides methods for serialization.
27 | """
28 |
29 | def __init__(self, course_id=None, partner_id=None, group_id=None,
30 | export_type=None, anonymity_level=None,
31 | statement_of_purpose=None, schema_names=None,
32 | interval=None, ignore_existing=None, **kwargs):
33 | self._course_id = course_id
34 | if partner_id is not None:
35 | self._partner_id = int(partner_id)
36 | else:
37 | self._partner_id = partner_id
38 | self._group_id = group_id
39 | self._export_type = export_type
40 | self._anonymity_level = anonymity_level
41 | self._statement_of_purpose = statement_of_purpose
42 | self._schema_names = schema_names
43 | self._interval = interval
44 | self._ignore_existing = ignore_existing
45 |
46 | def to_json(self):
47 | """
48 | Serialize ExportRequest to a dictionary representing a json object.
49 | No validation is done with the exception that only specification of
50 | scope is used (course/partner/group).
51 | :return json_request:
52 | """
53 | json_request = {}
54 |
55 | if self._course_id:
56 | json_request['scope'] = {
57 | 'typeName': 'courseContext',
58 | 'definition': {
59 | 'courseId': self._course_id
60 | }}
61 | elif self._partner_id:
62 | json_request['scope'] = {
63 | 'typeName': 'partnerContext',
64 | 'definition': {
65 | 'partnerId': {
66 | 'maestroId': self._partner_id
67 | }}}
68 | elif self._group_id:
69 | json_request['scope'] = {
70 | 'typeName': 'groupContext',
71 | 'definition': {
72 | 'groupId': self._group_id
73 | }}
74 | if self._export_type:
75 | json_request['exportType'] = self._export_type
76 | if self._anonymity_level:
77 | json_request['anonymityLevel'] = self._anonymity_level
78 | if self._statement_of_purpose:
79 | json_request['statementOfPurpose'] = self._statement_of_purpose
80 | if self._schema_names:
81 | json_request['schemaNames'] = self._schema_names
82 | if self._interval:
83 | json_request['interval'] = {
84 | 'start': self._interval[0], 'end': self._interval[1]}
85 | if self._ignore_existing:
86 | json_request['ignoreExisting'] = self._ignore_existing
87 |
88 | return json_request
89 |
90 | @classmethod
91 | def from_args(cls, **kwargs):
92 | """
93 | Create a ExportResource object using the parameters required. Performs
94 | course_id/partner_id inference if possible.
95 | :param kwargs:
96 | :return export_request: ExportRequest
97 | """
98 | if kwargs.get('course_slug') and not kwargs.get('course_id'):
99 | kwargs['course_id'] = utils.lookup_course_id_by_slug(
100 | kwargs['course_slug'])
101 | elif kwargs.get('partner_short_name') and not kwargs.get('partner_id'):
102 | kwargs['partner_id'] = utils.lookup_partner_id_by_short_name(
103 | kwargs['partner_short_name'])
104 |
105 | if kwargs.get('user_id_hashing'):
106 | if kwargs['user_id_hashing'] == 'linked':
107 | kwargs['anonymity_level'] = ANONYMITY_LEVEL_COORDINATOR
108 | elif kwargs['user_id_hashing'] == 'isolated':
109 | kwargs['anonymity_level'] = ANONYMITY_LEVEL_ISOLATED
110 |
111 | return cls(**kwargs)
112 |
113 | @classmethod
114 | def from_json(cls, json_request):
115 | """
116 | Deserialize ExportRequest from json object.
117 | :param json_request:
118 | :return export_request: ExportRequest
119 | """
120 | kwargs = {}
121 | request_scope = json_request['scope']
122 | request_scope_context = request_scope['typeName']
123 |
124 | if request_scope_context == 'courseContext':
125 | kwargs['course_id'] = request_scope['definition']['courseId']
126 | elif request_scope_context == 'partnerContext':
127 | kwargs['partner_id'] = \
128 | request_scope['definition']['partnerId']['maestroId']
129 | elif request_scope_context == 'groupContext':
130 | kwargs['group_id'] = request_scope['definition']['groupId']
131 |
132 | if json_request.get('interval'):
133 | kwargs['interval'] = [
134 | json_request['interval']['start'],
135 | json_request['interval']['end']
136 | ]
137 |
138 | return cls(
139 | export_type=json_request.get('exportType'),
140 | anonymity_level=json_request.get('anonymityLevel'),
141 | statement_of_purpose=json_request.get('statementOfPurpose'),
142 | schema_names=json_request.get('schemaNames'),
143 | ignore_existing=json_request.get('ignoreExisting'),
144 | **kwargs)
145 |
146 | @property
147 | def course_id(self):
148 | return self._course_id
149 |
150 | @property
151 | def partner_id(self):
152 | return self._partner_id
153 |
154 | @property
155 | def export_type(self):
156 | return self._export_type
157 |
158 | @property
159 | def export_type_display(self):
160 | if self._export_type == EXPORT_TYPE_GRADEBOOK:
161 | return 'GRADEBOOK'
162 | elif self._export_type == EXPORT_TYPE_CLICKSTREAM:
163 | return 'CLICKSTREAM'
164 | elif self._export_type == EXPORT_TYPE_TABLES:
165 | return 'TABLES'
166 | else:
167 | return self._export_type
168 |
169 | @property
170 | def anonymity_level(self):
171 | return self._anonymity_level
172 |
173 | @property
174 | def formatted_anonymity_level(self):
175 | if self.anonymity_level == ANONYMITY_LEVEL_COORDINATOR:
176 | return 'Linked'
177 | elif self.anonymity_level == ANONYMITY_LEVEL_ISOLATED:
178 | return 'Isolated'
179 | else:
180 | return 'Unknown'
181 |
182 | @property
183 | def statement_of_purpose(self):
184 | return self._statement_of_purpose
185 |
186 | @property
187 | def interval(self):
188 | return self._interval
189 |
190 | @property
191 | def ignore_existing(self):
192 | return self._ignore_existing
193 |
194 | @property
195 | def scope_context(self):
196 | """
197 | Context for this ExportRequest, assume that only one identifier for
198 | partner/course/group is defined.
199 | """
200 | if self._course_id:
201 | return 'COURSE'
202 | elif self._partner_id:
203 | return 'PARTNER'
204 | elif self._group_id:
205 | return 'GROUP'
206 | else:
207 | return None
208 |
209 | @property
210 | def scope_id(self):
211 | """
212 | Identifier for the scope, assume that only one of course/partner/group
213 | is defined for a valid request.
214 | :return scope_id:
215 | """
216 | return self._course_id or self._partner_id or self._group_id
217 |
218 | @property
219 | def scope_name(self):
220 | """
221 | Human readable name for this scope context. Partner short names for
222 | partners, but only group ids for groups and course ids for courses(apis
223 | are not open)
224 | :return:
225 | """
226 | if self._course_id:
227 | try:
228 | return utils.lookup_course_slug_by_id(self._course_id)
229 | except:
230 | print("couldn't create human readable course name, using alphanumeric characters of course_id")
231 | chars = re.escape(string.punctuation)
232 | return re.sub(r'['+chars+']', '', self._course_id)
233 | elif self._partner_id:
234 | try:
235 | return utils.lookup_partner_short_name_by_id(self._partner_id)
236 | except:
237 | print("couldn't create human readable partner name, using course_id")
238 | return self._partner_id
239 | elif self._group_id:
240 | return self._group_id
241 | else:
242 | return 'UNKNOWN'
243 |
244 | @property
245 | def schema_names(self):
246 | return self._schema_names
247 |
248 | @property
249 | def schema_names_display(self):
250 | """
251 | Display only property for schemas names.
252 | :return schemas:
253 | """
254 | if self._schema_names:
255 | if set(self._schema_names) == set(SCHEMA_NAMES):
256 | return 'all'
257 | else:
258 | return ','.join(self._schema_names)
259 | else:
260 | return None
261 |
262 | def __eq__(self, other):
263 | """
264 | Override for internal equality checks as suggested at:
265 | http://stackoverflow.com/a/390640
266 | """
267 | if type(other) is type(self):
268 | return self.__dict__ == other.__dict__
269 | return False
270 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/courseraresearchexports/commands/jobs.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Coursera
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import print_function
16 |
17 | import json
18 | import logging
19 |
20 | import argparse
21 | from tabulate import tabulate
22 |
23 | from courseraresearchexports.exports import api
24 | from courseraresearchexports.constants.api_constants import \
25 | ANONYMITY_LEVEL_COORDINATOR, EXPORT_TYPE_CLICKSTREAM, \
26 | EXPORT_TYPE_TABLES, SCHEMA_NAMES
27 | from courseraresearchexports.models.ClickstreamDownloadLinksRequest import \
28 | ClickstreamDownloadLinksRequest
29 | from courseraresearchexports.models.ExportRequest import ExportRequest
30 | from courseraresearchexports.exports import utils
31 |
32 |
33 | def request_clickstream(args):
34 | """
35 | Create and send an clickstream data export request with Coursera. Only
36 | available for data coordinators.
37 | """
38 | export_request = ExportRequest.from_args(
39 | course_id=args.course_id,
40 | course_slug=args.course_slug,
41 | partner_id=args.partner_id,
42 | partner_short_name=args.partner_short_name,
43 | group_id=args.group_id,
44 | anonymity_level=ANONYMITY_LEVEL_COORDINATOR,
45 | statement_of_purpose=args.purpose,
46 | export_type=EXPORT_TYPE_CLICKSTREAM,
47 | interval=args.interval,
48 | ignore_existing=args.ignore_existing)
49 |
50 | export_request_with_metadata = api.post(export_request)[0]
51 |
52 | logging.info('Successfully created clickstream export request {id}.'
53 | .format(id=export_request_with_metadata.id))
54 | logging.debug('Request created with json body:\n{json}'
55 | .format(json=json.dumps(
56 | export_request_with_metadata.to_json(), indent=2)))
57 |
58 |
59 | def request_tables(args):
60 | """
61 | Create and send a tables data export request with Coursera.
62 | """
63 | export_request = ExportRequest.from_args(
64 | course_id=args.course_id,
65 | course_slug=args.course_slug,
66 | partner_id=args.partner_id,
67 | partner_short_name=args.partner_short_name,
68 | group_id=args.group_id,
69 | user_id_hashing=args.user_id_hashing,
70 | statement_of_purpose=args.purpose,
71 | export_type=EXPORT_TYPE_TABLES,
72 | schema_names=args.schemas)
73 |
74 | export_request_with_metadata = api.post(export_request)[0]
75 |
76 | logging.info('Successfully created tables export request {id}.'
77 | .format(id=export_request_with_metadata.id))
78 | logging.debug('Request created with json body:\n{json}'
79 | .format(json=json.dumps(
80 | export_request_with_metadata.to_json(), indent=2)))
81 |
82 |
83 | def get(args):
84 | """
85 | Get the details and status of a data export request using a job id.
86 | """
87 | export_request = api.get(args.id)[0]
88 |
89 | export_request_info = [
90 | ['Export Job Id:', export_request.id],
91 | ['Export Type:', export_request.export_type_display],
92 | ['Status:', export_request.status],
93 | ['Scope Context:', export_request.scope_context],
94 | ['Scope Id:', export_request.scope_id],
95 | ['Scope Name:', export_request.scope_name],
96 | ['User id Hashing: ', export_request.formatted_anonymity_level],
97 | ['Created:', export_request.created_at.strftime('%c')]]
98 |
99 | if export_request.schema_names:
100 | export_request_info.append(
101 | ['Schemas:', export_request.schema_names_display])
102 |
103 | if export_request.download_link:
104 | export_request_info.append(
105 | ['Download Link:', export_request.download_link])
106 |
107 | if export_request.interval:
108 | export_request_info.append(
109 | ['Interval:', ' to '.join(export_request.interval)])
110 |
111 | print(tabulate(export_request_info, tablefmt="plain"))
112 |
113 |
114 | def get_all(args):
115 | """
116 | Get the details and status of your data export requests.
117 | """
118 | export_requests = api.get_all()
119 |
120 | export_requests_table = [['Created', 'Request Id', 'Status', 'Type',
121 | 'User Id Hashing', 'Scope', 'Schemas']]
122 | for export_request in sorted(export_requests, key=lambda x: x.created_at):
123 | export_requests_table.append([
124 | export_request.created_at.strftime('%Y-%m-%d %H:%M'),
125 | export_request.id,
126 | export_request.status,
127 | export_request.export_type_display,
128 | export_request.formatted_anonymity_level,
129 | export_request.scope_id,
130 | export_request.schema_names_display])
131 |
132 | print(tabulate(export_requests_table, headers='firstrow'))
133 |
134 |
135 | def download(args):
136 | """
137 | Download a data export job using a request id.
138 | """
139 | try:
140 | export_request = api.get(args.id)[0]
141 | dest = args.dest
142 | utils.download(export_request, dest)
143 | except Exception as err:
144 | logging.error('Download failed with exception:\n{}'.format(err))
145 | raise
146 |
147 |
148 | def get_clickstream_links(args):
149 | """
150 | Generate links for clickstream data exports
151 | """
152 | clickstream_links_request = ClickstreamDownloadLinksRequest.from_args(
153 | course_id=args.course_id,
154 | course_slug=args.course_slug,
155 | partner_id=args.partner_id,
156 | partner_short_name=args.partner_short_name,
157 | group_id=args.group_id,
158 | interval=args.interval)
159 |
160 | clickstream_download_links = api.get_clickstream_download_links(
161 | clickstream_links_request)
162 |
163 | # TODO: add more descriptive information or option write to text file
164 | print(tabulate(
165 | [[link] for link in clickstream_download_links],
166 | tablefmt="plain"))
167 |
168 |
169 | def parser(subparsers):
170 | parser_jobs = subparsers.add_parser(
171 | 'jobs',
172 | help='Get status of current/completed research export job(s)',
173 | description='Command line tools for requesting and reviewing the '
174 | 'status of Coursera research data exports. Please first authenticate '
175 | 'with the OAuth2 client before making requests (courseraoauth2client '
176 | 'config authorize --app manage-research-exports).',
177 | epilog='Please file bugs on github at: '
178 | 'https://github.com/coursera/courseraresearchexports/issues. If you '
179 | 'would like to contribute to this tool\'s development, check us out '
180 | 'at: https://github.com/coursera/courseraresarchexports')
181 |
182 | jobs_subparsers = parser_jobs.add_subparsers()
183 |
184 | create_request_parser(jobs_subparsers)
185 |
186 | parser_get_all = jobs_subparsers.add_parser(
187 | 'get_all',
188 | help=get_all.__doc__,
189 | description=get_all.__doc__)
190 | parser_get_all.set_defaults(func=get_all)
191 |
192 | parser_get = jobs_subparsers.add_parser(
193 | 'get',
194 | help=get.__doc__,
195 | description=get.__doc__)
196 | parser_get.set_defaults(func=get)
197 |
198 | parser_get.add_argument(
199 | 'id',
200 | help='Export request ID')
201 |
202 | parser_download = jobs_subparsers.add_parser(
203 | 'download',
204 | help=download.__doc__,
205 | description=download.__doc__)
206 | parser_download.set_defaults(func=download)
207 |
208 | parser_download.add_argument(
209 | 'id',
210 | help='Export request ID')
211 |
212 | parser_download.add_argument(
213 | '--dest',
214 | default='.',
215 | help='Destination folder')
216 |
217 | parser_clickstream_links = jobs_subparsers.add_parser(
218 | 'clickstream_download_links',
219 | help='Get download links for completed eventing exports.')
220 | parser_clickstream_links.set_defaults(func=get_clickstream_links)
221 |
222 | create_scope_subparser(parser_clickstream_links)
223 |
224 | parser_clickstream_links.add_argument(
225 | '--interval',
226 | nargs=2,
227 | metavar=('START', 'END'),
228 | help='Interval of exported clickstream data, inclusive. '
229 | '(i.e. 2016-08-01 2016-08-04).')
230 |
231 | return parser_jobs
232 |
233 |
234 | def create_scope_subparser(parser):
235 | scope_subparser = parser.add_mutually_exclusive_group(
236 | required=True)
237 | scope_subparser.add_argument(
238 | '--course_id',
239 | help='Export rows corresponding to learners within a course according '
240 | 'to the unique id assigned by Coursera.')
241 | scope_subparser.add_argument(
242 | '--course_slug',
243 | help='Export rows corresponding to learners within a course according '
244 | 'to the unique name of your course defined as the part after '
245 | '/learn in the course url. (e.g. machine-learning for '
246 | 'https://www.coursera.org/learn/machine-learning).')
247 | scope_subparser.add_argument(
248 | '--partner_id',
249 | type=int,
250 | help='Export rows corresponding to learners within a partner.')
251 | scope_subparser.add_argument(
252 | '--partner_short_name',
253 | help='Export rows corresponding to learners within a partner.')
254 | scope_subparser.add_argument(
255 | '--group_id',
256 | help='Export rows corresponding to learners without a group.')
257 |
258 |
259 | def create_request_parser(subparsers):
260 | parser_request = subparsers.add_parser(
261 | 'request',
262 | help='Create and send a data export request with Coursera.',
263 | description='Create and send a data export request with Coursera. '
264 | 'Use subcommands to specify the export request type.')
265 | request_subparsers = parser_request.add_subparsers()
266 |
267 | # common arguments between schema and eventing exports
268 | request_args_parser = argparse.ArgumentParser(add_help=False)
269 |
270 | create_scope_subparser(request_args_parser)
271 |
272 | request_args_parser.add_argument(
273 | '--purpose',
274 | required=True,
275 | help='Please let us know how you plan to use the '
276 | 'data, what types of research questions you\'re asking, who will '
277 | 'be working with the data primarily, and with whom you plan to '
278 | 'share it.')
279 |
280 | # tables subcommand
281 | parser_tables = request_subparsers.add_parser(
282 | 'tables',
283 | help=request_tables.__doc__,
284 | description=request_tables.__doc__,
285 | parents=[request_args_parser])
286 | parser_tables.set_defaults(func=request_tables)
287 |
288 | parser_tables.add_argument(
289 | '--user_id_hashing',
290 | choices=['linked', 'isolated'],
291 | default='isolated',
292 | help='The level of user_id hashing in the data export. With \'linked\''
293 | ' user_id hashing, users can be identified between table schemas. '
294 | 'With \'isolated\' user_id hashing, users have independent ids in'
295 | 'different schemas and cannot be linked. Only data coordinators have '
296 | 'access to \'linked\' users_ids to restrict PII.')
297 |
298 | parser_tables.add_argument(
299 | '--schemas',
300 | choices=SCHEMA_NAMES,
301 | nargs='+',
302 | default=SCHEMA_NAMES,
303 | help='Data schemas to export. Any combination of: {}. By default this '
304 | 'will be all available schemas.'.format(
305 | ', '.join(SCHEMA_NAMES)))
306 |
307 | # clickstream subcommand
308 | parser_clickstream = request_subparsers.add_parser(
309 | 'clickstream',
310 | help=request_clickstream.__doc__,
311 | description=request_clickstream.__doc__,
312 | parents=[request_args_parser])
313 | parser_clickstream.set_defaults(func=request_clickstream)
314 |
315 | parser_clickstream.add_argument(
316 | '--interval',
317 | nargs=2,
318 | metavar=('START', 'END'),
319 | help='Interval of clickstream data to be exported '
320 | '(i.e. 2016-08-01 2016-08-04). By default this will be the past day.')
321 |
322 | parser_clickstream.add_argument(
323 | '--ignore_existing',
324 | action='store_true',
325 | help='If flag is set, we will recompute clickstream data for all dates'
326 | 'in the interval. Otherwise, previously computed days are skipped.')
327 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | courseraresearchexports
2 | =======================
3 |
4 | .. image:: https://travis-ci.org/coursera/courseraresearchexports.svg
5 | :target: https://travis-ci.org/coursera/courseraresearchexports
6 |
7 | This project is a library consisting of a command line interface and a client
8 | for interacting with Coursera's research exports. Up to date documentation
9 | of the data provided by Coursera for research purposes is available in the Partner Resource Center
10 | , `Coursera Data Exports Guide `_.
11 |
12 | Installation
13 | ------------
14 |
15 | To install this package, execute::
16 |
17 | pip install courseraresearchexports
18 |
19 | `pip `_ is a python package manager.
20 |
21 | If you do not have ``pip`` installed on your machine, please follow the
22 | `installation instructions `_ for your platform.
23 |
24 | If you experience issues installing with `pip`, we recommend that you use the
25 | python 2.7 distribution of `Anaconda `_ and try the above
26 | command again or to use a `virtualenv `_
27 | for installation::
28 |
29 | virtualenv venv -p python2.7
30 | source venv/bin/activate
31 | pip install courseraresearchexports
32 |
33 | Note: the ``containers`` subcommand requires ``docker`` to already be installed
34 | on your machine. Please see the `docker installation instructions `_ for platform
35 | specific information.
36 |
37 | Refer to `Issues`_ section for additional debugging around installation.
38 |
39 | autocomplete
40 | ^^^^^^^^^^^^
41 |
42 | To enable tab autocomplete, please install `argcomplete `_ using
43 | ``pip install autocomplete`` and execute ``activate-global-python-argcomplete``. Open a new shell and
44 | press tab for autocomplete functionality.
45 |
46 | See the argcomplete documentation for more details.
47 |
48 | Setup
49 | -----
50 |
51 | Authorize your application using `courseraoauth2client `_::
52 |
53 | courseraoauth2client config authorize --app manage_research_exports
54 |
55 | To use the ``containers`` functionality, a docker instance must be running.
56 | Please see the docker `getting started guide `_
57 | for installation instructions for your platform.
58 |
59 | Upgrade
60 | -------
61 |
62 | If you have a previously installed version of `courseracourseexports`, execute::
63 |
64 | pip install courseraresearchexports --upgrade
65 |
66 | This will upgrade your installation to the newest version.
67 |
68 | Command Line Interface
69 | ----------------------
70 |
71 | The project includes a command line tool. Run::
72 |
73 | courseraresearchexports -h
74 |
75 | for a complete list of features, flags, and documentation. Similarly,
76 | documentation for the subcommands listed below is also available (e.g. for
77 | ``jobs``) by running::
78 |
79 | courseraresearchexports jobs -h
80 |
81 | jobs
82 | ^^^^
83 | Submit a research export request or retrieve the status of pending and
84 | completed export jobs.
85 |
86 | request
87 | ~~~~~~~
88 | Creates an data export job request and return the export request id. To create a
89 | data export requests for all available tables for a course::
90 |
91 | courseraresearchexports jobs request tables --course_id $COURSE_ID \
92 | --purpose "testing data export"
93 |
94 | In order to know your course_id, you can take advantage
95 | of our COURSE API, putting in the appropriate course_slug.
96 |
97 | For example,
98 | if the course_slug is `developer-iot`, you can query the course_id by making the request in your browser logged in session::
99 |
100 | https://api.coursera.org/api/onDemandCourses.v1?q=slug&slug=developer-iot
101 |
102 | The response will be a JSON object containing an id field with the value::
103 |
104 | iRl53_BWEeW4_wr--Yv6Aw
105 |
106 | **Note**: The course slug is the part after
107 | ``/learn`` in your course url. For ``https://www.coursera.org/learn/machine-learning``,
108 | the slug is `machine-learning`
109 |
110 | If you have a publically available course, you can request the export using::
111 |
112 | courseraresearchexports jobs request tables --course_slug $COURSE_SLUG \
113 | --purpose "testing data export"
114 |
115 | Replace ``$COURSE_SLUG`` with your course slug (The course slug is the part after
116 | ``/learn`` in the url. For ``https://www.coursera.org/learn/machine-learning``,
117 | the slug is `machine-learning`).
118 |
119 | If a more limited set of data is required, you can specify which schemas are
120 | included with the export. (e.g. for the demographics and notebooks tables)::
121 |
122 | courseraresearchexports jobs request tables --course_id $COURSE_ID \
123 | --schemas demographics notebooks --purpose "testing data export"
124 |
125 | You can look at all the possible ways to export using::
126 |
127 | courseraresearchexports jobs request tables -h
128 |
129 | **Recommendations**
130 |
131 |
132 | 1. Always request the specific schemas that you need by adding the `schemas` while requesting the exports.
133 | For more information on the available tables/schemas, please refer to the
134 | `Coursera Data Exports Guide `_.
135 |
136 | 2. While requesting the exports for all courses in your institution, it is recommended to use the partner level export,
137 | rather than requesting individual course level exports. You can use the command::
138 |
139 | courseraresearchexports jobs request tables --partner_short_name $PARTNER_SHORT_NAME \
140 | --schemas demographics notebooks --purpose "testing data export"
141 |
142 | Your partner_short_name can be found in the University Assets section of your institution setting.
143 |
144 | Note that the above command is available for only publicly available partners.
145 | If you have your partnerID, you can request the export using::
146 |
147 | courseraresearchexports jobs request tables --partner_id $PARTNER_ID \
148 | --schemas demographics notebooks --purpose "testing data export"
149 |
150 | You can find your partner_id using the API in your browser login session::
151 | https://www.coursera.org/api/partners.v1?q=shortName&shortName=$PARTNER_SHORT_NAME
152 |
153 | If you are a data coordinator, you can request that user ids are linked between
154 | domains of the data export::
155 |
156 | courseraresearchexports jobs request tables --course_id $COURSE_ID \
157 | --purpose "testing data export" --user_id_hashing linked
158 |
159 | Data coordinators can also request clickstream exports::
160 |
161 | courseraresearchexports jobs request clickstream --course_id $COURSE_ID \
162 | --interval 2016-09-01 2016-09-02 --purpose "testing data export"
163 |
164 | By default, clickstream exports will cache results for days already exported. To ignore the cache and request exports for the entire date range, pass in the flag ``--ignore_existing``.
165 |
166 | Rate limits
167 | ~~~~~~~~~~~
168 | We have rate limits enabled for the number of exports that can be performed. The underlying export API returns the rate limit error message,
169 | which is printed when the command fails. The error message reflects the reason why you might be rate limited.
170 |
171 | get_all
172 | ~~~~~~~
173 | Lists the details and status of all data export requests that you have made::
174 |
175 | courseraresearchexports jobs get_all
176 |
177 | get
178 | ~~~
179 | Retrieve the details and status of an export request::
180 |
181 | courseraresearchexports jobs get $EXPORT_REQUEST_ID
182 |
183 | download
184 | ~~~~~~~~
185 | Download a completed table or clickstream to your local destination::
186 |
187 | courseraresearchexports jobs download $EXPORT_REQUEST_ID
188 |
189 | clickstream_download_links
190 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
191 | Due to the size of clickstream exports, we persist download links for completed
192 | clickstream export requests on Amazon S3. The clickstream data for each day is
193 | saved into a separate file and download links to these files can be retrieved
194 | by running::
195 |
196 | courseraresearchexports jobs clickstream_download_links --course_id $COURSE_ID
197 |
198 | containers
199 | ^^^^^^^^^^
200 |
201 | create
202 | ~~~~~~
203 | Creates a docker container using the postgres image and loads export data
204 | into a postgres database on the container. To create a docker container
205 | from an export, first ``request`` an export using the ``jobs`` command. Then,
206 | using the ``$EXPORT_REQUEST_ID``, create a docker container with::
207 |
208 | courseraresearchexports containers create --export_request_id $EXPORT_REQUEST_ID
209 |
210 | This will download the data export and load all the data into the database
211 | running on the container. This may take some time depending on the size of
212 | your export. To create a docker container with an already downloaded export
213 | (please decompress the archive first)::
214 |
215 | courseraresearchexports containers create --export_data_folder /path/to/data_export/
216 |
217 | After creation use the ``list`` command to check the status of the
218 | container and view the container name, database name, address and port to
219 | connect to the database. Use the `db connect $CONTAINER_NAME` command to open
220 | a psql shell.
221 |
222 | list
223 | ~~~~
224 | Lists the details of all the containers created by ``courseraresearchexports``::
225 |
226 | courseraresearchexports containers list
227 |
228 | start
229 | ~~~~~
230 | Start a container::
231 |
232 | courseraresearchexports containers start $CONTAINER_NAME
233 |
234 | stop
235 | ~~~~
236 | Stop a container::
237 |
238 | courseraresearchexports containers stop $CONTAINER_NAME
239 |
240 | remove
241 | ~~~~~~
242 | Remove a container::
243 |
244 | courseraresearchexports containers remove $CONTAINER_NAME
245 |
246 | db
247 | ^^
248 |
249 | connect
250 | ~~~~~~~
251 | Open a shell to a postgres database::
252 |
253 | courseraresearchexports db connect $CONTAINER_NAME
254 |
255 | create_view
256 | ~~~~~~~~~~~
257 | Create a view in the postgres database. We are planning to include commonly
258 | used denormalized views as part of this project. To create one of these views
259 | (i.e. for the demographic_survey view)::
260 |
261 | courseraresearchexports db create_view $CONTAINER_NAME --view_name demographic_survey
262 |
263 | If you have your own sql script that you'd like to create as a view run::
264 |
265 | courseraresearchexports db create_view $CONTAINER_NAME --sql_file /path/to/sql/file/new_view.sql
266 |
267 | This will create a view using the name of the file as the name of the view, in this case "new_view".
268 |
269 | Note: as `user_id` columns vary with partner and user id hashing, please refer
270 | to the exports guide for SQL formatting guidelines.
271 |
272 | unload_to_csv
273 | ~~~~~~~~~~~~~
274 | Export a table or view to a csv file. For example, if the `demographic_survey`
275 | was created in the above section, use this commmand to create a csv::
276 |
277 | courseraresearchexports db unload_to_csv $CONTAINER_NAME --relation demographic_survey --dest /path/to/dest/
278 |
279 | list_tables
280 | ~~~~~~~~~~~
281 | List all the tables present inside a dockerized database::
282 |
283 | courseraresearchexports db list_tables $CONTAINER_NAME
284 |
285 | list_views
286 | ~~~~~~~~~~
287 | List all the views present inside a dockerized database::
288 |
289 | courseraresearchexports db list_views $CONTAINER_NAME
290 |
291 | Using `courseraresearchexports` on a machine without a browser
292 | --------------------------------------------------------------
293 | Sometimes, a browser is not available, making the oauth flow not possible. Commonly, this occurs when users want to automate the data export process by using an external machine.
294 |
295 | To get around this, you may generate the access token initially on a machine with browser access [e.g your laptop]. The access token is serialized in your local file system at `~/.coursera/manage_research_exports_oauth2_cache.pickle`.
296 |
297 | Requests after the first can use the refresh token flow, which does not require a browser. By copying the initial pickled access token to a remote machine, that machine can continue to request updated data.
298 |
299 |
300 |
301 | Bugs / Issues / Feature Requests
302 | --------------------------------
303 |
304 | Please us the github issue tracker to document any bugs or other issues you
305 | encounter while using this tool.
306 |
307 |
308 | Developing / Contributing
309 | -------------------------
310 |
311 | We recommend developing ``courseraresearchexports`` within a python
312 | `virtualenv `_.
313 | To get your environment set up properly, do the following::
314 |
315 | virtualenv venv
316 | source venv/bin/activate
317 | python setup.py develop
318 | pip install -r test_requirements.txt
319 |
320 | Tests
321 | ^^^^^
322 |
323 | To run tests, simply run: ``nosetests``, or ``tox``.
324 |
325 | Code Style
326 | ^^^^^^^^^^
327 |
328 | Code should conform to pep8 style requirements. To check, simply run::
329 |
330 | pep8 courseraresearchexports tests
331 |
332 |
333 | Issues
334 | -------
335 | If you face following error when installling psycopg2 package for Mac::
336 |
337 | ld: library not found for -lssl
338 | clang: error: linker command failed with exit code 1 (use -v to see invocation)
339 | error: command 'gcc' failed with exit status 1
340 |
341 | Install openssl package if not installed::
342 |
343 | brew install openssl
344 | export LDFLAGS="-L/usr/local/opt/openssl/lib"
345 | or
346 | export LDFLAGS=-L/usr/local/opt/openssl@3/lib
347 |
348 |
--------------------------------------------------------------------------------