├── tests ├── __init__.py ├── utils │ ├── __init__.py │ └── utils_tests.py └── models │ ├── __init__.py │ ├── jobs_tests.py │ └── export_request_tests.py ├── courseraresearchexports ├── __init__.py ├── db │ ├── __init__.py │ └── db.py ├── exports │ ├── __init__.py │ ├── api.py │ └── utils.py ├── containers │ ├── __init__.py │ ├── utils.py │ └── client.py ├── constants │ ├── __init__.py │ ├── container_constants.py │ ├── db_constants.py │ └── api_constants.py ├── models │ ├── __init__.py │ ├── ContainerInfo.py │ ├── ClickstreamDownloadLinksRequest.py │ ├── utils.py │ ├── ExportDb.py │ ├── ExportRequestWithMetadata.py │ └── ExportRequest.py ├── commands │ ├── __init__.py │ ├── version.py │ ├── utils.py │ ├── db.py │ ├── containers.py │ └── jobs.py ├── sql │ ├── demographic_survey.sql │ └── enrollments.sql └── main.py ├── MANIFEST.in ├── test_requirements.txt ├── tox.ini ├── .travis.yml ├── .gitignore ├── setup.py ├── LICENSE └── README.rst /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /courseraresearchexports/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include courseraresearchexports/sql * 2 | -------------------------------------------------------------------------------- /test_requirements.txt: -------------------------------------------------------------------------------- 1 | mock==1.0.1 2 | nose==1.3.7 3 | pep8==1.6.2 4 | testfixtures==4.1.2 5 | -------------------------------------------------------------------------------- /courseraresearchexports/db/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "db" 3 | ] 4 | 5 | from . import * # noqa 6 | -------------------------------------------------------------------------------- /courseraresearchexports/exports/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "api", 3 | "utils" 4 | ] 5 | 6 | from . import * # noqa 7 | -------------------------------------------------------------------------------- /courseraresearchexports/containers/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "client", 3 | "utils" 4 | ] 5 | 6 | from . import * # noqa 7 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27 3 | [testenv] 4 | deps = 5 | nose 6 | mock 7 | testfixtures 8 | commands = 9 | nosetests 10 | -------------------------------------------------------------------------------- /courseraresearchexports/constants/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "api_constants", 3 | "db_constants", 4 | "container_constants" 5 | ] 6 | 7 | from . import * # noqa 8 | -------------------------------------------------------------------------------- /courseraresearchexports/models/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "ExportRequestWithMetadata", 3 | "ExportRequest", 4 | "ClickstreamDownloadLinksRequest", 5 | "ContainerInfo", 6 | "ExportDb", 7 | "utils" 8 | ] 9 | 10 | from . import * # noqa 11 | -------------------------------------------------------------------------------- /courseraresearchexports/commands/__init__.py: -------------------------------------------------------------------------------- 1 | "Commands and their implementations for Coursera's research export tools." 2 | 3 | __all__ = [ 4 | "version", 5 | "jobs", 6 | "containers", 7 | "db", 8 | "utils" 9 | ] 10 | 11 | from . import * # noqa 12 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | 5 | # command to install dependencies 6 | install: 7 | - "pip install ." 8 | - "pip install -r test_requirements.txt" 9 | 10 | # command to run tests & check style 11 | script: 12 | - nosetests 13 | - pep8 courseraresearchexports tests 14 | -------------------------------------------------------------------------------- /courseraresearchexports/constants/container_constants.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | COURSERA_DOCKER_LABEL = 'courseraResearchExport' 4 | COURSERA_LOCAL_FOLDER = os.path.expanduser('~/.coursera/exports/') 5 | POSTGRES_DOCKER_IMAGE = 'postgres:9.5' 6 | POSTGRES_INIT_MSG = 'PostgreSQL init process complete; ready for start up.' 7 | POSTGRES_READY_MSG = 'database system is ready to accept connections' 8 | -------------------------------------------------------------------------------- /courseraresearchexports/constants/db_constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | HASHED_USER_ID_COLUMN_TO_SOURCE_TABLE = { 16 | '[partner_user_id]': 'users', 17 | '[demographics_user_id]': 'demographics_answers', 18 | '[feedback_user_id]': 'feedback_course_ratings', 19 | '[assessments_user_id]': 'assessment_actions', 20 | '[peer_assignments_user_id]': 'peer_submissions', 21 | '[discussions_user_id]': 'discussion_answers', 22 | '[programming_assignments_user_id]': 'programming_submissions', 23 | } 24 | -------------------------------------------------------------------------------- /tests/utils/utils_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 Coursera 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from courseraresearchexports.models import utils 18 | from mock import Mock 19 | from mock import patch 20 | import requests 21 | 22 | fake_partner_short_name = 'fake_partner_short_name' 23 | fake_partner_id = 1 24 | fake_partner_response = {'elements': [{"id": str(fake_partner_id)}]} 25 | 26 | 27 | @patch.object(requests, 'get') 28 | def test_partner_id_lookup(mockget): 29 | mock_partners_get_response = Mock() 30 | mock_partners_get_response.json.return_value = fake_partner_response 31 | mockget.return_value = mock_partners_get_response 32 | inferred_partner_id = utils.lookup_partner_id_by_short_name( 33 | fake_partner_short_name) 34 | 35 | assert inferred_partner_id == fake_partner_id 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # IDEA project settings 92 | .idea 93 | 94 | # Ignore DS_STORE 95 | .DS_Store 96 | -------------------------------------------------------------------------------- /courseraresearchexports/constants/api_constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | RESEARCH_EXPORTS_APP = 'manage_research_exports' 17 | RESEARCH_EXPORTS_API = 'https://www.coursera.org/api/onDemandExports.v2/' 18 | COURSE_API = 'https://www.coursera.org/api/onDemandCourses.v1/' 19 | PARTNER_API = 'https://www.coursera.org/api/partners.v1/' 20 | CLICKSTREAM_API = 'https://www.coursera.org/api/clickstreamExportsDownload.v1/' 21 | ANONYMITY_LEVEL_COORDINATOR = 'HASHED_IDS_NO_PII' 22 | ANONYMITY_LEVEL_ISOLATED = 'HASHED_IDS_WITH_ISOLATED_UGC_NO_PII' 23 | ANONYMITY_LEVELS = [ANONYMITY_LEVEL_COORDINATOR, ANONYMITY_LEVEL_ISOLATED] 24 | EXPORT_TYPE_TABLES = 'RESEARCH_WITH_SCHEMAS' 25 | EXPORT_TYPE_CLICKSTREAM = 'RESEARCH_EVENTING' 26 | EXPORT_TYPE_GRADEBOOK = 'GRADEBOOK' 27 | EXPORT_TYPES = [EXPORT_TYPE_TABLES, EXPORT_TYPE_CLICKSTREAM, 28 | EXPORT_TYPE_GRADEBOOK] 29 | SCHEMA_NAMES = [ 30 | 'demographics', 31 | 'users', 32 | 'course_membership', 33 | 'course_progress', 34 | 'feedback', 35 | 'assessments', 36 | 'course_grades', 37 | 'peer_assignments', 38 | 'staff_graded_assignments', 39 | 'discussions', 40 | 'programming_assignments', 41 | 'course_content', 42 | 'ecb', 43 | 'notebooks', 44 | 'transactions'] 45 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | def readme(): 5 | with open('README.rst') as f: 6 | return f.read() 7 | 8 | setup( 9 | name='courseraresearchexports', 10 | version='0.0.29', 11 | description='Command line tool for convenient access to ' 12 | 'Coursera Research Data Exports.', 13 | long_description=readme(), 14 | long_description_content_type='text/markdown', 15 | classifiers=[ 16 | 'Development Status :: 5 - Production/Stable', 17 | 'License :: OSI Approved :: Apache Software License', 18 | 'Programming Language :: Python :: 2.7', 19 | ], 20 | keywords='coursera', 21 | url='https://github.com/coursera/courseraresearchexports', 22 | author='Chris Liu', 23 | author_email='cliu@coursera.org', 24 | license='Apache', 25 | entry_points={ 26 | 'console_scripts': [ 27 | 'courseraresearchexports = courseraresearchexports.main:main', 28 | ], 29 | }, 30 | packages=['courseraresearchexports', 31 | 'courseraresearchexports.commands', 32 | 'courseraresearchexports.constants', 33 | 'courseraresearchexports.exports', 34 | 'courseraresearchexports.containers', 35 | 'courseraresearchexports.models', 36 | 'courseraresearchexports.db'], 37 | install_requires=[ 38 | 'argcomplete>=1.4.1', 39 | 'courseraoauth2client>=0.0.1', 40 | 'requests>=2.7.0,<2.11', 41 | 'docker-py>=1.2.3', 42 | 'tqdm>=4.8.4', 43 | 'tabulate>=0.7.5', 44 | 'python-dateutil>=2.5.3', 45 | 'SQLAlchemy>=1.0.15', 46 | 'psycopg2>=2.6.2' 47 | ], 48 | test_suite='nose.collector', 49 | tests_require=['nose', 'nose-cover3'], 50 | # IMPORTANT: This makes MANIFEST.in work. DO NOT USE `package_data`, as 51 | # it does not work with sdist correctly. 52 | # See http://flask.pocoo.org/docs/0.11/patterns/distribute/ for details 53 | include_package_data=True, 54 | zip_safe=False 55 | ) 56 | -------------------------------------------------------------------------------- /courseraresearchexports/commands/version.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Coursera's command line SDK for interacting with research data exports. 17 | 18 | You may install it from source, or via pip. 19 | """ 20 | 21 | import sys 22 | import logging 23 | 24 | 25 | def command_version(args): 26 | """Implements the version subcommand""" 27 | 28 | # See http://stackoverflow.com/questions/17583443 29 | from pkg_resources import get_distribution, DistributionNotFound 30 | import os.path 31 | 32 | try: 33 | _dist = get_distribution('courseraresearchexports') 34 | # Normalize case for Windows systems 35 | dist_loc = os.path.normcase(_dist.location) 36 | here = os.path.normcase(__file__) 37 | if not here.startswith( 38 | os.path.join( 39 | dist_loc, 40 | 'courseraresearchexports')): 41 | # not installed, but there is another version that *is* 42 | raise DistributionNotFound 43 | except DistributionNotFound: 44 | __version__ = 'Please install this project with setup.py' 45 | else: 46 | __version__ = _dist.version 47 | 48 | if args.quiet and args.quiet > 0: 49 | logging.info(__version__) 50 | else: 51 | logging.info("Your {prog}'s version is:\n\t{version}" 52 | .format(prog=sys.argv[0], version=__version__)) 53 | 54 | 55 | def parser(subparsers): 56 | """Build an argparse argument parser to parse the command line.""" 57 | 58 | # create the parser for the version subcommand. 59 | parser_version = subparsers.add_parser( 60 | 'version', 61 | help="Output the version of %(prog)s to the console.") 62 | parser_version.set_defaults(func=command_version) 63 | 64 | return parser_version 65 | -------------------------------------------------------------------------------- /courseraresearchexports/models/ContainerInfo.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import dateutil.parser 16 | 17 | 18 | class ContainerInfo: 19 | """ 20 | Represents the relevant information about a docker container used to store 21 | a database of Coursera Export data. 22 | """ 23 | 24 | def __init__(self, name=None, id=None, host_port=None, host_ip=None, 25 | creation_time=None, database_name=None, status=None): 26 | self.name = name 27 | self.id = id 28 | self.short_id = id[:12] if id else None 29 | self.host_port = host_port 30 | self.host_ip = host_ip 31 | self.creation_time = creation_time 32 | self.status = status 33 | self.database_name = database_name 34 | 35 | @classmethod 36 | def from_container(cls, container_name, docker_client): 37 | """ 38 | Create ContainerInfo using the response from docker-py Client's 39 | `inspect-container` method. 40 | :param container_dict: 41 | :return container_info: ContainerInfo 42 | """ 43 | container_dict = docker_client.inspect_container(container_name) 44 | host_config = container_dict['HostConfig']['PortBindings'] 45 | network_settings = container_dict['NetworkSettings']['Ports'] 46 | 47 | assigned_port = int(host_config['5432/tcp'][0]['HostPort']) 48 | ip_if_running = network_settings and network_settings[ 49 | '5432/tcp'][0]['HostIp'] 50 | 51 | return cls( 52 | name=container_dict['Name'][1:], # remove prepended '\' 53 | id=container_dict['Id'], 54 | creation_time=dateutil.parser.parse(container_dict['Created']), 55 | database_name=container_dict['Config']['Labels']['database_name'], 56 | status=container_dict['State']['Status'], 57 | host_port=assigned_port, 58 | host_ip=ip_if_running) 59 | -------------------------------------------------------------------------------- /tests/models/jobs_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 Coursera 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from courseraresearchexports.commands import jobs 18 | from courseraresearchexports.models.ExportRequest import ExportRequest 19 | from courseraresearchexports.models.ExportRequestWithMetadata import \ 20 | ExportRequestWithMetadata 21 | from mock import MagicMock 22 | from mock import patch 23 | import argparse 24 | 25 | 26 | fake_course_id = 'fake_course_id' 27 | fake_course_slug = 'fake_course_slug' 28 | 29 | 30 | @patch('courseraresearchexports.commands.jobs.api.get_all') 31 | def test_get_all(api_get_all): 32 | api_get_all.return_value = [] 33 | 34 | jobs.get_all(argparse.Namespace()) 35 | 36 | api_get_all.assert_any_call() 37 | 38 | 39 | @patch('courseraresearchexports.models.utils.lookup_course_slug_by_id') 40 | @patch('courseraresearchexports.commands.jobs.api.get') 41 | def test_get(api_get, lookup_course_slug_by_id): 42 | lookup_course_slug_by_id.return_value = fake_course_slug 43 | api_get.return_value = [ 44 | ExportRequestWithMetadata(course_id=fake_course_id) 45 | ] 46 | args = argparse.Namespace() 47 | args.id = fake_course_id 48 | 49 | jobs.get(args) 50 | 51 | api_get.assert_called_with(fake_course_id) 52 | 53 | 54 | @patch('courseraresearchexports.commands.jobs.api.post') 55 | def test_request(api_post): 56 | api_post.return_value = [ 57 | ExportRequestWithMetadata(course_id=fake_course_id) 58 | ] 59 | args = argparse.Namespace() 60 | args.course_id = fake_course_id 61 | args.course_slug = None 62 | args.partner_id = None 63 | args.partner_short_name = None 64 | args.group_id = None 65 | args.export_type = None 66 | args.user_id_hashing = None 67 | args.purpose = None 68 | args.schemas = None 69 | 70 | jobs.request_tables(args) 71 | 72 | export_request, = api_post.call_args[0] 73 | assert export_request.course_id == fake_course_id 74 | -------------------------------------------------------------------------------- /courseraresearchexports/models/ClickstreamDownloadLinksRequest.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | 17 | from courseraresearchexports.models import utils 18 | 19 | 20 | class ClickstreamDownloadLinksRequest: 21 | """ 22 | Represents a request for clickstream download links. 23 | """ 24 | 25 | def __init__(self, course_id=None, partner_id=None, interval=None, 26 | **kwargs): 27 | self.course_id = course_id 28 | self.partner_id = partner_id 29 | self.interval = interval 30 | 31 | @staticmethod 32 | def from_args(**kwargs): 33 | """ 34 | Create a ClickstreamDownloadLinkRequest from arguments. Performs 35 | course_id/partner_id inference. 36 | :param kwargs: 37 | :return eventing_links_request: ClickstreamDownloadLinksRequest 38 | """ 39 | if kwargs.get('course_slug') and not kwargs.get('course_id'): 40 | kwargs['course_id'] = utils.lookup_course_id_by_slug( 41 | kwargs['course_slug']) 42 | elif kwargs.get('partner_short_name') and not kwargs.get('partner_id'): 43 | kwargs['partner_id'] = \ 44 | utils.lookup_partner_id_by_short_name( 45 | kwargs['partner_short_name']) 46 | elif kwargs.get('group_id'): 47 | logging.error( 48 | 'Eventing exports by group is not currently supported. ' 49 | 'Please see: ' 50 | 'https://partner.coursera.help/hc/articles/360021121132' 51 | ) 52 | raise ValueError('Eventing exports by group is not supported.') 53 | 54 | return ClickstreamDownloadLinksRequest(**kwargs) 55 | 56 | @property 57 | def scope(self): 58 | """ 59 | API specific format for request scope context. 60 | :return scope: 61 | """ 62 | if self.course_id: 63 | return 'courseContext~{}'.format(self.course_id) 64 | elif self.partner_id: 65 | return 'partnerContext~{}'.format(self.partner_id) 66 | 67 | def to_url_params(self): 68 | """ 69 | API specific parameters for POST request. 70 | :return: 71 | """ 72 | url_params = {'action': 'generateLinks', 'scope': self.scope} 73 | if self.interval: 74 | url_params['startDate'] = self.interval[0] 75 | url_params['endDate'] = self.interval[1] 76 | 77 | return url_params 78 | -------------------------------------------------------------------------------- /courseraresearchexports/sql/demographic_survey.sql: -------------------------------------------------------------------------------- 1 | /* 2 | demographic_survey 3 | This query partially denormalizes the demographics tables to create aggregate 4 | information about the users in the present data export. 5 | 6 | Columns 7 | coursera_user_id 8 | demographic_survey_submission_dt 9 | demographic_survey_gender 10 | demographic_survey_age 11 | demographic_survey_country_cd_of_birth 12 | demographic_survey_us_postal_code 13 | demographic_survey_spanish_hispanic_or_latino_descent 14 | demographic_survey_race 15 | demographic_survey_highest_level_of_schooling 16 | demographic_survey_currently_enrolled_in_an_educational_program 17 | demographic_survey_level_of_current_educational_program 18 | demographic_survey_subject_area_of_degree 19 | demographic_survey_current_employment_status 20 | demographic_survey_area_of_industry_currently_employed_in 21 | demographic_survey_english_proficiency 22 | demographic_survey_other_languages_spoken 23 | */ 24 | 25 | SELECT 26 | a.[demographics_user_id] 27 | ,MAX(a.submission_ts::DATE) AS demographic_survey_submission_dt 28 | ,MAX(CASE WHEN a.question_id = 11 29 | THEN c.choice_desc END) AS demographic_survey_gender 30 | ,MAX(CASE WHEN a.question_id = 12 31 | THEN DATE_PART('y', CURRENT_DATE) - a.answer_int END) AS demographic_survey_age 32 | ,UPPER(LEFT(MAX(CASE WHEN a.question_id = 13 33 | THEN c.choice_desc END), 2)) AS demographic_survey_country_cd_of_birth 34 | ,MAX(CASE WHEN a.question_id = 15 35 | THEN a.answer_int END) AS demographic_survey_us_postal_code 36 | ,MAX(CASE WHEN a.question_id = 16 37 | THEN c.choice_desc END) AS demographic_survey_spanish_hispanic_or_latino_descent 38 | ,RTRIM(STRING_AGG(CASE WHEN a.question_id = 17 THEN c.choice_desc END, ';')) AS demographic_survey_race 39 | ,MAX(CASE WHEN a.question_id = 18 40 | THEN c.choice_desc END) AS demographic_survey_highest_level_of_schooling 41 | ,MAX(CASE WHEN a.question_id = 19 42 | THEN c.choice_desc END) AS demographic_survey_currently_enrolled_in_an_educational_program 43 | ,MAX(CASE WHEN a.question_id = 20 44 | THEN c.choice_desc END) AS demographic_survey_level_of_current_educational_program 45 | ,RTRIM(STRING_AGG(CASE WHEN a.question_id = 21 46 | THEN c.choice_desc END, ';')) AS demographic_survey_subject_area_of_degree 47 | ,MAX(CASE WHEN a.question_id = 22 48 | THEN c.choice_desc END) AS demographic_survey_current_employment_status 49 | ,MAX(CASE WHEN a.question_id = 23 50 | THEN c.choice_desc END) AS demographic_survey_area_of_industry_currently_employed_in 51 | ,MAX(CASE WHEN a.question_id = 24 52 | THEN c.choice_desc END) AS demographic_survey_english_proficiency 53 | ,RTRIM(STRING_AGG(CASE WHEN a.question_id = 25 54 | THEN c.choice_desc END, ';')) AS demographic_survey_other_languages_spoken 55 | FROM demographics_answers a 56 | JOIN demographics_choices c USING (question_id, choice_id) 57 | WHERE a.question_id BETWEEN 11 AND 25 58 | AND a.question_id = c.question_id 59 | AND a.choice_id = c.choice_id 60 | GROUP BY 1 61 | -------------------------------------------------------------------------------- /courseraresearchexports/commands/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | import sys 17 | 18 | import requests 19 | 20 | 21 | def add_logging_parser(main_parser): 22 | """Build an argparse argument parser to parse the command line.""" 23 | 24 | main_parser.set_defaults(setup_logging=set_logging_level) 25 | 26 | verbosity_group = main_parser.add_mutually_exclusive_group(required=False) 27 | verbosity_group.add_argument( 28 | '--verbose', 29 | '-v', 30 | action='count', 31 | help='Output more verbose logging. Can be specified multiple times.') 32 | verbosity_group.add_argument( 33 | '--quiet', 34 | '-q', 35 | action='count', 36 | help='Output less information to the console during operation. Can be ' 37 | 'specified multiple times.') 38 | 39 | main_parser.add_argument( 40 | '--silence-urllib3', 41 | action='store_true', 42 | help='Silence urllib3 warnings. See ' 43 | 'https://urllib3.readthedocs.org/en/latest/security.html for details.') 44 | 45 | return verbosity_group 46 | 47 | 48 | def set_logging_level(args): 49 | """Computes and sets the logging level from the parsed arguments.""" 50 | logging.basicConfig() 51 | root_logger = logging.getLogger() 52 | level = logging.INFO 53 | logging.getLogger('sqlalchemy.engine').setLevel(logging.WARNING) 54 | logging.getLogger('requests.packages.urllib3').setLevel(logging.WARNING) 55 | if "verbose" in args and args.verbose is not None: 56 | logging.getLogger('requests.packages.urllib3').setLevel(0) # Unset 57 | if args.verbose > 1: 58 | level = 5 # "Trace" level 59 | elif args.verbose > 0: 60 | level = logging.DEBUG 61 | else: 62 | logging.critical("verbose is an unexpected value. {} exiting." 63 | .format(args.verbose)) 64 | sys.exit(2) 65 | logging.getLogger('sqlalchemy.engine').setLevel(level) 66 | elif "quiet" in args and args.quiet is not None: 67 | if args.quiet > 1: 68 | level = logging.ERROR 69 | elif args.quiet > 0: 70 | level = logging.WARNING 71 | else: 72 | logging.critical("quiet is an unexpected value. {} exiting." 73 | .format(args.quiet)) 74 | if level is not None: 75 | root_logger.setLevel(level) 76 | 77 | if args.silence_urllib3: 78 | # See: https://urllib3.readthedocs.org/en/latest/security.html 79 | requests.packages.urllib3.disable_warnings() 80 | -------------------------------------------------------------------------------- /courseraresearchexports/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # PYTHON_ARGCOMPLETE_OK 4 | 5 | # Copyright 2016 Coursera 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | """ 20 | Coursera's tools for interacting with research data exports. 21 | 22 | You may install it from source, or via pip. 23 | """ 24 | 25 | import argcomplete 26 | import argparse 27 | import logging 28 | import sys 29 | 30 | from courseraresearchexports import commands 31 | from courseraresearchexports.commands import utils 32 | 33 | 34 | def build_parser(): 35 | """ 36 | Build an argparse argument parser to parse the command line. 37 | """ 38 | 39 | parser = argparse.ArgumentParser( 40 | description="""Coursera tools for interacting with research exports. 41 | There are a number of subcommands, each with their own help 42 | documentation. Feel free to view them by executing `%(prog)s 43 | SUB_COMMAND -h`. For example: `%(prog)s jobs -h`.""", 44 | epilog="""Please file bugs on github at: 45 | https://github.com/coursera/courseraresearchexports/issues. If you 46 | would like to contribute to this tool's development, check us out at: 47 | https://github.com/coursera/courseraresarchexports""") 48 | 49 | utils.add_logging_parser(parser) 50 | 51 | # We have a number of subcommands. These subcommands have their own 52 | # subparsers. Each subcommand should set a default value for the 'func' 53 | # option. We then call the parsed 'func' function, and execution carries on 54 | # from there. 55 | subparsers = parser.add_subparsers() 56 | 57 | # create the parser for the version subcommand. 58 | commands.version.parser(subparsers) 59 | 60 | # create the parser for the jobs subcommand. 61 | commands.jobs.parser(subparsers) 62 | 63 | # create the parser for the containers subcommand. 64 | commands.containers.parser(subparsers) 65 | 66 | # create the parser for the db subcommand. 67 | commands.db.parser(subparsers) 68 | 69 | return parser 70 | 71 | 72 | def main(): 73 | """ 74 | Boots up the command line tool 75 | """ 76 | logging.captureWarnings(True) 77 | parser = build_parser() 78 | 79 | argcomplete.autocomplete(parser) 80 | 81 | args = parser.parse_args() 82 | # Configure logging 83 | args.setup_logging(args) 84 | # Dispatch into the appropriate subcommand function. 85 | try: 86 | return args.func(args) 87 | except SystemExit: 88 | raise 89 | except: 90 | logging.exception('Problem when running command. Sorry!') 91 | sys.exit(1) 92 | 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /courseraresearchexports/sql/enrollments.sql: -------------------------------------------------------------------------------- 1 | /* 2 | enrollments 3 | An enrollment is a unique learner-course pair. Many tables log a learner's 4 | interactions in a course, and this view will aggregate key metrics for simple 5 | reporting purposes. 6 | 7 | Columns 8 | coursera_user_id 9 | course_id 10 | commenced_dt 11 | is_enrollment_active 12 | activity_first_dt 13 | activity_last_dt 14 | num_days_active 15 | is_enrollment_completed 16 | completion_dt 17 | was_paid_or_finaid 18 | */ 19 | 20 | /* 21 | Any user that reaches the LEARNER membership role in a course is considered a 22 | commenced enrolllment. This excludes those users that pre-enroll in the course, 23 | and then later unenrolls before the course starts. 24 | */ 25 | WITH enrollment_commenced AS ( 26 | SELECT 27 | cm.[partner_user_id] 28 | ,course_id 29 | ,MIN(course_membership_ts)::DATE AS commenced_dt 30 | FROM course_memberships AS cm 31 | WHERE 32 | course_membership_role = 'LEARNER' 33 | GROUP BY 1,2 34 | ) 35 | 36 | /* 37 | Learners' progress on course items (e.g. lectures, quizzes, etc.) are 38 | summarized in the course_progress table. Generate their "activity" metrics with 39 | aggregate functions. 40 | */ 41 | ,enrollment_progress AS ( 42 | SELECT 43 | cp.[partner_user_id] 44 | ,course_id 45 | ,MIN(course_progress_ts)::DATE AS activity_first_dt 46 | ,MAX(course_progress_ts)::DATE AS activity_last_dt 47 | ,COUNT(DISTINCT course_progress_ts::DATE) AS num_days_active 48 | FROM course_progress AS cp -- contains 'started' or 'completed' progress 49 | GROUP BY 1,2 50 | ) 51 | 52 | /* 53 | Learners who complete the course are logged by reaching one of two passing 54 | states in the the course_grades table. Generate when they first pass. 55 | */ 56 | ,enrollment_completed AS ( 57 | SELECT 58 | cg.[partner_user_id] 59 | ,course_id 60 | ,MIN(course_grade_ts)::DATE AS completion_dt 61 | FROM course_grades AS cg -- contains when the learner reached the highest grade 62 | WHERE 63 | course_passing_state_id IN (1,2) -- 'passed' or 'verified passed' states 64 | GROUP BY 1,2 65 | ) 66 | 67 | /* 68 | Learners can own the course, either by payment or receiving financial aid. 69 | */ 70 | ,enrollment_ownership AS ( 71 | SELECT 72 | uccp.[partner_user_id] 73 | ,course_id 74 | ,was_payment OR was_finaid_grant AS was_paid_or_finaid 75 | FROM users_courses__certificate_payments AS uccp 76 | ) 77 | 78 | /* 79 | Combine all learner-course stats into one final table. 80 | */ 81 | SELECT 82 | ec.[partner_user_id] 83 | ,course_id 84 | ,commenced_dt 85 | ,activity_first_dt IS NOT NULL AS is_enrollment_active 86 | ,activity_first_dt 87 | ,activity_last_dt 88 | ,num_days_active 89 | ,completion_dt IS NOT NULL AS is_enrollment_completed 90 | ,completion_dt 91 | ,COALESCE(was_paid_or_finaid, FALSE) AS was_paid_or_finaid 92 | FROM enrollment_commenced AS ec 93 | LEFT JOIN enrollment_progress 94 | USING ([partner_user_id], course_id) 95 | LEFT JOIN enrollment_completed 96 | USING ([partner_user_id], course_id) 97 | LEFT JOIN enrollment_ownership 98 | USING ([partner_user_id], course_id) 99 | 100 | -------------------------------------------------------------------------------- /courseraresearchexports/exports/api.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Coursera's wrapper for data exports API. 17 | """ 18 | 19 | import requests 20 | from courseraoauth2client import oauth2 21 | from courseraresearchexports.models.utils import requests_response_to_model 22 | from courseraresearchexports.constants.api_constants import \ 23 | RESEARCH_EXPORTS_APP, RESEARCH_EXPORTS_API, CLICKSTREAM_API 24 | from courseraresearchexports.models.ExportRequestWithMetadata import \ 25 | ExportRequestWithMetadata 26 | 27 | 28 | @requests_response_to_model(ExportRequestWithMetadata.from_response) 29 | def get(export_job_id): 30 | """ 31 | Use Coursera's Research Export Resource to get a data export job given an 32 | export job id. 33 | :param export_job_id: 34 | :return export_request_with_metadata: [ExportRequestWithMetaData] 35 | """ 36 | auth = oauth2.build_oauth2(app=RESEARCH_EXPORTS_APP).build_authorizer() 37 | response = requests.get( 38 | url=requests.compat.urljoin(RESEARCH_EXPORTS_API, export_job_id), 39 | auth=auth) 40 | 41 | return response 42 | 43 | 44 | @requests_response_to_model(ExportRequestWithMetadata.from_response) 45 | def get_all(): 46 | """ 47 | Uses Coursera's Research Exports Resource to get all data export job 48 | requests created by a user. Limited to the 100 most recent requests. 49 | :return export_requests: [ExportRequestWithMetaData] 50 | """ 51 | auth = oauth2.build_oauth2(app=RESEARCH_EXPORTS_APP).build_authorizer() 52 | response = requests.get( 53 | url=RESEARCH_EXPORTS_API, 54 | auth=auth, 55 | params={'q': 'my'}) 56 | 57 | return response 58 | 59 | 60 | @requests_response_to_model(ExportRequestWithMetadata.from_response) 61 | def post(export_request): 62 | """ 63 | Creates a data export job using a formatted json request. 64 | :param export_request: 65 | :return export_request_with_metadata: [ExportRequestWithMetadata] 66 | """ 67 | auth = oauth2.build_oauth2(app=RESEARCH_EXPORTS_APP).build_authorizer() 68 | response = requests.post( 69 | url=RESEARCH_EXPORTS_API, 70 | json=export_request.to_json(), 71 | auth=auth) 72 | 73 | return response 74 | 75 | 76 | @requests_response_to_model(lambda response: response.json()) 77 | def get_clickstream_download_links(clickstream_download_links_request): 78 | """ 79 | Return the download links for clickstream exports in a given scope. 80 | :param clickstream_download_links_request: ClickstreamDownloadLinksRequest 81 | """ 82 | auth = oauth2.build_oauth2(app=RESEARCH_EXPORTS_APP).build_authorizer() 83 | response = requests.post( 84 | url=CLICKSTREAM_API, 85 | params=clickstream_download_links_request.to_url_params(), 86 | auth=auth) 87 | 88 | return response 89 | -------------------------------------------------------------------------------- /courseraresearchexports/containers/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | from io import BytesIO 17 | import logging 18 | import os 19 | import tarfile 20 | import time 21 | import zipfile 22 | 23 | from docker import Client 24 | 25 | 26 | def extract_zip_archive(archive, dest, delete_archive=True): 27 | """ 28 | Extracts a zip archive to `dest` 29 | :param export_archive: 30 | :param dest: 31 | :param delete_archive: delete the archive after extracting 32 | :return dest: 33 | """ 34 | try: 35 | logging.debug('Extracting archive to {}'.format(dest)) 36 | with zipfile.ZipFile(archive, 'r') as z: 37 | z.extractall(dest) 38 | if delete_archive: 39 | os.remove(archive) 40 | except: 41 | logging.error('Error in extracting zip archive {} to {}'.format( 42 | archive, dest)) 43 | raise 44 | 45 | 46 | def create_tar_archive(str, name='init-user-db.sh'): 47 | """ 48 | Creates tar archive to load single file as suggested by 49 | https://gist.github.com/zbyte64/6800eae10ce082bb78f0b7a2cca5cbc2 50 | """ 51 | archive_tarstream = BytesIO() 52 | archive_file = tarfile.TarFile(fileobj=archive_tarstream, mode='w') 53 | 54 | file_data = str.encode('utf8') 55 | file_info = tarfile.TarInfo(name) 56 | file_info.size = len(file_data) 57 | file_info.mtime = time.time() 58 | 59 | archive_file.addfile(file_info, BytesIO(file_data)) 60 | archive_file.close() 61 | archive_tarstream.seek(0) 62 | 63 | return archive_tarstream 64 | 65 | 66 | def get_next_available_port(containers_info): 67 | """ 68 | Find next available port to map postgres port to host. 69 | :param containers_info: 70 | :return port: 71 | """ 72 | ports = [container_info.host_port for container_info in containers_info] 73 | 74 | return (max(ports) + 1) if ports else 5433 75 | 76 | 77 | def is_container_running(container_name, docker_client): 78 | """ 79 | Check whether container is still running. 80 | :param container_name: 81 | :param docker_client: 82 | :return isRunning: Boolean 83 | """ 84 | container_details = docker_client.inspect_container(container_name) 85 | 86 | return container_details['State']['Running'] 87 | 88 | 89 | def docker_client_arg_parser(): 90 | """Builds an argparse parser for docker client connection flags.""" 91 | # The following subcommands operate on a single containers. We centralize 92 | # all these options here. 93 | docker_parser = argparse.ArgumentParser(add_help=False) 94 | docker_parser.add_argument( 95 | '--docker-url', 96 | help='The url of the docker demon.') 97 | docker_parser.add_argument( 98 | '--timeout', 99 | type=int, 100 | default=60, 101 | help='Set the default timeout when interacting with the docker demon') 102 | return docker_parser 103 | 104 | 105 | def docker_client(docker_url=None, timeout=60): 106 | """ 107 | Attempts to create a docker client. 108 | 109 | - docker_url: base url for docker 110 | - timeout: timeout for docker client 111 | - returns: a docker-py client 112 | """ 113 | if docker_url: 114 | return Client( 115 | base_url=docker_url, 116 | timeout=timeout, 117 | version='auto') 118 | else: 119 | return Client( 120 | timeout=timeout, 121 | version='auto') 122 | -------------------------------------------------------------------------------- /courseraresearchexports/models/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | 17 | import requests 18 | 19 | from courseraresearchexports.constants.api_constants import \ 20 | COURSE_API, PARTNER_API 21 | 22 | 23 | def requests_response_to_model(response_transformer): 24 | """ 25 | Creates decorator to handles errors in response from API call and 26 | transforms response with response_handler_func 27 | :param response_transformer: function(response) -> Any 28 | :return: 29 | """ 30 | def response_transform_decorator(original_func): 31 | """ 32 | Creates wrapper around a function that returns response 33 | """ 34 | def response_transformer_wrapper(*args, **kwargs): 35 | """ 36 | Log errors and apply transformation in response_handler_func 37 | """ 38 | try: 39 | response = original_func(*args, **kwargs) 40 | response.raise_for_status() 41 | 42 | except requests.exceptions.HTTPError: 43 | help_string = ('Please consult the Coursera Data ' 44 | 'Exports Guide for further assistance: ' 45 | 'https://partner.coursera.help/hc/en-us/articles/360021121132.') # noqa 46 | 47 | if (response.status_code == 403): 48 | help_string = ('Please authorize this application ' 49 | 'by running:\n' 50 | '\t$ courseraoauth2client config authorize --app manage_research_exports\n' # noqa 51 | 'See https://github.com/coursera/courseraoauth2client ' # noqa 52 | 'for more information on authorization.\n' 53 | 'For further assistance, consult the ' 54 | 'Coursera Data Exports Guide ' 55 | 'https://partner.coursera.help/hc/en-us/articles/360021121132.') # noqa 56 | 57 | logging.error( 58 | 'Request to {url} with body:\n\t{body}\nreceived response' 59 | ':\n\t{text}\n' 60 | '{help_string}\n' 61 | .format(url=response.url, 62 | text=response.text, 63 | body=(response.request and response.request.body), 64 | help_string=help_string)) 65 | raise 66 | 67 | return response_transformer(response) 68 | return response_transformer_wrapper 69 | return response_transform_decorator 70 | 71 | 72 | @requests_response_to_model( 73 | lambda response: response.json()['elements'][0]['slug']) 74 | def lookup_course_slug_by_id(course_id): 75 | """ 76 | Find the course slug given an course_id 77 | """ 78 | return requests.get(requests.compat.urljoin(COURSE_API, course_id)) 79 | 80 | 81 | @requests_response_to_model( 82 | lambda response: response.json()['elements'][0]['id']) 83 | def lookup_course_id_by_slug(course_slug): 84 | """ 85 | Find the course_id given a course_slug 86 | """ 87 | payload = {'q': 'slug', 'slug': course_slug} 88 | return requests.get(COURSE_API, params=payload) 89 | 90 | 91 | @requests_response_to_model( 92 | lambda response: int(response.json()['elements'][0]['id'])) 93 | def lookup_partner_id_by_short_name(partner_short_name): 94 | """ 95 | Find the partner_id by short name 96 | """ 97 | payload = {'q': 'shortName', 'shortName': partner_short_name} 98 | return requests.get(PARTNER_API, params=payload) 99 | 100 | 101 | @requests_response_to_model( 102 | lambda response: response.json()['elements'][0]['shortName']) 103 | def lookup_partner_short_name_by_id(partner_id): 104 | """ 105 | Find the partner_id by short name 106 | """ 107 | return requests.get(requests.compat.urljoin(PARTNER_API, str(partner_id))) 108 | -------------------------------------------------------------------------------- /courseraresearchexports/models/ExportDb.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import csv 16 | 17 | from sqlalchemy import create_engine 18 | from sqlalchemy.engine import reflection 19 | 20 | from courseraresearchexports.models.ContainerInfo import ContainerInfo 21 | 22 | 23 | class ExportDb: 24 | """ 25 | Interface for accessing a database containing research export data. 26 | """ 27 | def __init__(self, host_ip=None, host_port=None, db=None, **kwargs): 28 | 29 | if not (host_ip and host_port and db): 30 | raise ValueError( 31 | 'Host IP, port and database name must be specified') 32 | 33 | self.host_ip = host_ip 34 | self.host_port = host_port 35 | self.db = db 36 | self.engine = create_engine( 37 | "postgresql://{user}@{host}:{port}/{db}" 38 | .format(user='postgres', 39 | host=self.host_ip, 40 | port=self.host_port, 41 | db=self.db)) 42 | 43 | @classmethod 44 | def from_container(cls, container_name, docker_client): 45 | """ 46 | Create ExportDb object directly from container_name identifier. 47 | :param container_name: 48 | :param docker_client: 49 | :return: 50 | """ 51 | container_info = ContainerInfo.from_container(container_name, 52 | docker_client) 53 | return cls(host_ip=container_info.host_ip, 54 | host_port=container_info.host_port, 55 | db=container_info.database_name) 56 | 57 | def create_view(self, name, sql_text): 58 | """ 59 | Creates or overrides an existing view given a select statement. 60 | :param name: 61 | :param sql_text: 62 | :return: 63 | """ 64 | view_statement = """ 65 | DROP VIEW IF EXISTS {name}; 66 | CREATE VIEW {name} AS {sql_text}; 67 | """.format(name=name, sql_text=sql_text) 68 | 69 | self.engine.execute(view_statement) 70 | 71 | def unload(self, query, output_filename): 72 | """ 73 | Unloads to a csv file given a query. 74 | :param query: 75 | :param output_filename: 76 | :return rowcount: 77 | """ 78 | result = self.engine.execute(query) 79 | 80 | rowcount = result.rowcount 81 | 82 | with open(output_filename, 'wb') as csv_file: 83 | csv_obj = csv.writer(csv_file) 84 | csv_obj.writerow(result.keys()) 85 | for row in result: 86 | encoded_row = [col.encode('utf8') 87 | if isinstance(col, unicode) else col 88 | for col in row] 89 | csv_obj.writerow(encoded_row) 90 | 91 | return rowcount 92 | 93 | def unload_relation(self, relation, output_filename): 94 | """ 95 | Unload a table or view. 96 | :param relation: 97 | :param output_filename: 98 | :return rowcount: 99 | """ 100 | query = 'SELECT * FROM {relation};'.format(relation=relation) 101 | rowcount = self.unload(query, output_filename) 102 | return rowcount 103 | 104 | def get_columns(self, table): 105 | """ 106 | Names of all the columns in a table. 107 | :param table: 108 | :return columns: 109 | """ 110 | insp = reflection.Inspector.from_engine(self.engine) 111 | return [column['name'] for column in insp.get_columns(table)] 112 | 113 | @property 114 | def tables(self): 115 | """ 116 | Names of all tables present on database. 117 | """ 118 | insp = reflection.Inspector.from_engine(self.engine) 119 | return insp.get_table_names() 120 | 121 | @property 122 | def views(self): 123 | """ 124 | Names of all views present on database. 125 | """ 126 | insp = reflection.Inspector.from_engine(self.engine) 127 | return insp.get_view_names() 128 | -------------------------------------------------------------------------------- /courseraresearchexports/exports/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | import os 17 | from urlparse import urlparse 18 | 19 | from tqdm import tqdm 20 | import requests 21 | 22 | from courseraresearchexports.constants.api_constants import \ 23 | EXPORT_TYPE_CLICKSTREAM, EXPORT_TYPE_TABLES 24 | 25 | from courseraresearchexports.exports import api 26 | from courseraresearchexports.models.ClickstreamDownloadLinksRequest import \ 27 | ClickstreamDownloadLinksRequest 28 | 29 | 30 | def download(export_request, dest): 31 | """ 32 | Download a data export job using a request id. 33 | """ 34 | try: 35 | is_table_export = export_request.export_type == EXPORT_TYPE_TABLES 36 | is_clickstream_export = \ 37 | export_request.export_type == EXPORT_TYPE_CLICKSTREAM 38 | 39 | _validate(export_request) 40 | 41 | if not os.path.exists(dest): 42 | logging.info('Creating destination folder: {}'.format(dest)) 43 | os.makedirs(dest) 44 | 45 | if is_table_export: 46 | return [download_url(export_request.download_link, dest)] 47 | elif is_clickstream_export: 48 | links_request = ClickstreamDownloadLinksRequest.from_args( 49 | course_id=export_request.course_id, 50 | partner_id=export_request.partner_id, 51 | interval=export_request.interval) 52 | download_links = api.get_clickstream_download_links(links_request) 53 | if len(download_links) == 0: 54 | raise ValueError( 55 | 'Clickstream download links not found. This typically ' 56 | 'means no data was available for the dates in ' 57 | 'the specified interval: {interval}' 58 | .format(interval=export_request.interval)) 59 | return [download_url(link, dest) for link in download_links] 60 | else: 61 | raise ValueError('Require export_type is one of {} or {}'.format( 62 | EXPORT_TYPE_TABLES, 63 | EXPORT_TYPE_CLICKSTREAM)) 64 | 65 | except Exception as err: 66 | logging.error('Download failed.\n{err}'.format(err=err)) 67 | raise 68 | 69 | 70 | def download_url(url, dest_folder): 71 | """ 72 | Download url to dest_folder/FILENAME, where FILENAME is the last 73 | part of the url path. 74 | """ 75 | filename = urlparse(url).path.split('/')[-1] 76 | full_filename = os.path.join(dest_folder, filename) 77 | response = requests.get(url, stream=True) 78 | chunk_size = 1024 * 1024 79 | logging.debug('Writing to file: {}'.format(full_filename)) 80 | 81 | with open(full_filename, 'wb') as f: 82 | for data in tqdm( 83 | iterable=response.iter_content(chunk_size), 84 | total=int(response.headers['Content-length']) / chunk_size, 85 | unit='MB', 86 | desc=filename): 87 | f.write(data) 88 | return full_filename 89 | 90 | 91 | def _validate(export_request): 92 | is_clickstream_export = \ 93 | export_request.export_type == EXPORT_TYPE_CLICKSTREAM 94 | 95 | if not export_request.download_link: 96 | if export_request.status in ['PENDING', 'IN_PROGRESS']: 97 | logging.error( 98 | 'Export request {} is currently {} and is not ready for' 99 | 'download. Please wait until the request is completed.' 100 | .format(export_request.id, export_request.status)) 101 | raise ValueError( 102 | 'Export request is not yet ready for download') 103 | elif export_request.status == 'TERMINATED': 104 | logging.error( 105 | 'Export request has been TERMINATED. Please contact ' 106 | 'data-support@coursera.org if we have not resolved this ' 107 | 'within 24 hours.') 108 | raise ValueError('Export request has been TERMINATED') 109 | elif is_clickstream_export: 110 | # We don't fill in download links for clickstream exports 111 | pass 112 | else: 113 | logging.error('Download link was not found.') 114 | raise ValueError('Download link was not found') 115 | -------------------------------------------------------------------------------- /courseraresearchexports/commands/db.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | import logging 18 | 19 | from tabulate import tabulate 20 | 21 | import courseraresearchexports.db.db as db 22 | from courseraresearchexports.containers import utils 23 | 24 | 25 | def connect(args): 26 | """ 27 | Connect postgres shell to dockerized database. 28 | """ 29 | d = utils.docker_client(args.docker_url, args.timeout) 30 | db.connect(args.container_name, docker_client=d) 31 | 32 | 33 | def list_tables(args): 34 | """ 35 | List all of the tables present in a dockerized database. 36 | """ 37 | d = utils.docker_client(args.docker_url, args.timeout) 38 | tables = db.get_table_names(args.container_name, docker_client=d) 39 | print(tabulate([[table] for table in tables])) 40 | 41 | 42 | def list_views(args): 43 | """ 44 | List all of the views present in a dockerized database. 45 | """ 46 | d = utils.docker_client(args.docker_url, args.timeout) 47 | tables = db.get_view_names(args.container_name, docker_client=d) 48 | print(tabulate([[table] for table in tables])) 49 | 50 | 51 | def create_view(args): 52 | """ 53 | Create a view from a sql query. 54 | """ 55 | d = utils.docker_client(args.docker_url, args.timeout) 56 | 57 | if args.view_name: 58 | created_view = db.create_registered_view( 59 | args.container_name, args.view_name, d) 60 | elif args.sql_file: 61 | created_view = db.create_view_from_file( 62 | args.container_name, args.sql_file, d) 63 | 64 | logging.info('Created view {}'.format(created_view)) 65 | 66 | 67 | def unload_relation(args): 68 | """ 69 | Unload a table or view to a CSV file. 70 | """ 71 | d = utils.docker_client(args.docker_url, args.timeout) 72 | rowcount = db.unload_relation(args.container_name, args.dest, 73 | args.relation, d) 74 | 75 | logging.info('Unloaded {} rows'.format(rowcount)) 76 | 77 | 78 | def parser(subparsers): 79 | """Build an argparse argument parser to parse the command line.""" 80 | 81 | # create the parser for the version subcommand. 82 | parser_db = subparsers.add_parser( 83 | 'db', 84 | help='Tools for interacting with dockerized database', 85 | parents=[utils.docker_client_arg_parser()]) 86 | 87 | db_subparsers = parser_db.add_subparsers() 88 | 89 | parser_tables = db_subparsers.add_parser( 90 | 'list_tables', 91 | help=list_tables.__doc__) 92 | parser_tables.set_defaults(func=list_tables) 93 | parser_tables.add_argument( 94 | 'container_name', 95 | help='Name of the container database.') 96 | 97 | parser_views = db_subparsers.add_parser( 98 | 'list_views', 99 | help=list_views.__doc__) 100 | parser_views.set_defaults(func=list_views) 101 | parser_views.add_argument( 102 | 'container_name', 103 | help='Name of the container database.') 104 | 105 | parser_create_view = db_subparsers.add_parser( 106 | 'create_view', 107 | help=create_view.__doc__) 108 | parser_create_view.set_defaults(func=create_view) 109 | parser_create_view.add_argument( 110 | 'container_name', 111 | help='Name of the container database.') 112 | create_source_subparser = parser_create_view.add_mutually_exclusive_group( 113 | required=True) 114 | create_source_subparser.add_argument( 115 | '--view_name', 116 | help='Name of view') 117 | create_source_subparser.add_argument( 118 | '--sql_file', 119 | help='SQL file with query.') 120 | 121 | parser_unload = db_subparsers.add_parser( 122 | 'unload_to_csv', 123 | help=unload_relation.__doc__) 124 | parser_unload.set_defaults(func=unload_relation) 125 | parser_unload.add_argument( 126 | 'container_name', 127 | help='Name of the container database.') 128 | parser_unload.add_argument( 129 | '--dest', 130 | help='Destination folder.') 131 | parser_unload.add_argument( 132 | '--relation', 133 | help='Table or view to export.') 134 | 135 | parser_connect = db_subparsers.add_parser( 136 | 'connect', 137 | help=connect.__doc__) 138 | parser_connect.set_defaults(func=connect) 139 | parser_connect.add_argument( 140 | 'container_name', 141 | help='Name of the container database.') 142 | 143 | return parser_db 144 | -------------------------------------------------------------------------------- /tests/models/export_request_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright 2016 Coursera 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from courseraresearchexports.constants.api_constants import SCHEMA_NAMES, \ 18 | EXPORT_TYPE_TABLES, EXPORT_TYPE_CLICKSTREAM, EXPORT_TYPE_GRADEBOOK 19 | from courseraresearchexports.models.ExportRequest import ExportRequest 20 | from courseraresearchexports.models.ExportRequestWithMetadata import \ 21 | ExportRequestWithMetadata 22 | from mock import patch 23 | from nose.tools import raises 24 | 25 | fake_course_id = 'fake_course_id' 26 | fake_course_slug = 'fake_course' 27 | fake_partner_id = 1 28 | bad_partner_id = 'bad_partner_id' 29 | fake_partner_short_name = 'fake_partner' 30 | fake_export_id = '1' 31 | 32 | 33 | def test_export_request_serialize_to_json(): 34 | export_request = ExportRequest(course_id=fake_course_id) 35 | expected_result = { 36 | 'scope': { 37 | 'typeName': 'courseContext', 38 | 'definition': { 39 | 'courseId': fake_course_id}}} 40 | 41 | assert export_request.to_json() == expected_result 42 | 43 | 44 | def test_export_request_deserialize_from_json(): 45 | export_request_json = { 46 | 'scope': { 47 | 'typeName': 'courseContext', 48 | 'definition': { 49 | 'courseId': fake_course_id}}} 50 | export_request = ExportRequest.from_json(export_request_json) 51 | 52 | assert ExportRequest(course_id=fake_course_id) == export_request 53 | 54 | 55 | def test_create_from_args(): 56 | export_request = ExportRequest.from_args(course_id=fake_course_id) 57 | assert ExportRequest(course_id=fake_course_id) == export_request 58 | 59 | 60 | @raises(ValueError) 61 | def test_create_from_args_non_integer_partner_id(): 62 | export_request = ExportRequest.from_args(partner_id=bad_partner_id) 63 | 64 | 65 | @patch('courseraresearchexports.models.utils.lookup_course_id_by_slug') 66 | def test_course_id_inference(lookup_course_id_by_slug): 67 | lookup_course_id_by_slug.return_value = fake_course_id 68 | export_request = ExportRequest.from_args(course_slug=fake_course_slug) 69 | 70 | assert ExportRequest(course_id=fake_course_id) == export_request 71 | 72 | 73 | @patch('courseraresearchexports.models.utils.' 74 | 'lookup_partner_id_by_short_name') 75 | def test_partner_id_inference(lookup_partner_id_by_short_name): 76 | lookup_partner_id_by_short_name.return_value = fake_partner_id 77 | export_request = ExportRequest.from_args( 78 | partner_short_name=fake_partner_short_name) 79 | 80 | assert ExportRequest(partner_id=fake_partner_id) == export_request 81 | 82 | 83 | def test_scope_id(): 84 | export_request = ExportRequest(course_id=fake_course_id) 85 | 86 | assert export_request.scope_id == fake_course_id 87 | 88 | 89 | def test_schemas(): 90 | eventing_request = ExportRequest( 91 | course_id=fake_course_id, export_type=EXPORT_TYPE_CLICKSTREAM) 92 | gradebook_request = ExportRequest( 93 | course_id=fake_course_id, export_type=EXPORT_TYPE_GRADEBOOK) 94 | all_tables_request = ExportRequest( 95 | course_id=fake_course_id, export_type=EXPORT_TYPE_TABLES, 96 | schema_names=SCHEMA_NAMES) 97 | 98 | assert eventing_request.schema_names_display is None 99 | assert gradebook_request.schema_names_display is None 100 | assert all_tables_request.schema_names_display == 'all' 101 | 102 | 103 | def test_export_request_with_metadata_from_export_request(): 104 | export_request = ExportRequest.from_args(course_id=fake_course_id) 105 | export_request_with_metadata = \ 106 | ExportRequestWithMetadata.from_export_request( 107 | export_request, id=fake_export_id) 108 | 109 | assert export_request.course_id == export_request_with_metadata.course_id 110 | 111 | 112 | def test_export_request_with_metadata_serialize_to_json(): 113 | export_request = ExportRequestWithMetadata(course_id=fake_course_id, 114 | id=fake_export_id) 115 | expected_result = { 116 | 'scope': { 117 | 'typeName': 'courseContext', 118 | 'definition': { 119 | 'courseId': fake_course_id}}, 120 | 'id': fake_export_id} 121 | 122 | assert export_request.to_json() == expected_result 123 | 124 | 125 | def test_export_request_with_metadata_deserialize_from_json(): 126 | export_request_json = { 127 | 'scope': { 128 | 'typeName': 'courseContext', 129 | 'definition': { 130 | 'courseId': fake_course_id}}, 131 | 'id': fake_export_id} 132 | export_request = ExportRequestWithMetadata.from_json(export_request_json) 133 | 134 | assert export_request == ExportRequestWithMetadata( 135 | course_id=fake_course_id, id=fake_export_id) 136 | -------------------------------------------------------------------------------- /courseraresearchexports/db/db.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import logging 17 | import pkg_resources 18 | import subprocess 19 | 20 | from courseraresearchexports.constants.container_constants import \ 21 | POSTGRES_DOCKER_IMAGE 22 | from courseraresearchexports.models.ContainerInfo import ContainerInfo 23 | from courseraresearchexports.models.ExportDb import ExportDb 24 | from courseraresearchexports.constants.db_constants import \ 25 | HASHED_USER_ID_COLUMN_TO_SOURCE_TABLE 26 | 27 | 28 | def replace_user_id_placeholders(export_db, sql_text): 29 | """ 30 | Replace placeholders with actual user_id column names 31 | :param export_db: 32 | :param sql_text: 33 | :return sql_text_with_inferred_columns: 34 | """ 35 | hashed_user_id_columns_dict = infer_hashed_user_id_columns(export_db) 36 | 37 | for placeholder, column_name in hashed_user_id_columns_dict.items(): 38 | sql_text = sql_text.replace(placeholder, column_name) 39 | 40 | return sql_text 41 | 42 | 43 | def infer_hashed_user_id_columns(export_db): 44 | """ 45 | Infer hashed_user_id_columns from database using known placeholders 46 | :param export_db: 47 | :return: 48 | """ 49 | hashed_user_id_columns_dict = {} 50 | 51 | for placeholder, table in HASHED_USER_ID_COLUMN_TO_SOURCE_TABLE.items(): 52 | if table in export_db.tables: 53 | columns = export_db.get_columns(table) 54 | inferred_column = infer_user_id_column(columns) 55 | if inferred_column: 56 | hashed_user_id_columns_dict[placeholder] = inferred_column 57 | 58 | return hashed_user_id_columns_dict 59 | 60 | 61 | def infer_user_id_column(columns): 62 | """ 63 | Infer partner_short_name 64 | :param columns: 65 | :return: 66 | """ 67 | return next((column for column in columns 68 | if column.endswith('user_id')), None) 69 | 70 | 71 | def connect(container_name, docker_client): 72 | """ 73 | Create psql shell to container databaise 74 | :param container_name: 75 | :param docker_client: 76 | """ 77 | container_info = ContainerInfo.from_container( 78 | container_name, docker_client) 79 | 80 | subprocess.call([ 81 | 'docker', 'run', '-it', '--rm', 82 | '--link', container_info.name, 83 | POSTGRES_DOCKER_IMAGE, 'psql', 84 | '-h', container_info.name, 85 | '-d', container_info.database_name, 86 | '-U', 'postgres' 87 | ], shell=False) 88 | 89 | 90 | def get_table_names(container_name, docker_client): 91 | """ 92 | Returns table names present in containerized database. 93 | :param container_name: 94 | :param docker_client: 95 | :return table_names: 96 | """ 97 | export_db = ExportDb.from_container(container_name, docker_client) 98 | 99 | return export_db.tables 100 | 101 | 102 | def get_view_names(container_name, docker_client): 103 | """ 104 | Returns view names present in containerized database. 105 | :param container_name: 106 | :param docker_client: 107 | :return table_names: 108 | """ 109 | export_db = ExportDb.from_container(container_name, docker_client) 110 | 111 | return export_db.views 112 | 113 | 114 | def unload_relation(container_name, dest, relation, docker_client): 115 | """ 116 | Unloads a table or view to a csv file. 117 | :param container_name: 118 | :param dest_file: 119 | :param relation: 120 | :param docker_client: 121 | :return: 122 | """ 123 | if not os.path.exists(dest): 124 | logging.debug('Creating destination folder: {}'.format(dest)) 125 | os.makedirs(dest) 126 | 127 | export_db = ExportDb.from_container(container_name, docker_client) 128 | output_filename = os.path.join(dest, '{}.csv'.format(relation)) 129 | rowcount = export_db.unload_relation(relation, output_filename) 130 | return rowcount 131 | 132 | 133 | def create_registered_view(container_name, view_name, docker_client): 134 | """ 135 | Create a prepackaged view 136 | :param container_name: 137 | :param view_name: 138 | :param partner_short_name: 139 | :param docker_client: 140 | :return view_name: 141 | """ 142 | export_db = ExportDb.from_container(container_name, docker_client) 143 | 144 | sql_text = pkg_resources.resource_string( 145 | __name__.split('.')[0], 'sql/{}.sql'.format(view_name)) 146 | sql_text_with_inferred_columns = replace_user_id_placeholders( 147 | export_db, sql_text) 148 | 149 | export_db.create_view(view_name, sql_text_with_inferred_columns) 150 | 151 | return view_name 152 | 153 | 154 | def create_view_from_file(container_name, sql_file, docker_client): 155 | """ 156 | Create a view from a sql file. 157 | :param container_name: 158 | :param sql_file: 159 | :param partner_short_name: 160 | :param docker_client: 161 | :return view_name: 162 | """ 163 | export_db = ExportDb.from_container(container_name, docker_client) 164 | 165 | with open(sql_file, 'r') as sf: 166 | sql_text = sf.read() 167 | 168 | view_name = os.path.splitext(os.path.basename(sql_file))[0] 169 | 170 | sql_text_with_inferred_columns = replace_user_id_placeholders( 171 | export_db, sql_text) 172 | 173 | export_db.create_view(view_name, sql_text_with_inferred_columns) 174 | 175 | return view_name 176 | -------------------------------------------------------------------------------- /courseraresearchexports/commands/containers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | import logging 18 | 19 | from tabulate import tabulate 20 | 21 | from courseraresearchexports.containers import client 22 | from courseraresearchexports.containers import utils 23 | 24 | 25 | def create_container(args): 26 | """ 27 | Create a container containing a postgres database using an export job id. 28 | Export job will be downloaded and loaded into dockerized database. 29 | Automatically starts container. 30 | """ 31 | d = utils.docker_client(args.docker_url, args.timeout) 32 | 33 | kwargs = {} 34 | if args.container_name: 35 | kwargs['container_name'] = args.container_name 36 | if args.database_name: 37 | kwargs['database_name'] = args.database_name 38 | 39 | if args.export_request_id: 40 | container_id = client.create_from_export_request_id( 41 | args.export_request_id, docker_client=d, **kwargs) 42 | elif args.export_data_folder: 43 | container_id = client.create_from_folder( 44 | args.export_data_folder, docker_client=d, **kwargs) 45 | 46 | logging.info('Container {:.12} ready.'.format(container_id)) 47 | 48 | 49 | def list_containers(args): 50 | """ 51 | List docker containers created with Coursera data exports. 52 | """ 53 | d = utils.docker_client(args.docker_url, args.timeout) 54 | containers_info = client.list_all(docker_client=d) 55 | 56 | if containers_info: 57 | containers_info_table = [['Name', 'Container Id', 'Database', 58 | 'Created', 'Status', 'Host IP', 'Port']] 59 | 60 | for container_info in containers_info: 61 | containers_info_table.append([ 62 | container_info.name, 63 | container_info.short_id, 64 | container_info.database_name, 65 | container_info.creation_time.strftime('%c'), 66 | container_info.status, 67 | container_info.host_ip, 68 | container_info.host_port 69 | ]) 70 | 71 | print(tabulate(containers_info_table, headers='firstrow')) 72 | 73 | 74 | def start_container(args): 75 | """ 76 | Start a docker container. 77 | """ 78 | d = utils.docker_client(args.docker_url, args.timeout) 79 | client.start(args.container_name, docker_client=d) 80 | 81 | 82 | def stop_container(args): 83 | """ 84 | Stop a docker container. 85 | """ 86 | d = utils.docker_client(args.docker_url, args.timeout) 87 | client.stop(args.container_name, docker_client=d) 88 | 89 | 90 | def remove_container(args): 91 | """ 92 | Remove a docker container, stop the container 93 | before removing. 94 | """ 95 | d = utils.docker_client(args.docker_url, args.timeout) 96 | client.remove(args.container_name, docker_client=d) 97 | 98 | 99 | def parser(subparsers): 100 | parser_containers = subparsers.add_parser( 101 | 'containers', 102 | help='Create docker container from export jobs', 103 | description='Command line tools for creating a docker container' 104 | 'containing the results of a research export. Please first ' 105 | 'authenticate with the OAuth2 client before making requests (' 106 | 'courseraoauth2client config authorize --app manage-research-exports)', 107 | epilog='Please file bugs on github at: ' 108 | 'https://github.com/coursera/courseraresearchexports/issues. If you ' 109 | 'would like to contribute to this tool\'s development, check us out ' 110 | 'at: https://github.com/coursera/courseraresarchexports', 111 | parents=[utils.docker_client_arg_parser()]) 112 | 113 | containers_subparsers = parser_containers.add_subparsers() 114 | 115 | parser_create = containers_subparsers.add_parser( 116 | 'create', 117 | help=create_container.__doc__, 118 | description=create_container.__doc__) 119 | parser_create.set_defaults(func=create_container) 120 | 121 | source_subparser = parser_create.add_mutually_exclusive_group( 122 | required=True) 123 | 124 | source_subparser.add_argument( 125 | '--export_request_id', 126 | help='Export job to download and create containers') 127 | source_subparser.add_argument( 128 | '--export_data_folder', 129 | help='Location of already downloaded export data') 130 | 131 | parser_create.add_argument( 132 | '--container_name', 133 | help='Name for docker container.') 134 | parser_create.add_argument( 135 | '--database_name', 136 | help='Name for database inside container.') 137 | 138 | parser_list = containers_subparsers.add_parser( 139 | 'list', 140 | help=list_containers.__doc__) 141 | parser_list.set_defaults(func=list_containers) 142 | 143 | parser_stop = containers_subparsers.add_parser( 144 | 'stop', 145 | help=stop_container.__doc__) 146 | parser_stop.add_argument( 147 | 'container_name', 148 | help='Name of the container to stop.') 149 | parser_stop.set_defaults(func=stop_container) 150 | 151 | parser_start = containers_subparsers.add_parser( 152 | 'start', 153 | help=start_container.__doc__) 154 | parser_start.add_argument( 155 | 'container_name', 156 | help='Name of the container to start.') 157 | parser_start.set_defaults(func=start_container) 158 | 159 | parser_remove = containers_subparsers.add_parser( 160 | 'remove', 161 | help=remove_container.__doc__) 162 | parser_remove.add_argument( 163 | 'container_name', 164 | help='Name of the container to remove.') 165 | parser_remove.set_defaults(func=remove_container) 166 | 167 | return parser_containers 168 | -------------------------------------------------------------------------------- /courseraresearchexports/models/ExportRequestWithMetadata.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from datetime import datetime 16 | import time 17 | 18 | from courseraresearchexports.models.ExportRequest import ExportRequest 19 | 20 | 21 | class ExportRequestMetadata: 22 | """Metadata about the internal timings of the export request""" 23 | 24 | def __init__(self, created_by=None, created_at=None, started_at=None, 25 | completed_at=None, snapshot_at=None, **kwargs): 26 | self._created_by = created_by 27 | self._created_at = created_at 28 | self._started_at = started_at 29 | self._completed_at = completed_at 30 | self._snapshot_at = snapshot_at 31 | 32 | def to_json(self): 33 | """ 34 | Serialize metadata from json object. 35 | :return json_metadata: 36 | """ 37 | json_metadata = {} 38 | if self._created_by: 39 | json_metadata['createdBy'] = self._created_by 40 | if self._created_at: 41 | json_metadata['createdAt'] = datetime_to_unix_ms(self._created_at) 42 | if self._started_at: 43 | json_metadata['startedAt'] = datetime_to_unix_ms(self._started_at) 44 | if self._completed_at: 45 | json_metadata['completedAt'] = datetime_to_unix_ms( 46 | self._completed_at) 47 | if self._snapshot_at: 48 | json_metadata['snapshotAt'] = datetime_to_unix_ms( 49 | self._snapshot_at) 50 | 51 | return json_metadata 52 | 53 | @classmethod 54 | def from_json(cls, json_metadata): 55 | """ 56 | Deserialize ExportRequestMetaData from json object. 57 | :param json_metadata: 58 | :return export_request_metadata: ExportRequestMetadata 59 | """ 60 | if json_metadata: 61 | kwargs = {} 62 | if json_metadata.get('createdBy'): 63 | kwargs['created_by'] = json_metadata['createdBy'] 64 | if json_metadata.get('createdAt'): 65 | kwargs['created_at'] = unix_ms_to_datetime( 66 | json_metadata['createdAt']) 67 | if json_metadata.get('completedAt'): 68 | kwargs['completed_at'] = unix_ms_to_datetime( 69 | json_metadata['completedAt']) 70 | if json_metadata.get('startedAt'): 71 | kwargs['started_at'] = unix_ms_to_datetime( 72 | json_metadata['startedAt']) 73 | if json_metadata.get('snapshotAt'): 74 | kwargs['snapshot_at'] = unix_ms_to_datetime( 75 | json_metadata['snapshotAt']) 76 | return cls(**kwargs) 77 | 78 | else: 79 | return None 80 | 81 | 82 | class ExportRequestWithMetadata(ExportRequest): 83 | """ 84 | Class representing a export request from Coursera's research data export 85 | service with metadata about its status. 86 | """ 87 | 88 | def __init__(self, course_id=None, partner_id=None, group_id=None, 89 | export_type=None, anonymity_level=None, 90 | statement_of_purpose=None, schema_names=None, 91 | interval=None, ignore_existing=None, id=None, 92 | status=None, download_link=None, metadata=None, **kwargs): 93 | ExportRequest.__init__( 94 | self, course_id=course_id, partner_id=partner_id, 95 | group_id=group_id, export_type=export_type, 96 | anonymity_level=anonymity_level, 97 | statement_of_purpose=statement_of_purpose, 98 | schema_names=schema_names, interval=interval, 99 | ignore_existing=ignore_existing) 100 | self._id = id 101 | self._status = status 102 | self._download_link = download_link 103 | self._metadata = metadata 104 | 105 | def to_json(self): 106 | """ 107 | Serialize ExportRequestWithMetadata to json object 108 | :return json_request: 109 | """ 110 | json_request = ExportRequest.to_json(self) 111 | 112 | if self._id: 113 | json_request['id'] = self._id 114 | if self._status: 115 | json_request['status'] = self._status 116 | if self._download_link: 117 | json_request['downloadLink'] = self._download_link 118 | if self._metadata: 119 | json_request['metadata'] = self._metadata.to_json() 120 | 121 | return json_request 122 | 123 | @classmethod 124 | def from_export_request(cls, export_request, id=None, status=None, 125 | download_link=None, metadata=None, **kwargs): 126 | """ 127 | Create an object of class ExportRequestWithMetadata from an object of 128 | class ExportRequest. 129 | :param export_request: ExportRequest, parent object 130 | :param id: 131 | :param status: 132 | :param download_link: 133 | :param metadata: 134 | :param kwargs: 135 | :return export_request_with_metadata: ExportRequestWithMetadata 136 | """ 137 | return cls( 138 | course_id=export_request._course_id, 139 | partner_id=export_request._partner_id, 140 | group_id=export_request._group_id, 141 | export_type=export_request._export_type, 142 | anonymity_level=export_request._anonymity_level, 143 | statement_of_purpose=export_request._statement_of_purpose, 144 | schema_names=export_request._schema_names, 145 | interval=export_request._interval, 146 | ignore_existing=export_request._ignore_existing, 147 | id=id, 148 | status=status, 149 | download_link=download_link, 150 | metadata=metadata) 151 | 152 | @classmethod 153 | def from_json(cls, json_request): 154 | """ 155 | Deserialize ExportRequestWithMetadata from json object. 156 | :param json_request: 157 | :return export_request: ExportRequestWithMetadata 158 | """ 159 | export_request = ExportRequest.from_json(json_request) 160 | 161 | return cls.from_export_request( 162 | export_request=export_request, 163 | id=json_request.get('id'), 164 | status=json_request.get('status'), 165 | download_link=json_request.get('downloadLink'), 166 | metadata=ExportRequestMetadata.from_json( 167 | json_request.get('metadata'))) 168 | 169 | @classmethod 170 | def from_response(cls, response): 171 | """ 172 | Instantiate a list of ExportRequestWithMeta objects from 173 | API call response. 174 | :param response: 175 | :return export_request_with_metadata_list: [ExportRequestWithMetadata] 176 | """ 177 | return [cls.from_json(export_request) 178 | for export_request in response.json()['elements']] 179 | 180 | @property 181 | def id(self): 182 | return self._id 183 | 184 | @property 185 | def status(self): 186 | return self._status 187 | 188 | @property 189 | def download_link(self): 190 | return self._download_link 191 | 192 | @property 193 | def metadata(self): 194 | return self._metadata 195 | 196 | @property 197 | def created_at(self): 198 | if self._metadata and self._metadata._created_at: 199 | return self._metadata._created_at 200 | else: 201 | return datetime.fromtimestamp(0) 202 | 203 | 204 | def datetime_to_unix_ms(dt): 205 | """Convert datetime object to timestamp in milliseconds""" 206 | return int(time.mktime(dt.timetuple()) * 1000) 207 | 208 | 209 | def unix_ms_to_datetime(unix_ms): 210 | """Convert timestamp in milliseconds to datetime object""" 211 | return datetime.fromtimestamp(unix_ms / 1000.0) 212 | -------------------------------------------------------------------------------- /courseraresearchexports/containers/client.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Coursera's tools for managing docker containers configured with a 17 | postgres database. 18 | """ 19 | 20 | import logging 21 | import os 22 | import shutil 23 | import time 24 | 25 | from courseraresearchexports import exports 26 | from courseraresearchexports.constants.api_constants import \ 27 | EXPORT_TYPE_TABLES 28 | from courseraresearchexports.constants.container_constants import \ 29 | COURSERA_DOCKER_LABEL, COURSERA_LOCAL_FOLDER, POSTGRES_DOCKER_IMAGE, \ 30 | POSTGRES_INIT_MSG, POSTGRES_READY_MSG 31 | from courseraresearchexports.containers import utils as container_utils 32 | from courseraresearchexports.exports import utils as export_utils 33 | from courseraresearchexports.models.ContainerInfo import ContainerInfo 34 | 35 | 36 | def list_all(docker_client): 37 | """ 38 | Return all containers that have Coursera label 39 | :param docker_client: 40 | :return containers_info: [ContainerInfo] 41 | """ 42 | return [ContainerInfo.from_container(container['Id'], docker_client) 43 | for container in docker_client.containers( 44 | all=True, filters={'label': COURSERA_DOCKER_LABEL})] 45 | 46 | 47 | def start(container_name, docker_client): 48 | """ 49 | Start a docker container containing a research export database. Waits until 50 | """ 51 | try: 52 | logging.debug('Starting container {}...'.format(container_name)) 53 | docker_client.start(container_name) 54 | 55 | # poll logs to see if database is ready to accept connections 56 | while POSTGRES_READY_MSG not in docker_client.logs( 57 | container_name, tail=4): 58 | 59 | logging.debug('Polling container for database connection...') 60 | if not container_utils.is_container_running( 61 | container_name, docker_client): 62 | raise RuntimeError('Container failed to start.') 63 | 64 | time.sleep(10) 65 | 66 | logging.info('Started container {}.'.format(container_name)) 67 | 68 | except: 69 | logging.error( 70 | """Container failed to start, check log for errors:\n{}""" 71 | .format(docker_client.logs(container_name, tail=20))) 72 | raise 73 | 74 | 75 | def stop(container_name, docker_client): 76 | """ 77 | Stops a docker container 78 | """ 79 | docker_client.stop(container_name) 80 | 81 | 82 | def remove(container_name, docker_client): 83 | """ 84 | Remove a stopped container 85 | """ 86 | docker_client.remove_container(container_name) 87 | 88 | 89 | def initialize(container_name, docker_client): 90 | """ 91 | Initialize a docker container. Polls database for completion of 92 | entrypoint tasks. 93 | """ 94 | try: 95 | logging.info('Initializing container {}...'.format( 96 | container_name)) 97 | 98 | docker_client.start(container_name) 99 | while POSTGRES_INIT_MSG not in docker_client.logs( 100 | container_name, tail=20): 101 | 102 | logging.debug('Polling data for entrypoint initialization...') 103 | if not container_utils.is_container_running(container_name, 104 | docker_client): 105 | raise RuntimeError('Container initialization failed.') 106 | 107 | time.sleep(10) 108 | 109 | logging.info('Initialized container {}.'.format(container_name)) 110 | 111 | except: 112 | logging.error( 113 | """Container initialization failed, check log for errors:\n{}""" 114 | .format(docker_client.logs(container_name, tail=20))) 115 | logging.error( 116 | """If error persists, consider restarting your docker engine.""") 117 | raise 118 | 119 | 120 | def create_from_folder(export_data_folder, docker_client, 121 | container_name='coursera-exports', 122 | database_name='coursera-exports', 123 | database_password=''): 124 | """ 125 | Using a folder containing a Coursera research export, create a docker 126 | container with the export data loaded into a data base and start the 127 | container 128 | :param export_data_folder: folder where export data/scripts is stored 129 | :param docker_client: 130 | :param container_name: 131 | :param database_name: 132 | :param database_password: 133 | :return container_id: 134 | """ 135 | logging.debug('Creating containers from {folder}'.format( 136 | folder=export_data_folder)) 137 | 138 | env = ({'POSTGRES_PASSWORD': database_password} if database_password 139 | else {'POSTGRES_HOST_AUTH_METHOD': 'trust'}) 140 | create_container_args = { 141 | 'environment': env, 142 | 'volumes': ['/mnt/exportData'], 143 | 'host_config': docker_client.create_host_config( 144 | binds=['{}:/mnt/exportData:ro'.format(export_data_folder)], 145 | port_bindings={ 146 | 5432: ('127.0.0.1', 147 | container_utils.get_next_available_port(list_all( 148 | docker_client))) 149 | }) 150 | } 151 | container = create_postgres_container( 152 | docker_client, container_name, database_name, create_container_args) 153 | 154 | container_id = container['Id'] 155 | 156 | # copy containers initialization script to entrypoint 157 | database_setup_script = """ 158 | createdb -U {user} {db} 159 | cd /mnt/exportData 160 | psql -e -U {user} -d {db} -f setup.sql 161 | psql -e -U {user} -d {db} -f load.sql 162 | """.format(user='postgres', db=database_name) 163 | 164 | docker_client.put_archive( 165 | container_id, # using a named argument causes NullResource error 166 | path='/docker-entrypoint-initdb.d/', 167 | data=container_utils.create_tar_archive( 168 | database_setup_script, name='init-user-db.sh')) 169 | 170 | logging.info('Created container with id: {}'.format(container_id)) 171 | 172 | initialize(container_id, docker_client) 173 | 174 | return container_id 175 | 176 | 177 | def create_postgres_container(docker_client, container_name, database_name, 178 | create_container_args): 179 | if not docker_client.images(name=POSTGRES_DOCKER_IMAGE): 180 | logging.info('Downloading image: {}'.format(POSTGRES_DOCKER_IMAGE)) 181 | docker_client.import_image(image=POSTGRES_DOCKER_IMAGE) 182 | 183 | for existing_container in docker_client.containers( 184 | all=True, filters={'name': container_name}): 185 | logging.info('Removing existing container with name: {}'.format( 186 | container_name)) 187 | docker_client.stop(existing_container) 188 | docker_client.remove_container(existing_container) 189 | create_container_args['image'] = POSTGRES_DOCKER_IMAGE 190 | create_container_args['name'] = container_name 191 | create_container_args['labels'] = { 192 | COURSERA_DOCKER_LABEL: None, 193 | 'database_name': database_name 194 | } 195 | return docker_client.create_container(**create_container_args) 196 | 197 | 198 | def create_from_export_request_id(export_request_id, docker_client, 199 | container_name=None, 200 | database_name=None, 201 | database_password=''): 202 | """ 203 | Create a docker container containing the export data from a given 204 | export request. Container and database name will be inferred as the 205 | course slug or partner short name from export_request if not provided. 206 | :param export_request_id: 207 | :param docker_client: 208 | :param container_name: 209 | :param database_name: 210 | :param database_password: 211 | :return container_id: 212 | """ 213 | export_request = exports.api.get(export_request_id)[0] 214 | 215 | if export_request.export_type != EXPORT_TYPE_TABLES: 216 | raise ValueError('Invalid Export Type. (Only tables exports supported.' 217 | 'Given [{}])'.format(export_request.export_type)) 218 | 219 | logging.info('Downloading export {}'.format(export_request_id)) 220 | downloaded_files = export_utils.download( 221 | export_request, dest=COURSERA_LOCAL_FOLDER) 222 | dest = os.path.join(COURSERA_LOCAL_FOLDER, export_request_id) 223 | for f in downloaded_files: 224 | container_utils.extract_zip_archive( 225 | archive=f, 226 | dest=dest, 227 | delete_archive=True) 228 | 229 | container_id = create_from_folder( 230 | export_data_folder=dest, 231 | docker_client=docker_client, 232 | database_name=(database_name if database_name 233 | else export_request.scope_name), 234 | container_name=(container_name if container_name 235 | else export_request.scope_name), 236 | database_password=(database_password if database_password 237 | else '') 238 | ) 239 | 240 | shutil.rmtree(dest) 241 | 242 | return container_id 243 | -------------------------------------------------------------------------------- /courseraresearchexports/models/ExportRequest.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from courseraresearchexports.constants.api_constants import \ 16 | ANONYMITY_LEVEL_COORDINATOR, ANONYMITY_LEVEL_ISOLATED, EXPORT_TYPE_TABLES,\ 17 | EXPORT_TYPE_CLICKSTREAM, EXPORT_TYPE_GRADEBOOK, SCHEMA_NAMES 18 | from courseraresearchexports.models import utils 19 | import re 20 | import string 21 | 22 | 23 | class ExportRequest: 24 | """ 25 | Represents a export request for Coursera's research data export 26 | service and provides methods for serialization. 27 | """ 28 | 29 | def __init__(self, course_id=None, partner_id=None, group_id=None, 30 | export_type=None, anonymity_level=None, 31 | statement_of_purpose=None, schema_names=None, 32 | interval=None, ignore_existing=None, **kwargs): 33 | self._course_id = course_id 34 | if partner_id is not None: 35 | self._partner_id = int(partner_id) 36 | else: 37 | self._partner_id = partner_id 38 | self._group_id = group_id 39 | self._export_type = export_type 40 | self._anonymity_level = anonymity_level 41 | self._statement_of_purpose = statement_of_purpose 42 | self._schema_names = schema_names 43 | self._interval = interval 44 | self._ignore_existing = ignore_existing 45 | 46 | def to_json(self): 47 | """ 48 | Serialize ExportRequest to a dictionary representing a json object. 49 | No validation is done with the exception that only specification of 50 | scope is used (course/partner/group). 51 | :return json_request: 52 | """ 53 | json_request = {} 54 | 55 | if self._course_id: 56 | json_request['scope'] = { 57 | 'typeName': 'courseContext', 58 | 'definition': { 59 | 'courseId': self._course_id 60 | }} 61 | elif self._partner_id: 62 | json_request['scope'] = { 63 | 'typeName': 'partnerContext', 64 | 'definition': { 65 | 'partnerId': { 66 | 'maestroId': self._partner_id 67 | }}} 68 | elif self._group_id: 69 | json_request['scope'] = { 70 | 'typeName': 'groupContext', 71 | 'definition': { 72 | 'groupId': self._group_id 73 | }} 74 | if self._export_type: 75 | json_request['exportType'] = self._export_type 76 | if self._anonymity_level: 77 | json_request['anonymityLevel'] = self._anonymity_level 78 | if self._statement_of_purpose: 79 | json_request['statementOfPurpose'] = self._statement_of_purpose 80 | if self._schema_names: 81 | json_request['schemaNames'] = self._schema_names 82 | if self._interval: 83 | json_request['interval'] = { 84 | 'start': self._interval[0], 'end': self._interval[1]} 85 | if self._ignore_existing: 86 | json_request['ignoreExisting'] = self._ignore_existing 87 | 88 | return json_request 89 | 90 | @classmethod 91 | def from_args(cls, **kwargs): 92 | """ 93 | Create a ExportResource object using the parameters required. Performs 94 | course_id/partner_id inference if possible. 95 | :param kwargs: 96 | :return export_request: ExportRequest 97 | """ 98 | if kwargs.get('course_slug') and not kwargs.get('course_id'): 99 | kwargs['course_id'] = utils.lookup_course_id_by_slug( 100 | kwargs['course_slug']) 101 | elif kwargs.get('partner_short_name') and not kwargs.get('partner_id'): 102 | kwargs['partner_id'] = utils.lookup_partner_id_by_short_name( 103 | kwargs['partner_short_name']) 104 | 105 | if kwargs.get('user_id_hashing'): 106 | if kwargs['user_id_hashing'] == 'linked': 107 | kwargs['anonymity_level'] = ANONYMITY_LEVEL_COORDINATOR 108 | elif kwargs['user_id_hashing'] == 'isolated': 109 | kwargs['anonymity_level'] = ANONYMITY_LEVEL_ISOLATED 110 | 111 | return cls(**kwargs) 112 | 113 | @classmethod 114 | def from_json(cls, json_request): 115 | """ 116 | Deserialize ExportRequest from json object. 117 | :param json_request: 118 | :return export_request: ExportRequest 119 | """ 120 | kwargs = {} 121 | request_scope = json_request['scope'] 122 | request_scope_context = request_scope['typeName'] 123 | 124 | if request_scope_context == 'courseContext': 125 | kwargs['course_id'] = request_scope['definition']['courseId'] 126 | elif request_scope_context == 'partnerContext': 127 | kwargs['partner_id'] = \ 128 | request_scope['definition']['partnerId']['maestroId'] 129 | elif request_scope_context == 'groupContext': 130 | kwargs['group_id'] = request_scope['definition']['groupId'] 131 | 132 | if json_request.get('interval'): 133 | kwargs['interval'] = [ 134 | json_request['interval']['start'], 135 | json_request['interval']['end'] 136 | ] 137 | 138 | return cls( 139 | export_type=json_request.get('exportType'), 140 | anonymity_level=json_request.get('anonymityLevel'), 141 | statement_of_purpose=json_request.get('statementOfPurpose'), 142 | schema_names=json_request.get('schemaNames'), 143 | ignore_existing=json_request.get('ignoreExisting'), 144 | **kwargs) 145 | 146 | @property 147 | def course_id(self): 148 | return self._course_id 149 | 150 | @property 151 | def partner_id(self): 152 | return self._partner_id 153 | 154 | @property 155 | def export_type(self): 156 | return self._export_type 157 | 158 | @property 159 | def export_type_display(self): 160 | if self._export_type == EXPORT_TYPE_GRADEBOOK: 161 | return 'GRADEBOOK' 162 | elif self._export_type == EXPORT_TYPE_CLICKSTREAM: 163 | return 'CLICKSTREAM' 164 | elif self._export_type == EXPORT_TYPE_TABLES: 165 | return 'TABLES' 166 | else: 167 | return self._export_type 168 | 169 | @property 170 | def anonymity_level(self): 171 | return self._anonymity_level 172 | 173 | @property 174 | def formatted_anonymity_level(self): 175 | if self.anonymity_level == ANONYMITY_LEVEL_COORDINATOR: 176 | return 'Linked' 177 | elif self.anonymity_level == ANONYMITY_LEVEL_ISOLATED: 178 | return 'Isolated' 179 | else: 180 | return 'Unknown' 181 | 182 | @property 183 | def statement_of_purpose(self): 184 | return self._statement_of_purpose 185 | 186 | @property 187 | def interval(self): 188 | return self._interval 189 | 190 | @property 191 | def ignore_existing(self): 192 | return self._ignore_existing 193 | 194 | @property 195 | def scope_context(self): 196 | """ 197 | Context for this ExportRequest, assume that only one identifier for 198 | partner/course/group is defined. 199 | """ 200 | if self._course_id: 201 | return 'COURSE' 202 | elif self._partner_id: 203 | return 'PARTNER' 204 | elif self._group_id: 205 | return 'GROUP' 206 | else: 207 | return None 208 | 209 | @property 210 | def scope_id(self): 211 | """ 212 | Identifier for the scope, assume that only one of course/partner/group 213 | is defined for a valid request. 214 | :return scope_id: 215 | """ 216 | return self._course_id or self._partner_id or self._group_id 217 | 218 | @property 219 | def scope_name(self): 220 | """ 221 | Human readable name for this scope context. Partner short names for 222 | partners, but only group ids for groups and course ids for courses(apis 223 | are not open) 224 | :return: 225 | """ 226 | if self._course_id: 227 | try: 228 | return utils.lookup_course_slug_by_id(self._course_id) 229 | except: 230 | print("couldn't create human readable course name, using alphanumeric characters of course_id") 231 | chars = re.escape(string.punctuation) 232 | return re.sub(r'['+chars+']', '', self._course_id) 233 | elif self._partner_id: 234 | try: 235 | return utils.lookup_partner_short_name_by_id(self._partner_id) 236 | except: 237 | print("couldn't create human readable partner name, using course_id") 238 | return self._partner_id 239 | elif self._group_id: 240 | return self._group_id 241 | else: 242 | return 'UNKNOWN' 243 | 244 | @property 245 | def schema_names(self): 246 | return self._schema_names 247 | 248 | @property 249 | def schema_names_display(self): 250 | """ 251 | Display only property for schemas names. 252 | :return schemas: 253 | """ 254 | if self._schema_names: 255 | if set(self._schema_names) == set(SCHEMA_NAMES): 256 | return 'all' 257 | else: 258 | return ','.join(self._schema_names) 259 | else: 260 | return None 261 | 262 | def __eq__(self, other): 263 | """ 264 | Override for internal equality checks as suggested at: 265 | http://stackoverflow.com/a/390640 266 | """ 267 | if type(other) is type(self): 268 | return self.__dict__ == other.__dict__ 269 | return False 270 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /courseraresearchexports/commands/jobs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Coursera 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | import json 18 | import logging 19 | 20 | import argparse 21 | from tabulate import tabulate 22 | 23 | from courseraresearchexports.exports import api 24 | from courseraresearchexports.constants.api_constants import \ 25 | ANONYMITY_LEVEL_COORDINATOR, EXPORT_TYPE_CLICKSTREAM, \ 26 | EXPORT_TYPE_TABLES, SCHEMA_NAMES 27 | from courseraresearchexports.models.ClickstreamDownloadLinksRequest import \ 28 | ClickstreamDownloadLinksRequest 29 | from courseraresearchexports.models.ExportRequest import ExportRequest 30 | from courseraresearchexports.exports import utils 31 | 32 | 33 | def request_clickstream(args): 34 | """ 35 | Create and send an clickstream data export request with Coursera. Only 36 | available for data coordinators. 37 | """ 38 | export_request = ExportRequest.from_args( 39 | course_id=args.course_id, 40 | course_slug=args.course_slug, 41 | partner_id=args.partner_id, 42 | partner_short_name=args.partner_short_name, 43 | group_id=args.group_id, 44 | anonymity_level=ANONYMITY_LEVEL_COORDINATOR, 45 | statement_of_purpose=args.purpose, 46 | export_type=EXPORT_TYPE_CLICKSTREAM, 47 | interval=args.interval, 48 | ignore_existing=args.ignore_existing) 49 | 50 | export_request_with_metadata = api.post(export_request)[0] 51 | 52 | logging.info('Successfully created clickstream export request {id}.' 53 | .format(id=export_request_with_metadata.id)) 54 | logging.debug('Request created with json body:\n{json}' 55 | .format(json=json.dumps( 56 | export_request_with_metadata.to_json(), indent=2))) 57 | 58 | 59 | def request_tables(args): 60 | """ 61 | Create and send a tables data export request with Coursera. 62 | """ 63 | export_request = ExportRequest.from_args( 64 | course_id=args.course_id, 65 | course_slug=args.course_slug, 66 | partner_id=args.partner_id, 67 | partner_short_name=args.partner_short_name, 68 | group_id=args.group_id, 69 | user_id_hashing=args.user_id_hashing, 70 | statement_of_purpose=args.purpose, 71 | export_type=EXPORT_TYPE_TABLES, 72 | schema_names=args.schemas) 73 | 74 | export_request_with_metadata = api.post(export_request)[0] 75 | 76 | logging.info('Successfully created tables export request {id}.' 77 | .format(id=export_request_with_metadata.id)) 78 | logging.debug('Request created with json body:\n{json}' 79 | .format(json=json.dumps( 80 | export_request_with_metadata.to_json(), indent=2))) 81 | 82 | 83 | def get(args): 84 | """ 85 | Get the details and status of a data export request using a job id. 86 | """ 87 | export_request = api.get(args.id)[0] 88 | 89 | export_request_info = [ 90 | ['Export Job Id:', export_request.id], 91 | ['Export Type:', export_request.export_type_display], 92 | ['Status:', export_request.status], 93 | ['Scope Context:', export_request.scope_context], 94 | ['Scope Id:', export_request.scope_id], 95 | ['Scope Name:', export_request.scope_name], 96 | ['User id Hashing: ', export_request.formatted_anonymity_level], 97 | ['Created:', export_request.created_at.strftime('%c')]] 98 | 99 | if export_request.schema_names: 100 | export_request_info.append( 101 | ['Schemas:', export_request.schema_names_display]) 102 | 103 | if export_request.download_link: 104 | export_request_info.append( 105 | ['Download Link:', export_request.download_link]) 106 | 107 | if export_request.interval: 108 | export_request_info.append( 109 | ['Interval:', ' to '.join(export_request.interval)]) 110 | 111 | print(tabulate(export_request_info, tablefmt="plain")) 112 | 113 | 114 | def get_all(args): 115 | """ 116 | Get the details and status of your data export requests. 117 | """ 118 | export_requests = api.get_all() 119 | 120 | export_requests_table = [['Created', 'Request Id', 'Status', 'Type', 121 | 'User Id Hashing', 'Scope', 'Schemas']] 122 | for export_request in sorted(export_requests, key=lambda x: x.created_at): 123 | export_requests_table.append([ 124 | export_request.created_at.strftime('%Y-%m-%d %H:%M'), 125 | export_request.id, 126 | export_request.status, 127 | export_request.export_type_display, 128 | export_request.formatted_anonymity_level, 129 | export_request.scope_id, 130 | export_request.schema_names_display]) 131 | 132 | print(tabulate(export_requests_table, headers='firstrow')) 133 | 134 | 135 | def download(args): 136 | """ 137 | Download a data export job using a request id. 138 | """ 139 | try: 140 | export_request = api.get(args.id)[0] 141 | dest = args.dest 142 | utils.download(export_request, dest) 143 | except Exception as err: 144 | logging.error('Download failed with exception:\n{}'.format(err)) 145 | raise 146 | 147 | 148 | def get_clickstream_links(args): 149 | """ 150 | Generate links for clickstream data exports 151 | """ 152 | clickstream_links_request = ClickstreamDownloadLinksRequest.from_args( 153 | course_id=args.course_id, 154 | course_slug=args.course_slug, 155 | partner_id=args.partner_id, 156 | partner_short_name=args.partner_short_name, 157 | group_id=args.group_id, 158 | interval=args.interval) 159 | 160 | clickstream_download_links = api.get_clickstream_download_links( 161 | clickstream_links_request) 162 | 163 | # TODO: add more descriptive information or option write to text file 164 | print(tabulate( 165 | [[link] for link in clickstream_download_links], 166 | tablefmt="plain")) 167 | 168 | 169 | def parser(subparsers): 170 | parser_jobs = subparsers.add_parser( 171 | 'jobs', 172 | help='Get status of current/completed research export job(s)', 173 | description='Command line tools for requesting and reviewing the ' 174 | 'status of Coursera research data exports. Please first authenticate ' 175 | 'with the OAuth2 client before making requests (courseraoauth2client ' 176 | 'config authorize --app manage-research-exports).', 177 | epilog='Please file bugs on github at: ' 178 | 'https://github.com/coursera/courseraresearchexports/issues. If you ' 179 | 'would like to contribute to this tool\'s development, check us out ' 180 | 'at: https://github.com/coursera/courseraresarchexports') 181 | 182 | jobs_subparsers = parser_jobs.add_subparsers() 183 | 184 | create_request_parser(jobs_subparsers) 185 | 186 | parser_get_all = jobs_subparsers.add_parser( 187 | 'get_all', 188 | help=get_all.__doc__, 189 | description=get_all.__doc__) 190 | parser_get_all.set_defaults(func=get_all) 191 | 192 | parser_get = jobs_subparsers.add_parser( 193 | 'get', 194 | help=get.__doc__, 195 | description=get.__doc__) 196 | parser_get.set_defaults(func=get) 197 | 198 | parser_get.add_argument( 199 | 'id', 200 | help='Export request ID') 201 | 202 | parser_download = jobs_subparsers.add_parser( 203 | 'download', 204 | help=download.__doc__, 205 | description=download.__doc__) 206 | parser_download.set_defaults(func=download) 207 | 208 | parser_download.add_argument( 209 | 'id', 210 | help='Export request ID') 211 | 212 | parser_download.add_argument( 213 | '--dest', 214 | default='.', 215 | help='Destination folder') 216 | 217 | parser_clickstream_links = jobs_subparsers.add_parser( 218 | 'clickstream_download_links', 219 | help='Get download links for completed eventing exports.') 220 | parser_clickstream_links.set_defaults(func=get_clickstream_links) 221 | 222 | create_scope_subparser(parser_clickstream_links) 223 | 224 | parser_clickstream_links.add_argument( 225 | '--interval', 226 | nargs=2, 227 | metavar=('START', 'END'), 228 | help='Interval of exported clickstream data, inclusive. ' 229 | '(i.e. 2016-08-01 2016-08-04).') 230 | 231 | return parser_jobs 232 | 233 | 234 | def create_scope_subparser(parser): 235 | scope_subparser = parser.add_mutually_exclusive_group( 236 | required=True) 237 | scope_subparser.add_argument( 238 | '--course_id', 239 | help='Export rows corresponding to learners within a course according ' 240 | 'to the unique id assigned by Coursera.') 241 | scope_subparser.add_argument( 242 | '--course_slug', 243 | help='Export rows corresponding to learners within a course according ' 244 | 'to the unique name of your course defined as the part after ' 245 | '/learn in the course url. (e.g. machine-learning for ' 246 | 'https://www.coursera.org/learn/machine-learning).') 247 | scope_subparser.add_argument( 248 | '--partner_id', 249 | type=int, 250 | help='Export rows corresponding to learners within a partner.') 251 | scope_subparser.add_argument( 252 | '--partner_short_name', 253 | help='Export rows corresponding to learners within a partner.') 254 | scope_subparser.add_argument( 255 | '--group_id', 256 | help='Export rows corresponding to learners without a group.') 257 | 258 | 259 | def create_request_parser(subparsers): 260 | parser_request = subparsers.add_parser( 261 | 'request', 262 | help='Create and send a data export request with Coursera.', 263 | description='Create and send a data export request with Coursera. ' 264 | 'Use subcommands to specify the export request type.') 265 | request_subparsers = parser_request.add_subparsers() 266 | 267 | # common arguments between schema and eventing exports 268 | request_args_parser = argparse.ArgumentParser(add_help=False) 269 | 270 | create_scope_subparser(request_args_parser) 271 | 272 | request_args_parser.add_argument( 273 | '--purpose', 274 | required=True, 275 | help='Please let us know how you plan to use the ' 276 | 'data, what types of research questions you\'re asking, who will ' 277 | 'be working with the data primarily, and with whom you plan to ' 278 | 'share it.') 279 | 280 | # tables subcommand 281 | parser_tables = request_subparsers.add_parser( 282 | 'tables', 283 | help=request_tables.__doc__, 284 | description=request_tables.__doc__, 285 | parents=[request_args_parser]) 286 | parser_tables.set_defaults(func=request_tables) 287 | 288 | parser_tables.add_argument( 289 | '--user_id_hashing', 290 | choices=['linked', 'isolated'], 291 | default='isolated', 292 | help='The level of user_id hashing in the data export. With \'linked\'' 293 | ' user_id hashing, users can be identified between table schemas. ' 294 | 'With \'isolated\' user_id hashing, users have independent ids in' 295 | 'different schemas and cannot be linked. Only data coordinators have ' 296 | 'access to \'linked\' users_ids to restrict PII.') 297 | 298 | parser_tables.add_argument( 299 | '--schemas', 300 | choices=SCHEMA_NAMES, 301 | nargs='+', 302 | default=SCHEMA_NAMES, 303 | help='Data schemas to export. Any combination of: {}. By default this ' 304 | 'will be all available schemas.'.format( 305 | ', '.join(SCHEMA_NAMES))) 306 | 307 | # clickstream subcommand 308 | parser_clickstream = request_subparsers.add_parser( 309 | 'clickstream', 310 | help=request_clickstream.__doc__, 311 | description=request_clickstream.__doc__, 312 | parents=[request_args_parser]) 313 | parser_clickstream.set_defaults(func=request_clickstream) 314 | 315 | parser_clickstream.add_argument( 316 | '--interval', 317 | nargs=2, 318 | metavar=('START', 'END'), 319 | help='Interval of clickstream data to be exported ' 320 | '(i.e. 2016-08-01 2016-08-04). By default this will be the past day.') 321 | 322 | parser_clickstream.add_argument( 323 | '--ignore_existing', 324 | action='store_true', 325 | help='If flag is set, we will recompute clickstream data for all dates' 326 | 'in the interval. Otherwise, previously computed days are skipped.') 327 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | courseraresearchexports 2 | ======================= 3 | 4 | .. image:: https://travis-ci.org/coursera/courseraresearchexports.svg 5 | :target: https://travis-ci.org/coursera/courseraresearchexports 6 | 7 | This project is a library consisting of a command line interface and a client 8 | for interacting with Coursera's research exports. Up to date documentation 9 | of the data provided by Coursera for research purposes is available in the Partner Resource Center 10 | , `Coursera Data Exports Guide `_. 11 | 12 | Installation 13 | ------------ 14 | 15 | To install this package, execute:: 16 | 17 | pip install courseraresearchexports 18 | 19 | `pip `_ is a python package manager. 20 | 21 | If you do not have ``pip`` installed on your machine, please follow the 22 | `installation instructions `_ for your platform. 23 | 24 | If you experience issues installing with `pip`, we recommend that you use the 25 | python 2.7 distribution of `Anaconda `_ and try the above 26 | command again or to use a `virtualenv `_ 27 | for installation:: 28 | 29 | virtualenv venv -p python2.7 30 | source venv/bin/activate 31 | pip install courseraresearchexports 32 | 33 | Note: the ``containers`` subcommand requires ``docker`` to already be installed 34 | on your machine. Please see the `docker installation instructions `_ for platform 35 | specific information. 36 | 37 | Refer to `Issues`_ section for additional debugging around installation. 38 | 39 | autocomplete 40 | ^^^^^^^^^^^^ 41 | 42 | To enable tab autocomplete, please install `argcomplete `_ using 43 | ``pip install autocomplete`` and execute ``activate-global-python-argcomplete``. Open a new shell and 44 | press tab for autocomplete functionality. 45 | 46 | See the argcomplete documentation for more details. 47 | 48 | Setup 49 | ----- 50 | 51 | Authorize your application using `courseraoauth2client `_:: 52 | 53 | courseraoauth2client config authorize --app manage_research_exports 54 | 55 | To use the ``containers`` functionality, a docker instance must be running. 56 | Please see the docker `getting started guide `_ 57 | for installation instructions for your platform. 58 | 59 | Upgrade 60 | ------- 61 | 62 | If you have a previously installed version of `courseracourseexports`, execute:: 63 | 64 | pip install courseraresearchexports --upgrade 65 | 66 | This will upgrade your installation to the newest version. 67 | 68 | Command Line Interface 69 | ---------------------- 70 | 71 | The project includes a command line tool. Run:: 72 | 73 | courseraresearchexports -h 74 | 75 | for a complete list of features, flags, and documentation. Similarly, 76 | documentation for the subcommands listed below is also available (e.g. for 77 | ``jobs``) by running:: 78 | 79 | courseraresearchexports jobs -h 80 | 81 | jobs 82 | ^^^^ 83 | Submit a research export request or retrieve the status of pending and 84 | completed export jobs. 85 | 86 | request 87 | ~~~~~~~ 88 | Creates an data export job request and return the export request id. To create a 89 | data export requests for all available tables for a course:: 90 | 91 | courseraresearchexports jobs request tables --course_id $COURSE_ID \ 92 | --purpose "testing data export" 93 | 94 | In order to know your course_id, you can take advantage 95 | of our COURSE API, putting in the appropriate course_slug. 96 | 97 | For example, 98 | if the course_slug is `developer-iot`, you can query the course_id by making the request in your browser logged in session:: 99 | 100 | https://api.coursera.org/api/onDemandCourses.v1?q=slug&slug=developer-iot 101 | 102 | The response will be a JSON object containing an id field with the value:: 103 | 104 | iRl53_BWEeW4_wr--Yv6Aw 105 | 106 | **Note**: The course slug is the part after 107 | ``/learn`` in your course url. For ``https://www.coursera.org/learn/machine-learning``, 108 | the slug is `machine-learning` 109 | 110 | If you have a publically available course, you can request the export using:: 111 | 112 | courseraresearchexports jobs request tables --course_slug $COURSE_SLUG \ 113 | --purpose "testing data export" 114 | 115 | Replace ``$COURSE_SLUG`` with your course slug (The course slug is the part after 116 | ``/learn`` in the url. For ``https://www.coursera.org/learn/machine-learning``, 117 | the slug is `machine-learning`). 118 | 119 | If a more limited set of data is required, you can specify which schemas are 120 | included with the export. (e.g. for the demographics and notebooks tables):: 121 | 122 | courseraresearchexports jobs request tables --course_id $COURSE_ID \ 123 | --schemas demographics notebooks --purpose "testing data export" 124 | 125 | You can look at all the possible ways to export using:: 126 | 127 | courseraresearchexports jobs request tables -h 128 | 129 | **Recommendations** 130 | 131 | 132 | 1. Always request the specific schemas that you need by adding the `schemas` while requesting the exports. 133 | For more information on the available tables/schemas, please refer to the 134 | `Coursera Data Exports Guide `_. 135 | 136 | 2. While requesting the exports for all courses in your institution, it is recommended to use the partner level export, 137 | rather than requesting individual course level exports. You can use the command:: 138 | 139 | courseraresearchexports jobs request tables --partner_short_name $PARTNER_SHORT_NAME \ 140 | --schemas demographics notebooks --purpose "testing data export" 141 | 142 | Your partner_short_name can be found in the University Assets section of your institution setting. 143 | 144 | Note that the above command is available for only publicly available partners. 145 | If you have your partnerID, you can request the export using:: 146 | 147 | courseraresearchexports jobs request tables --partner_id $PARTNER_ID \ 148 | --schemas demographics notebooks --purpose "testing data export" 149 | 150 | You can find your partner_id using the API in your browser login session:: 151 | https://www.coursera.org/api/partners.v1?q=shortName&shortName=$PARTNER_SHORT_NAME 152 | 153 | If you are a data coordinator, you can request that user ids are linked between 154 | domains of the data export:: 155 | 156 | courseraresearchexports jobs request tables --course_id $COURSE_ID \ 157 | --purpose "testing data export" --user_id_hashing linked 158 | 159 | Data coordinators can also request clickstream exports:: 160 | 161 | courseraresearchexports jobs request clickstream --course_id $COURSE_ID \ 162 | --interval 2016-09-01 2016-09-02 --purpose "testing data export" 163 | 164 | By default, clickstream exports will cache results for days already exported. To ignore the cache and request exports for the entire date range, pass in the flag ``--ignore_existing``. 165 | 166 | Rate limits 167 | ~~~~~~~~~~~ 168 | We have rate limits enabled for the number of exports that can be performed. The underlying export API returns the rate limit error message, 169 | which is printed when the command fails. The error message reflects the reason why you might be rate limited. 170 | 171 | get_all 172 | ~~~~~~~ 173 | Lists the details and status of all data export requests that you have made:: 174 | 175 | courseraresearchexports jobs get_all 176 | 177 | get 178 | ~~~ 179 | Retrieve the details and status of an export request:: 180 | 181 | courseraresearchexports jobs get $EXPORT_REQUEST_ID 182 | 183 | download 184 | ~~~~~~~~ 185 | Download a completed table or clickstream to your local destination:: 186 | 187 | courseraresearchexports jobs download $EXPORT_REQUEST_ID 188 | 189 | clickstream_download_links 190 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 191 | Due to the size of clickstream exports, we persist download links for completed 192 | clickstream export requests on Amazon S3. The clickstream data for each day is 193 | saved into a separate file and download links to these files can be retrieved 194 | by running:: 195 | 196 | courseraresearchexports jobs clickstream_download_links --course_id $COURSE_ID 197 | 198 | containers 199 | ^^^^^^^^^^ 200 | 201 | create 202 | ~~~~~~ 203 | Creates a docker container using the postgres image and loads export data 204 | into a postgres database on the container. To create a docker container 205 | from an export, first ``request`` an export using the ``jobs`` command. Then, 206 | using the ``$EXPORT_REQUEST_ID``, create a docker container with:: 207 | 208 | courseraresearchexports containers create --export_request_id $EXPORT_REQUEST_ID 209 | 210 | This will download the data export and load all the data into the database 211 | running on the container. This may take some time depending on the size of 212 | your export. To create a docker container with an already downloaded export 213 | (please decompress the archive first):: 214 | 215 | courseraresearchexports containers create --export_data_folder /path/to/data_export/ 216 | 217 | After creation use the ``list`` command to check the status of the 218 | container and view the container name, database name, address and port to 219 | connect to the database. Use the `db connect $CONTAINER_NAME` command to open 220 | a psql shell. 221 | 222 | list 223 | ~~~~ 224 | Lists the details of all the containers created by ``courseraresearchexports``:: 225 | 226 | courseraresearchexports containers list 227 | 228 | start 229 | ~~~~~ 230 | Start a container:: 231 | 232 | courseraresearchexports containers start $CONTAINER_NAME 233 | 234 | stop 235 | ~~~~ 236 | Stop a container:: 237 | 238 | courseraresearchexports containers stop $CONTAINER_NAME 239 | 240 | remove 241 | ~~~~~~ 242 | Remove a container:: 243 | 244 | courseraresearchexports containers remove $CONTAINER_NAME 245 | 246 | db 247 | ^^ 248 | 249 | connect 250 | ~~~~~~~ 251 | Open a shell to a postgres database:: 252 | 253 | courseraresearchexports db connect $CONTAINER_NAME 254 | 255 | create_view 256 | ~~~~~~~~~~~ 257 | Create a view in the postgres database. We are planning to include commonly 258 | used denormalized views as part of this project. To create one of these views 259 | (i.e. for the demographic_survey view):: 260 | 261 | courseraresearchexports db create_view $CONTAINER_NAME --view_name demographic_survey 262 | 263 | If you have your own sql script that you'd like to create as a view run:: 264 | 265 | courseraresearchexports db create_view $CONTAINER_NAME --sql_file /path/to/sql/file/new_view.sql 266 | 267 | This will create a view using the name of the file as the name of the view, in this case "new_view". 268 | 269 | Note: as `user_id` columns vary with partner and user id hashing, please refer 270 | to the exports guide for SQL formatting guidelines. 271 | 272 | unload_to_csv 273 | ~~~~~~~~~~~~~ 274 | Export a table or view to a csv file. For example, if the `demographic_survey` 275 | was created in the above section, use this commmand to create a csv:: 276 | 277 | courseraresearchexports db unload_to_csv $CONTAINER_NAME --relation demographic_survey --dest /path/to/dest/ 278 | 279 | list_tables 280 | ~~~~~~~~~~~ 281 | List all the tables present inside a dockerized database:: 282 | 283 | courseraresearchexports db list_tables $CONTAINER_NAME 284 | 285 | list_views 286 | ~~~~~~~~~~ 287 | List all the views present inside a dockerized database:: 288 | 289 | courseraresearchexports db list_views $CONTAINER_NAME 290 | 291 | Using `courseraresearchexports` on a machine without a browser 292 | -------------------------------------------------------------- 293 | Sometimes, a browser is not available, making the oauth flow not possible. Commonly, this occurs when users want to automate the data export process by using an external machine. 294 | 295 | To get around this, you may generate the access token initially on a machine with browser access [e.g your laptop]. The access token is serialized in your local file system at `~/.coursera/manage_research_exports_oauth2_cache.pickle`. 296 | 297 | Requests after the first can use the refresh token flow, which does not require a browser. By copying the initial pickled access token to a remote machine, that machine can continue to request updated data. 298 | 299 | 300 | 301 | Bugs / Issues / Feature Requests 302 | -------------------------------- 303 | 304 | Please us the github issue tracker to document any bugs or other issues you 305 | encounter while using this tool. 306 | 307 | 308 | Developing / Contributing 309 | ------------------------- 310 | 311 | We recommend developing ``courseraresearchexports`` within a python 312 | `virtualenv `_. 313 | To get your environment set up properly, do the following:: 314 | 315 | virtualenv venv 316 | source venv/bin/activate 317 | python setup.py develop 318 | pip install -r test_requirements.txt 319 | 320 | Tests 321 | ^^^^^ 322 | 323 | To run tests, simply run: ``nosetests``, or ``tox``. 324 | 325 | Code Style 326 | ^^^^^^^^^^ 327 | 328 | Code should conform to pep8 style requirements. To check, simply run:: 329 | 330 | pep8 courseraresearchexports tests 331 | 332 | 333 | Issues 334 | ------- 335 | If you face following error when installling psycopg2 package for Mac:: 336 | 337 | ld: library not found for -lssl 338 | clang: error: linker command failed with exit code 1 (use -v to see invocation) 339 | error: command 'gcc' failed with exit status 1 340 | 341 | Install openssl package if not installed:: 342 | 343 | brew install openssl 344 | export LDFLAGS="-L/usr/local/opt/openssl/lib" 345 | or 346 | export LDFLAGS=-L/usr/local/opt/openssl@3/lib 347 | 348 | --------------------------------------------------------------------------------