├── .github └── workflows │ ├── check_release.yml │ ├── linter.yml │ └── unittest.yml ├── .gitignore ├── .readthedocs.yaml ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── RELEASE.md ├── docs ├── Makefile ├── _static │ ├── .gitkeep │ └── jupyter_telemetry_logo.svg ├── conf.py ├── index.md ├── make.bat └── pages │ ├── application.md │ ├── configure.md │ ├── schemas.md │ └── user-guide.md ├── jupyter_telemetry ├── __init__.py ├── _categories.py ├── _version.py ├── categories.py ├── conftest.py ├── eventlog.py ├── tests │ ├── __init__.py │ ├── test_allowed_schemas.py │ ├── test_category_filtering.py │ ├── test_eventlog.py │ ├── test_register_schema.py │ ├── test_traits.py │ └── utils.py └── traits.py ├── proposal ├── JEP.md ├── design.md └── press_release.md ├── pyproject.toml ├── setup.cfg └── setup.py /.github/workflows/check_release.yml: -------------------------------------------------------------------------------- 1 | name: Check Release 2 | on: 3 | push: 4 | branches: ["master"] 5 | pull_request: 6 | branches: ["*"] 7 | 8 | jobs: 9 | check_release: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | group: [check_release, link_check] 14 | steps: 15 | - name: Checkout 16 | uses: actions/checkout@v2 17 | - name: Install Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: 3.9 21 | architecture: "x64" 22 | - name: Get pip cache dir 23 | id: pip-cache 24 | run: | 25 | echo "::set-output name=dir::$(pip cache dir)" 26 | - name: Cache pip 27 | uses: actions/cache@v2 28 | with: 29 | path: ${{ steps.pip-cache.outputs.dir }} 30 | key: ${{ runner.os }}-pip-${{ hashFiles('setup.cfg') }} 31 | restore-keys: | 32 | ${{ runner.os }}-pip- 33 | ${{ runner.os }}-pip- 34 | - name: Cache checked links 35 | if: ${{ matrix.group == 'link_check' }} 36 | uses: actions/cache@v2 37 | with: 38 | path: ~/.cache/pytest-link-check 39 | key: ${{ runner.os }}-linkcheck-${{ hashFiles('**/*.md', '**/*.rst') }}-md-links 40 | restore-keys: | 41 | ${{ runner.os }}-linkcheck- 42 | - name: Upgrade packaging dependencies 43 | run: | 44 | pip install --upgrade pip setuptools wheel --user 45 | - name: Install Dependencies 46 | run: | 47 | pip install -e . 48 | - name: Check Release 49 | if: ${{ matrix.group == 'check_release' }} 50 | uses: jupyter-server/jupyter_releaser/.github/actions/check-release@v1 51 | with: 52 | token: ${{ secrets.GITHUB_TOKEN }} 53 | - name: Run Link Check 54 | if: ${{ matrix.group == 'link_check' }} 55 | uses: jupyter-server/jupyter_releaser/.github/actions/check-links@v1 56 | -------------------------------------------------------------------------------- /.github/workflows/linter.yml: -------------------------------------------------------------------------------- 1 | name: Linter 2 | on: 3 | push: 4 | branches: '*' 5 | pull_request: 6 | branches: '*' 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | fail-fast: false 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v1 15 | - name: Install Python 3.8 16 | uses: actions/setup-python@v1 17 | with: 18 | python-version: 3.8 19 | architecture: 'x64' 20 | - name: Install the Python dependencies 21 | run: | 22 | pip install flake8 23 | pip install -e . 24 | - name: Run Linter 25 | run: | 26 | python -m flake8 jupyter_telemetry -------------------------------------------------------------------------------- /.github/workflows/unittest.yml: -------------------------------------------------------------------------------- 1 | name: Jupyter Telemetry Unit Tests 2 | on: 3 | push: 4 | branches: "*" 5 | pull_request: 6 | branches: "*" 7 | jobs: 8 | build: 9 | runs-on: ${{ matrix.os }} 10 | strategy: 11 | fail-fast: false 12 | matrix: 13 | os: [ubuntu-latest, macos-latest, windows-latest] 14 | python-version: ["3.6", "3.7", "3.8", "3.9"] 15 | exclude: 16 | - os: macos-latest 17 | python-version: "3.6" 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v1 21 | - name: Install Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v1 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | architecture: "x64" 26 | - name: Install the Python dependencies 27 | run: | 28 | pip install -e ".[test]" 29 | - name: Run the tests 30 | run: | 31 | pytest jupyter_telemetry 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .DS_Store 107 | .vscode/ 108 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-20.04 11 | tools: 12 | python: "3.9" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - method: pip 22 | path: . 23 | extra_requirements: 24 | - docs 25 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | 3 | 4 | 5 | 6 | 7 | 8 | ## v0.1.0 9 | 10 | - Ensure unique loggers [#45](https://github.com/jupyter/telemetry/pull/45) 11 | - Allow overriding timestamp of event [#43](https://github.com/jupyter/telemetry/pull/43) 12 | 13 | 14 | ## v0.0.2 15 | 16 | - Fix passing list of logging handlers to EventLog via config files. 17 | Thanks to [@Zsailer](https://github.com/zsailer) via 18 | [#17](https://github.com/jupyter/telemetry/pull/17) 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Project Jupyter 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include *.md 3 | include *requirements.txt 4 | include package.json 5 | 6 | graft docs 7 | graft proposal 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Telemetry 2 | 3 | [![Tests](https://github.com/jupyter/telemetry/workflows/Jupyter%20Telemetry%20Unit%20Tests/badge.svg)](https://github.com/jupyter/telemetry/actions?query=workflow%3A%22Jupyter+Telemetry+Unit+Tests%22) 4 | [![codecov](https://codecov.io/gh/jupyter/telemetry/branch/master/graph/badge.svg)](https://codecov.io/gh/jupyter/telemetry) 5 | [![Documentation Status](https://readthedocs.org/projects/jupyter-telemetry/badge/?version=latest)](https://jupyter-telemetry.readthedocs.io/en/latest/?badge=latest) 6 | 7 | *Telemetry for Jupyter Applications and extensions.* 8 | 9 | > Telemetry (təˈlemətrē): the process of recording and transmitting the readings of an instrument. [Oxford Dictionaries] 10 | 11 | Jupyter Telemetry enables Jupyter Applications (e.g. Jupyter Server, Jupyter Notebook, JupyterLab, JupyterHub, etc.) to record **events**—i.e. actions by application users—and transmit them to remote (or local) destinations as **structured** data. It works with Python's standard `logging` library to handle the transmission of events allowing users to send events to local files, over the web, etc. 12 | 13 | ## Install 14 | 15 | Jupyter's Telemetry library can be installed from PyPI. 16 | ``` 17 | pip install jupyter_telemetry 18 | ``` 19 | 20 | ## Basic Usage 21 | 22 | Telemetry provides a configurable traitlets object, `EventLog`, for structured event-logging in Python. It leverages Python's standard `logging` library for filtering, handling, and recording events. All events are validated (using [jsonschema](https://pypi.org/project/jsonschema/)) against registered [JSON schemas](https://json-schema.org/). 23 | 24 | Let's look at a basic example of an `EventLog`. 25 | ```python 26 | import logging 27 | from jupyter_telemetry import EventLog 28 | 29 | 30 | eventlog = EventLog( 31 | # Use logging handlers to route where events 32 | # should be record. 33 | handlers=[ 34 | logging.FileHandler('events.log') 35 | ], 36 | # List schemas of events that should be recorded. 37 | allowed_schemas=[ 38 | 'uri.to.event.schema' 39 | ] 40 | ) 41 | ``` 42 | 43 | EventLog has two configurable traits: 44 | * `handlers`: a list of Python's `logging` handlers. 45 | * `allowed_schemas`: a list of event schemas to record. 46 | 47 | Event schemas must be registered with the `EventLog` for events to be recorded. An event schema looks something like: 48 | ```json 49 | { 50 | "$id": "url.to.event.schema", 51 | "title": "My Event", 52 | "description": "All events must have a name property.", 53 | "type": "object", 54 | "properties": { 55 | "name": { 56 | "title": "Name", 57 | "description": "Name of event", 58 | "type": "string" 59 | } 60 | }, 61 | "required": ["name"], 62 | "version": 1 63 | } 64 | ``` 65 | 2 fields are required: 66 | * `$id`: a valid URI to identify the schema (and possibly fetch it from a remote address). 67 | * `version`: the version of the schema. 68 | 69 | The other fields follow standard JSON schema structure. 70 | 71 | Schemas can be registered from a Python `dict` object, a file, or a URL. This example loads the above example schema from file. 72 | ```python 73 | # Register the schema. 74 | eventlog.register_schema_file('schema.json') 75 | ``` 76 | 77 | Events are recorded using the `record_event` method. This method validates the event data and routes the JSON string to the Python `logging` handlers listed in the `EventLog`. 78 | ```python 79 | # Record an example event. 80 | event = {'name': 'example event'} 81 | eventlog.record_event( 82 | schema_id='url.to.event.schema', 83 | version=1, 84 | event=event 85 | ) 86 | ``` 87 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # Release Workflow 2 | 3 | Releases are made (and automated) using [Jupyter-releaser](https://github.com/jupyter-server/jupyter_releaser). 4 | 5 | - [ ] Set up a fork of `jupyter-releaser` if you have not yet done so. 6 | - [ ] Run through the release process, targeting this repo and the appropriate branch 7 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jupyter/telemetry/2975fcac06cb3ffa5872258669990f488547e0c1/docs/_static/.gitkeep -------------------------------------------------------------------------------- /docs/_static/jupyter_telemetry_logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 18 | 40 | 42 | 43 | 45 | image/svg+xml 46 | 48 | logo.svg 49 | 50 | 51 | 52 | logo.svg 54 | Created using Figma 0.90 56 | 65 | 74 | 83 | 92 | 101 | 110 | 119 | 128 | 137 | 146 | 155 | 164 | 166 | 169 | 172 | 175 | 178 | 181 | 184 | 187 | 190 | 193 | 196 | 199 | 202 | 203 | telemetry 213 | 214 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'telemetry' 21 | copyright = '2019, Project Jupyter' 22 | author = 'Project Jupyter' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | "myst_parser" 32 | ] 33 | 34 | # Add any paths that contain templates here, relative to this directory. 35 | templates_path = ['_templates'] 36 | 37 | # List of patterns, relative to source directory, that match files and 38 | # directories to ignore when looking for source files. 39 | # This pattern also affects html_static_path and html_extra_path. 40 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 41 | 42 | 43 | # -- Options for HTML output ------------------------------------------------- 44 | 45 | # The theme to use for HTML and HTML Help pages. See the documentation for 46 | # a list of builtin themes. 47 | # 48 | html_theme = 'pydata_sphinx_theme' 49 | 50 | # Add any paths that contain custom static files (such as style sheets) here, 51 | # relative to this directory. They are copied after the builtin static files, 52 | # so a file named "default.css" will overwrite the builtin "default.css". 53 | html_static_path = ['_static'] 54 | master_doc = 'index' 55 | html_logo = "_static/jupyter_telemetry_logo.svg" 56 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Jupyter Telemetry 2 | 3 | **Configurable event-logging for Jupyter applications and extensions.** 4 | 5 | 6 | Telemetry provides a configurable traitlets object, EventLog, for structured event-logging in Python. It leverages Python's standard logging library for filtering, handling, and recording events. All events are validated (using jsonschema) against registered JSON schemas. 7 | 8 | The most common way to use Jupyter's telemetry system is to configure the ``EventLog`` objects in Jupyter Applications, (e.g. JupyterLab, Jupyter Notebook, JupyterHub). See the page "[](pages/application.md)". 9 | 10 | If you're looking to add telemetry to an application that you're developing, check out the page "[](pages/configure.md)". 11 | 12 | If you're looking for client-side telemetry in Jupyter frontend applications (like JupyterLab), check out the work happening in [jupyterlab-telemetry](https://github.com/jupyterlab/jupyterlab-telemetry)! 13 | 14 | 15 | ## Installation 16 | 17 | Jupyter's Telemetry library can be installed from PyPI. 18 | 19 | ``` 20 | pip install jupyter_telemetry 21 | ``` 22 | 23 | ## Table of Contents 24 | 25 | ```{toctree} 26 | :maxdepth: 2 27 | 28 | pages/user-guide 29 | ``` 30 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/pages/application.md: -------------------------------------------------------------------------------- 1 | # Adding telemetry to an application 2 | 3 | Jupyter Telemetry enables you to log events from your running application. (It's designed to work best with traitlet's `Application` object for simple configuration.) To use telemetry, begin by creating an instance of `EventLog`: 4 | 5 | ```python 6 | from jupyter_telemetry import EventLog 7 | 8 | class MyApplication: 9 | 10 | def __init__(self): 11 | ... 12 | # The arguments 13 | self.eventlog = EventLog( 14 | ... 15 | # Either pass the traits (see below) here, 16 | # or enable users of your application to configure 17 | # the EventLog's traits. 18 | ) 19 | ``` 20 | 21 | EventLog has two configurable traits: 22 | 23 | - `handlers`: a list of Python's logging handlers that handle the recording of incoming events. 24 | - `allowed_schemas`: a dictionary of options for each schema describing what data should be collected. 25 | 26 | Next, you'll need to register event schemas for your application. You can register schemas using the `register_schema_file` (JSON or YAML format) or `register_schema` methods. 27 | 28 | 29 | Once your have an instance of `EventLog` and your registered schemas, you can use the `record_event` method to log that event. 30 | 31 | 32 | ```python 33 | # Record an example event. 34 | event = {'name': 'example event'} 35 | self.eventlog.record_event( 36 | schema_id='url.to.event.schema', 37 | version=1, 38 | event=event 39 | ) 40 | ``` 41 | -------------------------------------------------------------------------------- /docs/pages/configure.md: -------------------------------------------------------------------------------- 1 | # Using telemetry in Jupyter applications 2 | 3 | Most people will use `jupyter_telemetry` to log events data from Jupyter applications, (e.g. JupyterLab, Jupyter Server, JupyterHub, etc). 4 | 5 | In this case, you'll be able to record events provided by schemas within those applications. To start, you'll need to configure each application's `EventLog` object. 6 | 7 | This usually means two things: 8 | 9 | 1. Define a set of `logging` handlers (from Python's standard library) to tell telemetry where to send your event data (e.g. file, remote storage, etc.) 10 | 2. List the names of events to collect and the properties/categories to collect from each of those events. (see the example below for more details). 11 | 12 | Here is an example of a Jupyter configuration file, e.g. `jupyter_config.d`, that demonstrates how to configure an eventlog. 13 | 14 | ```python 15 | from logging import FileHandler 16 | 17 | # Log events to a local file on disk. 18 | handler = FileHandler('events.txt') 19 | 20 | # Explicitly list the types of events 21 | # to record and what properties or what categories 22 | # of data to begin collecting. 23 | allowed_schemas = { 24 | "uri.to.schema": { 25 | "allowed_properties": ["name", "email"], 26 | "allowed_categories": ["category.jupyter.org/user-identifier"] 27 | } 28 | } 29 | 30 | c.EventLog.handlers = [handler] 31 | c.EventLog.allowed_schemas = allowed_schemas 32 | ``` 33 | -------------------------------------------------------------------------------- /docs/pages/schemas.md: -------------------------------------------------------------------------------- 1 | # Writing a schema for telemetry 2 | 3 | All Schemas should be a valid [JSON schema](https://json-schema.org/) and can be written in valid YAML or JSON. 4 | 5 | At a minimum, valid Jupyter Telemetry Event schema requires have the following keys: 6 | 7 | - `$id` : a URI to identify (and possibly locate) the schema. 8 | - `version` : schema version. 9 | - `title` : name of the schema 10 | - `description` : documentation for the schema 11 | - `properties` : attributes of the event being emitted. 12 | 13 | Each property should have the following attributes: 14 | 15 | + `title` : name of the property 16 | + `description`: documentation for this property. 17 | + `categories`: list of types of data being collected 18 | 19 | - `required`: list of required properties. 20 | 21 | Here is a minimal example of a valid JSON schema for an event. 22 | 23 | ```yaml 24 | $id: event.jupyter.org/example-event 25 | version: 1 26 | title: My Event 27 | description: | 28 | All events must have a name property 29 | type: object 30 | properties: 31 | thing: 32 | title: Thing 33 | categories: 34 | - category.jupyter.org/unrestricted 35 | description: A random thing. 36 | user: 37 | title: User name 38 | categories: 39 | - category.jupyter.org/user-identifier 40 | description: Name of user who initiated event 41 | required: 42 | - thing 43 | - user 44 | ``` 45 | 46 | 47 | 48 | # Property Categories 49 | 50 | Each property can be labelled with `categories` field. This makes it easier to filter properties based on a category. We recommend that schema authors use valid URIs for these labels, e.g. something like `category.jupyter.org/unrestricted`. 51 | 52 | Below is a list of common category labels that Jupyter Telemetry recommends using: 53 | 54 | * `category.jupyter.org/unrestricted` 55 | * `category.jupyter.org/user-identifier` 56 | * `category.jupyter.org/user-identifiable-information` 57 | * `category.jupyter.org/action-timestamp` 58 | -------------------------------------------------------------------------------- /docs/pages/user-guide.md: -------------------------------------------------------------------------------- 1 | User Guide 2 | ========== 3 | 4 | ```{toctree} 5 | :maxdepth: 1 6 | :caption: "Table of Contents" 7 | 8 | configure 9 | schemas 10 | application 11 | ``` 12 | -------------------------------------------------------------------------------- /jupyter_telemetry/__init__.py: -------------------------------------------------------------------------------- 1 | # Increment this version when the metadata included with each event 2 | # changes. 3 | TELEMETRY_METADATA_VERSION = 1 4 | -------------------------------------------------------------------------------- /jupyter_telemetry/_categories.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | 3 | from jsonschema import Draft7Validator, validators 4 | from jsonschema.exceptions import ValidationError 5 | 6 | 7 | class ExtractCategories(ValidationError): 8 | """ 9 | A special `jsonschema.ValidationError` that carries information about the 10 | `categories` keyword, intended to be yielded whenever a `categories` keyword 11 | is encountered during `jsonschema` JSON validation. 12 | 13 | The primary use case for this class is to make use of the JSON validation 14 | mechanism implemented by `jsonschema` to extract all categories associated 15 | with each property in a JSON instance based on a JSON schema. It is not 16 | intended to be used as an actual validation error. 17 | """ 18 | 19 | def __init__(self, property, categories, *args, **kwargs): 20 | super(ValidationError, self).__init__(*args, **kwargs) 21 | self.property = property 22 | self.categories = categories 23 | 24 | 25 | def extend_with_categories(validator_class): 26 | """ 27 | Extend a `jsonschema.IValidator` class so that it yields a `_ExtractCategories` 28 | whenever a `categories` keyword is encountered during JSON validation 29 | 30 | Parameters 31 | ---------- 32 | validator_class : jsonschema.IValidator 33 | an existing validator class 34 | 35 | Returns 36 | ------- 37 | jsonschema.IValidator 38 | a new `jsonschema.IValidator` class extending the one provided 39 | 40 | Examples 41 | -------- 42 | from jsonschema import Draft7Validator 43 | 44 | 45 | CategoryExtractor = extend_with_categories(Draft7Validator) 46 | """ 47 | validate_properties = validator_class.VALIDATORS["properties"] 48 | 49 | def get_categories(validator, properties, instance, schema): 50 | for property, subschema in properties.items(): 51 | if "categories" in subschema: 52 | yield ExtractCategories(property, subschema["categories"], message=None) 53 | 54 | for error in validate_properties( 55 | validator, properties, instance, schema, 56 | ): 57 | yield error 58 | 59 | return validators.extend( 60 | validator_class, {"properties": get_categories}, 61 | ) 62 | 63 | 64 | JSONSchemaValidator = Draft7Validator 65 | CategoryExtractor = extend_with_categories(JSONSchemaValidator) 66 | 67 | 68 | # Ignore categories under any of these jsonschema keywords 69 | IGNORE_CATEGORIES_SCHEMA_KEYWORDS = { 70 | 'if', 'not', 'anyOf', 'oneOf', 'then', 'else' 71 | } 72 | 73 | 74 | def extract_categories_from_errors(errors): 75 | for e in errors: 76 | if ( 77 | isinstance(e, ExtractCategories) and 78 | not any(p in IGNORE_CATEGORIES_SCHEMA_KEYWORDS 79 | for p in e.absolute_schema_path) 80 | ): 81 | yield e 82 | else: 83 | yield from extract_categories_from_errors(e.context) 84 | 85 | 86 | def extract_categories_from_event(event, schema): 87 | """ 88 | Generate a `dict` of `_ExtractCategories` whose keys are pointers to the properties 89 | 90 | Parameters 91 | ---------- 92 | event : dict 93 | A telemetry event 94 | 95 | schema : dict 96 | A JSON schema 97 | 98 | Returns 99 | ------- 100 | dict 101 | A mapping from properties in the event to their categories. 102 | 103 | In each entry, the key is a pointer to a property in the event 104 | (in the form of a tuple) and the value is a `_ExtractCategories` 105 | containing the categories associated with that property. 106 | """ 107 | return { 108 | tuple(c.absolute_path + deque([c.property])): c 109 | for c in extract_categories_from_errors( 110 | CategoryExtractor(schema).iter_errors(event) 111 | ) 112 | } 113 | 114 | 115 | def filter_categories_from_event(event, schema, allowed_categories, allowed_properties): 116 | """ 117 | Filter properties from an event based on their categories. 118 | 119 | Only whitelisted properties and properties whose categories are allowed are kept. 120 | 121 | Parameters 122 | ---------- 123 | event : dict 124 | The input telemetry event 125 | 126 | schema : dict 127 | A JSON schema that makes use of the the `categories` keyword to 128 | specify what categories are associated with a certain property. 129 | 130 | allowed_categories : set 131 | Specify which categories are allowed 132 | 133 | allowed_properties : set 134 | Whitelist certain top level properties. 135 | 136 | These properties are included in the output event even if not all of 137 | their properties are allowed. 138 | 139 | Returns 140 | ------- 141 | dict 142 | The output event after category filtering 143 | 144 | """ 145 | categories = extract_categories_from_event(event, schema) 146 | 147 | # Top-level properties without declared categories are set to null 148 | for property in event.keys(): 149 | path = (property,) 150 | if path not in categories: 151 | event[property] = None 152 | 153 | # Allow only properties whose categories are included in allowed_categories 154 | # and whose top-level parent is included in allowed_properties 155 | not_allowed = ( 156 | c for p, c in categories.items() 157 | if not (set(c.categories).issubset(allowed_categories) or 158 | p[0] in allowed_properties) 159 | ) 160 | 161 | for c in not_allowed: 162 | # In case both a sub property and its parent, e.g. ['user', 'name'] and 163 | # ['user'], do not have all the allowed categories and are to be removed, 164 | # if the parent is removed first then attempting to access 165 | # the descendent would either return None or raise an IndexError or 166 | # KeyError. Just skip it. 167 | try: 168 | item = deep_get(event, c.absolute_path) 169 | except IndexError: 170 | continue 171 | except KeyError: 172 | continue 173 | 174 | if item is not None: 175 | item[c.property] = None 176 | 177 | return event 178 | 179 | 180 | def deep_get(instance, path): 181 | result = instance 182 | while result is not None and path: 183 | result = result[path.popleft()] 184 | return result 185 | -------------------------------------------------------------------------------- /jupyter_telemetry/_version.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | # Version string must appear intact for tbump versioning 4 | __version__ = '0.2.0.dev0' 5 | 6 | # Build up version_info tuple for backwards compatibility 7 | pattern = r'(?P\d+).(?P\d+).(?P\d+)(?P.*)' 8 | match = re.match(pattern, __version__) 9 | parts = [int(match[part]) for part in ['major', 'minor', 'patch']] 10 | if match['rest']: 11 | parts.append(match['rest']) 12 | version_info = tuple(parts) 13 | -------------------------------------------------------------------------------- /jupyter_telemetry/categories.py: -------------------------------------------------------------------------------- 1 | from ._categories import JSONSchemaValidator, filter_categories_from_event # noqa 2 | -------------------------------------------------------------------------------- /jupyter_telemetry/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jupyter/telemetry/2975fcac06cb3ffa5872258669990f488547e0c1/jupyter_telemetry/conftest.py -------------------------------------------------------------------------------- /jupyter_telemetry/eventlog.py: -------------------------------------------------------------------------------- 1 | """ 2 | Emit structured, discrete events when various actions happen. 3 | """ 4 | import json 5 | import logging 6 | from datetime import datetime 7 | 8 | from pythonjsonlogger import jsonlogger 9 | try: 10 | from ruamel.yaml import YAML 11 | except ImportError as e: 12 | # check for known conda bug that prevents 13 | # pip from installing ruamel.yaml dependency 14 | try: 15 | import ruamel_yaml # noqa 16 | except ImportError: 17 | # nope, regular import error; raise original 18 | raise e 19 | else: 20 | # have conda fork ruamel_yaml, but not ruamel.yaml. 21 | # this is a bug in the ruamel_yaml conda package 22 | # mistakenly identifying itself as ruamel.yaml to pip. 23 | # conda install the 'real' ruamel.yaml to fix 24 | raise ImportError("Missing dependency ruamel.yaml. Try: `conda install ruamel.yaml`") 25 | 26 | from traitlets.config import Configurable, Config 27 | 28 | from .traits import Handlers, SchemaOptions 29 | from . import TELEMETRY_METADATA_VERSION 30 | 31 | from .categories import JSONSchemaValidator, filter_categories_from_event 32 | 33 | yaml = YAML(typ='safe') 34 | 35 | 36 | def _skip_message(record, **kwargs): 37 | """ 38 | Remove 'message' from log record. 39 | It is always emitted with 'null', and we do not want it, 40 | since we are always emitting events only 41 | """ 42 | del record['message'] 43 | return json.dumps(record, **kwargs) 44 | 45 | 46 | class EventLog(Configurable): 47 | """ 48 | Send structured events to a logging sink 49 | """ 50 | handlers = Handlers( 51 | [], 52 | allow_none=True, 53 | help="""A list of logging.Handler instances to send events to. 54 | 55 | When set to None (the default), events are discarded. 56 | """ 57 | ).tag(config=True) 58 | 59 | allowed_schemas = SchemaOptions( 60 | {}, 61 | allow_none=True, 62 | help=""" 63 | Fully qualified names of schemas to record. 64 | 65 | Each schema you want to record must be manually specified. 66 | The default, an empty list, means no events are recorded. 67 | """ 68 | ).tag(config=True) 69 | 70 | def __init__(self, *args, **kwargs): 71 | # We need to initialize the configurable before 72 | # adding the logging handlers. 73 | super().__init__(*args, **kwargs) 74 | # Use a unique name for the logger so that multiple instances of EventLog do not write 75 | # to each other's handlers. 76 | log_name = __name__ + '.' + str(id(self)) 77 | self.log = logging.getLogger(log_name) 78 | # We don't want events to show up in the default logs 79 | self.log.propagate = False 80 | # We will use log.info to emit 81 | self.log.setLevel(logging.INFO) 82 | self.schemas = {} 83 | # Add each handler to the logger and format the handlers. 84 | if self.handlers: 85 | formatter = jsonlogger.JsonFormatter(json_serializer=_skip_message) 86 | for handler in self.handlers: 87 | handler.setFormatter(formatter) 88 | self.log.addHandler(handler) 89 | 90 | def _load_config(self, cfg, section_names=None, traits=None): 91 | """Load EventLog traits from a Config object, patching the 92 | handlers trait in the Config object to avoid deepcopy errors. 93 | """ 94 | my_cfg = self._find_my_config(cfg) 95 | handlers = my_cfg.pop("handlers", []) 96 | 97 | # Turn handlers list into a pickeable function 98 | def get_handlers(): 99 | return handlers 100 | 101 | my_cfg["handlers"] = get_handlers 102 | 103 | # Build a new eventlog config object. 104 | eventlog_cfg = Config({"EventLog": my_cfg}) 105 | super(EventLog, self)._load_config(eventlog_cfg, section_names=None, traits=None) 106 | 107 | def register_schema_file(self, filename): 108 | """ 109 | Convenience function for registering a JSON schema from a filepath 110 | 111 | Supports both JSON & YAML files. 112 | 113 | Parameters 114 | ---------- 115 | filename: str, path object or file-like object 116 | Path to the schema file or a file object to register. 117 | """ 118 | # Just use YAML loader for everything, since all valid JSON is valid YAML 119 | 120 | # check if input is a file-like object 121 | if hasattr(filename, 'read') and hasattr(filename, 'write'): 122 | self.register_schema(yaml.load(filename)) 123 | else: 124 | with open(filename) as f: 125 | self.register_schema(yaml.load(f)) 126 | 127 | def register_schema(self, schema): 128 | """ 129 | Register a given JSON Schema with this event emitter 130 | 131 | 'version' and '$id' are required fields. 132 | """ 133 | # Check if our schema itself is valid 134 | # This throws an exception if it isn't valid 135 | JSONSchemaValidator.check_schema(schema) 136 | 137 | # Check that the properties we require are present 138 | required_schema_fields = {'$id', 'version', 'properties'} 139 | for rsf in required_schema_fields: 140 | if rsf not in schema: 141 | raise ValueError( 142 | '{} is required in schema specification'.format(rsf) 143 | ) 144 | 145 | if (schema['$id'], schema['version']) in self.schemas: 146 | raise ValueError( 147 | 'Schema {} version {} has already been registered.'.format( 148 | schema['$id'], schema['version'] 149 | ) 150 | ) 151 | 152 | for p, attrs in schema['properties'].items(): 153 | if p.startswith('__'): 154 | raise ValueError( 155 | 'Schema {} has properties beginning with __, which is not allowed' 156 | ) 157 | 158 | # Validate "categories" property in proposed schema. 159 | try: 160 | cats = attrs['categories'] 161 | # Categories must be a list. 162 | if not isinstance(cats, list): 163 | raise ValueError( 164 | 'The "categories" field in a registered schemas must be a list.' 165 | ) 166 | except KeyError: 167 | raise KeyError( 168 | 'All properties must have a "categories" field that describes ' 169 | 'the type of data being collected. The "{}" property does not ' 170 | 'have a category field.'.format(p) 171 | ) 172 | 173 | self.schemas[(schema['$id'], schema['version'])] = schema 174 | 175 | def get_allowed_properties(self, schema_name): 176 | """Get the allowed properties for an allowed schema.""" 177 | config = self.allowed_schemas[schema_name] 178 | try: 179 | return set(config["allowed_properties"]) 180 | except KeyError: 181 | return set() 182 | 183 | def get_allowed_categories(self, schema_name): 184 | """ 185 | Return a set of allowed categories for a given schema 186 | from the EventLog's config. 187 | """ 188 | config = self.allowed_schemas[schema_name] 189 | try: 190 | allowed_categories = config["allowed_categories"] 191 | allowed_categories.append("unrestricted") 192 | return set(allowed_categories) 193 | except KeyError: 194 | return {"unrestricted"} 195 | 196 | def record_event(self, schema_name, version, event, timestamp_override=None): 197 | """ 198 | Record given event with schema has occurred. 199 | 200 | Parameters 201 | ---------- 202 | schema_name: str 203 | Name of the schema 204 | version: str 205 | The schema version 206 | event: dict 207 | The event to record 208 | timestamp_override: datetime, optional 209 | Optionally override the event timestamp. By default it is set to the current timestamp. 210 | 211 | Returns 212 | ------- 213 | dict 214 | The recorded event data 215 | """ 216 | if not (self.handlers and schema_name in self.allowed_schemas): 217 | # if handler isn't set up or schema is not explicitly whitelisted, 218 | # don't do anything 219 | return 220 | 221 | if (schema_name, version) not in self.schemas: 222 | raise ValueError('Schema {schema_name} version {version} not registered'.format( 223 | schema_name=schema_name, version=version 224 | )) 225 | 226 | schema = self.schemas[(schema_name, version)] 227 | 228 | # Validate the event data. 229 | JSONSchemaValidator(schema).validate(event) 230 | 231 | # Generate the empty event capsule. 232 | if timestamp_override is None: 233 | timestamp = datetime.utcnow() 234 | else: 235 | timestamp = timestamp_override 236 | capsule = { 237 | '__timestamp__': timestamp.isoformat() + 'Z', 238 | '__schema__': schema_name, 239 | '__schema_version__': version, 240 | '__metadata_version__': TELEMETRY_METADATA_VERSION, 241 | } 242 | 243 | # Filter properties in the incoming event based on the 244 | # allowed categories and properties from the eventlog config. 245 | allowed_categories = self.get_allowed_categories(schema_name) 246 | allowed_properties = self.get_allowed_properties(schema_name) 247 | 248 | filtered_event = filter_categories_from_event( 249 | event, schema, allowed_categories, allowed_properties 250 | ) 251 | capsule.update(filtered_event) 252 | 253 | self.log.info(capsule) 254 | return capsule 255 | -------------------------------------------------------------------------------- /jupyter_telemetry/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jupyter/telemetry/2975fcac06cb3ffa5872258669990f488547e0c1/jupyter_telemetry/tests/__init__.py -------------------------------------------------------------------------------- /jupyter_telemetry/tests/test_allowed_schemas.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent as _ 2 | from ruamel.yaml import YAML 3 | 4 | from jupyter_telemetry.eventlog import EventLog 5 | 6 | import pytest 7 | 8 | from .utils import get_event_data 9 | 10 | 11 | SCHEMA_ID = "test.event" 12 | VERSION = 1 13 | 14 | 15 | @pytest.fixture 16 | def schema(): 17 | return { 18 | '$id': SCHEMA_ID, 19 | 'title': 'Test Event', 20 | 'version': VERSION, 21 | 'description': 'Test Event.', 22 | 'type': 'object', 23 | 'properties': { 24 | 'nothing-exciting': { 25 | 'description': 'a property with nothing exciting happening', 26 | 'categories': ['unrestricted'], 27 | 'type': 'string' 28 | }, 29 | 'id': { 30 | 'description': 'user ID', 31 | 'categories': ['user-identifier'], 32 | 'type': 'string' 33 | }, 34 | 'email': { 35 | 'description': 'email address', 36 | 'categories': ['user-identifiable-information'], 37 | 'type': 'string' 38 | }, 39 | } 40 | } 41 | 42 | 43 | def test_raised_exception_for_nonlist_categories(): 44 | # Bad schema in yaml form. 45 | yaml_schema = _("""\ 46 | $id: test.schema 47 | title: Test Event 48 | version: 1 49 | type: object 50 | properties: 51 | test_property: 52 | description: testing a property 53 | categories: user-identifier 54 | type: string 55 | """) 56 | yaml = YAML(typ='safe') 57 | schema = yaml.load(yaml_schema) 58 | 59 | # Register schema with an EventLog 60 | e = EventLog( 61 | allowed_schemas={ 62 | SCHEMA_ID: { 63 | "allowed_categories": ["user-identifier"] 64 | } 65 | }, 66 | ) 67 | 68 | # This schema does not have categories as a list. 69 | with pytest.raises(ValueError) as err: 70 | e.register_schema(schema) 71 | # Verify that the error message is the expected error message. 72 | assert 'must be a list.' in str(err.value) 73 | 74 | 75 | def test_missing_categories_label(): 76 | # Bad schema in yaml form. 77 | yaml_schema = _("""\ 78 | $id: test.schema 79 | title: Test Event 80 | version: 1 81 | type: object 82 | properties: 83 | test_property: 84 | description: testing a property 85 | type: string 86 | """) 87 | yaml = YAML(typ='safe') 88 | schema = yaml.load(yaml_schema) 89 | 90 | # Register schema with an EventLog 91 | e = EventLog( 92 | allowed_schemas={ 93 | SCHEMA_ID: { 94 | "allowed_categories": ["random-category"] 95 | } 96 | } 97 | ) 98 | 99 | # This schema does not have categories as a list. 100 | with pytest.raises(KeyError) as err: 101 | e.register_schema(schema) 102 | # Verify that the error message is the expected error message. 103 | assert 'All properties must have a "categories"' in str(err.value) 104 | 105 | 106 | EVENT_DATA = { 107 | 'nothing-exciting': 'hello, world', 108 | 'id': 'test id', 109 | 'email': 'test@testemail.com', 110 | } 111 | 112 | 113 | @pytest.mark.parametrize( 114 | 'allowed_schemas,expected_output', 115 | [ 116 | ( 117 | # User configuration for allowed_schemas 118 | {SCHEMA_ID: {"allowed_categories": []}}, 119 | # Expected properties in the recorded event 120 | { 121 | 'nothing-exciting': 'hello, world', 122 | 'id': None, 123 | 'email': None, 124 | } 125 | ), 126 | ( 127 | # User configuration for allowed_schemas 128 | {SCHEMA_ID: {"allowed_categories": ["unrestricted"]}}, 129 | # Expected properties in the recorded event 130 | { 131 | 'nothing-exciting': 'hello, world', 132 | 'id': None, 133 | 'email': None, 134 | } 135 | ), 136 | ( 137 | # User configuration for allowed_schemas 138 | {SCHEMA_ID: {"allowed_categories": ["user-identifier"]}}, 139 | # Expected properties in the recorded event 140 | { 141 | 'nothing-exciting': 'hello, world', 142 | 'id': 'test id', 143 | 'email': None, 144 | } 145 | ), 146 | ( 147 | # User configuration for allowed_schemas 148 | {SCHEMA_ID: {"allowed_categories": ["user-identifiable-information"]}}, 149 | # Expected properties in the recorded event 150 | { 151 | 'nothing-exciting': 'hello, world', 152 | 'id': None, 153 | 'email': 'test@testemail.com', 154 | } 155 | ), 156 | ( 157 | # User configuration for allowed_schemas 158 | { 159 | SCHEMA_ID: { 160 | "allowed_categories": [ 161 | "user-identifier", 162 | "user-identifiable-information" 163 | ] 164 | } 165 | }, 166 | # Expected properties in the recorded event 167 | { 168 | 'nothing-exciting': 'hello, world', 169 | 'id': 'test id', 170 | 'email': 'test@testemail.com', 171 | } 172 | ), 173 | ( 174 | # User configuration for allowed_schemas 175 | {SCHEMA_ID: {"allowed_properties": ["id"]}}, 176 | # Expected properties in the recorded event 177 | { 178 | 'nothing-exciting': 'hello, world', 179 | 'id': 'test id', 180 | 'email': None, 181 | } 182 | ), 183 | ( 184 | # User configuration for allowed_schemas 185 | { 186 | SCHEMA_ID: { 187 | "allowed_properties": ["id"], 188 | "allowed_categories": ["user-identifiable-information"], 189 | } 190 | }, 191 | # Expected properties in the recorded event 192 | { 193 | 'nothing-exciting': 'hello, world', 194 | 'id': 'test id', 195 | 'email': 'test@testemail.com', 196 | } 197 | ), 198 | ] 199 | ) 200 | def test_allowed_schemas(schema, allowed_schemas, expected_output): 201 | event_data = get_event_data( 202 | EVENT_DATA, 203 | schema, 204 | SCHEMA_ID, 205 | VERSION, 206 | allowed_schemas 207 | ) 208 | 209 | # Verify that *exactly* the right properties are recorded. 210 | assert expected_output == event_data 211 | -------------------------------------------------------------------------------- /jupyter_telemetry/tests/test_category_filtering.py: -------------------------------------------------------------------------------- 1 | from itertools import product 2 | 3 | import pytest 4 | 5 | from .utils import get_event_data 6 | 7 | 8 | SCHEMA_ID = 'test.event' 9 | VERSION = 1 10 | 11 | 12 | NESTED_CATEGORY_SCHEMA = { 13 | '$id': SCHEMA_ID, 14 | 'title': 'Test Event', 15 | 'version': VERSION, 16 | 'description': 'Test Event.', 17 | 'type': 'object', 18 | 'properties': { 19 | 'nothing-exciting': { 20 | 'description': 'a property with nothing exciting happening', 21 | 'categories': ['unrestricted'], 22 | 'type': 'string' 23 | }, 24 | 'user': { 25 | 'description': 'user', 26 | 'categories': ['user-identifier'], 27 | 'type': 'object', 28 | 'properties': { 29 | 'email': { 30 | 'description': 'email address', 31 | 'categories': ['user-identifiable-information'], 32 | 'type': 'string' 33 | }, 34 | 'id': { 35 | 'description': 'user ID', 36 | 'type': 'string' 37 | } 38 | } 39 | } 40 | } 41 | } 42 | 43 | 44 | NESTED_EVENT_DATA = { 45 | 'nothing-exciting': 'hello, world', 46 | 'user': { 47 | 'id': 'test id', 48 | 'email': 'test@testemail.com', 49 | } 50 | } 51 | 52 | 53 | NESTED_CATEGORY_TEST_CASES = [ 54 | ( 55 | # User configuration for allowed_schemas 56 | {SCHEMA_ID: {'allowed_categories': []}}, 57 | # Expected properties in the recorded event 58 | { 59 | 'nothing-exciting': 'hello, world', 60 | 'user': None 61 | } 62 | ), 63 | ( 64 | # User configuration for allowed_schemas 65 | {SCHEMA_ID: {'allowed_categories': ['unrestricted']}}, 66 | # Expected properties in the recorded event 67 | { 68 | 'nothing-exciting': 'hello, world', 69 | 'user': None 70 | } 71 | ), 72 | ( 73 | # User configuration for allowed_schemas 74 | {SCHEMA_ID: {'allowed_categories': ['user-identifier']}}, 75 | # Expected properties in the recorded event 76 | { 77 | 'nothing-exciting': 'hello, world', 78 | 'user': { 79 | 'id': 'test id', 80 | 'email': None 81 | } 82 | } 83 | ), 84 | ( 85 | # User configuration for allowed_schemas 86 | {SCHEMA_ID: {'allowed_categories': ['user-identifiable-information']}}, 87 | # Expected properties in the recorded event 88 | { 89 | 'nothing-exciting': 'hello, world', 90 | 'user': None 91 | } 92 | ), 93 | ( 94 | # User configuration for allowed_schemas 95 | { 96 | SCHEMA_ID: { 97 | 'allowed_categories': [ 98 | 'user-identifier', 99 | 'user-identifiable-information' 100 | ] 101 | } 102 | }, 103 | # Expected properties in the recorded event 104 | { 105 | 'nothing-exciting': 'hello, world', 106 | 'user': { 107 | 'id': 'test id', 108 | 'email': 'test@testemail.com', 109 | } 110 | } 111 | ), 112 | ( 113 | # User configuration for allowed_schemas 114 | {SCHEMA_ID: {'allowed_properties': ['user']}}, 115 | # Expected properties in the recorded event 116 | { 117 | 'nothing-exciting': 'hello, world', 118 | 'user': { 119 | 'id': 'test id', 120 | 'email': 'test@testemail.com', 121 | } 122 | } 123 | ), 124 | ] 125 | 126 | 127 | @pytest.mark.parametrize( 128 | 'allowed_schemas,expected_output', NESTED_CATEGORY_TEST_CASES 129 | ) 130 | def test_category_filtering(allowed_schemas, expected_output): 131 | event_data = get_event_data( 132 | NESTED_EVENT_DATA, 133 | NESTED_CATEGORY_SCHEMA, 134 | SCHEMA_ID, 135 | VERSION, 136 | allowed_schemas 137 | ) 138 | 139 | # Verify that *exactly* the right properties are recorded. 140 | assert expected_output == event_data 141 | 142 | 143 | NESTED_CATEGORY_ARRAY_SCHEMA = { 144 | '$id': SCHEMA_ID, 145 | 'title': 'Test Event', 146 | 'version': VERSION, 147 | 'description': 'Test Event.', 148 | 'type': 'object', 149 | 'properties': { 150 | 'nothing-exciting': { 151 | 'description': 'a property with nothing exciting happening', 152 | 'categories': ['unrestricted'], 153 | 'type': 'string' 154 | }, 155 | 'users': { 156 | 'description': 'user', 157 | 'categories': ['user-identifier'], 158 | 'type': 'array', 159 | 'items': { 160 | 'properties': { 161 | 'email': { 162 | 'description': 'email address', 163 | 'categories': ['user-identifiable-information'], 164 | 'type': 'string' 165 | }, 166 | 'id': { 167 | 'description': 'user ID', 168 | 'type': 'string' 169 | } 170 | } 171 | } 172 | } 173 | } 174 | } 175 | 176 | 177 | ARRAY_EVENT_DATA = { 178 | 'nothing-exciting': 'hello, world', 179 | 'users': [ 180 | { 181 | 'id': 'test id 0', 182 | 'email': 'test0@testemail.com', 183 | }, 184 | { 185 | 'id': 'test id 1', 186 | 'email': 'test1@testemail.com', 187 | } 188 | ] 189 | } 190 | 191 | 192 | @pytest.mark.parametrize( 193 | 'allowed_schemas,expected_output', 194 | [ 195 | ( 196 | # User configuration for allowed_schemas 197 | {SCHEMA_ID: {'allowed_categories': []}}, 198 | # Expected properties in the recorded event 199 | { 200 | 'nothing-exciting': 'hello, world', 201 | 'users': None 202 | } 203 | ), 204 | ( 205 | # User configuration for allowed_schemas 206 | {SCHEMA_ID: {'allowed_categories': ['unrestricted']}}, 207 | # Expected properties in the recorded event 208 | { 209 | 'nothing-exciting': 'hello, world', 210 | 'users': None 211 | } 212 | ), 213 | ( 214 | # User configuration for allowed_schemas 215 | {SCHEMA_ID: {'allowed_categories': ['user-identifier']}}, 216 | # Expected properties in the recorded event 217 | { 218 | 'nothing-exciting': 'hello, world', 219 | 'users': [ 220 | { 221 | 'id': 'test id 0', 222 | 'email': None, 223 | }, 224 | { 225 | 'id': 'test id 1', 226 | 'email': None, 227 | } 228 | ] 229 | } 230 | ), 231 | ( 232 | # User configuration for allowed_schemas 233 | {SCHEMA_ID: {'allowed_categories': ['user-identifiable-information']}}, 234 | # Expected properties in the recorded event 235 | { 236 | 'nothing-exciting': 'hello, world', 237 | 'users': None 238 | } 239 | ), 240 | ( 241 | # User configuration for allowed_schemas 242 | { 243 | SCHEMA_ID: { 244 | 'allowed_categories': [ 245 | 'user-identifier', 246 | 'user-identifiable-information' 247 | ] 248 | } 249 | }, 250 | # Expected properties in the recorded event 251 | { 252 | 'nothing-exciting': 'hello, world', 253 | 'users': [ 254 | { 255 | 'id': 'test id 0', 256 | 'email': 'test0@testemail.com', 257 | }, 258 | { 259 | 'id': 'test id 1', 260 | 'email': 'test1@testemail.com', 261 | } 262 | ] 263 | } 264 | ), 265 | ( 266 | # User configuration for allowed_schemas 267 | {SCHEMA_ID: {'allowed_properties': ['users']}}, 268 | # Expected properties in the recorded event 269 | { 270 | 'nothing-exciting': 'hello, world', 271 | 'users': [ 272 | { 273 | 'id': 'test id 0', 274 | 'email': 'test0@testemail.com', 275 | }, 276 | { 277 | 'id': 'test id 1', 278 | 'email': 'test1@testemail.com', 279 | } 280 | ] 281 | } 282 | ), 283 | ] 284 | ) 285 | def test_array_category_filtering(allowed_schemas, expected_output): 286 | event_data = get_event_data( 287 | ARRAY_EVENT_DATA, 288 | NESTED_CATEGORY_ARRAY_SCHEMA, 289 | SCHEMA_ID, 290 | VERSION, 291 | allowed_schemas 292 | ) 293 | 294 | # Verify that *exactly* the right properties are recorded. 295 | assert expected_output == event_data 296 | 297 | 298 | ADDITIONAL_PROP_EVENT_DATA = { 299 | 'nothing-exciting': 'hello, world', 300 | 'user': { 301 | 'id': 'test id', 302 | 'email': 'test@testemail.com', 303 | }, 304 | 'extra': 1234 305 | } 306 | 307 | 308 | @pytest.mark.parametrize( 309 | 'allowed_schemas,expected_output', 310 | [ 311 | ( 312 | # User configuration for allowed_schemas 313 | {SCHEMA_ID: {'allowed_categories': []}}, 314 | # Expected properties in the recorded event 315 | { 316 | 'nothing-exciting': 'hello, world', 317 | 'user': None, 318 | 'extra': None 319 | } 320 | ), 321 | ( 322 | # User configuration for allowed_schemas 323 | {SCHEMA_ID: {'allowed_categories': ['unrestricted']}}, 324 | # Expected properties in the recorded event 325 | { 326 | 'nothing-exciting': 'hello, world', 327 | 'user': None, 328 | 'extra': None 329 | } 330 | ), 331 | ( 332 | # User configuration for allowed_schemas 333 | {SCHEMA_ID: {'allowed_categories': ['user-identifier']}}, 334 | # Expected properties in the recorded event 335 | { 336 | 'nothing-exciting': 'hello, world', 337 | 'user': { 338 | 'id': 'test id', 339 | 'email': None 340 | }, 341 | 'extra': None 342 | } 343 | ), 344 | ( 345 | # User configuration for allowed_schemas 346 | {SCHEMA_ID: {'allowed_categories': ['user-identifiable-information']}}, 347 | # Expected properties in the recorded event 348 | { 349 | 'nothing-exciting': 'hello, world', 350 | 'user': None, 351 | 'extra': None 352 | } 353 | ), 354 | ( 355 | # User configuration for allowed_schemas 356 | { 357 | SCHEMA_ID: { 358 | 'allowed_categories': [ 359 | 'user-identifier', 360 | 'user-identifiable-information' 361 | ] 362 | } 363 | }, 364 | # Expected properties in the recorded event 365 | { 366 | 'nothing-exciting': 'hello, world', 367 | 'user': { 368 | 'id': 'test id', 369 | 'email': 'test@testemail.com', 370 | }, 371 | 'extra': None 372 | } 373 | ), 374 | ( 375 | # User configuration for allowed_schemas 376 | {SCHEMA_ID: {'allowed_properties': ['user']}}, 377 | # Expected properties in the recorded event 378 | { 379 | 'nothing-exciting': 'hello, world', 380 | 'user': { 381 | 'id': 'test id', 382 | 'email': 'test@testemail.com', 383 | }, 384 | 'extra': None 385 | } 386 | ), 387 | ] 388 | ) 389 | def test_no_additional_properties(allowed_schemas, expected_output): 390 | event_data = get_event_data( 391 | ADDITIONAL_PROP_EVENT_DATA, 392 | NESTED_CATEGORY_SCHEMA, 393 | SCHEMA_ID, 394 | VERSION, 395 | allowed_schemas 396 | ) 397 | 398 | # Verify that *exactly* the right properties are recorded. 399 | assert expected_output == event_data 400 | 401 | 402 | NESTED_CATEGORY_SCHEMA_ALLOF = { 403 | '$id': SCHEMA_ID, 404 | 'title': 'Test Event', 405 | 'version': VERSION, 406 | 'description': 'Test Event.', 407 | 'type': 'object', 408 | 'properties': { 409 | 'nothing-exciting': { 410 | 'description': 'a property with nothing exciting happening', 411 | 'categories': ['unrestricted'], 412 | 'type': 'string' 413 | }, 414 | 'user': { 415 | 'description': 'user', 416 | 'categories': ['user-identifier'], 417 | 'type': 'object', 418 | 'allOf': [ 419 | { 420 | 'properties': { 421 | 'email': { 422 | 'description': 'email address', 423 | 'categories': ['user-identifiable-information'], 424 | 'type': 'string' 425 | } 426 | } 427 | }, 428 | { 429 | 'properties': { 430 | 'id': { 431 | 'description': 'user ID', 432 | 'type': 'string' 433 | } 434 | } 435 | } 436 | ] 437 | } 438 | } 439 | } 440 | 441 | 442 | NESTED_CATEGORY_SCHEMA_REF = { 443 | '$id': SCHEMA_ID, 444 | 'title': 'Test Event', 445 | 'version': VERSION, 446 | 'description': 'Test Event.', 447 | 'type': 'object', 448 | 'properties': { 449 | 'nothing-exciting': { 450 | 'description': 'a property with nothing exciting happening', 451 | 'categories': ['unrestricted'], 452 | 'type': 'string' 453 | }, 454 | 'user': { 455 | 'description': 'user', 456 | 'categories': ['user-identifier'], 457 | 'type': 'object', 458 | '$ref': '#/definitions/properties' 459 | } 460 | }, 461 | 'definitions': { 462 | 'properties': { 463 | 'properties': { 464 | 'email': { 465 | 'description': 'email address', 466 | 'categories': ['user-identifiable-information'], 467 | 'type': 'string' 468 | }, 469 | 'id': { 470 | 'description': 'user ID', 471 | 'type': 'string' 472 | } 473 | } 474 | } 475 | } 476 | } 477 | 478 | 479 | @pytest.mark.parametrize( 480 | 'allowed_schemas,expected_output', NESTED_CATEGORY_TEST_CASES 481 | ) 482 | def test_category_filtering_ref(allowed_schemas, expected_output): 483 | event_data = get_event_data( 484 | NESTED_EVENT_DATA, 485 | NESTED_CATEGORY_SCHEMA_REF, 486 | SCHEMA_ID, 487 | VERSION, 488 | allowed_schemas 489 | ) 490 | 491 | # Verify that *exactly* the right properties are recorded. 492 | assert expected_output == event_data 493 | 494 | 495 | @pytest.mark.parametrize( 496 | 'allowed_schemas,expected_output', NESTED_CATEGORY_TEST_CASES 497 | ) 498 | def test_category_filtering_allof(allowed_schemas, expected_output): 499 | event_data = get_event_data( 500 | NESTED_EVENT_DATA, 501 | NESTED_CATEGORY_SCHEMA_ALLOF, 502 | SCHEMA_ID, 503 | VERSION, 504 | allowed_schemas 505 | ) 506 | 507 | # Verify that *exactly* the right properties are recorded. 508 | assert expected_output == event_data 509 | -------------------------------------------------------------------------------- /jupyter_telemetry/tests/test_eventlog.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import logging 3 | from traitlets.config.loader import PyFileConfigLoader 4 | from traitlets import TraitError 5 | 6 | from jupyter_telemetry.eventlog import EventLog 7 | 8 | GOOD_CONFIG = """ 9 | import logging 10 | 11 | c.EventLog.handlers = [ 12 | logging.StreamHandler() 13 | ] 14 | """ 15 | 16 | BAD_CONFIG = """ 17 | import logging 18 | 19 | c.EventLog.handlers = [ 20 | 0 21 | ] 22 | """ 23 | 24 | 25 | def get_config_from_file(path, content): 26 | # Write config file 27 | filename = 'config.py' 28 | config_file = path / filename 29 | config_file.write_text(content) 30 | 31 | # Load written file. 32 | loader = PyFileConfigLoader(filename, path=str(path)) 33 | cfg = loader.load_config() 34 | return cfg 35 | 36 | 37 | def test_good_config_file(tmp_path): 38 | cfg = get_config_from_file(tmp_path, GOOD_CONFIG) 39 | 40 | # Pass config to EventLog 41 | e = EventLog(config=cfg) 42 | 43 | assert len(e.handlers) > 0 44 | assert isinstance(e.handlers[0], logging.Handler) 45 | 46 | 47 | def test_bad_config_file(tmp_path): 48 | cfg = get_config_from_file(tmp_path, BAD_CONFIG) 49 | 50 | with pytest.raises(TraitError): 51 | EventLog(config=cfg) 52 | -------------------------------------------------------------------------------- /jupyter_telemetry/tests/test_register_schema.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | import logging 4 | import tempfile 5 | 6 | import jsonschema 7 | from datetime import datetime, timedelta 8 | import pytest 9 | from ruamel.yaml import YAML 10 | 11 | from jupyter_telemetry.eventlog import EventLog 12 | 13 | 14 | def test_register_invalid_schema(): 15 | """ 16 | Invalid JSON Schemas should fail registration 17 | """ 18 | el = EventLog() 19 | with pytest.raises(jsonschema.SchemaError): 20 | el.register_schema({ 21 | # Totally invalid 22 | 'properties': True 23 | }) 24 | 25 | 26 | def test_missing_required_properties(): 27 | """ 28 | id and $version are required properties in our schemas. 29 | 30 | They aren't required by JSON Schema itself 31 | """ 32 | el = EventLog() 33 | with pytest.raises(ValueError): 34 | el.register_schema({ 35 | 'properties': {} 36 | }) 37 | 38 | with pytest.raises(ValueError): 39 | el.register_schema({ 40 | '$id': 'something', 41 | '$version': 1, # This should been 'version' 42 | }) 43 | 44 | 45 | def test_reserved_properties(): 46 | """ 47 | User schemas can't have properties starting with __ 48 | 49 | These are reserved 50 | """ 51 | el = EventLog() 52 | with pytest.raises(ValueError): 53 | el.register_schema({ 54 | '$id': 'test/test', 55 | 'version': 1, 56 | 'properties': { 57 | '__fail__': { 58 | 'type': 'string', 59 | 'categories': ['unrestricted'] 60 | }, 61 | }, 62 | }) 63 | 64 | 65 | def test_timestamp_override(): 66 | """ 67 | Simple test for overriding timestamp 68 | """ 69 | schema = { 70 | '$id': 'test/test', 71 | 'version': 1, 72 | 'properties': { 73 | 'something': { 74 | 'type': 'string', 75 | 'categories': ['unrestricted'] 76 | }, 77 | }, 78 | } 79 | 80 | output = io.StringIO() 81 | handler = logging.StreamHandler(output) 82 | el = EventLog(handlers=[handler]) 83 | el.register_schema(schema) 84 | el.allowed_schemas = ['test/test'] 85 | 86 | timestamp_override = datetime.utcnow() - timedelta(days=1) 87 | el.record_event('test/test', 1, { 88 | 'something': 'blah', 89 | }, timestamp_override=timestamp_override) 90 | handler.flush() 91 | 92 | event_capsule = json.loads(output.getvalue()) 93 | 94 | # Cope with python3.12 95 | if "taskName" in event_capsule: 96 | del event_capsule["taskName"] 97 | 98 | assert event_capsule['__timestamp__'] == timestamp_override.isoformat() + 'Z' 99 | 100 | 101 | def test_record_event(): 102 | """ 103 | Simple test for emitting valid events 104 | """ 105 | schema = { 106 | '$id': 'test/test', 107 | 'version': 1, 108 | 'properties': { 109 | 'something': { 110 | 'type': 'string', 111 | 'categories': ['unrestricted'] 112 | }, 113 | }, 114 | } 115 | 116 | output = io.StringIO() 117 | handler = logging.StreamHandler(output) 118 | el = EventLog(handlers=[handler]) 119 | el.register_schema(schema) 120 | el.allowed_schemas = ['test/test'] 121 | 122 | el.record_event('test/test', 1, { 123 | 'something': 'blah', 124 | }) 125 | handler.flush() 126 | 127 | event_capsule = json.loads(output.getvalue()) 128 | 129 | assert '__timestamp__' in event_capsule 130 | # Remove timestamp from capsule when checking equality, since it is gonna vary 131 | del event_capsule['__timestamp__'] 132 | 133 | # Cope with python3.12 134 | if "taskName" in event_capsule: 135 | del event_capsule["taskName"] 136 | 137 | assert event_capsule == { 138 | '__schema__': 'test/test', 139 | '__schema_version__': 1, 140 | '__metadata_version__': 1, 141 | 'something': 'blah' 142 | } 143 | 144 | 145 | def test_register_schema_file(tmp_path): 146 | """ 147 | Register schema from a file 148 | """ 149 | schema = { 150 | '$id': 'test/test', 151 | 'version': 1, 152 | 'properties': { 153 | 'something': { 154 | 'type': 'string', 155 | 'categories': ['unrestricted'] 156 | }, 157 | }, 158 | } 159 | 160 | el = EventLog() 161 | 162 | yaml = YAML(typ='safe') 163 | 164 | schema_file = tmp_path.joinpath("schema.yml") 165 | yaml.dump(schema, schema_file) 166 | el.register_schema_file(str(schema_file)) 167 | 168 | assert schema in el.schemas.values() 169 | 170 | 171 | def test_register_schema_file_object(tmp_path): 172 | """ 173 | Register schema from a file 174 | """ 175 | schema = { 176 | '$id': 'test/test', 177 | 'version': 1, 178 | 'properties': { 179 | 'something': { 180 | 'type': 'string', 181 | 'categories': ['unrestricted'] 182 | }, 183 | }, 184 | } 185 | 186 | el = EventLog() 187 | 188 | yaml = YAML(typ='safe') 189 | 190 | schema_file = tmp_path.joinpath("schema.yml") 191 | yaml.dump(schema, schema_file) 192 | with open(str(schema_file), 'r') as f: 193 | el.register_schema_file(f) 194 | 195 | assert schema in el.schemas.values() 196 | 197 | 198 | def test_allowed_schemas(): 199 | """ 200 | Events should be emitted only if their schemas are allowed 201 | """ 202 | schema = { 203 | '$id': 'test/test', 204 | 'version': 1, 205 | 'properties': { 206 | 'something': { 207 | 'type': 'string', 208 | 'categories': ['unrestricted'] 209 | }, 210 | }, 211 | } 212 | 213 | output = io.StringIO() 214 | handler = logging.StreamHandler(output) 215 | el = EventLog(handlers=[handler]) 216 | # Just register schema, but do not mark it as allowed 217 | el.register_schema(schema) 218 | 219 | el.record_event('test/test', 1, { 220 | 'something': 'blah', 221 | }) 222 | handler.flush() 223 | 224 | assert output.getvalue() == '' 225 | 226 | 227 | def test_record_event_badschema(): 228 | """ 229 | Fail fast when an event doesn't conform to its schema 230 | """ 231 | schema = { 232 | '$id': 'test/test', 233 | 'version': 1, 234 | 'properties': { 235 | 'something': { 236 | 'type': 'string', 237 | 'categories': ['unrestricted'] 238 | }, 239 | 'status': { 240 | 'enum': ['success', 'failure'], 241 | 'categories': ['unrestricted'] 242 | } 243 | } 244 | } 245 | 246 | el = EventLog(handlers=[logging.NullHandler()]) 247 | el.register_schema(schema) 248 | el.allowed_schemas = ['test/test'] 249 | 250 | with pytest.raises(jsonschema.ValidationError): 251 | el.record_event('test/test', 1, { 252 | 'something': 'blah', 253 | 'status': 'hi' # 'not-in-enum' 254 | }) 255 | 256 | 257 | def test_unique_logger_instances(): 258 | schema0 = { 259 | '$id': 'test/test0', 260 | 'version': 1, 261 | 'properties': { 262 | 'something': { 263 | 'type': 'string', 264 | 'categories': ['unrestricted'] 265 | }, 266 | }, 267 | } 268 | 269 | schema1 = { 270 | '$id': 'test/test1', 271 | 'version': 1, 272 | 'properties': { 273 | 'something': { 274 | 'type': 'string', 275 | 'categories': ['unrestricted'] 276 | }, 277 | }, 278 | } 279 | 280 | output0 = io.StringIO() 281 | output1 = io.StringIO() 282 | handler0 = logging.StreamHandler(output0) 283 | handler1 = logging.StreamHandler(output1) 284 | 285 | el0 = EventLog(handlers=[handler0]) 286 | el0.register_schema(schema0) 287 | el0.allowed_schemas = ['test/test0'] 288 | 289 | el1 = EventLog(handlers=[handler1]) 290 | el1.register_schema(schema1) 291 | el1.allowed_schemas = ['test/test1'] 292 | 293 | el0.record_event('test/test0', 1, { 294 | 'something': 'blah', 295 | }) 296 | el1.record_event('test/test1', 1, { 297 | 'something': 'blah', 298 | }) 299 | handler0.flush() 300 | handler1.flush() 301 | 302 | event_capsule0 = json.loads(output0.getvalue()) 303 | 304 | assert '__timestamp__' in event_capsule0 305 | # Remove timestamp from capsule when checking equality, since it is gonna vary 306 | del event_capsule0['__timestamp__'] 307 | # Cope with python3.12 308 | if "taskName" in event_capsule0: 309 | del event_capsule0["taskName"] 310 | assert event_capsule0 == { 311 | '__schema__': 'test/test0', 312 | '__schema_version__': 1, 313 | '__metadata_version__': 1, 314 | 'something': 'blah' 315 | } 316 | 317 | event_capsule1 = json.loads(output1.getvalue()) 318 | 319 | assert '__timestamp__' in event_capsule1 320 | # Remove timestamp from capsule when checking equality, since it is gonna vary 321 | del event_capsule1['__timestamp__'] 322 | # Cope with python3.12 323 | if "taskName" in event_capsule1: 324 | del event_capsule1["taskName"] 325 | assert event_capsule1 == { 326 | '__schema__': 'test/test1', 327 | '__schema_version__': 1, 328 | '__metadata_version__': 1, 329 | 'something': 'blah' 330 | } 331 | 332 | 333 | def test_register_duplicate_schemas(): 334 | schema0 = { 335 | '$id': 'test/test', 336 | 'version': 1, 337 | 'properties': { 338 | 'something': { 339 | 'type': 'string', 340 | 'categories': ['unrestricted'] 341 | }, 342 | }, 343 | } 344 | 345 | schema1 = { 346 | '$id': 'test/test', 347 | 'version': 1, 348 | 'properties': { 349 | 'somethingelse': { 350 | 'type': 'string', 351 | 'categories': ['unrestricted'] 352 | }, 353 | }, 354 | } 355 | 356 | el = EventLog() 357 | el.register_schema(schema0) 358 | with pytest.raises(ValueError): 359 | el.register_schema(schema1) 360 | -------------------------------------------------------------------------------- /jupyter_telemetry/tests/test_traits.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pytest 4 | from traitlets import HasTraits, TraitError 5 | 6 | from jupyter_telemetry.traits import Handlers, SchemaOptions 7 | 8 | 9 | class HasHandlers(HasTraits): 10 | handlers = Handlers( 11 | None, 12 | allow_none=True 13 | ) 14 | 15 | 16 | def test_good_handlers_value(): 17 | handlers = [ 18 | logging.NullHandler(), 19 | logging.NullHandler() 20 | ] 21 | obj = HasHandlers( 22 | handlers=handlers 23 | ) 24 | assert obj.handlers == handlers 25 | 26 | 27 | def test_bad_handlers_values(): 28 | handlers = [0, 1] 29 | 30 | with pytest.raises(TraitError): 31 | HasHandlers( 32 | handlers=handlers 33 | ) 34 | 35 | 36 | def test_mixed_handlers_values(): 37 | handlers = [ 38 | logging.NullHandler(), 39 | 1 40 | ] 41 | with pytest.raises(TraitError): 42 | HasHandlers( 43 | handlers=handlers 44 | ) 45 | 46 | 47 | class HasSchemaOptions(HasTraits): 48 | schema_options = SchemaOptions({}, allow_none=True) 49 | 50 | 51 | @pytest.mark.parametrize( 52 | "schema_options", 53 | [ 54 | # schema_options can be a list of schema_names. In this case, 55 | # the SchemaOptions trait will turn this list into a dictionary 56 | # with the list items as keys the values as empty dictionaries. 57 | ["schema_name_1", "schema_name_2"], 58 | # Empty nested config are okay. 59 | {"schema_name_1": {}}, 60 | # Nested config with empty values is okay too. 61 | {"schema_name_1": {"allowed_categories": []}}, 62 | # Test complete config for good measure. 63 | {"schema_name_1": {"allowed_categories": ["value"]}}, 64 | # Test multiple values. 65 | {"schema_name_1": {"allowed_categories": ["value"]}, "schema_name_2": {}}, 66 | ] 67 | ) 68 | def test_good_schema_options(schema_options): 69 | obj = HasSchemaOptions(schema_options=schema_options) 70 | assert type(obj.schema_options) == dict 71 | 72 | 73 | @pytest.mark.parametrize( 74 | "schema_options", 75 | [ 76 | # Raise an error if Schema Options has unknown attribute. 77 | {"schema_name_1": {"unknown_attribute": []}}, 78 | # Test multiple values. 79 | { 80 | "schema_name_1": { 81 | "allowed_categories": ["value"] 82 | }, 83 | "schema_name_2": { 84 | "unknown_attribute": [] 85 | } 86 | }, 87 | ] 88 | ) 89 | def test_bad_schema_options(schema_options): 90 | with pytest.raises(TraitError): 91 | HasSchemaOptions(schema_options=schema_options) -------------------------------------------------------------------------------- /jupyter_telemetry/tests/utils.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import io 3 | import json 4 | import logging 5 | 6 | from jupyter_telemetry.eventlog import EventLog 7 | 8 | 9 | def get_event_data(event, schema, schema_id, version, allowed_schemas): 10 | sink = io.StringIO() 11 | 12 | # Create a handler that captures+records events with allowed tags. 13 | handler = logging.StreamHandler(sink) 14 | 15 | e = EventLog( 16 | handlers=[handler], 17 | allowed_schemas=allowed_schemas 18 | ) 19 | e.register_schema(schema) 20 | 21 | # Record event and read output 22 | e.record_event(schema_id, version, deepcopy(event)) 23 | 24 | recorded_event = json.loads(sink.getvalue()) 25 | return {key: value for key, value in recorded_event.items() if not key.startswith('__')} 26 | -------------------------------------------------------------------------------- /jupyter_telemetry/traits.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from traitlets import TraitType, TraitError 4 | 5 | 6 | class Handlers(TraitType): 7 | """A trait that takes a list of logging handlers and converts 8 | it to a callable that returns that list (thus, making this 9 | trait pickleable). 10 | """ 11 | info_text = "a list of logging handlers" 12 | 13 | def validate_elements(self, obj, value): 14 | if len(value) > 0: 15 | # Check that all elements are logging handlers. 16 | for el in value: 17 | if isinstance(el, logging.Handler) is False: 18 | self.element_error(obj) 19 | 20 | def element_error(self, obj): 21 | raise TraitError( 22 | "Elements in the '{}' trait of an {} instance " 23 | "must be Python `logging` handler instances." 24 | .format(self.name, obj.__class__.__name__) 25 | ) 26 | 27 | def validate(self, obj, value): 28 | # If given a callable, call it and set the 29 | # value of this trait to the returned list. 30 | # Verify that the callable returns a list 31 | # of logging handler instances. 32 | if callable(value): 33 | out = value() 34 | self.validate_elements(obj, out) 35 | return out 36 | # If a list, check it's elements to verify 37 | # that each element is a logging handler instance. 38 | elif type(value) == list: 39 | self.validate_elements(obj, value) 40 | return value 41 | else: 42 | self.error(obj, value) 43 | 44 | 45 | class SchemaOptions(TraitType): 46 | """A trait for handling options for recording schemas. 47 | """ 48 | info_text = "either a dictionary with schema options or a list with schema names." 49 | 50 | def validate(self, obj, val): 51 | # If the type is a dictionary. 52 | if type(val) is dict: 53 | for schema_name, data in val.items(): 54 | given_keys = set(data.keys()) 55 | # Compare against keys expected. 56 | allowed_keys = {"allowed_categories", "allowed_properties"} 57 | # There should be no extra keys (anything other than 58 | # allowed_keys) in the schema options. 59 | unknown_keys = given_keys.difference(allowed_keys) 60 | if unknown_keys: 61 | # Throw an error if there are unknown keys. 62 | raise TraitError( 63 | "The schema option, {schema_name}, includes " 64 | "unknown key(s): {unknown_keys}".format( 65 | schema_name=schema_name, 66 | unknown_keys=",".join(unknown_keys) 67 | ) 68 | ) 69 | validated_val = val 70 | # If the type is a list (for backwards compatibility). 71 | elif type(val) is list: 72 | validated_val = {} 73 | for schema_name in val: 74 | validated_val[schema_name] = {} 75 | else: 76 | raise TraitError("SchemaOptions must be of type dict or list.") 77 | return validated_val 78 | -------------------------------------------------------------------------------- /proposal/JEP.md: -------------------------------------------------------------------------------- 1 | # Jupyter telemetry enhancement proposal 2 | 3 | | Item | Value | 4 | |------------|------------------------------------------------------------------------------------------------------------------------------| 5 | | JEP Number | XX | 6 | | Title | Jupyter telemetry | 7 | | Authors | Jaipreet Singh (jaipreet@amazon.com, @jaipreet-s), Zach Sailer(zachsailer@gmail.com, @Zsailer), Yuvi Panda(yuvipanda@gmail.com, @yuvipanda) | 8 | | Status | Draft | 9 | | Type | S - [Standards Track](https://www.python.org/dev/peps/#pep-types-key) JEP | 10 | | Created | 5 June 2019 | 11 | | History | 5 June 2019 12 | 13 | ## Problem 14 | 15 | Telemetry data serves a number of purposes for organizations deploying Jupyter: i) operational monitoring, ii) security monitoring / intrusion detection, iii) compliance auditing, and iv) a-posteriori analysis of the platform’s usage and design–i.e. as a UX research tool. This must be done in a transparent and understandable manner for end-users. Jupyter components and extension developers interested in publishing events must be incentivized to follow the system's interface and patterns rather then rolling out their own. 16 | 17 | ### Personas 18 | 19 | * **Jupyter Users** are the primary stakeholder since it is their data. 20 | * **Jupyter extension developers** will be emitting events from both the server-side and the browser-side code. 21 | * **Jupyter operators** handle the infrastructure deployments and are interested in collecting various bits of events. 22 | * **Analysts** consume the final stored data and visualize and interpret it. 23 | 24 | ### User Stories 25 | 26 | 1. As a Jupyter user, I can view and understand the events being collected, so that I have visibility into what information is being collected. 27 | 2. As a Jupyter user, I need to opt-in to event collection, so that I can choose what data about my usage is gathered. 28 | 3. As a Jupyter extension developer, I can publish custom events using a well defined interface, from the browser as well as the server. 29 | 4. As a Jupyter operator, I can configure events sinks for my deployment, so that I have control over where events are sent to. 30 | 5. As a Jupyter operator, I can write a custom event sink for my deployment, so that I can control where event data is stored. 31 | 6. As a Jupyter operator, I can decide what kinds and levels of events are going to be collected and stored. 32 | 7. As a Jupyter operator, I can configure opt-in or opt-out for the users in my deployment. 33 | 8. As an analyst, I can clearly tell what the fields in the data represent so that I can interpret and make decisions from that data. 34 | 35 | ## Proposed Enhancement 36 | 37 | The enhancement spans two categories - general concepts applicable across all Jupyter components, and the implementation for each concept for each Jupyter component. This proposal covers the components required to publish and consume discrete events from the various components of the Jupyter ecosystem. 38 | 39 | **General concepts:** 40 | 41 | * Event schema: The schema for a given event which is respected by all the Jupyter components across the event's lifecycle. 42 | * Event sinks: The "backends" which handle publishing the event to its final storage location. 43 | 44 | **Jupyter components:** 45 | 46 | * Jupyter server 47 | * JupyterLab 48 | * JupyterHub 49 | * Jupyter Notebook (Classic) 50 | 51 | ## Detailed Explanation 52 | 53 | ### Event Schema 54 | 55 | Each event is published conforming a given [JSON schema](https://json-schema.org/) and this is the contract followed by each step in the event lifecycle. These are referred to by all publishers regardless of whether the event comes from the server or the browser. 56 | 57 | Example event schema 58 | 59 | ```json 60 | { 61 | "name": "org.jupyter.kernel_lifecycle_event", 62 | "version": 1, 63 | "title": "Event emitted when a kernel lifecycle state changes", 64 | "description": "Records each invocation of any command in JupyterLab", 65 | "type": "object", 66 | "args": { 67 | "kernel_name": { 68 | "type": "string", 69 | "description": "Name of the kernel" 70 | }, 71 | "state": { 72 | "type": "string", 73 | "description": "The new state of the kernel" 74 | } 75 | } 76 | } 77 | ``` 78 | 79 | Schema validation is done in the core telemetry framework that routes events from publishers to sinks, so that each event sink does not need to implement schema validation. 80 | 81 | Schemas are used at two places 82 | * During the emit run-time, for validating events before publishing to the configured event sink. 83 | * The schemas are picked up from the same package or a side-car package for runtime validation to avoid the network call. 84 | * During analysis time, for discovering the available schemas and understanding the fields. 85 | * Schema names are valid URIs. For public event discovery, these are resolvable, and private events could be just `com.mycompany.myevent1` 86 | 87 | ### Event Sinks 88 | 89 | Event sinks are the backends where events published to. Event sinks can be configured from the browser as well as the server. Custom sinks can be implemented by extending the interface for the given Jupyter component. 90 | See [Python interface](#python-event-sink-interface) and the [JupyterLab interface](#jupyterlab-event-sink-interface) 91 | 92 | ### Server-side components 93 | 94 | #### Python publisher library 95 | 96 | The Python event publisher library provides extension developers and other internal components an interface to record events. This is agnostic of the event sink back-end and the other deployment configurations. 97 | 98 | ```python 99 | import jupyter_telemetry as telemetry 100 | 101 | telemetry.record_event( 102 | schema_name='org.jupyter.kernel_lifecycle_event', 103 | schema_version='1', 104 | args={ 105 | 'kernel_name': 'python3', 106 | 'state': 'restarted' 107 | } 108 | ) 109 | ``` 110 | 111 | #### Python Event Sink Interface 112 | 113 | The event sink interface 114 | 115 | ```python 116 | class MyCustomEventSink(JupyterEventSink): 117 | 118 | def consume_event(event_data): 119 | # Business logic to publish event into the backend. 120 | ``` 121 | 122 | #### Configuration 123 | 124 | A setting on the Jupyter server allows operators to configure various options of the event sinks such as: 125 | 126 | * The event sink implementations 127 | * The list of event names to whitelist or blacklist 128 | 129 | ```json 130 | c.NotebookApp.telemetry_event_sinks = [ 131 | "mypackage.sinks.MyCustomEventSink" 132 | "mypackage.sinks.AnotherCustomEventSink" 133 | ] 134 | 135 | c.NotebookApp.whitelisted_events = [ 136 | "org.jupyter.someevent" 137 | ] 138 | ``` 139 | 140 | #### Core event router 141 | 142 | The implementation of the `telemetry.record_event` method which handles 143 | 144 | * Schema validation 145 | * Adds event metadata such as `creationTime` 146 | * We should be able to add more metadata fields in the future if necessary without clashing with any potential schemas that have been defined in the various uses. 147 | * Routing events to configured sinks 148 | * Filters events based on the configuration 149 | * (Optionally) Aggregation and data cleaning. 150 | 151 | This part should be scalable to handle the volume of events and implement some sort of pub-sub design pattern. 152 | 153 | ```python 154 | def record_event(name, schema_version, args): 155 | # validate schema 156 | # add metadata 157 | # get configured sinks 158 | # send event to sinks 159 | ``` 160 | 161 | #### REST Endpoint 162 | 163 | In addition to the above sub-components, a REST interface is exposed on the Jupyter server to allow remote clients and the frontend to publish events into the server. The interface for this is similar to the [Python publisher library](#python-publisher-library) 164 | 165 | ```json 166 | HTTP PUT /api/telemetry/event 167 | { 168 | "name" : "org.jupyter.kernel_lifecycle_event", 169 | "schema_version : "1", 170 | "args" : { 171 | "kernel_name": "python3", 172 | "state": "restarted" 173 | } 174 | } 175 | ``` 176 | 177 | #### Open Questions 178 | 179 | 1. Is this work done on the standalone jupyter-server implementation or on the classic jupyter/notebook? 180 | 181 | ### JupyterHub 182 | 183 | JupyterHub would use the same [underlying python library](#python-event-sink-interface) 184 | as the notebook server, and be configured in the same way. 185 | 186 | JupyterHub relies on extension through 187 | [Spawners](https://jupyterhub.readthedocs.io/en/stable/reference/spawners.html), 188 | [Authenticators](https://jupyterhub.readthedocs.io/en/stable/reference/authenticators.html), 189 | [Proxies](https://jupyterhub.readthedocs.io/en/stable/api/spawner.html) & 190 | [Services](https://jupyterhub.readthedocs.io/en/stable/reference/services.html) 191 | to do most of its work. They would have interesting events to emit, so we should 192 | make sure they have easy ways to. 193 | 194 | #### Spawners, Authenticators & Proxies 195 | 196 | Spawners, Authenticators & Proxies run in-process with JupyterHub, and have access to 197 | the JupyterHub `app` object. When they want to emit events, they will: 198 | 199 | 1. Write schemas for events they want to emit 200 | 2. Register their schemas with the with the eventlogging library 201 | 202 | ```python 203 | self.app.event_log.register_schema 204 | for schema_file in glob(os.path.join(here, 'event-schemas','*.json')): 205 | with open(schema_file) as f: 206 | self.event_log.register_schema(json.load(f)) 207 | ``` 208 | 3. Emit events wherever they need. 209 | 210 | ```python 211 | self.app.event_log.emit('kubespawner.hub.jupyter.org/pod-missing', 1, { 212 | 'pod_name': pod.metadata.name, 213 | 'node': pod.metadata.node 214 | }) 215 | ``` 216 | 217 | This would get routed to the appropriate sink if the schema is whitelisted. 218 | 219 | #### Services 220 | 221 | Services run as their own process, and hence do not have access to the JupyterHub 222 | `app` object. They should use the core eventlogging library directly, and admins 223 | should be able to configure it as they would a standalone application. 224 | 225 | #### Authenticated routing service 226 | 227 | Events sent from user's notebook servers or client side interfaces directly to a 228 | sink are untrusted - they are from user controlled code & can be anything. It 229 | would be useful to provide a JupyterHub service that can validate that the 230 | users are who they say they are - even though the rest of the event data 231 | should be considered untrusted. 232 | 233 | This would expose a [REST Endpoint](#rest-endpoint) that can receive data as a sink from 234 | other parts of the ecosystem (Notebook, JupyterLab, classic notebook, 235 | other JupyterHub services). It would then add a metadata username field 236 | to each event, based on the JupyterHub authentication information sent 237 | alongside the request. The event will then be sent to the appropriate 238 | sink configured for this service. 239 | 240 | This is also helpful in smaller installations where there's no other 241 | centralized event collection mechanism (like fluentd, stackdriver, etc). 242 | Events can be sent here, and it can route them to an accessible location 243 | (like a database, or the filesystem) 244 | 245 | ### JupyterLab 246 | 247 | There are quite a few analytics frameworks that send events directly from the browser, so the round trip to the server can be avoided in certain deployments. Additionally, JupyterLab also publishes "platform" events which are subscribed to and published to the event sinks. 248 | 249 | All the sub-components defined in the [Jupyter server](#jupyter-server) section are applicable here as well. 250 | 251 | #### JupyterLab publisher library 252 | 253 | An interface to expose to JupyterLab extension developers to be able to publish custom events. This is similar to [this](https://github.com/jupyterlab/jupyterlab-telemetry/) 254 | 255 | ```typescript 256 | import {recordEvent} from '@jupyterlab/telemetry' 257 | 258 | recordEvent({ 259 | name: 'org.jupyter.jupyterlab_command', 260 | schema_version: '1', 261 | args: { 262 | 'commandName': 'newFile' 263 | } 264 | }) 265 | ``` 266 | 267 | #### JupyterLab Event Sink Interface 268 | 269 | An interface to expose to operators writing their own custom sink (as JupyterLab extensions themselves) and register themselves with the core routing components. The default event sink implementation is to publish to the [Server REST Endpoint](#rest-endpoint). 270 | 271 | ```typescript 272 | import {EventSink, EventData} from '@jupyterlab/telemetry' 273 | 274 | export class MyCustomEventSink implements EventSink { 275 | handle_event(eventData: EventData): Promise { 276 | // Business logic to publish event to backend. 277 | } 278 | } 279 | ``` 280 | 281 | #### JupyterLab Configuration 282 | 283 | The ability to configure the event sinks to publish to as well as events to blacklist or whitelist events. This can likely be accomplished via JupyterLab settings and the JupyterLab dependency management mechanism. 284 | 285 | The JupyterLab telemetry extension provides `Token` that event sink implementations can depend on and register themselves with the core event router. 286 | 287 | #### JupyterLab Core event router 288 | 289 | The implementation of the `@jupyterlab/telemetry/recordEvent` method which handles routing events to the configured sinks and additionally: 290 | 291 | * Schema validation 292 | * Adds event metadata such as `creationTime` 293 | * Filters events based on the configuration 294 | * (Optionally) Aggregation and data cleaning. 295 | 296 | #### User opt-in 297 | 298 | Since JupyterLab is the user facing component, it also contains UX features to give more visibility and transparency to the Jupyter user. In general, every component should make it clear to the user what data is being collected and possible ways to turn it off. 299 | 300 | * UI for opting in or opting out of telemetry data collection 301 | * UI for showing the list of events that are currently being collected. 302 | 303 | 304 | ### Jupyter Notebook (Classic) Frontend 305 | 306 | The proposal for Jupyter Classis is to have a convenience JS library that can be used to pubish events to the server [REST Endpoint](#rest-endpoint). 307 | This ensures that we provide support for Jupyter Classic but can rely on the Jupyter Server to do much of the heavy-lifting by relying on the Core Event Router, Event Sinks, and configuration done at the server level. 308 | 309 | ```javascript 310 | 311 | var telemetry = require('jupyter-telemetry-js') 312 | 313 | telemetry.record_event( 314 | name='org.jupyter.kernel_lifecycle_event', 315 | schema_version='1', 316 | args={ 317 | 'kernel_name': 'python3', 318 | 'state': 'restarted' 319 | } 320 | ) 321 | ``` 322 | 323 | ### Data protection 324 | 325 | (This section needs to be filled out) 326 | 327 | * What guidance do we provide? How do we provide this? 328 | * What building blocks do we provide? 329 | 330 | ## Pros and Cons 331 | 332 | PROS 333 | 334 | * Implementation shares common principles and components across the Jupyter components. 335 | * Decoupling between event publishers and event consumers. 336 | * Flexible configuration for Jupyter operators to configure event sinks and event names. 337 | 338 | CONS 339 | 340 | * Current proposal does not have guidance and building blocks for compliance programs such as GDPR 341 | 342 | ## Appendix 343 | 344 | ### Tenets 345 | 346 | (These are taken from [Brian's initial issue](https://github.com/jupyterlab/team-compass/issues/4).) 347 | 348 | There are certainly ethical and legal questions around telemetry systems. To address these, the following tenets of the telemetry system are proposed: 349 | 350 | * **Data protection by design and default.** Jupyter and its telemetry system should come with builtin tools that enable safe and secure deployments with all types of data. See [Art. 25 of GDPR](https://gdpr-info.eu/art-25-gdpr/) for details about this tenet. 351 | * **Make it easy to do the right things by default.** There are many ways to collect and use telemetry data that are illegal and/or irresponsible. Jupyter’s telemetry system should encode best practices and make it easy for operators to be responsible and comply with relevant laws. 352 | * **Choice in balancing tradeoffs.** There are two types of data JupyterLab: 1) the actual datasets users are working with in notebooks, and 2) telemetry data about the Jupyter users. At times, protecting these two types of data at the same time will require tradeoffs. For example, if a deployment is doing research with sensitive HIPAA or FERPA data, the operators need to closely monitor every action taken by its researchers using JupyterLab to ensure the sensitive data is used appropriately. At the same time, in some jurisdictions (EU) Jupyter users may be protected by local laws (GDPR) about what telemetry data can be recorded, how it can be used, the terms of that usage. 353 | * **Don't ignore the need for telemetry**. Organizations deploying Jupyter need to collect telemetry for a range of purposes. If we ignore this need, they will route around the project, with potential legal and ethical complications. By being proactive, we can establish best practices and guardrails. 354 | 355 | ### References 356 | 357 | 0. Telemetry [JEP proposal](https://github.com/jupyter/telemetry/blob/master/proposal/JEP.md) 358 | 1. Original write up by ellisonbg@ [https://github.com/jupyterlab/team-compass/issues/4](https://github.com/jupyterlab/team-compass/issues/4) 359 | 2. Wikimedia [telemetry system](https://m.mediawiki.org/wiki/Extension:EventLogging/Guide) 360 | 3. Initial strawman [design doc](https://github.com/jupyterlab/jupyterlab-telemetry/blob/master/design.md) 361 | 4. [Mozilla Telemetry System](https://wiki.mozilla.org/Telemetry) 362 | -------------------------------------------------------------------------------- /proposal/design.md: -------------------------------------------------------------------------------- 1 | # EventLogging design 2 | 3 | This document supplements `implementations.md` and has sections detailing the 4 | the eventlogging design that can be common to various parts of the Jupyter 5 | ecosystem. These two documents will co-evolve - as we think more about 6 | implementation, the design will change, and vice versa. 7 | 8 | ## Why collect data? 9 | 10 | The primary reasons for collecting such data are: 11 | 12 | 1. Better understanding of *usage* of their infrastructure. This 13 | might be for capacity planning, metrics, billing, etc 14 | 15 | 2. *Auditing* requirements - for security or legal reasons. Our 16 | Telemetry work in the Jupyter project is necessary but not 17 | sufficient for this, since it might have more stringent 18 | requirements around secure provenance & anti-tampering. 19 | 20 | 4. UX / UI Events from end user behavior. This is often 21 | targeted measurements to help UX designers / developers 22 | determine if particular UX decisions are meeting their 23 | goals. 24 | 25 | 3. *Operational* metrics. Prometheus metrics should be used for 26 | most operational metrics (error rates, percentiles of server 27 | or kernel start times, memory usage, etc). However, some 28 | operational data is much more useful when lossless than 29 | when sampled, such as server start times or contentmanager 30 | usage. 31 | 32 | ## Metrics vs Events 33 | 34 | Both Metrics and Events are telemetry, but are fundamentally 35 | different. Katy Farmer [explains it](https://thenewstack.io/what-is-the-difference-between-metrics-and-events/) 36 | thus: 37 | 38 | > I want to keep track of my piggy bank closely. Right now, there’s only one 39 | > metric I care about: total funds. Anyone can put money into my piggy bank, so 40 | > I want to report the total funds at a one-minute interval. This means that 41 | > every minute, my database will receive a data point with the timestamp and 42 | > the amount of total funds in my piggy bank. 43 | > 44 | > Now, I want to track specific events for my piggy bank: deposits and 45 | > withdrawals. When a deposit occurs, my database will receive a data point 46 | > with the “deposit” tag, the timestamp and the amount of the deposit. 47 | > Similarly, when a withdrawal occurs, my database will receive a data point 48 | > with the “withdrawal” tag, the timestamp and the amount of the withdrawal. 49 | > 50 | > Imagine now that this is the same basic idea behind online banking. We could 51 | > add more metadata to add detail to the events, like attaching a user ID to a 52 | > deposit or withdrawal. 53 | 54 | Metrics let us answer questions like 'what is the 99th percentile start time 55 | for our user servers over the last 24 hours?' or 'what is the current rate 56 | of 5xx errors in notebook servers running on machines with GPUs?'. They have 57 | limited cardinality, and are usually aggregated at the source. Usually, 58 | they are regularly pulled into a central location at regular intervals. They 59 | rarely contain any PII, although they might leak some if we are not careful. These 60 | are primarily operational. We already support metrics via the [prometheus] 61 | (https://prometheus.io/) protocol in [JupyterHub](https://github.com/jupyterhub/jupyterhub/pull/1581), 62 | [Notebook Server](https://github.com/jupyter/notebook/pull/3490) and 63 | [BinderHub](https://github.com/jupyterhub/binderhub/pull/150). This is 64 | heavily used in a bunch of places - see the public [grafana instance] 65 | (https://grafana.mybinder.org/) showing visualizations from metrics 66 | data, and documentation about [what is collected](https://mybinder-sre.readthedocs.io/en/latest/components/metrics.html) 67 | 68 | Events let us answer questions like 'which users opened a notebook named 69 | this in the last 48h?' or 'what JupyterLab commands have been executed 70 | most when running with an IPython kernel'. They have much more information 71 | in them, and do not happen with any regularity. Usually, they are also 72 | 'pushed' to a centralized location, and often contain PII - so need 73 | to be treated carefully. BinderHub [emits events](https://binderhub.readthedocs.io/en/latest/eventlogging.html) 74 | around repos launched there, the mybinder.org team has a [very small 75 | pipeline](https://github.com/jupyterhub/mybinder.org-deploy/tree/master/images/analytics-publisher) 76 | that cleans these events and publishes them at [archive.analytics.mybinder.org](https://github.com/jupyterhub/mybinder.org-deploy/tree/master/images/analytics-publisher) 77 | for the world to see. 78 | 79 | This document focuses primarily on *Events*, and doesn't talk much about metrics. 80 | 81 | ## Stakeholders 82 | 83 | 1. End Users 84 | 85 | Primary stakeholder, since it is their data. They have a right 86 | to know what information is being collected about them. We should 87 | make this the default, and provide automated, easy to understand 88 | ways for them to see what is collected about them. 89 | 90 | 2. Operators 91 | 92 | The operators of the infrastructure where various Jupyter components 93 | run are the folks interested in collecting various bits of Events. 94 | They have to: 95 | 96 | a. Explicitly decide what kinds of Events at what level they are going to 97 | be collecting and storing 98 | 99 | b. Configure where these Events needs to go. It should be very 100 | easy for them to integrate this with the rest of their infrastructure. 101 | 102 | By default, we should not store any Events unless an operator 103 | explicitly opts into it. 104 | 105 | 3. Developers 106 | 107 | Developers will be emitting Events from various parts of the code. 108 | They should only be concerned about emitting Events, and 109 | not about policy enforcement around what should be kept and 110 | where it should be stored. We should also provide easy 111 | interfaces for them to emit information in various places 112 | (backends, frontends, extensions, kernels, etc) 113 | 114 | 4. Analysts 115 | 116 | These are the folks actually using the event data to make 117 | decisions, and hence the ultimate consumers of all this data. 118 | They should be able to clearly tell what the various fields 119 | in the data represent, and how complete it is. We should also 120 | make it easy for the data to be easily consumable by common 121 | analyst tools - such as pandas, databases, data lakes, etc 122 | 123 | ## Other systems to study 124 | 125 | We aren't the first group to try design a unified eventlogging 126 | system that is easy to use, transparent and privacy preserving 127 | by default. Here are some examples of prior art we can draw 128 | inspiration from. 129 | 130 | * Wikimedia's [EventLogging](https://www.mediawiki.org/wiki/Extension:EventLogging/Guide) 131 | 132 | A simple and versatile system that can scale from the needs 133 | of a small organization running MediaWiki to the 7th largest 134 | Website in the world. The [Guide](https://www.mediawiki.org/wiki/Extension:EventLogging/Guide) 135 | lays out the principles behind how things work and why they 136 | do the way they do. The [Operational Information Page](https://wikitech.wikimedia.org/wiki/Analytics/Systems/EventLogging) 137 | shows how this is configured in a large scale installation. 138 | 139 | Let's take an example case to illustrate this. 140 | 141 | Each eventlogging use case must be documented in a public 142 | schema. [This schema](https://meta.wikimedia.org/wiki/Schema:ServerSideAccountCreation) 143 | documents events collected about account creation events. 144 | This is very useful for a variety of stakeholders. 145 | 146 | 1. Users can see what information is being collected about 147 | them if they wish. 148 | 149 | 2. Analysts know exactly what each field in their dataset means 150 | 151 | 3. Operators can use this to perform automatic data purging, 152 | anonymizing or other retention policies easily. See 153 | how [wikimedia does it](https://wikitech.wikimedia.org/wiki/Analytics/Systems/EventLogging/Data_retention_and_auto-purging), 154 | to be compliant with GDPR and friends. 155 | 156 | 4. Developers can easily log events that conform to the schema 157 | with standardized libraries that are provided for them, 158 | without having to worry about policy around recording and 159 | retention. See some [sample code](https://www.mediawiki.org/wiki/Extension:EventLogging/Guide#Underlying_technology) 160 | to get a feel for how it is. 161 | 162 | Thanks to Ori Livneh, one of the designers of this system, for conversations that 163 | have influenced how the Jupyter Telemetry system is being designed. 164 | 165 | * Mozilla's Telemetry system 166 | 167 | Firefox runs on a lot of browsers, and has a lot of very privacy conscious 168 | users & developers. Mozilla has a well thought out [data collection policy] 169 | (https://wiki.mozilla.org/Firefox/Data_Collection). 170 | 171 | There is a [technical overview](https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/start/adding-a-new-probe.html) 172 | of various capabilities available. Their [events](https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/events.html) 173 | system is most similar to what we want here. Similar to the wikimedia example, 174 | every event must have a corresponding schema, and you can see all the 175 | schemas in their [repository](https://dxr.mozilla.org/mozilla-central/source/toolkit/components/telemetry/Events.yaml). 176 | They also provide easy ways for developers to [emit events](https://firefox-source-docs.mozilla.org/toolkit/components/telemetry/telemetry/collection/events.html#the-api) 177 | from the frontend JS. 178 | 179 | There is a lot more information in their [telemetry data portal](https://docs.telemetry.mozilla.org/), 180 | particularly around how analysts can work with this data. 181 | 182 | * Debian 'popularity contest' 183 | 184 | The debian project has an opt-in way to try map the popularity of 185 | various packages used in end user systems with the [popularity 186 | contest](https://popcon.debian.org/). It is a purely opt-in system, 187 | and records packages installed in the system and the frequency 188 | of their use. This is [sortof anonymously, sortof securely](https://popcon.debian.org/FAQ) 189 | sent to a centralized server, which then produces useful graphs. 190 | [Ubuntu](https://popcon.ubuntu.com/) and [NeuroDebian](http://neuro.debian.net/popcon/) 191 | run versions of this as well for their own packages. 192 | 193 | This is different from the other systems in being extremely 194 | single purpose, and not particularly secure in terms of user 195 | privacy. This model might be useful for particular things that 196 | need to work across a large swath of the ecosystem - such as 197 | package usage metrics - but is of limited use in Jupyter itself. 198 | 199 | * Homebrew's analytics 200 | 201 | The popular OS X package manager [homebrew](https://brew.sh) 202 | [collects information](https://github.com/Homebrew/brew/blob/master/docs/Analytics.md) about 203 | usage with Google Analytics. This is very similar to the Debian Popularity 204 | contest system, except it sends events to a third party (Google Analytics) 205 | instead. You can opt out of it if you wish. 206 | 207 | * Bloomberg? 208 | 209 | Paul Ivanov mentioned that Bloomberg has their own data collection 210 | system around JupyterLab. Would be great to hear more details of that 211 | here. 212 | 213 | * Other organizations 214 | 215 | Everyone operating at scale has some way of doing this kind of analytics 216 | pipeline. Would be great to add more info here! 217 | 218 | ## Design proposal 219 | 220 | 1. *Schema* 221 | 222 | Each event type needs a [JSON Schema](https://json-schema.org/) associated 223 | with this. This schema is versioned to allow analysts, operators 224 | and users to see when new fields are added / removed. The descriptions 225 | should also be clear enough to inform users of what is being collected, 226 | and analysts of what they are actually analyzing. We could also use this 227 | to mark specific fields as PII, which can then be automatically mangled, 228 | anonymized or dropped. 229 | 230 | 2. *EventLogging Python API* 231 | 232 | A simple python API that lets users in serverside code (JupyterHub, 233 | Notebook Server, Kernel, etc) emit events. This will: 234 | 235 | 1. Validate the events to make sure they conform to the schema 236 | they claim to represent. 237 | 2. Look at traitlet configuration to see if the events should be 238 | dropped, and immediately drop it if so. So nothing leaves the 239 | process unless explicitly configured to do so. 240 | 3. Filter / obfuscate / drop PII if configured so. 241 | 3. Wrap the event in an *event capsule* with common information 242 | for all events - timestamp (of sufficient granularity), 243 | schema reference, origin, etc. 244 | 4. Emit the event to a given 'sink'. We should leverage the 245 | ecosystem built around Python Loggers for this, so we can 246 | send events to a wide variety of sources - [files](https://docs.python.org/3/library/logging.handlers.html#filehandler), 247 | [files with automatic rotation](https://docs.python.org/3/library/logging.handlers.html#rotatingfilehandler), 248 | [arbitrary HTTP output](https://docs.python.org/3/library/logging.handlers.html#httphandler), 249 | [kafka](https://pypi.org/project/python-kafka-logger/), 250 | [Google Cloud's Stackdrive](https://pypi.org/project/google-cloud-logging/), 251 | [AWS CloudWatch](https://github.com/kislyuk/watchtower), 252 | [ElasticSearch](https://github.com/kislyuk/watchtower) 253 | and many many more. This should help integrate with whatever 254 | systems the organization is already using. 255 | 256 | This helps us centralize all the processing around event validity, 257 | PII handling and sink configuration. Organizations can then decide 258 | what to do with the events afterwards. 259 | 260 | 3. *EventLogging REST API* 261 | 262 | This is a HTTP Endpoint to the Python API, and is a way for frontend 263 | JavaScript and other remote clients to emit events. This is an HTTP 264 | interface, and could exist in many places: 265 | 266 | 1. Inside JupyterHub, and all events can be sent via that. 267 | 2. Inside Jupyter Notebook Server, so it can collect info from 268 | the user running it. The Notebook Server can then send it 269 | someplace. 270 | 3. A standalone service, that can be sent events from everywhere. 271 | 272 | By separating (2) and (3), we can cater to a variety of scales 273 | and use cases. 274 | 275 | 4. *EventLogging JavaScript API* 276 | 277 | This is the equivalent to (1), but in JavaScript. 278 | 279 | It should receive configuration in a similar way as (1) and (2), but be 280 | able to send them to various sinks *directly* instead of being forced to 281 | go through (3). This is very useful in cases where events should be sent 282 | directly to a pre-existing collection service - such as Google Analytics 283 | or mixpanel. Those can be supported as various sinks that plug into this 284 | API, so the code that is emitting the events can remain agnostic to where 285 | they are being sent. 286 | 287 | The default sink can be (3), but we should make sure we implement 288 | at least 1 more sink to begin with so we don't overfit our API design. 289 | 290 | We should be careful to make sure that these events still conform to 291 | schemas, need to be explicitly turned on in configuration, and follow 292 | all the other expectations we have around eventlogging data. 293 | 294 | 5. *User consent / information UI* 295 | 296 | Every application collecting data should have a way to make it 297 | clear to the user what is being collected, and possibly ways 298 | to turn it off. We could possibly let admins configure opt-in / 299 | opt-out options. 300 | 301 | ## Schema naming recommendations 302 | 303 | Schema naming conventions are very important, and affect multiple stakeholders. 304 | 305 | 1. **Analysts** are affected the most. When looking at event data, 306 | they should have an easy, reliable way to get the JSON schema 307 | referenced there. This schema will have documentation describing 308 | the fields, which should be of immense help in understanding the 309 | data they are working with. 310 | 311 | 2. **Developers** want to avoid cumbersome, hard to remember names when 312 | recording events. They might also have private schemas they do 313 | not want to publicly publish. There should also be no central 314 | naming authority for event schemas, as that will slow down 315 | development. They also want their code to be fast, so recording 316 | events should never require a network call to fetch schemas. 317 | 318 | So the goal should be to provide a set of naming recommendations that 319 | can be implemented as a standalone utility for analysts to get the JSON 320 | schema from a given schema name. This could even be deployed as a 321 | small public tool that can resolve public schemas and display them 322 | in a nice readable format, like [this](https://meta.wikimedia.org/wiki/Schema:ServerSideAccountCreation). 323 | 324 | There's lots of prior art here, but we'll steal most of our 325 | recommendations from [go's remote package naming conventions](https://golang.org/cmd/go/#hdr-Remote_import_paths). 326 | 327 | 1. All schema names **must** be valid URIs, with no protocol part. 328 | This is the **only** requirement - these URIs need not actually 329 | resolve to anything. This lets developers get going quickly, 330 | and makes private schemas easy to do. 331 | 332 | 2. `jupyter.org` URIs will be special cased. `jupyter.org//` 333 | would resolve to: 334 | 335 | a. The github repository `jupyter/` 336 | b. The directory `event-schemas/` in the project 337 | c. Files inside this directory should be named `v.json`, where 338 | `` is the integer version of the schema being used. All 339 | schema versions must be preserved in the repository. 340 | 341 | 3. `lab.jupyter.org`, `hub.jupyter.org` and `ipython.jupyter.org` URIs will 342 | also be specialcased, pointing to projects under the `jupyterlab`, 343 | `jupyterhub` and `ipython` github organizations respectively. 344 | 345 | 4. For arbitrary other public projects, we assume they most likely use a 346 | public version control repository. Here we borrow from go's remote syntax 347 | for vcs repos - looking for a version control system specific suffix in 348 | any part of the path. 349 | 350 | For example, if I want to add eventlogging to the project hosted at 351 | `https://github.com/yuvipanda/hubploy.git`, the recommendation is 352 | that I use URIs of the form `github.com/yuvipanda/hubploy.git/`. 353 | The resolver can then look for the directory `event-schemas/` 354 | after cloning the repository, and find files in there of the form 355 | `v.json`, same as the `jupyter.org` special case. 356 | 357 | The suggestion is that `jupyter.org` and other special cases are just 358 | shortcuts for expanding into the full git repo URL form. 359 | 360 | 5. If a git repository is not used, the URI is treated as a https endpoint, 361 | and fetched. Different actions are taken based on the `Content-Type` of 362 | the response. 363 | 364 | a. If `application/json` or `application/json+schema`, the response is 365 | assumed to be the actual schema. 366 | 367 | b. If `text/html`, we look for a `` tag that can point us to a 368 | different URI to resolve. We use the standard `rel='alternate'` 369 | attribute, with `type='application/json+schema'`, and the `href` 370 | attribute pointing to another URI. The entire resolution algorithm 371 | is then run on this URI, until a schema is produced. 372 | 373 | This is slightly different than what go does, since they just invented 374 | their own `` tag. We instead use existing standard `` 375 | tags for the same purpose. 376 | 377 | This lets URLs provide a human readable version directly with HTML 378 | for user consumption, with a link to a machine readable version 379 | for computer usage. 380 | 381 | 6. If none of these work, the URI is assumed to be known to the end user. 382 | This might be likely for internal, private schemas that are made available 383 | to specific internal users only. Even for private schemas, ideally developers 384 | will follow the same naming recommendations as specified here - just for 385 | the sake of analysts. However, they might already have other systems 386 | of documentation in place, and we do not enforce any of this. 387 | 388 | A small reference tool that implements schema resolution using these rules 389 | should be produced to see what problems we end up with, and tinker the design 390 | accordingly. 391 | 392 | ## Open questions 393 | 394 | Here's a list of open questions. 395 | 396 | 1. How do we signal strongly that telemetry / events are never sent 397 | to the Jupyter project / 3rd party unless you explicitly configure 398 | it to do so? This is a common meaning of the word 'telemetry' 399 | today, so we need to make sure we communicate clearly what this 400 | is, what this isn't, and what it can be used for. Same applies 401 | to communicating that nothing is collected or emitted anywhere, 402 | despite the possible presence of emission code in the codebase. 403 | 404 | 3. Add yours here! 405 | -------------------------------------------------------------------------------- /proposal/press_release.md: -------------------------------------------------------------------------------- 1 | # Jupyter releases a new Telemetry System 2 | 3 | Today the Jupyter Team announced the release of its newest feature—the Jupyter Telemetry System. This new system enables anyone deploying Jupyter products to safely collect structured “event” data from Jupyter applications. This makes it easier to secure Jupyter deployments, detect intrusion, improve user experience, scale resources effectively, and minimize costs for both administrators and users. It achieves all of this while prioritizing user privacy. The result is an improved Jupyter experience for all. 4 | 5 | Jupyter’s telemetry system has been a long time coming. Members of the Jupyter community have been exploring various ways to collect structured events inside Jupyter applications. It’s no easy task. While event data can improve the development and deployment of Jupyter applications, it’s difficult to create an effective, safe and secure event-logging system. 6 | 7 | This new telemetry system strikes a successful balance between effective event-logging and honoring user privacy. Transparency is a key tenant. Users are always informed when data is being collected. In most cases, users can “opt-in” and dictate which events are logged. In stricter Jupyter environments where users cannot dictate the event-logging (e.g. FEDRAMP, FERPA, HIPAA, GDPR-compliant services), users will be informed by clear and transparent messages from Jupyter’s telemetry system. 8 | 9 | While staying user-friendly, the telemetry system is both flexible and extensible. It is highly configurable and integrates seamlessly with Jupyter-based applications and can be extended to work with non-Jupyter applications. Jupyter extension developers can leverage its well-defined APIs and libraries to publish events to various storage systems. Jupyter administrators can dictate what events are collected and when. It can store code executed on Jupyter Kernels, track clicks by users inside JupyterLab, record which datasets have been opened. For more sensitive or secure Jupyter deployments, it can also track personal identifiable information (PII). 10 | 11 | The system provides necessary building blocks for telemetry. It includes an extensible event schema, APIs and libraries for Jupyter server, JupyterLab, JupyterHub and Jupyter Classic, and a flexible configuration system to work with any deployment. 12 | 13 | The Jupyter Team remains committed to providing flexible and extensible building blocks for interactive computing. Tools like the Jupyter Notebook, JupyterLab, JupyterHub, etc. have expanded the world’s interactive computing toolkit. The Jupyter telemetry system offers new building blocks that improve these tools and increase security while protecting users’ privacy. Begin using these building blocks today. Telemetry is available [here](https://github.com/jupyter/telemetry). -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | 2 | 3 | [build-system] 4 | requires = ["jupyter_packaging~=0.9,<2"] 5 | build-backend = "jupyter_packaging.build_api" 6 | 7 | [tool.check-manifest] 8 | ignore = ["tbump.toml", ".*", "*.yml", "conftest.py"] 9 | 10 | [tool.pytest.ini_options] 11 | addopts = "--doctest-modules" 12 | testpaths = [ 13 | "jupyter_telemetry/" 14 | ] 15 | 16 | [tool.jupyter-releaser] 17 | skip = ["check-links"] 18 | 19 | [tool.tbump.version] 20 | current = "0.2.0.dev0" 21 | regex = ''' 22 | (?P\d+)\.(?P\d+)\.(?P\d+) 23 | ((?Pa|b|rc|.dev)(?P\d+))? 24 | ''' 25 | 26 | [tool.tbump.git] 27 | message_template = "Bump to {new_version}" 28 | tag_template = "v{new_version}" 29 | 30 | [[tool.tbump.file]] 31 | src = "jupyter_telemetry/_version.py" 32 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = jupyter_telemetry 3 | version = attr: jupyter_telemetry._version.__version__ 4 | description = Jupyter telemetry library 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown 7 | license_file = LICENSE 8 | author = Jupyter Development Team 9 | author_email = jupyter@googlegroups.com 10 | url = https://jupyter.org 11 | platforms = Linux, Mac OS X, Windows 12 | keywords = jupyter 13 | classifiers = 14 | Intended Audience :: Developers 15 | Intended Audience :: System Administrators 16 | Intended Audience :: Science/Research 17 | License :: OSI Approved :: BSD License 18 | Programming Language :: Python 19 | Programming Language :: Python :: 3.6 20 | Programming Language :: Python :: 3.7 21 | Programming Language :: Python :: 3.8 22 | Programming Language :: Python :: 3.9 23 | 24 | [options] 25 | zip_safe = False 26 | include_package_data = True 27 | packages = find: 28 | python_requires = >=3.6 29 | install_requires = 30 | jsonschema 31 | python-json-logger 32 | traitlets 33 | ruamel.yaml 34 | 35 | [options.extras_require] 36 | test = 37 | flake8 38 | pytest 39 | pytest-cov 40 | codecov 41 | docs = 42 | pydata_sphinx_theme 43 | myst_parser 44 | 45 | [options.packages.find] 46 | exclude = ['docs*'] 47 | 48 | [flake8] 49 | ignore = E, C, W, F401, F403, F811, F841, E402, I100, I101, D400 50 | builtins = c, get_config 51 | exclude = 52 | .cache, 53 | .github, 54 | docs, 55 | setup.py 56 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | setup() 3 | --------------------------------------------------------------------------------