├── skops
    ├── cli
    │   ├── __init__.py
    │   ├── _utils.py
    │   ├── tests
    │   │   ├── test_entrypoint.py
    │   │   └── test_convert.py
    │   ├── entrypoint.py
    │   └── _convert.py
    ├── utils
    │   └── __init__.py
    ├── hub_utils
    │   ├── tests
    │   │   └── common.py
    │   └── __init__.py
    ├── io
    │   ├── __init__.py
    │   ├── _trusted_types.py
    │   ├── exceptions.py
    │   ├── _scipy.py
    │   ├── tests
    │   │   ├── test_utils.py
    │   │   ├── test_audit.py
    │   │   └── _utils.py
    │   ├── _utils.py
    │   ├── _persist.py
    │   ├── _sklearn.py
    │   └── _numpy.py
    ├── card
    │   ├── __init__.py
    │   ├── tests
    │   │   ├── examples
    │   │   │   ├── specter.md.diff
    │   │   │   ├── vit-base-patch32-224-in21k.md.diff
    │   │   │   ├── specter.md
    │   │   │   ├── gpt2.md.diff
    │   │   │   ├── clip-vit-large-patch14.md.diff
    │   │   │   ├── toy-example.md.diff
    │   │   │   ├── bert-base-uncased.md.diff
    │   │   │   ├── toy-example.md
    │   │   │   ├── vit-base-patch32-224-in21k.md
    │   │   │   ├── clip-vit-large-patch14.md
    │   │   │   ├── gpt2.md
    │   │   │   └── bert-base-uncased.md
    │   │   └── test_parser.py
    │   ├── default_template.md
    │   └── _templates.py
    ├── conftest.py
    ├── __init__.py
    └── _min_dependencies.py
├── docs
    ├── requirements.txt
    ├── images
    │   └── logo.png
    ├── _authors.rst
    ├── installation.rst
    ├── modules
    │   └── classes.rst
    ├── Makefile
    ├── make.bat
    ├── community.rst
    ├── index.rst
    ├── changes.rst
    ├── conf.py
    ├── model_card.rst
    ├── hf_hub.rst
    └── persistence.rst
├── setup.cfg
├── MANIFEST.in
├── examples
    ├── README.rst
    ├── plot_hf_hub.py
    ├── plot_text_classification.py
    └── plot_model_card.py
├── Makefile
├── .codecov.yml
├── .readthedocs.yml
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── clean-skops-user.yml
    │   ├── PULL_REQUEST_TEMPLATE.md
    │   ├── publish-pypi.yml
    │   └── build-test.yml
├── .pre-commit-config.yaml
├── LICENSE
├── scripts
    └── clean_skops.py
├── pyproject.toml
├── .gitignore
├── setup.py
├── README.rst
└── CONTRIBUTING.rst


/skops/cli/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/skops/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | -e .[docs]
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 99
3 | enable-extensions = C, G
4 | 


--------------------------------------------------------------------------------
/docs/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/merveenoyan/skops/main/docs/images/logo.png


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.rst
3 | include skops/card/default_template.md
4 | 


--------------------------------------------------------------------------------
/examples/README.rst:
--------------------------------------------------------------------------------
1 | skops Gallery
2 | =============
3 | 
4 | Here are the examples to use this library.
5 | 


--------------------------------------------------------------------------------
/skops/hub_utils/tests/common.py:
--------------------------------------------------------------------------------
1 | # This is the token for the skops user on the hub, used for the CI.
2 | HF_HUB_TOKEN = "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"
3 | 


--------------------------------------------------------------------------------
/skops/io/__init__.py:
--------------------------------------------------------------------------------
1 | from ._persist import dump, dumps, get_untrusted_types, load, loads
2 | 
3 | __all__ = ["dumps", "load", "loads", "dump", "get_untrusted_types"]
4 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # A makefile to simplify speatative steps
2 | 
3 | package:
4 | 	python setup.py bdist_wheel
5 | 	python setup.py sdist
6 | 
7 | pypi-upload:
8 | 	twine upload --verbose dist/*
9 | 


--------------------------------------------------------------------------------
/skops/card/__init__.py:
--------------------------------------------------------------------------------
1 | from ._model_card import Card, metadata_from_config
2 | from ._parser import parse_modelcard
3 | 
4 | __all__ = ["Card", "metadata_from_config", "parse_modelcard"]
5 | 


--------------------------------------------------------------------------------
/.codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | codecov:
 3 |   branch: main
 4 |   require_ci_to_pass: true
 5 |   notify:
 6 |     after_n_builds: 12
 7 |     wait_for_ci: true
 8 | ignore:
 9 |   - "skops/_min_dependencies.py"  # This file is not tested, and won't be.
10 | 


--------------------------------------------------------------------------------
/docs/_authors.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. role:: raw-html(raw)
 3 |    :format: html
 4 | 
 5 | 
 6 | .. _Adrin Jalali: https://github.com/adrinjalali
 7 | 
 8 | .. _Benjamin Bossan: https://github.com/BenjaminBossan
 9 | 
10 | .. _Merve Noyan: https://github.com/merveenoyan
11 | 


--------------------------------------------------------------------------------
/skops/card/tests/examples/specter.md.diff:
--------------------------------------------------------------------------------
1 | --- 
2 | +++ 
3 | @@ -3 +3 @@
4 | -## SPECTER
5 | +# SPECTER
6 | @@ -15 +15 @@
7 | -Authors: *Arman Cohan, Sergey Feldman, Iz Beltagy, Doug Downey, Daniel S. Weld*
8 | +Authors: _Arman Cohan, Sergey Feldman, Iz Beltagy, Doug Downey, Daniel S. Weld_
9 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | .. _installation:
 2 | 
 3 | Installation
 4 | ============
 5 | 
 6 | To install skops, run the following command in your Python environment:
 7 | 
 8 | .. code-block:: bash
 9 | 
10 |     python -m pip install skops
11 | 
12 | If you're interested in contributing to skops, please follow the `contribution
13 | guideline <https://github.com/skops-dev/skops/blob/main/CONTRIBUTING.rst>`__
14 | instead.
15 | 


--------------------------------------------------------------------------------
/skops/hub_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._hf_hub import (
 2 |     add_files,
 3 |     download,
 4 |     get_config,
 5 |     get_model_output,
 6 |     get_requirements,
 7 |     init,
 8 |     push,
 9 |     update_env,
10 | )
11 | 
12 | __all__ = [
13 |     "add_files",
14 |     "download",
15 |     "get_config",
16 |     "get_requirements",
17 |     "get_model_output",
18 |     "init",
19 |     "push",
20 |     "update_env",
21 | ]
22 | 


--------------------------------------------------------------------------------
/skops/io/_trusted_types.py:
--------------------------------------------------------------------------------
 1 | from sklearn.utils import all_estimators
 2 | 
 3 | from ._utils import get_type_name
 4 | 
 5 | PRIMITIVES_TYPES = [int, float, str, bool]
 6 | 
 7 | PRIMITIVE_TYPE_NAMES = ["builtins." + t.__name__ for t in PRIMITIVES_TYPES]
 8 | 
 9 | SKLEARN_ESTIMATOR_TYPE_NAMES = [
10 |     get_type_name(estimator_class)
11 |     for _, estimator_class in all_estimators()
12 |     if get_type_name(estimator_class).startswith("sklearn.")
13 | ]
14 | 


--------------------------------------------------------------------------------
/skops/cli/_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def get_log_level(level: int = 0) -> int:
 5 |     """Takes in verbosity from a CLI entrypoint (number of times -v specified),
 6 |     and sets the logger to the required log level"""
 7 | 
 8 |     all_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
 9 | 
10 |     if level > len(all_levels):
11 |         level = len(all_levels) - 1
12 |     elif level < 0:
13 |         level = 0
14 | 
15 |     return all_levels[level]
16 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-20.04
11 |   tools:
12 |     python: "3.10"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |    configuration: docs/conf.py
17 | 
18 | python:
19 |    install:
20 |    - requirements: docs/requirements.txt
21 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "github-actions"
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "weekly"
12 | 


--------------------------------------------------------------------------------
/skops/card/tests/examples/vit-base-patch32-224-in21k.md.diff:
--------------------------------------------------------------------------------
1 | --- 
2 | +++ 
3 | @@ -17 +17 @@
4 | -Note that this model does not provide any fine-tuned heads, as these were zero'd by Google researchers. However, the model does include the pre-trained pooler, which can be used for downstream tasks (such as image classification).
5 | +Note that this model does not provide any fine-tuned heads, as these were zero’d by Google researchers. However, the model does include the pre-trained pooler, which can be used for downstream tasks (such as image classification).
6 | 


--------------------------------------------------------------------------------
/skops/io/exceptions.py:
--------------------------------------------------------------------------------
 1 | class UnsupportedTypeException(TypeError):
 2 |     """Raise when an object of this type is known to be unsupported"""
 3 | 
 4 |     def __init__(self, obj):
 5 |         super().__init__(
 6 |             f"Objects of type {obj.__class__.__name__} are not supported yet."
 7 |         )
 8 | 
 9 | 
10 | class UntrustedTypesFoundException(TypeError):
11 |     """Raise when some untrusted objects are found in the file."""
12 | 
13 |     def __init__(self, unsafe):
14 |         super().__init__(f"Untrusted types found in the file: {sorted(unsafe)}.")
15 | 


--------------------------------------------------------------------------------
/skops/conftest.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def pandas_not_installed():
 8 |     # patch import so that it raises an ImportError when trying to import
 9 |     # pandas. This works because pandas is only imported lazily.
10 |     orig_import = __import__
11 | 
12 |     def mock_import(name, *args, **kwargs):
13 |         if name == "pandas":
14 |             raise ImportError
15 |         return orig_import(name, *args, **kwargs)
16 | 
17 |     with patch("builtins.__import__", side_effect=mock_import):
18 |         yield
19 | 


--------------------------------------------------------------------------------
/docs/modules/classes.rst:
--------------------------------------------------------------------------------
 1 | .. _api_ref:
 2 | 
 3 | =============
 4 | API Reference
 5 | =============
 6 | 
 7 | This is the class and function reference of skops.
 8 | 
 9 | :mod:`skops.hf_hub`: Hugging Face Hub Integration
10 | =================================================
11 | .. automodule:: skops.hub_utils
12 |     :members:
13 | 
14 | :mod:`skops.card`: Model Card Utilities
15 | =======================================
16 | .. automodule:: skops.card
17 |     :members:
18 | 
19 | :mod:`skops.io`: Secure persistence
20 | ===================================
21 | .. automodule:: skops.io
22 |     :members:
23 | 


--------------------------------------------------------------------------------
/.github/workflows/clean-skops-user.yml:
--------------------------------------------------------------------------------
 1 | name: clean-skops-user
 2 | 
 3 | on:
 4 |   schedule:
 5 |     # * is a special character in YAML so you have to quote this string
 6 |     - cron:  '10 1 * * *'
 7 | 
 8 | jobs:
 9 |   clean-skops-user:
10 | 
11 |     runs-on: ubuntu-latest
12 |     if: "github.repository == 'skops-dev/skops'"
13 | 
14 |     # Timeout: https://stackoverflow.com/a/59076067/4521646
15 |     timeout-minutes: 35
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v3
19 |     - name: Set up Python
20 |       uses: actions/setup-python@v4
21 |     - name: Install Requirements
22 |       run: pip install huggingface_hub
23 |     - name: run cleanup
24 |       run: echo "y" | python scripts/clean_skops.py
25 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | clean:
18 | 	rm -rf $(BUILDDIR)
19 | 	rm -rf auto_examples
20 | 
21 | # Catch-all target: route all unknown targets to Sphinx using the new
22 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
23 | %: Makefile
24 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
25 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.1.0
 4 |     hooks:
 5 |     -   id: check-yaml
 6 |         exclude: .github/conda/meta.yaml
 7 |     -   id: end-of-file-fixer
 8 |     -   id: trailing-whitespace
 9 |         exclude: skops/card/tests/examples
10 |     -   id: check-case-conflict
11 |     -   id: check-merge-conflict
12 | -   repo: https://github.com/psf/black
13 |     rev: 22.6.0
14 |     hooks:
15 |     -   id: black
16 | -   repo: https://github.com/pycqa/flake8
17 |     rev: 4.0.1
18 |     hooks:
19 |     -   id: flake8
20 |         types: [file, python]
21 | -   repo: https://github.com/PyCQA/isort
22 |     rev: 5.10.1
23 |     hooks:
24 |     -   id: isort
25 | -   repo: https://github.com/pre-commit/mirrors-mypy
26 |     rev: v0.971
27 |     hooks:
28 |     -   id: mypy
29 |         args: [--config-file=pyproject.toml]
30 |         additional_dependencies: [types-requests>=2.28.5]
31 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (C) 2021 Hugging Face Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/skops/card/tests/examples/specter.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: en
 3 | thumbnail: "https://camo.githubusercontent.com/7d080b7a769f7fdf64ac0ebeb47b039cb50be35287e3071f9d633f0fe33e7596/68747470733a2f2f692e6962622e636f2f33544331576d472f737065637465722d6c6f676f2d63726f707065642e706e67"
 4 | license: apache-2.0
 5 | datasets:
 6 | - SciDocs
 7 | metrics:
 8 | - F1
 9 | - accuracy
10 | - map
11 | - ndcg
12 | ---
13 | 
14 | ## SPECTER
15 | 
16 | <!-- retrieved on 2022-12-05 | mod: removed trailing whitespaces -->
17 | 
18 | SPECTER is a pre-trained language model to generate document-level embedding of documents. It is pre-trained on a a powerful signal of document-level relatedness: the citation graph. Unlike existing pretrained language models, SPECTER can be easily applied to downstream applications without task-specific fine-tuning.
19 | 
20 | Paper: [SPECTER: Document-level Representation Learning using Citation-informed Transformers](https://arxiv.org/pdf/2004.07180.pdf)
21 | 
22 | Original Repo: [Github](https://github.com/allenai/specter)
23 | 
24 | Evaluation Benchmark: [SciDocs](https://github.com/allenai/scidocs)
25 | 
26 | Authors: *Arman Cohan, Sergey Feldman, Iz Beltagy, Doug Downey, Daniel S. Weld*
27 | 


--------------------------------------------------------------------------------
/scripts/clean_skops.py:
--------------------------------------------------------------------------------
 1 | """This script removes all repos under the skops user on HF Hub.
 2 | 
 3 | The user is used for the CI and if there are leftover repos, they can be
 4 | removed.
 5 | """
 6 | 
 7 | import datetime
 8 | 
 9 | from huggingface_hub import HfApi
10 | 
11 | # This is the token for the skops user. TODO remove eventually, see issue #47
12 | token = "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"
13 | client = HfApi(token=token)
14 | user = client.whoami()["name"]
15 | answer = input(
16 |     f"Are you sure you want to delete all repos under {user} older than 7 days? (y/[n])"
17 | )
18 | if answer != "y":
19 |     exit(1)
20 | models = [x for x in client.list_models(author=user)]
21 | 
22 | print(f"Found {len(models)} models, checking their age...")
23 | 
24 | for model_info in models:
25 |     info = client.model_info(model_info.modelId)
26 |     age = (
27 |         datetime.datetime.now()
28 |         - datetime.datetime.fromisoformat(info.lastModified.rsplit(".", 1)[0])
29 |     ).days
30 |     if age < 7:
31 |         print(f"Skipping model: {model_info.modelId}, age: {age}")
32 |         continue
33 |     print(f"deleting {model_info.modelId}, age: {age} days")
34 |     client.delete_repo(model_info.modelId)
35 | 


--------------------------------------------------------------------------------
/skops/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | # PEP0440 compatible formatted version, see:
 4 | # https://www.python.org/dev/peps/pep-0440/
 5 | #
 6 | # Generic release markers:
 7 | #   X.Y.0   # For first release after an increment in Y
 8 | #   X.Y.Z   # For bugfix releases
 9 | #
10 | # Admissible pre-release markers:
11 | #   X.Y.ZaN   # Alpha release
12 | #   X.Y.ZbN   # Beta release
13 | #   X.Y.ZrcN  # Release Candidate
14 | #   X.Y.Z     # Final release
15 | #
16 | # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
17 | # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
18 | #
19 | __version__ = "0.6.dev0"
20 | 
21 | try:
22 |     # This variable is injected in the __builtins__ by the build
23 |     # process. It is used to enable importing subpackages of skops when
24 |     # the binaries are not built
25 |     # mypy error: Cannot determine type of '__SKOPS_SETUP__'
26 |     __SKOPS_SETUP__  # type: ignore
27 | except NameError:
28 |     __SKOPS_SETUP__ = False
29 | 
30 | if __SKOPS_SETUP__:
31 |     sys.stderr.write("Partial import of the library during the build process.\n")
32 |     # We are not importing the rest of the library during the build
33 |     # process, as it may not be compiled yet or cause immature import
34 | 


--------------------------------------------------------------------------------
/.github/workflows/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Thanks for contributing a pull request! Please ensure you have taken a look at
 3 | the contribution guidelines: https://github.com/skops-dev/skops/blob/main/CONTRIBUTING.rst
 4 | This guideline contains crucial information for your PR to be merged, such as setting up 
 5 | development environment or linting your code.
 6 | -->
 7 | 
 8 | #### Reference Issues/PRs
 9 | <!--
10 | Example: Fixes #1234. See also #3456.
11 | Please use keywords (e.g., Fixes, partially fixes) to create link to the issues 
12 | or pull requests you resolved, so that they will automatically be closed when 
13 | your pull request is merged. 
14 | See https://github.com/blog/1506-closing-issues-via-pull-requests
15 | -->
16 | 
17 | 
18 | #### What does this implement/fix? Explain your changes.
19 | 
20 | 
21 | #### Any other comments?
22 | 
23 | 
24 | <!--
25 | Please be aware that we are a loose team of volunteers so patience is
26 | necessary; assistance handling other issues is very welcome. We value
27 | all user contributions, no matter how minor they are. If we are slow to
28 | review, either the pull request needs some benchmarking, tinkering,
29 | convincing, etc. or more likely the reviewers are simply busy.
30 | 
31 | Thanks for contributing!
32 | -->


--------------------------------------------------------------------------------
/skops/cli/tests/test_entrypoint.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pathlib
 3 | import sys
 4 | from unittest import mock
 5 | 
 6 | import pytest
 7 | 
 8 | from skops.cli.entrypoint import main_cli
 9 | 
10 | 
11 | class TestEntrypoint:
12 |     """Integration tests that check that entrypoint calls pass through correctly.
13 |     Full coverage of individual entrypoint calls should be done in their own classes.
14 |     """
15 | 
16 |     @pytest.fixture(autouse=True)
17 |     def clear_argv(self):
18 |         # Required to clear argv in case Pytest is called on this specific function.
19 |         # Otherwise, clogs parser.parse_known_args() in argparse
20 |         sys.argv = [""]
21 | 
22 |     @mock.patch("skops.cli._convert._convert_file")
23 |     def test_convert_works_as_expected(
24 |         self,
25 |         convert_file_mock: mock.MagicMock,
26 |         caplog,
27 |     ):
28 |         """
29 |         Intended as a unit test to make sure,
30 |         given 'convert' as the first argument,
31 |         the parser is configured correctly
32 |         """
33 | 
34 |         args = ["convert", "abc.def"]
35 | 
36 |         main_cli(args)
37 |         convert_file_mock.assert_called_once_with(
38 |             input_file="abc.def", output_file=pathlib.Path.cwd() / "abc.skops"
39 |         )
40 | 
41 |         assert caplog.at_level(logging.WARNING)
42 | 


--------------------------------------------------------------------------------
/docs/community.rst:
--------------------------------------------------------------------------------
 1 | .. _community:
 2 | 
 3 | Community
 4 | ---------
 5 | Our community works mostly on `GitHub <https://github.com/skops-dev/skops/>`__,
 6 | directly on issues and pull requests.
 7 | 
 8 | If you encounter any issues, please don't hesitate to open an issue on our
 9 | repository.
10 | 
11 | If you'd like to contribute to the project, please make sure you read our
12 | `contributing guidelines
13 | <https://github.com/skops-dev/skops/blob/main/CONTRIBUTING.rst>`__.
14 | 
15 | 
16 | Discord
17 | ~~~~~~~
18 | We also have a place on Hugging Face's discord server. We're happy to see you
19 | there and answer any questions you might have. You can join using this `invite
20 | link <http://hf.co/join/discord>`__. Once you join, first you need to accept
21 | the rules on the server regarding respectful and harassment free communication,
22 | and then you can head to the ``#role-assignment`` channel where you'll find and
23 | ``Open Source ML`` button. Clicking on that will give you access to a few
24 | channels and categories, including the ``skops`` category.
25 | 
26 | Maintainers
27 | -----------
28 | Current maintainers of the project are (in alphabetical order):
29 | 
30 | - `Adrin Jalali <https://github.com/adrinjalali/>`__
31 | - `Benjamin Bossan <https://github.com/benjaminbossan>`__
32 | - `Erin Aho <https://github.com/E-Aho>`__
33 | - `Merve Noyan <https://github.com/merveenoyan>`__
34 | 


--------------------------------------------------------------------------------
/skops/card/tests/examples/gpt2.md.diff:
--------------------------------------------------------------------------------
 1 | --- 
 2 | +++ 
 3 | @@ -89 +88,0 @@
 4 | ->
 5 | @@ -96 +95 @@
 6 | -Here's an example of how the model can have biased predictions:
 7 | +Here’s an example of how the model can have biased predictions:
 8 | @@ -144,5 +143,4 @@
 9 | -| Dataset  | LAMBADA | LAMBADA | CBT-CN | CBT-NE | WikiText2 | PTB    | enwiki8 | text8  | WikiText103 | 1BW   |
10 | -|:--------:|:-------:|:-------:|:------:|:------:|:---------:|:------:|:-------:|:------:|:-----------:|:-----:|
11 | -| (metric) | (PPL)   | (ACC)   | (ACC)  | (ACC)  | (PPL)     | (PPL)  | (BPB)   | (BPC)  | (PPL)       | (PPL) |
12 | -|          | 35.13   | 45.99   | 87.65  | 83.4   | 29.41     | 65.85  | 1.16    | 1,17   | 37.50       | 75.20 |
13 | -
14 | +| Dataset   | LAMBADA   | CBT-CN   | CBT-NE   | WikiText2   | PTB   | enwiki8   | text8   | WikiText103   | 1BW   |
15 | +|-----------|-----------|----------|----------|-------------|-------|-----------|---------|---------------|-------|
16 | +| (metric)  | (ACC)     | (ACC)    | (ACC)    | (PPL)       | (PPL) | (BPB)     | (BPC)   | (PPL)         | (PPL) |
17 | +|           | 45.99     | 87.65    | 83.4     | 29.41       | 65.85 | 1.16      | 1,17    | 37.50         | 75.20 |
18 | @@ -161 +159 @@
19 | -	<img width="300px" src="https://cdn-media.huggingface.co/exbert/button.png">
20 | +<img width="300px" src="https://cdn-media.huggingface.co/exbert/button.png">
21 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to (Test)PyPI
 2 | on:
 3 |   workflow_dispatch:
 4 |     inputs:
 5 |       version:
 6 |         description: 'Version to upload to pypi'
 7 |         required: true
 8 |       pypi_repo:
 9 |         description: 'Repo to upload to ("testpypi" or "pypi")'
10 |         default: 'testpypi'
11 |         required: true
12 | 
13 | jobs:
14 |   publish:
15 | 
16 |     runs-on: ubuntu-latest
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |       with:
21 |         ref: ${{ github.event.inputs.version }}
22 | 
23 |     - uses: actions/setup-python@v4
24 |       with:
25 |         python-version: '3.x'
26 | 
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install -U pip
30 |         python -m pip install -U setuptools wheel twine build
31 | 
32 |     - name: Generate distribution archives
33 |       run: |
34 |         python -m build
35 | 
36 |     - name: Publish package to TestPyPI
37 |       uses: pypa/gh-action-pypi-publish@v1.6.4
38 |       with:
39 |         user: __token__
40 |         password: ${{ secrets.TEST_PYPI_TOKEN }}
41 |         repository_url: https://test.pypi.org/legacy/
42 |       if: ${{ github.event.inputs.pypi_repo == 'testpypi' }}
43 | 
44 |     - name: Publish package to PyPI
45 |       uses: pypa/gh-action-pypi-publish@v1.6.4
46 |       with:
47 |         user: __token__
48 |         password: ${{ secrets.PYPI_TOKEN }}
49 |       if: ${{ github.event.inputs.pypi_repo == 'pypi' }}
50 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 88
 3 | target_version = ['py38', 'py39', 'py310', 'py311']
 4 | preview = true
 5 | 
 6 | [tool.isort]
 7 | profile = "black"
 8 | 
 9 | [tool.pytest.ini_options]
10 | filterwarnings = [
11 |     "error::DeprecationWarning",
12 |     "error::FutureWarning",
13 |     # TODO: remove when no longer supporting sklearn v1.0
14 |     # numpy and scipy deprecation warnings in sklearn:
15 |     'ignore:\n\n  \`numpy.distutils\` is deprecated since NumPy:DeprecationWarning',
16 |     # https://github.com/scikit-learn/scikit-learn/issues/24080
17 |     "ignore:The \\'sym_pos\\' keyword is deprecated and should be replaced:DeprecationWarning",
18 |     # https://github.com/scikit-learn/scikit-learn/pull/23633
19 |     "ignore:Unlike other reduction functions:FutureWarning",
20 |     # https://github.com/scikit-learn/scikit-learn/pull/25157
21 |     "ignore:\\w+ is deprecated. Use files\\(\\) instead:DeprecationWarning"
22 | ]
23 | markers = [
24 |     "network: marks tests as requiring internet (deselect with '-m \"not network\"')",
25 |     "inference: marks tests that call inference API (deselect with '-m \"not inference\"')",
26 | ]
27 | addopts = "--cov=skops --cov-report=term-missing --doctest-modules"
28 | 
29 | [tool.coverage.run]
30 | omit = [
31 |     "skops/**/test_*.py",
32 |     "skops/_min_dependencies.py",
33 |     "skops/conftest.py",
34 | ]
35 | 
36 | [tool.mypy]
37 | exclude = "(\\w+/)*test_\\w+\\.py$"
38 | ignore_missing_imports = true
39 | no_implicit_optional = true
40 | 


--------------------------------------------------------------------------------
/skops/card/default_template.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | {{ card_data }}
 3 | ---
 4 | 
 5 | # Model description
 6 | 
 7 | {{ model_description | default("[More Information Needed]", true)}}
 8 | 
 9 | ## Intended uses & limitations
10 | 
11 | {{ limitations | default("[More Information Needed]", true)}}
12 | 
13 | ## Training Procedure
14 | 
15 | ### Hyperparameters
16 | 
17 | The model is trained with below hyperparameters.
18 | 
19 | <details>
20 | <summary> Click to expand </summary>
21 | 
22 | {{ hyperparameter_table }}
23 | 
24 | </details>
25 | 
26 | ### Model Plot
27 | 
28 | The model plot is below.
29 | 
30 | {{ model_plot }}
31 | 
32 | ## Evaluation Results
33 | 
34 | You can find the details about evaluation process and the evaluation results.
35 | 
36 | {{ eval_methods }}
37 | 
38 | {{ eval_results | default("[More Information Needed]", true)}}
39 | 
40 | # How to Get Started with the Model
41 | 
42 | Use the code below to get started with the model.
43 | 
44 | ```python
45 | {{ get_started_code | default("[More Information Needed]", true)}}
46 | ```
47 | 
48 | 
49 | # Model Card Authors
50 | 
51 | This model card is written by following authors:
52 | 
53 | {{ model_card_authors | default("[More Information Needed]", true)}}
54 | 
55 | # Model Card Contact
56 | 
57 | You can contact the model card authors through following channels:
58 | {{ model_card_contact | default("[More Information Needed]", true)}}
59 | 
60 | # Citation
61 | 
62 | Below you can find information related to citation.
63 | 
64 | **BibTeX:**
65 | ```
66 | {{ citation_bibtex | default("[More Information Needed]", true)}}
67 | ```
68 | 


--------------------------------------------------------------------------------
/skops/card/tests/examples/clip-vit-large-patch14.md.diff:
--------------------------------------------------------------------------------
 1 | --- 
 2 | +++ 
 3 | @@ -23 +22,0 @@
 4 | -
 5 | @@ -28 +26,0 @@
 6 | -
 7 | @@ -51 +48,0 @@
 8 | -
 9 | @@ -72,2 +68,0 @@
10 | -
11 | -
12 | @@ -81,2 +75,0 @@
13 | -
14 | -
15 | @@ -132,3 +125 @@
16 | -We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks.
17 | -
18 | -
19 | +We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with 'Middle Eastern' having the highest accuracy (98.4%) and 'White' having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks.
20 | 


--------------------------------------------------------------------------------
/skops/cli/entrypoint.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import skops.cli._convert
 4 | 
 5 | 
 6 | def main_cli(command_line_args=None):
 7 |     """Main command line interface entrypoint for all command line Skops methods.
 8 | 
 9 |     To add a new entrypoint:
10 |         1. Create a new method to call that accepts a namespace
11 |         2. Create a new subparser formatter to define the expected CL arguments
12 |         3. Add those to the function map.
13 |     """
14 |     entry_parser = argparse.ArgumentParser(
15 |         prog="Skops",
16 |         description="Main entrypoint for all command line Skops methods.",
17 |         add_help=True,
18 |     )
19 | 
20 |     subparsers = entry_parser.add_subparsers(
21 |         title="Commands",
22 |         description="Skops command to call",
23 |         dest="cmd",
24 |         help="Sub-commands help",
25 |     )
26 | 
27 |     # function_map should map a command to
28 |     #   method: the command to call (gets set to default 'func')
29 |     #   format_parser: the function used to create a subparser for that command
30 |     function_map = {
31 |         "convert": {
32 |             "method": skops.cli._convert.main,
33 |             "format_parser": skops.cli._convert.format_parser,
34 |         },
35 |     }
36 | 
37 |     for func_name, values in function_map.items():
38 |         # Add subparser for each function in func map,
39 |         # and assigns default func to be "method" from function_map
40 |         subparser = subparsers.add_parser(func_name)
41 |         subparser.set_defaults(func=values["method"])
42 |         values["format_parser"](subparser)
43 | 
44 |     # Parse arguments with arg parser for given function in function map,
45 |     # Then call the matching method in the function_map with the argument namespace
46 |     args = entry_parser.parse_args(command_line_args)
47 |     args.func(args)
48 | 


--------------------------------------------------------------------------------
/skops/card/tests/examples/toy-example.md.diff:
--------------------------------------------------------------------------------
 1 | --- 
 2 | +++ 
 3 | @@ -0,0 +1 @@
 4 | +
 5 | @@ -17 +18 @@
 6 | -Parser doesn’t ‘preserve’ other “quotation” marks.
 7 | +Parser doesn’t 'preserve' other "quotation" marks.
 8 | @@ -22 +23 @@
 9 | -Another *way* of doing it.
10 | +Another _way_ of doing it.
11 | @@ -26 +27 @@
12 | -One __way__ of doing it.
13 | +One **way** of doing it.
14 | @@ -45,2 +46,2 @@
15 | -* using
16 | -* asterisk
17 | +- using
18 | +- asterisk
19 | @@ -56 +57 @@
20 | -+ using plus
21 | +- using plus
22 | @@ -100 +101 @@
23 | -[a link](https://skops.readthedocs.io/ "this disappears")
24 | +[a link](https://skops.readthedocs.io/)
25 | @@ -106 +107 @@
26 | -[a link with reference][1]
27 | +[a link with reference](https://skops.readthedocs.io/)
28 | @@ -109,2 +109,0 @@
29 | -
30 | -[1]: https://skops.readthedocs.io/
31 | @@ -164 +163,6 @@
32 | -<p hidden>Cryptids of Revachol:</p>
33 | +
34 | +<p hidden>
35 | +
36 | +Cryptids of Revachol:
37 | +
38 | +</p>
39 | @@ -167,8 +171,37 @@
40 | -    <dt>Beast of Bodmin</dt>
41 | -    <dd>A large feline inhabiting Bodmin Moor.</dd>
42 | -
43 | -    <dt>Morgawr</dt>
44 | -    <dd>A sea serpent.</dd>
45 | -
46 | -    <dt>Owlman</dt>
47 | -    <dd>A giant owl-like creature.</dd>
48 | +
49 | +<dt>
50 | +
51 | +Beast of Bodmin
52 | +
53 | +</dt>
54 | +
55 | +<dd>
56 | +
57 | +A large feline inhabiting Bodmin Moor.
58 | +
59 | +</dd>
60 | +
61 | +<dt>
62 | +
63 | +Morgawr
64 | +
65 | +</dt>
66 | +
67 | +<dd>
68 | +
69 | +A sea serpent.
70 | +
71 | +</dd>
72 | +
73 | +<dt>
74 | +
75 | +Owlman
76 | +
77 | +</dt>
78 | +
79 | +<dd>
80 | +
81 | +A giant owl-like creature.
82 | +
83 | +</dd>
84 | +
85 | @@ -180,3 +213,2 @@
86 | -<div class="warning" somekey key="with value" id="123">
87 | -  <p>Divs are possible</p>
88 | -</div>
89 | +
90 | +<div id="123" class="warning" somekey key="with value"><p>Divs are possible</p></div>
91 | @@ -186 +218 @@
92 | -A text with  
93 | +A text with
94 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | hub/
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | .pytest_cache/
  6 | .mypy_cache/
  7 | *.py[cod]
  8 | *$py.class
  9 | py36-64/
 10 | py35-64/
 11 | 
 12 | # C extensions
 13 | *.so
 14 | .cython_src/
 15 | cython_src/
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | env/
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | pip-wheel-metadata/*
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *,cover
 55 | .hypothesis/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | docs/auto_examples/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # IPython Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # celery beat schedule file
 86 | celerybeat-schedule
 87 | 
 88 | # dotenv
 89 | .env*
 90 | .~env
 91 | .env-3.5.0
 92 | .env-3.6.2
 93 | 
 94 | # virtualenv
 95 | venv/
 96 | ENV/
 97 | .linenv
 98 | 
 99 | # Spyder project settings
100 | .spyderproject
101 | 
102 | # Rope project settings
103 | .ropeproject
104 | 
105 | # PyCharm project settings
106 | .idea
107 | 
108 | # Node
109 | node_modules
110 | 
111 | # Redis
112 | *.rdb
113 | 
114 | /tmp
115 | .vscode
116 | 
117 | # Vim
118 | *.swp
119 | 
120 | 
121 | exports
122 | trash
123 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. skops documentation master file, created by
 2 |    sphinx-quickstart on Thu May  5 11:43:45 2022.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to skops's documentation!
 7 | =================================
 8 | 
 9 | ``skops`` is a Python library helping you share your `scikit-learn
10 | <https://scikit-learn.org/stable/>`__ based models and put them in production.
11 | 
12 | The library is still a work in progress and under active development. You can
13 | find the source code and the development discussions on `Github
14 | <https://github.com/skops-dev/skops>`__.
15 | 
16 | The following examples are good starting points:
17 | 
18 | - How to create and initialize a scikit-learn model repo:
19 |   :ref:`sphx_glr_auto_examples_plot_hf_hub.py`. You can see all the models
20 |   uploaded to the Hugging Face Hub using this library `here
21 |   <https://huggingface.co/models?other=skops>`_.
22 | - How to create a model card for your scikit-learn based model:
23 |   :ref:`sphx_glr_auto_examples_plot_model_card.py`
24 | - A text classification example, and its integration with the hub:
25 |   :ref:`sphx_glr_auto_examples_plot_text_classification.py`
26 | 
27 | In order to better understand the role of each file and their content when
28 | uploaded to Hugging Face Hub, refer to this :ref:`user guide <hf_hub>`. You can
29 | refer to :ref:`user guide <model_card>` to see how you can leverage model cards
30 | for documenting your scikit-learn models and enabling reproducibility.
31 | 
32 | User Guide / API Reference
33 | ==========================
34 | 
35 | .. toctree::
36 |    :maxdepth: 2
37 | 
38 |    installation
39 |    hf_hub
40 |    model_card
41 |    persistence
42 |    modules/classes
43 | 
44 | Community / About
45 | =================
46 | .. toctree::
47 |    :maxdepth: 1
48 | 
49 |    community
50 |    changes
51 | 
52 | Indices and tables
53 | ==================
54 | 
55 | * :ref:`genindex`
56 | * :ref:`modindex`
57 | * :ref:`search`
58 | 


--------------------------------------------------------------------------------
/skops/_min_dependencies.py:
--------------------------------------------------------------------------------
 1 | """All minimum dependencies for scikit-learn."""
 2 | import argparse
 3 | 
 4 | PYTEST_MIN_VERSION = "5.0.1"
 5 | 
 6 | # 'build' and 'install' is included to have structured metadata for CI.
 7 | # It will NOT be included in setup's extras_require
 8 | # The values are (version_spec, comma separated tags, condition)
 9 | # tags can be: 'build', 'install', 'docs', 'examples', 'tests', 'benchmark'
10 | # example:
11 | #     "tomli": ("1.1.0", "install", "python_full_version < '3.11.0a7'"),
12 | dependent_packages = {
13 |     "scikit-learn": ("0.24", "install", None),
14 |     "huggingface_hub": ("0.10.1", "install", None),
15 |     "tabulate": ("0.8.8", "install", None),
16 |     "pytest": (PYTEST_MIN_VERSION, "tests", None),
17 |     "pytest-cov": ("2.9.0", "tests", None),
18 |     "flake8": ("3.8.2", "tests", None),
19 |     "types-requests": ("2.28.5", "tests", None),
20 |     "flaky": ("3.7.0", "tests", None),
21 |     "sphinx": ("3.2.0", "docs", None),
22 |     "sphinx-gallery": ("0.7.0", "docs", None),
23 |     "sphinx-rtd-theme": ("1", "docs", None),
24 |     "numpydoc": ("1.0.0", "docs", None),
25 |     "sphinx-prompt": ("1.3.0", "docs", None),
26 |     "sphinx-issues": ("1.2.0", "docs", None),
27 |     "matplotlib": ("3.3", "docs, tests", None),
28 |     "packaging": ("17.0", "install", None),
29 |     "pandas": ("1", "docs, tests", None),
30 |     # required for persistence tests of external libraries
31 |     "lightgbm": ("3", "tests", None),
32 |     "xgboost": ("1.6", "tests", None),
33 |     # TODO: remove condition when catboost supports python 3.11
34 |     "catboost": ("1.0", "tests", "python_version < '3.11'"),
35 | }
36 | 
37 | 
38 | # create inverse mapping for setuptools
39 | tag_to_packages: dict = {
40 |     extra: []
41 |     for extra in ["build", "install", "docs", "examples", "tests", "benchmark"]
42 | }
43 | for package, (min_version, extras, condition) in dependent_packages.items():
44 |     for extra in extras.split(", "):
45 |         spec = f"{package}>={min_version}"
46 |         if condition:
47 |             spec += f"; {condition}"
48 |         tag_to_packages[extra].append(spec)
49 | 
50 | 
51 | # Used by CI to get the min dependencies
52 | if __name__ == "__main__":
53 |     parser = argparse.ArgumentParser(description="Get min dependencies for a package")
54 | 
55 |     parser.add_argument("package", choices=dependent_packages)
56 |     args = parser.parse_args()
57 |     min_version = dependent_packages[args.package][0]
58 |     print(min_version)
59 | 


--------------------------------------------------------------------------------
/skops/io/_scipy.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import io
 4 | from typing import Any, Sequence
 5 | 
 6 | from scipy.sparse import load_npz, save_npz, spmatrix
 7 | 
 8 | from ._audit import Node
 9 | from ._utils import LoadContext, SaveContext, get_module
10 | 
11 | 
12 | def sparse_matrix_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
13 |     res = {
14 |         "__class__": obj.__class__.__name__,
15 |         "__module__": get_module(type(obj)),
16 |         "__loader__": "SparseMatrixNode",
17 |     }
18 | 
19 |     data_buffer = io.BytesIO()
20 |     save_npz(data_buffer, obj)
21 |     # Memoize the object and then check if it's file name (containing
22 |     # the object id) already exists. If it does, there is no need to
23 |     # save the object again. Memoizitation is necessary since for
24 |     # ephemeral objects, the same id might otherwise be reused.
25 |     obj_id = save_context.memoize(obj)
26 |     f_name = f"{obj_id}.npz"
27 |     if f_name not in save_context.zip_file.namelist():
28 |         save_context.zip_file.writestr(f_name, data_buffer.getbuffer())
29 | 
30 |     res["type"] = "scipy"
31 |     res["file"] = f_name
32 |     return res
33 | 
34 | 
35 | class SparseMatrixNode(Node):
36 |     def __init__(
37 |         self,
38 |         state: dict[str, Any],
39 |         load_context: LoadContext,
40 |         trusted: bool | Sequence[str] = False,
41 |     ) -> None:
42 |         super().__init__(state, load_context, trusted)
43 |         type = state["type"]
44 |         self.trusted = self._get_trusted(trusted, [spmatrix])
45 |         if type != "scipy":
46 |             raise TypeError(
47 |                 f"Cannot load object of type {self.module_name}.{self.class_name}"
48 |             )
49 | 
50 |         self.children = {"content": io.BytesIO(load_context.src.read(state["file"]))}
51 | 
52 |     def _construct(self):
53 |         # scipy load_npz uses numpy.save with allow_pickle=False under the
54 |         # hood, so we're safe using it
55 |         return load_npz(self.children["content"])
56 | 
57 | 
58 | # tuples of type and function that gets the state of that type
59 | GET_STATE_DISPATCH_FUNCTIONS = [
60 |     # use 'spmatrix' to check if a matrix is a sparse matrix because that is
61 |     # what scipy.sparse.issparse checks
62 |     (spmatrix, sparse_matrix_get_state),
63 | ]
64 | # tuples of type and function that creates the instance of that type
65 | NODE_TYPE_MAPPING = {
66 |     # use 'spmatrix' to check if a matrix is a sparse matrix because that is
67 |     # what scipy.sparse.issparse checks
68 |     "SparseMatrixNode": SparseMatrixNode,
69 | }
70 | 


--------------------------------------------------------------------------------
/skops/io/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | import scipy
 4 | import sklearn.tree
 5 | 
 6 | from skops.io._utils import get_type_name, get_type_paths
 7 | 
 8 | 
 9 | class UserDefinedClass:
10 |     pass
11 | 
12 | 
13 | class UserDefinedString(str):
14 |     """Used to test behaviour of subclasses of strings"""
15 | 
16 |     pass
17 | 
18 | 
19 | class TestGetTypeName:
20 |     @pytest.mark.parametrize(
21 |         "input_type, expected_output",
22 |         [
23 |             # Built-In types
24 |             (list, "builtins.list"),
25 |             (set, "builtins.set"),
26 |             (dict, "builtins.dict"),
27 |             (str, "builtins.str"),
28 |             # Numpy types
29 |             (np.ndarray, "numpy.ndarray"),
30 |             (np.ma.MaskedArray, "numpy.ma.core.MaskedArray"),
31 |             # SciPy types
32 |             (scipy.fft.fft, "scipy.fft._basic.fft"),
33 |             # SKlearn types
34 |             (
35 |                 sklearn.linear_model.HuberRegressor,
36 |                 "sklearn.linear_model._huber.HuberRegressor",
37 |             ),
38 |             # User defined types
39 |             (UserDefinedClass, "test_utils.UserDefinedClass"),
40 |             (UserDefinedString, "test_utils.UserDefinedString"),
41 |         ],
42 |     )
43 |     def test_for_input_types_returns_as_expected(self, input_type, expected_output):
44 |         assert get_type_name(input_type) == expected_output
45 | 
46 | 
47 | class TestConvertTypesToStrings:
48 |     @pytest.mark.parametrize(
49 |         "input_list, output_list",
50 |         [
51 |             # Happy path
52 |             (["builtins.str", "builtins.list"], ["builtins.str", "builtins.list"]),
53 |             ([str, list], ["builtins.str", "builtins.list"]),
54 |             ([np.ndarray, "builtins.str"], ["numpy.ndarray", "builtins.str"]),
55 |             # Edge cases
56 |             (None, []),
57 |             (int, ["builtins.int"]),
58 |             ((list,), ["builtins.list"]),
59 |             ([], []),
60 |             (UserDefinedString, ["test_utils.UserDefinedString"]),
61 |             (UserDefinedString("foo"), ["foo"]),
62 |         ],
63 |         ids=[
64 |             "As strings",
65 |             "As types",
66 |             "mixed",
67 |             "None",
68 |             "Single int type",
69 |             "List in tuple",
70 |             "Empty list",
71 |             "UserDefinedString as type",
72 |             "UserDefinedString as instance",
73 |         ],
74 |     )
75 |     def test_for_normal_input_lists_returns_as_expected(self, input_list, output_list):
76 |         assert get_type_paths(input_list) == output_list
77 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # License: 3-clause BSD
 3 | import builtins
 4 | 
 5 | from setuptools import setup
 6 | 
 7 | # This is a bit (!) hackish: we are setting a global variable so that the
 8 | # main modelcard __init__ can detect if it is being loaded by the setup
 9 | # routine, to avoid attempting to load components.
10 | builtins.__SKOPS_SETUP__ = True  # type: ignore
11 | 
12 | 
13 | import skops  # noqa
14 | import skops._min_dependencies as min_deps  # noqa
15 | 
16 | VERSION = skops.__version__
17 | 
18 | DISTNAME = "skops"
19 | DESCRIPTION = (
20 |     "A set of tools to push scikit-learn based models to and pull from Hugging Face Hub"
21 | )
22 | with open("README.rst") as f:
23 |     LONG_DESCRIPTION = f.read()
24 | MAINTAINER = "Adrin Jalali"
25 | MAINTAINER_EMAIL = "adrin.jalali@gmail.com"
26 | URL = "http://github.com/skops-dev/skops"
27 | DOWNLOAD_URL = "https://pypi.org/project/skops/#files"
28 | LICENSE = "MIT"
29 | PROJECT_URLS = {
30 |     "Bug Tracker": "http://github.com/skops-dev/skops/issues",
31 |     "Documentation": "http://github.com/skops-dev/skops",
32 |     "Source Code": "http://github.com/skops-dev/skops",
33 | }
34 | 
35 | 
36 | def setup_package():
37 |     package_data = dict(
38 |         entry_points={
39 |             "console_scripts": [
40 |                 "skops = skops.cli.entrypoint:main_cli",
41 |             ],
42 |         }
43 |     )
44 | 
45 |     metadata = dict(
46 |         name=DISTNAME,
47 |         maintainer=MAINTAINER,
48 |         maintainer_email=MAINTAINER_EMAIL,
49 |         description=DESCRIPTION,
50 |         license=LICENSE,
51 |         url=URL,
52 |         download_url=DOWNLOAD_URL,
53 |         project_urls=PROJECT_URLS,
54 |         version=VERSION,
55 |         long_description=LONG_DESCRIPTION,
56 |         classifiers=[
57 |             "Intended Audience :: Science/Research",
58 |             "Intended Audience :: Developers",
59 |             "License :: OSI Approved",
60 |             "Programming Language :: Python",
61 |             "Topic :: Software Development",
62 |             "Topic :: Scientific/Engineering",
63 |             "Development Status :: 1 - Planning",
64 |             "Operating System :: Microsoft :: Windows",
65 |             "Operating System :: POSIX",
66 |             "Operating System :: Unix",
67 |             "Operating System :: MacOS",
68 |             "Programming Language :: Python :: 3",
69 |             "Programming Language :: Python :: 3.8",
70 |             "Programming Language :: Python :: 3.9",
71 |             "Programming Language :: Python :: 3.10",
72 |             "Programming Language :: Python :: 3.11",
73 |             "Programming Language :: Python :: Implementation :: CPython",
74 |         ],
75 |         python_requires=">=3.8",
76 |         install_requires=min_deps.tag_to_packages["install"],
77 |         extras_require={
78 |             "docs": min_deps.tag_to_packages["docs"],
79 |             "tests": min_deps.tag_to_packages["tests"],
80 |         },
81 |         include_package_data=True,
82 |     )
83 | 
84 |     setup(**package_data, **metadata)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     setup_package()
89 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | .. -*- mode: rst -*-
 2 | 
 3 | |readthedocs| |github-actions| |Codecov| |PyPi| |Black|
 4 | 
 5 | .. |readthedocs| image:: https://readthedocs.org/projects/skops/badge/?version=latest&style=flat
 6 |     :target: https://skops.readthedocs.io/en/latest/
 7 |     :alt: Documentation
 8 | 
 9 | .. |github-actions| image:: https://github.com/skops-dev/skops/workflows/pytest/badge.svg
10 |     :target: https://github.com/skops-dev/skops/actions
11 |     :alt: Linux, macOS, Windows tests
12 | 
13 | .. |Codecov| image:: https://codecov.io/gh/skops-dev/skops/branch/main/graph/badge.svg
14 |     :target: https://codecov.io/gh/skops-dev/skops
15 |     :alt: Codecov
16 | 
17 | .. |PyPi| image:: https://img.shields.io/pypi/v/skops
18 |     :target: https://pypi.org/project/skops
19 |     :alt: PyPi
20 | 
21 | .. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
22 |     :target: https://github.com/psf/black
23 |     :alt: Black
24 | 
25 | .. image:: https://raw.githubusercontent.com/skops-dev/skops/main/docs/images/logo.png
26 |   :width: 500
27 |   :target: https://skops.readthedocs.io/en/latest/
28 | 
29 | SKOPS
30 | =====
31 | 
32 | ``skops`` is a Python library helping you share your `scikit-learn
33 | <https://scikit-learn.org/stable/>`__ based models and put them in production.
34 | At the moment, it includes tools to easily integrate models on the Hugging Face
35 | Hub, which allows you to share your models, make them discoverable, and use the
36 | Hub's API inference and widgets to get outputs of the model without having to
37 | download or load the model.
38 | 
39 | - ``skops.hub_utils``: tools to create a model repository to be stored on
40 |   `Hugging Face Hub <https://hf.co/models>`__, mainly through
41 |   ``skops.hub_utils.init`` and ``skops.hub_utils.push``. You can see all the
42 |   models uploaded to the hub using this library `here
43 |   <https://huggingface.co/models?other=skops>`_
44 | - ``skops.card``: tools to create a model card explaining what the model does
45 |   and how it should be used. The model card can then be stored as the
46 |   ``README.md`` file on the Hugging Face Hub, with pre-populated metadata to
47 |   help Hub understand the model.
48 | - ``skops.io``: Secure persistence of sklearn estimators and more, without using
49 |   ``pickle``. Visit `the docs
50 |   <https://skops.readthedocs.io/en/latest/persistence.html>`_ for more
51 |   information.
52 | 
53 | Please refer to our `documentation <https://skops.readthedocs.io/en/latest/>`_
54 | on using the library as user, which includes user guides on the above topics as
55 | well as complete examples explaining how the features can be used.
56 | 
57 | If you want to contribute to the library, please refer to our `contributing
58 | <CONTRIBUTING.rst>`_ guidelines.
59 | 
60 | Installation
61 | ------------
62 | 
63 | You can install this library using:
64 | 
65 | .. code-block:: bash
66 | 
67 |     python -m pip install skops
68 | 
69 | Bug Reports and Questions
70 | -------------------------
71 | 
72 | Please send all your questions and report issues on this repository's issue
73 | tracker as an issue. Try to look for existing ones before you create a new one.
74 | 


--------------------------------------------------------------------------------
/.github/workflows/build-test.yml:
--------------------------------------------------------------------------------
  1 | name: pytest
  2 | 
  3 | on:
  4 |   - push
  5 |   - pull_request
  6 | 
  7 | concurrency:
  8 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  9 |   cancel-in-progress: true
 10 | 
 11 | jobs:
 12 |   pytest:
 13 | 
 14 |     runs-on: ${{ matrix.os }}
 15 |     if: "github.repository == 'skops-dev/skops'"
 16 |     strategy:
 17 |       fail-fast: false  # need to see which ones fail
 18 |       matrix:
 19 |         os: [ubuntu-latest, windows-latest, macos-latest]
 20 |         python: ["3.8", "3.9", "3.10", "3.11"]
 21 |         # this is to make the CI run on different sklearn versions
 22 |         include:
 23 |           - python: "3.8"
 24 |             sklearn_version: "1.0"
 25 |           - python: "3.9"
 26 |             sklearn_version: "1.1"
 27 |           - python: "3.10"
 28 |             sklearn_version: "1.2"
 29 |           - python: "3.11"
 30 |             sklearn_version: "nightly"
 31 | 
 32 | 
 33 |     # Timeout: https://stackoverflow.com/a/59076067/4521646
 34 |     timeout-minutes: 15
 35 | 
 36 |     steps:
 37 | 
 38 |     # The following two steps are workarounds to retrieve the "real" commit
 39 |     # message and make it available in later steps. This is because we want to
 40 |     # check the content of the commit message, but on PRs, it's replaced by an
 41 |     # artificial commit message. See https://github.com/skops-dev/skops/pull/147
 42 |     - uses: actions/checkout@v3
 43 |       with:
 44 |         fetch-depth: 0
 45 |         ref: ${{github.event.after}}
 46 | 
 47 |     - run: |
 48 |         echo PR_COMMIT_MESSAGE=$(git log -1 --pretty=format:\"%s\") >> $GITHUB_ENV
 49 |       shell: bash
 50 | 
 51 |     - name: Set up Python ${{ matrix.python }}
 52 |       uses: actions/setup-python@v4
 53 |       with:
 54 |         python-version: ${{ matrix.python }}
 55 | 
 56 |     - name: Install dependencies
 57 |       run: |
 58 |         pip install .[docs,tests]
 59 |         pip install black=="22.6.0" isort=="5.10.1" mypy=="0.981"
 60 |         pip uninstall --yes scikit-learn
 61 |         if [ ${{ matrix.sklearn_version }} == "nightly" ];
 62 |           then pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn;
 63 |           else pip install "scikit-learn~=${{ matrix.sklearn_version }}";
 64 |         fi
 65 |         if [ ${{ matrix.os }} == "ubuntu-latest" ];
 66 |           then sudo apt install pandoc && pandoc --version;
 67 |         fi
 68 |         python --version
 69 |         pip --version
 70 |         pip list
 71 |       shell: bash
 72 | 
 73 |     - name: Check black
 74 |       run: black --check --diff .
 75 | 
 76 |     - name: Check isort
 77 |       run: isort --check --diff .
 78 | 
 79 |     - name: Tests
 80 |       env:
 81 |         SUPER_SECRET: ${{ secrets.HF_HUB_TOKEN }}
 82 |       run: |
 83 |         python -m pytest -s -v --cov-report=xml -m "not inference" skops/
 84 | 
 85 |     - name: Mypy
 86 |       run: mypy --config-file pyproject.toml skops
 87 | 
 88 |     - name: Inference tests (conditional)
 89 |       if: contains(env.PR_COMMIT_MESSAGE, '[CI inference]')
 90 |       run: |
 91 |         python -m pytest -s -v -m "inference" skops/
 92 | 
 93 |     - name: Upload coverage to Codecov
 94 |       uses: codecov/codecov-action@v3
 95 |       with:
 96 |         env_vars: OS,PYTHON
 97 |         fail_ci_if_error: true
 98 |         token: ${{ secrets.CODECOV_TOKEN }}
 99 |         files: ./coverage.xml
100 |         flags: unittests
101 |         name: codecov-umbrella
102 |         verbose: true
103 | 


--------------------------------------------------------------------------------
/skops/card/tests/examples/bert-base-uncased.md.diff:
--------------------------------------------------------------------------------
 1 | --- 
 2 | +++ 
 3 | @@ -44,10 +44,10 @@
 4 | -| Model | #params | Language |
 5 | -|------------------------|--------------------------------|-------|
 6 | -| [`bert-base-uncased`](https://huggingface.co/bert-base-uncased) | 110M   | English |
 7 | -| [`bert-large-uncased`](https://huggingface.co/bert-large-uncased)              | 340M    | English | sub
 8 | -| [`bert-base-cased`](https://huggingface.co/bert-base-cased)        | 110M    | English |
 9 | -| [`bert-large-cased`](https://huggingface.co/bert-large-cased) | 340M    |  English |
10 | -| [`bert-base-chinese`](https://huggingface.co/bert-base-chinese) | 110M    | Chinese |
11 | -| [`bert-base-multilingual-cased`](https://huggingface.co/bert-base-multilingual-cased) | 110M | Multiple |
12 | -| [`bert-large-uncased-whole-word-masking`](https://huggingface.co/bert-large-uncased-whole-word-masking) | 340M | English |
13 | -| [`bert-large-cased-whole-word-masking`](https://huggingface.co/bert-large-cased-whole-word-masking) | 340M | English |
14 | +| Model                                                                                                   | #params   | Language   |
15 | +|---------------------------------------------------------------------------------------------------------|-----------|------------|
16 | +| [`bert-base-uncased`](https://huggingface.co/bert-base-uncased)                                         | 110M      | English    |
17 | +| [`bert-large-uncased`](https://huggingface.co/bert-large-uncased)                                       | 340M      | English    |
18 | +| [`bert-base-cased`](https://huggingface.co/bert-base-cased)                                             | 110M      | English    |
19 | +| [`bert-large-cased`](https://huggingface.co/bert-large-cased)                                           | 340M      | English    |
20 | +| [`bert-base-chinese`](https://huggingface.co/bert-base-chinese)                                         | 110M      | Chinese    |
21 | +| [`bert-base-multilingual-cased`](https://huggingface.co/bert-base-multilingual-cased)                   | 110M      | Multiple   |
22 | +| [`bert-large-uncased-whole-word-masking`](https://huggingface.co/bert-large-uncased-whole-word-masking) | 340M      | English    |
23 | +| [`bert-large-cased-whole-word-masking`](https://huggingface.co/bert-large-cased-whole-word-masking)     | 340M      | English    |
24 | @@ -57 +57 @@
25 | -You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to
26 | +You can use the raw model for either masked language modeling or next sentence prediction, but it’s mostly intended to
27 | @@ -189 +189 @@
28 | -the other cases, it's another random sentence in the corpus. Note that what is considered a sentence here is a
29 | +the other cases, it’s another random sentence in the corpus. Note that what is considered a sentence here is a
30 | @@ -212,4 +212,3 @@
31 | -| Task | MNLI-(m/mm) | QQP  | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE  | Average |
32 | -|:----:|:-----------:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:-------:|
33 | -|      | 84.6/83.4   | 71.2 | 90.5 | 93.5  | 52.1 | 85.8  | 88.9 | 66.4 | 79.6    |
34 | -
35 | +| Task   | MNLI-(m/mm)   |   QQP |   QNLI |   SST-2 |   CoLA |   STS-B |   MRPC |   RTE |   Average |
36 | +|--------|---------------|-------|--------|---------|--------|---------|--------|-------|-----------|
37 | +|        | 84.6/83.4     |  71.2 |   90.5 |    93.5 |   52.1 |    85.8 |   88.9 |  66.4 |      79.6 |
38 | @@ -240 +239 @@
39 | -	<img width="300px" src="https://cdn-media.huggingface.co/exbert/button.png">
40 | +<img width="300px" src="https://cdn-media.huggingface.co/exbert/button.png">
41 | 


--------------------------------------------------------------------------------
/skops/card/tests/examples/toy-example.md:
--------------------------------------------------------------------------------
  1 | # This document tries to cover many common markdown contents
  2 | 
  3 | This is not based on an existing model card and serves to increase test coverage. It also documents differences that may be found after parsing. There is no metainfo section.
  4 | 
  5 | ## H2
  6 | 
  7 | ### H3
  8 | 
  9 | #### H4
 10 | 
 11 | ##### H5
 12 | 
 13 | ###### H6
 14 | 
 15 | Parser 'preserves' some "quotation" marks.
 16 | 
 17 | Parser doesn’t ‘preserve’ other “quotation” marks.
 18 | 
 19 | ## Italics
 20 | 
 21 | One _way_ of doing it.
 22 | Another *way* of doing it.
 23 | 
 24 | ## Bold
 25 | 
 26 | One __way__ of doing it.
 27 | Another **way** of doing it.
 28 | 
 29 | ## Strikethrough
 30 | 
 31 | This is ~~not~~ the way.
 32 | 
 33 | ## Superscript and subscripts
 34 | 
 35 | Really just html tags.
 36 | 
 37 | E = mc<sup>2</sup>
 38 | 
 39 | log<sub>2</sub>
 40 | 
 41 | ## Bullet lists
 42 | 
 43 | Pandoc does not differentiate between different notations, so we always use -, not * or +.
 44 | 
 45 | * using
 46 | * asterisk
 47 | 
 48 | or
 49 | 
 50 | - using
 51 | - minus
 52 |   with line break
 53 | 
 54 | or
 55 | 
 56 | + using plus
 57 | 
 58 | Finally:
 59 | 
 60 | - nesting
 61 |   - is
 62 | - indeed
 63 |   - very
 64 |     - possible
 65 |   - to achieve
 66 | 
 67 | ## Ordered lists
 68 | 
 69 | 1. a normal
 70 | 2. ordered list
 71 | 
 72 | or
 73 | 
 74 | 1. an ordered
 75 | 2. list
 76 |    1. with
 77 |    2. indentation
 78 | 3. is possible
 79 | 
 80 | ## Mixed lists
 81 | 
 82 | 1. it’s
 83 | 2. possible
 84 |    - to
 85 |    - mix
 86 | 3. ordered _and_ unorderd
 87 | 
 88 | ## TODOs
 89 | 
 90 | - [x] This
 91 | - [ ] is
 92 | - [x] **done**
 93 | 
 94 | ## Links
 95 | 
 96 | [a link](https://skops.readthedocs.io/)
 97 | 
 98 | The "title" is not parsed by pandoc
 99 | 
100 | [a link](https://skops.readthedocs.io/ "this disappears")
101 | 
102 | [a link to a file](./toy-example.md)
103 | 
104 | References are resolved, so `[1]` below is replaced by the actual link:
105 | 
106 | [a link with reference][1]
107 | 
108 | A plain link to https://skops.readthedocs.io/ used inside of text.
109 | 
110 | [1]: https://skops.readthedocs.io/
111 | 
112 | ## Images
113 | 
114 | ![skops logo](https://github.com/skops-dev/skops/blob/main/docs/images/logo.png)
115 | 
116 | ### Using html
117 | 
118 | <img src="https://github.com/skops-dev/skops/blob/main/docs/images/logo.png" alt="logo" width="100"/>
119 | 
120 | ## Quotes
121 | 
122 | > Someone said something importent
123 | 
124 | > I quote wise words:
125 | > > Someone said something importent
126 | 
127 | ## Tables
128 | 
129 | | Header 0     | Header 1       |
130 | |--------------|----------------|
131 | | Some content | More content   |
132 | | _Even more_  | This is **it** |
133 | 
134 | Empty tables are legal
135 | 
136 | | What now?   |
137 | |-------------|
138 | 
139 | ## Inline code
140 | 
141 | Some `inline` code.
142 | 
143 | `A whole line`
144 | 
145 | ## Code blocks
146 | 
147 | ```
148 | A raw
149 | 
150 | code block
151 | ```
152 | 
153 | With language
154 | 
155 | ```python
156 | def foo():
157 |   return 0
158 |   
159 | def bar():
160 |   return 1
161 | ```
162 | 
163 | ## Raw HTML
164 | <p hidden>Cryptids of Revachol:</p>
165 | 
166 | <dl>
167 |     <dt>Beast of Bodmin</dt>
168 |     <dd>A large feline inhabiting Bodmin Moor.</dd>
169 | 
170 |     <dt>Morgawr</dt>
171 |     <dd>A sea serpent.</dd>
172 | 
173 |     <dt>Owlman</dt>
174 |     <dd>A giant owl-like creature.</dd>
175 | </dl>
176 | 
177 | ## Div
178 | 
179 | The "id" tag may change in order
180 | <div class="warning" somekey key="with value" id="123">
181 |   <p>Divs are possible</p>
182 | </div>
183 | 
184 | ## Line breaks
185 | 
186 | A text with  
187 | a LineBreak item.
188 | 


--------------------------------------------------------------------------------
/skops/cli/_convert.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import argparse
  4 | import logging
  5 | import os
  6 | import pathlib
  7 | import pickle
  8 | from typing import Optional
  9 | 
 10 | from skops.cli._utils import get_log_level
 11 | from skops.io import dumps, get_untrusted_types
 12 | 
 13 | 
 14 | def _convert_file(
 15 |     input_file: os.PathLike,
 16 |     output_file: os.PathLike,
 17 |     logger: logging.Logger = logging.getLogger(),
 18 | ) -> None:
 19 |     """Function that is called by ``skops convert`` entrypoint.
 20 | 
 21 |     Loads a pickle model from the input path, converts to skops format, and saves to
 22 |     output file.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     input_file : os.PathLike
 27 |         Path of input .pkl model to load.
 28 | 
 29 |     output_file : os.PathLike
 30 |         Path to save .skops model to.
 31 | 
 32 |     """
 33 |     model_name = pathlib.Path(input_file).stem
 34 | 
 35 |     logger.debug(f"Converting {model_name}")
 36 | 
 37 |     with open(input_file, "rb") as f:
 38 |         obj = pickle.load(f)
 39 |     skops_dump = dumps(obj)
 40 | 
 41 |     untrusted_types = get_untrusted_types(data=skops_dump)
 42 | 
 43 |     if not untrusted_types:
 44 |         logger.info(f"No unknown types found in {model_name}.")
 45 |     else:
 46 |         untrusted_str = ", ".join(untrusted_types)
 47 | 
 48 |         logger.warning(
 49 |             f"While converting {input_file}, "
 50 |             "the following unknown types were found: "
 51 |             f"{untrusted_str}. "
 52 |             f"When loading {output_file} with skops.load, these types must be "
 53 |             "specified as 'trusted'"
 54 |         )
 55 | 
 56 |     with open(output_file, "wb") as out_file:
 57 |         logger.debug(f"Writing to {output_file}")
 58 |         out_file.write(skops_dump)
 59 | 
 60 | 
 61 | def format_parser(
 62 |     parser: Optional[argparse.ArgumentParser] = None,
 63 | ) -> argparse.ArgumentParser:
 64 |     """Adds arguments and help to parent CLI parser for the convert method."""
 65 | 
 66 |     if not parser:  # used in tests
 67 |         parser = argparse.ArgumentParser()
 68 | 
 69 |     parser_subgroup = parser.add_argument_group("convert")
 70 |     parser_subgroup.add_argument("input", help="Path to an input file to convert. ")
 71 | 
 72 |     parser_subgroup.add_argument(
 73 |         "-o",
 74 |         "--output-file",
 75 |         help=(
 76 |             "Specify the output file name for the converted skops file. "
 77 |             "If not provided, will default to using the same name as the input file, "
 78 |             "and saving to the current working directory with the suffix '.skops'."
 79 |         ),
 80 |         default=None,
 81 |     )
 82 |     parser_subgroup.add_argument(
 83 |         "-v",
 84 |         "--verbose",
 85 |         help=(
 86 |             "Increases verbosity of logging. Can be used multiple times to increase "
 87 |             "verbosity further."
 88 |         ),
 89 |         action="count",
 90 |         dest="loglevel",
 91 |         default=0,
 92 |     )
 93 |     return parser
 94 | 
 95 | 
 96 | def main(
 97 |     parsed_args: argparse.Namespace,
 98 | ) -> None:
 99 |     output_file = parsed_args.output_file
100 |     input_file = parsed_args.input
101 | 
102 |     logging.basicConfig(
103 |         format="%(levelname)-8s: %(message)s", level=get_log_level(parsed_args.loglevel)
104 |     )
105 | 
106 |     if not output_file:
107 |         # No filename provided, defaulting to base file path
108 |         file_name = pathlib.Path(input_file).stem
109 |         output_file = pathlib.Path.cwd() / f"{file_name}.skops"
110 | 
111 |     _convert_file(
112 |         input_file=input_file,
113 |         output_file=output_file,
114 |     )
115 | 


--------------------------------------------------------------------------------
/skops/cli/tests/test_convert.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pathlib
  3 | import pickle
  4 | from unittest import mock
  5 | 
  6 | import numpy as np
  7 | import pytest
  8 | 
  9 | from skops.cli import _convert
 10 | from skops.io import load
 11 | 
 12 | 
 13 | class MockUnsafeType:
 14 |     def __init__(self):
 15 |         pass
 16 | 
 17 | 
 18 | class TestConvert:
 19 |     model_name = "some_model_name"
 20 | 
 21 |     @pytest.fixture
 22 |     def safe_obj(self):
 23 |         return np.ndarray([1, 2, 3, 4])
 24 | 
 25 |     @pytest.fixture
 26 |     def unsafe_obj(self):
 27 |         return MockUnsafeType()
 28 | 
 29 |     @pytest.fixture
 30 |     def pkl_path(self, tmp_path):
 31 |         return tmp_path / f"{self.model_name}.pkl"
 32 | 
 33 |     @pytest.fixture
 34 |     def skops_path(self, tmp_path):
 35 |         return tmp_path / f"{self.model_name}.skops"
 36 | 
 37 |     @pytest.fixture
 38 |     def write_safe_file(self, pkl_path, safe_obj):
 39 |         with open(pkl_path, "wb") as f:
 40 |             pickle.dump(safe_obj, f)
 41 | 
 42 |     @pytest.fixture
 43 |     def write_unsafe_file(self, pkl_path, unsafe_obj):
 44 |         with open(pkl_path, "wb") as f:
 45 |             pickle.dump(unsafe_obj, f)
 46 | 
 47 |     def test_base_case_works_as_expected(
 48 |         self, pkl_path, tmp_path, skops_path, write_safe_file, safe_obj, caplog
 49 |     ):
 50 |         mock_logger = mock.MagicMock()
 51 |         _convert._convert_file(pkl_path, skops_path, logger=mock_logger)
 52 |         persisted_obj = load(skops_path)
 53 |         assert np.array_equal(persisted_obj, safe_obj)
 54 | 
 55 |         # Check no warnings or errors raised
 56 |         mock_logger.warning.assert_not_called()
 57 |         mock_logger.error.assert_not_called()
 58 | 
 59 |     def test_unsafe_case_works_as_expected(
 60 |         self, pkl_path, tmp_path, skops_path, write_unsafe_file, caplog
 61 |     ):
 62 |         caplog.set_level(logging.WARNING)
 63 |         _convert._convert_file(pkl_path, skops_path)
 64 |         persisted_obj = load(skops_path, trusted=True)
 65 | 
 66 |         assert isinstance(persisted_obj, MockUnsafeType)
 67 | 
 68 |         # check logging has warned that an unsafe type was found
 69 |         assert MockUnsafeType.__name__ in caplog.text
 70 | 
 71 | 
 72 | class TestMain:
 73 |     @staticmethod
 74 |     def assert_called_correctly(
 75 |         mock_convert: mock.MagicMock,
 76 |         path,
 77 |         output_file=None,
 78 |     ):
 79 |         if not output_file:
 80 |             output_file = pathlib.Path.cwd() / f"{pathlib.Path(path).stem}.skops"
 81 |         mock_convert.assert_called_once_with(input_file=path, output_file=output_file)
 82 | 
 83 |     @mock.patch("skops.cli._convert._convert_file")
 84 |     def test_base_works_as_expected(self, mock_convert: mock.MagicMock):
 85 |         path = "123.pkl"
 86 |         namespace, _ = _convert.format_parser().parse_known_args([path])
 87 | 
 88 |         _convert.main(namespace)
 89 |         self.assert_called_correctly(mock_convert, path)
 90 | 
 91 |     @mock.patch("skops.cli._convert._convert_file")
 92 |     @pytest.mark.parametrize(
 93 |         "input_path, output_file, expected_path",
 94 |         [
 95 |             ("abc.123", "some/file/path.out", "some/file/path.out"),
 96 |             ("abc.123", None, pathlib.Path.cwd() / "abc.skops"),
 97 |         ],
 98 |         ids=["Given an output path", "No output path"],
 99 |     )
100 |     def test_with_output_dir_works_as_expected(
101 |         self, mock_convert: mock.MagicMock, input_path, output_file, expected_path
102 |     ):
103 |         if output_file is not None:
104 |             args = [input_path, "--output", output_file]
105 |         else:
106 |             args = [input_path]
107 | 
108 |         namespace, _ = _convert.format_parser().parse_known_args(args)
109 | 
110 |         _convert.main(namespace)
111 |         self.assert_called_correctly(
112 |             mock_convert, path=input_path, output_file=expected_path
113 |         )
114 | 
115 |     @mock.patch("skops.cli._convert._convert_file")
116 |     @pytest.mark.parametrize(
117 |         "verbosity, expected_level",
118 |         [
119 |             ("", logging.WARNING),
120 |             ("-v", logging.INFO),
121 |             ("--verbose", logging.INFO),
122 |             ("-vv", logging.DEBUG),
123 |             ("-v -v", logging.DEBUG),
124 |             ("-vvvvv", logging.DEBUG),
125 |             ("--verbose --verbose", logging.DEBUG),
126 |         ],
127 |     )
128 |     def test_given_log_levels_works_as_expected(
129 |         self, mock_convert: mock.MagicMock, verbosity, expected_level, caplog
130 |     ):
131 |         input_path = "abc.def"
132 |         output_path = "bde.skops"
133 |         args = [input_path, "--output", output_path, verbosity.split()]
134 | 
135 |         namespace, _ = _convert.format_parser().parse_known_args(args)
136 | 
137 |         _convert.main(namespace)
138 |         self.assert_called_correctly(
139 |             mock_convert, path=input_path, output_file=output_path
140 |         )
141 | 
142 |         assert caplog.at_level(expected_level)
143 | 


--------------------------------------------------------------------------------
/docs/changes.rst:
--------------------------------------------------------------------------------
  1 | .. include:: _authors.rst
  2 | 
  3 | .. _changelog:
  4 | 
  5 | skops Changelog
  6 | ===============
  7 | 
  8 | .. contents:: Table of Contents
  9 |     :depth: 1
 10 |     :local:
 11 | 
 12 | v0.6
 13 | ----
 14 | 
 15 | v0.5
 16 | ----
 17 | - Added CLI entrypoint support (:func:`.cli.entrypoint.main_cli`)
 18 |   and a command line function to convert Pickle files
 19 |   to Skops files (:func:`.cli._convert.main`). :pr:`249` by `Erin Aho`_
 20 | - Support more array-like data types for tabular data and list-like data types
 21 |   for text data. :pr:`179` by `Francesco Cariaggi`_.
 22 | - Add an option `use_intelex` to :func:`skops.hub_utils.init` which, when
 23 |   enabled, will result in the Hugging Face inference API running with Intel's
 24 |   scikit-learn intelex library, which can accelerate inference times. :pr:`267`
 25 |   by `Benjamin Bossan`_.
 26 | - Model cards that have been written into a markdown file can now be parsed back
 27 |   into a :class:`skops.card.Card` object and edited further by using the
 28 |   :func:`skops.card.parse_modelcard` function. :pr:`257` by `Benjamin Bossan`_.
 29 | 
 30 | v0.4
 31 | ----
 32 | - :func:`.io.dump` and :func:`.io.load` now work with file like objects,
 33 |   which means you can use them with the ``with open(...) as f: dump(obj, f)``
 34 |   pattern, like you'd do with ``pickle``. :pr:`234` by `Benjamin Bossan`_.
 35 | - All `scikit-learn` estimators are trusted by default.
 36 |   :pr:`237` by :user:`Edoardo Abati <EdAbati>`.
 37 | - Add `model_format` argument to :meth:`skops.hub_utils.init` to be stored in
 38 |   `config.json` so that we know how to load a model from the repository.
 39 |   :pr:`242` by `Merve Noyan`_.
 40 | - Persistence now supports bytes and bytearrays, added tests to verify that
 41 |   LightGBM, XGBoost, and CatBoost work now. :pr:`244` by `Benjamin Bossan`_.
 42 | - :class:`.card.Card` now allows to add content to existing sections, using a
 43 |   ``/`` to separate the subsections. E.g. use ``card.add(**{"Existing
 44 |   section/New section": "content"})`` to add "content" a new subsection called
 45 |   "New section" to an existing section called "Existing section". :pr:`203` by
 46 |   `Benjamin Bossan`_.
 47 | 
 48 | v0.3
 49 | ----
 50 | - Utility function to add arbitrary files to be uploaded to the hub by using
 51 |   :func:`.hub_utils.add_files`. :pr:`123` by `Benjamin Bossan`_.
 52 | - Add ``private`` as an optional argument to :meth:`skops.hub_utils.push` to
 53 |   optionally set the visibility status of a repo when pushing to the hub.
 54 |   :pr:`130` by `Adrin Jalali`_.
 55 | - First release of the skops secure persistence feature (:pr:`128`) by `Adrin
 56 |   Jalali`_ and `Benjamin Bossan`_. Visit :ref:`persistence` for more
 57 |   information. This feature is not production ready yet but we're happy to
 58 |   receive feedback from users.
 59 | - Fix a bug that resulted in markdown tables being rendered incorrectly if
 60 |   entries contained line breaks. :pr:`156` by `Benjamin Bossan`_.
 61 | - Raise an error instead of warning the user if a given model file is empty.
 62 |   :pr:`214` by `Adrin Jalali`_.
 63 | - Use ``huggingface_hub`` v0.10.1 for model cards, drop ``modelcards``
 64 |   dependency. :pr:`162` by `Benjamin Bossan`_.
 65 | - Add source links to API documentation. :pr:`172` by :user:`Ayyuce Demirbas
 66 |   <ayyucedemirbas>`.
 67 | - Add support to load model if given Path/str to ``model`` argument in
 68 |   :mod:`skops.card` . :pr:`205` by :user:`Prajjwal Mishra <p-mishra1>`.
 69 | 
 70 | 
 71 | v0.2
 72 | ----
 73 | - Tables, e.g. cross-validation results, can now be added to model cards using
 74 |   the :meth:`.Card.add_table` method. :pr:`90` by `Benjamin Bossan`_.
 75 | - Add method :meth:`.Card.render` which returns the model card as a string.
 76 |   :pr:`94` by `Benjamin Bossan`_.
 77 | - Make :meth:`skops.hub_utils.init` atomic. Now it doesn't leave a trace on the
 78 |   filesystem if it fails for some reason. :pr:`60` by `Adrin Jalali`_
 79 | - When adding figures or tables, it's now possible to set ``folded=True`` to
 80 |   render the content inside a details tag. :pr:`108` by `Benjamin Bossan`_.
 81 | - Add :meth:`skops.hub_utils.get_model_output` to get the model's output using
 82 |   The Hugging Face Hub's inference API, and return an array with the outputs.
 83 |   :pr:`105` by `Adrin Jalali`_.
 84 | 
 85 | v0.1
 86 | ----
 87 | 
 88 | This is the first release of the library. It include two main modules:
 89 | 
 90 | - :mod:`skops.hub_utils`: tools to create a model repository to be stored on
 91 |   `Hugging Face Hub <https://hf.co/models>`__, mainly through
 92 |   :func:`skops.hub_utils.init` and :func:`skops.hub_utils.push`.
 93 | - :mod:`skops.card`: tools to create a model card explaining what the model does
 94 |   and how it should be used. The model card can then be stored as the
 95 |   ``README.md`` file on the Hugging Face Hub, with pre-populated metadata to
 96 |   help Hub understand the model.
 97 | 
 98 | 
 99 | Contributors
100 | ~~~~~~~~~~~~
101 | 
102 | :user:`Adrin Jalali <adrinjalali>`, :user:`Merve Noyan <merveenoyan>`,
103 | :user:`Benjamin Bossan <BenjaminBossan>`, :user:`Ayyuce Demirbas
104 | <ayyucedemirbas>`, :user:`Prajjwal Mishra <p-mishra1>`, :user:`Francesco Cariaggi <anferico>`,
105 | :user:`Erin Aho <E-Aho>`
106 | 


--------------------------------------------------------------------------------
/skops/card/tests/examples/vit-base-patch32-224-in21k.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | license: apache-2.0
 3 | tags:
 4 | - vision
 5 | datasets:
 6 | - imagenet-21k
 7 | inference: false
 8 | ---
 9 | 
10 | # Vision Transformer (base-sized model)
11 | 
12 | <!-- retrieved on 2022-12-05 | mod: removed trailing whitespaces, double whitespace -->
13 | 
14 | Vision Transformer (ViT) model pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224. It was introduced in the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Dosovitskiy et al. and first released in [this repository](https://github.com/google-research/vision_transformer). However, the weights were converted from the [timm repository](https://github.com/rwightman/pytorch-image-models) by Ross Wightman, who already converted the weights from JAX to PyTorch. Credits go to him.
15 | 
16 | Disclaimer: The team releasing ViT did not write a model card for this model so this model card has been written by the Hugging Face team.
17 | 
18 | ## Model description
19 | 
20 | The Vision Transformer (ViT) is a transformer encoder model (BERT-like) pretrained on a large collection of images in a supervised fashion, namely ImageNet-21k, at a resolution of 224x224 pixels.
21 | 
22 | Images are presented to the model as a sequence of fixed-size patches (resolution 32x32), which are linearly embedded. One also adds a [CLS] token to the beginning of a sequence to use it for classification tasks. One also adds absolute position embeddings before feeding the sequence to the layers of the Transformer encoder.
23 | 
24 | Note that this model does not provide any fine-tuned heads, as these were zero'd by Google researchers. However, the model does include the pre-trained pooler, which can be used for downstream tasks (such as image classification).
25 | 
26 | By pre-training the model, it learns an inner representation of images that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled images for instance, you can train a standard classifier by placing a linear layer on top of the pre-trained encoder. One typically places a linear layer on top of the [CLS] token, as the last hidden state of this token can be seen as a representation of an entire image.
27 | 
28 | ## Intended uses & limitations
29 | 
30 | You can use the raw model for image classification. See the [model hub](https://huggingface.co/models?search=google/vit) to look for
31 | fine-tuned versions on a task that interests you.
32 | 
33 | ### How to use
34 | 
35 | Here is how to use this model:
36 | 
37 | ```python
38 | from transformers import ViTFeatureExtractor, ViTModel
39 | from PIL import Image
40 | import requests
41 | url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
42 | image = Image.open(requests.get(url, stream=True).raw)
43 | feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch32-224-in21k')
44 | model = ViTModel.from_pretrained('google/vit-base-patch32-224-in21k')
45 | inputs = feature_extractor(images=image, return_tensors="pt")
46 | outputs = model(**inputs)
47 | last_hidden_state = outputs.last_hidden_state
48 | ```
49 | 
50 | Currently, both the feature extractor and model support PyTorch. Tensorflow and JAX/FLAX are coming soon, and the API of ViTFeatureExtractor might change.
51 | 
52 | ## Training data
53 | 
54 | The ViT model was pretrained on [ImageNet-21k](http://www.image-net.org/), a dataset consisting of 14 million images and 21k classes.
55 | 
56 | ## Training procedure
57 | 
58 | ### Preprocessing
59 | 
60 | The exact details of preprocessing of images during training/validation can be found [here](https://github.com/google-research/vision_transformer/blob/master/vit_jax/input_pipeline.py).
61 | 
62 | Images are resized/rescaled to the same resolution (224x224) and normalized across the RGB channels with mean (0.5, 0.5, 0.5) and standard deviation (0.5, 0.5, 0.5).
63 | 
64 | ### Pretraining
65 | 
66 | The model was trained on TPUv3 hardware (8 cores). All model variants are trained with a batch size of 4096 and learning rate warmup of 10k steps. For ImageNet, the authors found it beneficial to additionally apply gradient clipping at global norm 1. Pre-training resolution is 224.
67 | 
68 | ## Evaluation results
69 | 
70 | For evaluation results on several image classification benchmarks, we refer to tables 2 and 5 of the original paper. Note that for fine-tuning, the best results are obtained with a higher resolution (384x384). Of course, increasing the model size will result in better performance.
71 | 
72 | ### BibTeX entry and citation info
73 | 
74 | ```bibtex
75 | @misc{wu2020visual,
76 |       title={Visual Transformers: Token-based Image Representation and Processing for Computer Vision},
77 |       author={Bichen Wu and Chenfeng Xu and Xiaoliang Dai and Alvin Wan and Peizhao Zhang and Zhicheng Yan and Masayoshi Tomizuka and Joseph Gonzalez and Kurt Keutzer and Peter Vajda},
78 |       year={2020},
79 |       eprint={2006.03677},
80 |       archivePrefix={arXiv},
81 |       primaryClass={cs.CV}
82 | }
83 | ```
84 | 
85 | ```bibtex
86 | @inproceedings{deng2009imagenet,
87 |   title={Imagenet: A large-scale hierarchical image database},
88 |   author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
89 |   booktitle={2009 IEEE conference on computer vision and pattern recognition},
90 |   pages={248--255},
91 |   year={2009},
92 |   organization={Ieee}
93 | }
94 | ```
95 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | import inspect
 10 | import os
 11 | import subprocess
 12 | from operator import attrgetter
 13 | 
 14 | from packaging.version import parse
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | # import os
 21 | # import sys
 22 | # sys.path.insert(0, os.path.abspath('.'))
 23 | import skops
 24 | 
 25 | # -- Project information -----------------------------------------------------
 26 | 
 27 | project = "skops"
 28 | copyright = "2022, Adrin Jalali"
 29 | author = "Adrin Jalali"
 30 | 
 31 | 
 32 | # The full version, including alpha/beta/rc tags
 33 | 
 34 | parsed_version = parse(skops.__version__)
 35 | release = ".".join(parsed_version.base_version.split(".")[:2])
 36 | 
 37 | 
 38 | # -- General configuration ---------------------------------------------------
 39 | 
 40 | # Add any Sphinx extension module names here, as strings. They can be
 41 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 42 | # ones.
 43 | extensions = [
 44 |     "sphinx.ext.linkcode",
 45 |     "sphinx.ext.autodoc",
 46 |     "numpydoc",
 47 |     "sphinx_gallery.gen_gallery",
 48 |     "sphinx_issues",
 49 |     "sphinx.ext.intersphinx",  # link to other documentations, e.g. sklearn
 50 | ]
 51 | 
 52 | autodoc_default_options = {"members": True, "inherited-members": True}
 53 | autodoc_typehints = "none"
 54 | 
 55 | sphinx_gallery_conf = {
 56 |     "examples_dirs": "../examples",  # path to your example scripts
 57 |     "gallery_dirs": "auto_examples",  # path to where to save gallery generated output
 58 | }
 59 | # Add any paths that contain templates here, relative to this directory.
 60 | templates_path = ["_templates"]
 61 | 
 62 | # List of patterns, relative to source directory, that match files and
 63 | # directories to ignore when looking for source files.
 64 | # This pattern also affects html_static_path and html_extra_path.
 65 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 66 | 
 67 | autosummary_generate = True
 68 | 
 69 | # sphinx-issues configuration
 70 | # Path to GitHub repo {group}/{project}
 71 | # (note that `group` is the GitHub user or organization)
 72 | issues_github_path = "skops-dev/skops"
 73 | 
 74 | REVISION_CMD = "git rev-parse --short HEAD"
 75 | 
 76 | 
 77 | def _get_git_revision():
 78 |     try:
 79 |         revision = subprocess.check_output(REVISION_CMD.split()).strip()
 80 |     except (subprocess.CalledProcessError, OSError):
 81 |         print("Failed to execute git to get revision")
 82 |         return None
 83 |     return revision.decode("utf-8")
 84 | 
 85 | 
 86 | def linkcode_resolve(domain, info):
 87 |     if domain not in ("py", "pyx"):
 88 |         return
 89 |     if not info.get("module") or not info.get("fullname"):
 90 |         return
 91 |     revision = _get_git_revision()
 92 | 
 93 |     if revision is None:
 94 |         return
 95 | 
 96 |     class_name = info["fullname"].split(".")[0]
 97 |     module = __import__(info["module"], fromlist=[class_name])
 98 |     obj = attrgetter(info["fullname"])(module)
 99 | 
100 |     # Unwrap the object to get the correct source
101 |     # file in case that is wrapped by a decorator
102 |     obj = inspect.unwrap(obj)
103 | 
104 |     try:
105 |         fn = inspect.getsourcefile(inspect.unwrap(obj))
106 |     except TypeError:
107 |         try:
108 |             fn = inspect.getsourcefile(inspect.unwrap(obj.fget))
109 |         except (AttributeError, TypeError):
110 |             fn = None
111 |     if not fn:
112 |         return None
113 |     package = "skops"
114 |     fn = os.path.relpath(fn, start=os.path.dirname(__import__(package).__file__))
115 |     try:
116 |         lineno = inspect.getsourcelines(obj)[1]
117 |     except Exception:
118 |         lineno = ""
119 |     url_fmt = (
120 |         "https://github.com/skops-dev/skops/blob/{revision}/{package}/{path}#L{lineno}"
121 |     )
122 |     revision = _get_git_revision()
123 |     return url_fmt.format(revision=revision, package=package, path=fn, lineno=lineno)
124 | 
125 | 
126 | # -- Options for HTML output -------------------------------------------------
127 | 
128 | # The theme to use for HTML and HTML Help pages.  See the documentation for
129 | # a list of builtin themes.
130 | #
131 | html_theme = "sphinx_rtd_theme"
132 | 
133 | # Add any paths that contain custom static files (such as style sheets) here,
134 | # relative to this directory. They are copied after the builtin static files,
135 | # so a file named "default.css" will overwrite the builtin "default.css".
136 | html_static_path = ["_static"]
137 | 
138 | html_logo = "images/logo.png"
139 | html_theme_options = {
140 |     "logo_only": True,
141 | }
142 | 
143 | # See:
144 | # https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html#confval-intersphinx_mapping
145 | intersphinx_mapping = {
146 |     "python": ("https://docs.python.org/3", None),
147 |     "numpy": ("https://docs.scipy.org/doc/numpy/", None),
148 |     "sklearn": ("https://scikit-learn.org/stable/", None),
149 |     "pandas": ("https://pandas.pydata.org/docs/", None),
150 |     "joblib": ("https://joblib.readthedocs.io/en/latest/", None),
151 |     "huggingface_hub": ("https://huggingface.co/docs/huggingface_hub/main/en", None),
152 | }
153 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | Contributing to skops
  2 | =====================
  3 | 
  4 | Please follow this workflow when contributing to skops:
  5 | 
  6 | - Fork the repository under your own user
  7 | - Clone the repository locally
  8 | - Create a new branch for your changes
  9 | - Add your changes to the branch
 10 | - Commit your changes
 11 | - Push your branch to the remote repository
 12 | - Create a pull request on GitHub
 13 | 
 14 | Issue Titles / Commit Messages
 15 | ------------------------------
 16 | 
 17 | When creating a pull request, please use a descriptive title. You can prefix
 18 | the title to indicate the type of it:
 19 | 
 20 | - ``DOC``: documentation changes
 21 | - ``FEAT/FEA``: new major features
 22 | - ``ENH``: enhancements to existing features with user facing implications
 23 | - ``CI``: continuous integration, sometimes overlaps with MNT
 24 | - ``MNT/MAINT``: maintenance, technical debt, etc
 25 | - ``FIX``: bug fixes
 26 | - ``TST``: new tests, refactoring tests
 27 | - ``PERF``: performance improvements
 28 | 
 29 | If a contributor forgets to prefix the title, a maintainer can add the prefix
 30 | when merging into ``main``. While merging, it is recommended that the
 31 | maintainer refines the commit message to add a short description of what the PR
 32 | being merged does.
 33 | 
 34 | Review Process
 35 | --------------
 36 | 
 37 | Don't hesitate to ping @skops-dev/maintainers in your issues and pull requests
 38 | if you don't receive a review in a timely manner. We try to review all pull
 39 | requests as soon as we can.
 40 | 
 41 | If you have permissions, you should almost never merge your own pull request
 42 | unless it's a hotfix and needs to be merged really quick and it's not a major
 43 | change.
 44 | 
 45 | Otherwise pull requests can be merged if at least one other person has approved
 46 | it on GitHub. Please don't merge them until all outstanding comments are
 47 | addressed or the discussions are concluded and people have agreed to tackle
 48 | them in future pull requests.
 49 | 
 50 | Working on Existing Issues
 51 | --------------------------
 52 | 
 53 | If you intend to work on an issue, leave a comment and state your intentions.
 54 | Also feel free to ask for clarifications if you're not sure what the issue
 55 | entails. If you don't understand an issue, it's on us, not on you!
 56 | 
 57 | Setting up the dev environment
 58 | ------------------------------
 59 | 
 60 | Following these steps you can prepare a dev environment for yourself to
 61 | contribute to `skops`.
 62 | 
 63 | Using conda/mamba
 64 | ~~~~~~~~~~~~~~~~~
 65 | 
 66 | .. code:: bash
 67 | 
 68 |           mamba create -c conda-forge -n skops python=3.10
 69 |           mamba activate skops
 70 |           python -m pip install -e ".[tests,docs]"
 71 |           # add pre-commit hooks
 72 |           mamba install -c conda-forge pre-commit
 73 |           pre-commit install
 74 | 
 75 | You can also replace the above `mamba` commands with `conda` if you don't have
 76 | `mamba` installed.
 77 | 
 78 | 
 79 | Running Tests
 80 | ~~~~~~~~~~~~~
 81 | 
 82 | skops uses pytest as its test runner, just run it from the project root:
 83 | 
 84 | .. code:: bash
 85 | 
 86 |    pytest
 87 | 
 88 | Certain tests require internet access to run, and they typically take slightly
 89 | longer to run than other tests. If you'd like to skip those tests, you can add
 90 | ``-m not network`` to your ``pytest`` command, or ``-m network`` to only run
 91 | those tests. For example, you can run all tests except the ones requiring
 92 | internet with:
 93 | 
 94 | .. code:: bash
 95 | 
 96 |    pytest -m "not network" skops
 97 | 
 98 | Similarly, there is a flag, ``-m inference`` for tests that hit the Hugging Face
 99 | Inference API, which can be quite slow or even hang. Skip these tests as long as
100 | you don't make any changes to this functionality. If you already skip network
101 | tests, the inference tests will also be skipped.
102 | 
103 | 
104 | Releases
105 | ========
106 | 
107 | Releases are created using `manual GitHub workflows
108 | <https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow>`_.
109 | As a maintainer, follow these steps:
110 | 
111 | 1. Check and update the ``docs/changes.rst``
112 | 2. For a major release, create a new branch with the name "0.version.X", e.g.
113 |    "0.2.X". This branch will have all tags for all releases under 0.2.
114 | 3. Bump the version defined in ``skops/__init__.py``
115 | 4. Git grep for any TODO's that need fixing before the release (e.g.
116 |    deprecations). You can do this, for example by:
117 | 
118 |    .. code:: bash
119 | 
120 |       git grep -n TODO
121 | 
122 | 
123 | 5. Create a PR with all the changes and have it reviewed and merged
124 | 6. Create a tag with the format "v0.version", e.g. "v0.2", and push it to the
125 |    remote repository. Use this tag for releasing the package. If there is a
126 |    minor release under the same branch, it would be "v0.2.1" for example.
127 | 7. Use the `GitHub action
128 |    <https://github.com/skops-dev/skops/actions/workflows/publish-pypi.yml>`__ to
129 |    create a new release on **TestPyPI**. Check it for correctness `on test.pypi
130 |    <https://test.pypi.org/project/skops/>`_.
131 | 8. Use the `GitHub action
132 |    <https://github.com/skops-dev/skops/actions/workflows/publish-pypi.yml>`__ to
133 |    create a new release on **PyPI**. Check it for correctness `pypi
134 |    <https://pypi.org/project/skops/>`_.
135 | 9. Create a `new release <https://github.com/skops-dev/skops/releases>`_ on
136 |    GitHub
137 | 10. Update the patch version of the package to a new dev version, e.g. from
138 |    ``v0.3.dev0`` to ``v0.4.dev0``
139 | 11. Add a section for the new release in the ``docs/changes.rst`` file.
140 | 12. Check that the new stable branch of documentation was built correctly on
141 |     `readthedocs <https://readthedocs.org/projects/skops/builds/>`_, and make
142 |     sure all relevant releases are *active*.
143 | 


--------------------------------------------------------------------------------
/skops/io/tests/test_audit.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import json
  3 | import re
  4 | from contextlib import suppress
  5 | from zipfile import ZipFile
  6 | 
  7 | import numpy as np
  8 | import pytest
  9 | from sklearn.linear_model import LogisticRegression
 10 | from sklearn.pipeline import FeatureUnion, Pipeline
 11 | from sklearn.preprocessing import FunctionTransformer, StandardScaler
 12 | 
 13 | from skops.io import dumps, get_untrusted_types
 14 | from skops.io._audit import Node, audit_tree, check_type, get_tree, temp_setattr
 15 | from skops.io._general import DictNode, dict_get_state
 16 | from skops.io._utils import LoadContext, SaveContext, gettype
 17 | 
 18 | 
 19 | class CustomType:
 20 |     """A custom untrusted class."""
 21 | 
 22 |     def __init__(self, value):
 23 |         self.value = value
 24 | 
 25 | 
 26 | @pytest.mark.parametrize(
 27 |     "module_name, type_name, trusted, expected",
 28 |     [
 29 |         ("sklearn", "Pipeline", ["sklearn.Pipeline"], True),
 30 |         ("sklearn", "Pipeline", ["sklearn.preprocessing.StandardScaler"], False),
 31 |         ("sklearn", "Pipeline", True, True),
 32 |         ("builtins", "int", ["builtins.int"], True),
 33 |         ("builtins", "int", [], False),
 34 |     ],
 35 |     ids=["list-True", "list-False", "True", "int-True", "int-False"],
 36 | )
 37 | def test_check_type(module_name, type_name, trusted, expected):
 38 |     assert check_type(module_name, type_name, trusted) == expected
 39 | 
 40 | 
 41 | def test_audit_tree_untrusted():
 42 |     var = {"a": CustomType(1), 2: CustomType(2)}
 43 |     state = dict_get_state(var, SaveContext(None, 0, {}))
 44 |     node = DictNode(state, LoadContext(None), trusted=False)
 45 |     with pytest.raises(
 46 |         TypeError,
 47 |         match=re.escape(
 48 |             "Untrusted types found in the file: ['test_audit.CustomType']."
 49 |         ),
 50 |     ):
 51 |         audit_tree(node, trusted=False)
 52 | 
 53 |     # there shouldn't be an error with trusted=True
 54 |     audit_tree(node, trusted=True)
 55 | 
 56 |     untrusted_list = get_untrusted_types(data=dumps(var))
 57 |     assert untrusted_list == ["test_audit.CustomType"]
 58 | 
 59 |     # passing the type would fix it.
 60 |     audit_tree(node, trusted=untrusted_list)
 61 | 
 62 | 
 63 | def test_audit_tree_defaults():
 64 |     # test that the default types are trusted
 65 |     var = {"a": 1, 2: "b"}
 66 |     state = dict_get_state(var, SaveContext(None, 0, {}))
 67 |     node = DictNode(state, LoadContext(None), trusted=False)
 68 |     audit_tree(node, trusted=[])
 69 | 
 70 | 
 71 | @pytest.mark.parametrize(
 72 |     "trusted, defaults, expected",
 73 |     [
 74 |         (True, None, True),
 75 |         (False, int, ["builtins.int"]),
 76 |         ([int], None, ["builtins.int"]),
 77 |     ],
 78 |     ids=["trusted", "untrusted", "untrusted_list"],
 79 | )
 80 | def test_Node_get_trusted(trusted, defaults, expected):
 81 |     assert Node._get_trusted(trusted, defaults) == expected
 82 | 
 83 | 
 84 | @pytest.mark.parametrize(
 85 |     "values, is_safe",
 86 |     [
 87 |         ([1, 2], True),
 88 |         ([1, {1: 2}], True),
 89 |         ([1, {1: CustomType(1)}], False),
 90 |         (eval, False),
 91 |         (pytest.mark.parametrize, False),
 92 |     ],
 93 |     ids=["int", "dict", "untrusted", "eval", "parametrize"],
 94 | )
 95 | def test_list_safety(values, is_safe):
 96 |     content = dumps(values)
 97 | 
 98 |     with ZipFile(io.BytesIO(content), "r") as zip_file:
 99 |         schema = json.loads(zip_file.read("schema.json"))
100 |         tree = get_tree(schema, load_context=LoadContext(src=zip_file))
101 |         assert tree.is_safe() == is_safe
102 | 
103 | 
104 | def test_gettype_error():
105 |     msg = "Object None of module test is unknown"
106 |     with pytest.raises(ValueError, match=msg):
107 |         gettype(module_name="test", cls_or_func=None)
108 | 
109 |     msg = "Object test of module None is unknown"
110 |     with pytest.raises(ValueError, match=msg):
111 |         gettype(module_name=None, cls_or_func="test")
112 | 
113 |     # ImportError if the module cannot be imported
114 |     with pytest.raises(ImportError):
115 |         gettype(module_name="invalid-module", cls_or_func="invalid-type")
116 | 
117 | 
118 | @pytest.mark.parametrize(
119 |     "data, file, exception, message",
120 |     [
121 |         ("not-none", "not-none", ValueError, "Only one of data or file"),
122 |         (None, None, ValueError, "Exactly one of data or file should be passed"),
123 |         ("string", None, TypeError, "a bytes-like object is required, not 'str'"),
124 |     ],
125 |     ids=["both", "neither", "string-data"],
126 | )
127 | def test_get_untrusted_types_validation(data, file, exception, message):
128 |     with pytest.raises(exception, match=message):
129 |         get_untrusted_types(data=data, file=file)
130 | 
131 | 
132 | def test_temp_setattr():
133 |     # Test that temp_setattr works as expected
134 |     class A:
135 |         def __init__(self):
136 |             self.a = 1
137 | 
138 |     temp = A()
139 |     with suppress(ValueError):
140 |         with temp_setattr(temp, a=2, b=3):
141 |             assert temp.a == 2
142 |             assert temp.b == 3
143 |             raise ValueError  # to make sure context manager handles exceptions
144 | 
145 |     assert temp.a == 1
146 |     assert not hasattr(temp, "b")
147 | 
148 | 
149 | def test_complex_pipeline_untrusted_set():
150 |     # fmt: off
151 |     clf = Pipeline([
152 |         ("features", FeatureUnion([
153 |             ("scaler", StandardScaler()),
154 |             ("sqrt", FunctionTransformer(
155 |                     func=np.sqrt,
156 |                     inverse_func=np.square,
157 |                 )),
158 |         ])),
159 |         ("clf", LogisticRegression(random_state=0, solver="liblinear")),
160 |     ])
161 |     # fmt: on
162 | 
163 |     untrusted = get_untrusted_types(data=dumps(clf))
164 |     type_names = [x.split(".")[-1] for x in untrusted]
165 |     assert type_names == ["sqrt", "square"]
166 | 


--------------------------------------------------------------------------------
/examples/plot_hf_hub.py:
--------------------------------------------------------------------------------
  1 | """
  2 | scikit-learn models on Hugging Face Hub
  3 | ---------------------------------------
  4 | 
  5 | This guide demonstrates how you can use this package to create a Hugging Face
  6 | Hub model repository based on a scikit-learn compatible model, and how to
  7 | fetch scikit-learn compatible models from the Hub and run them locally.
  8 | """
  9 | 
 10 | # %%
 11 | # Imports
 12 | # =======
 13 | # First we will import everything required for the rest of this document.
 14 | 
 15 | import json
 16 | import os
 17 | import pickle
 18 | from pathlib import Path
 19 | from tempfile import mkdtemp, mkstemp
 20 | from uuid import uuid4
 21 | 
 22 | import sklearn
 23 | from huggingface_hub import HfApi
 24 | from sklearn.datasets import load_breast_cancer
 25 | from sklearn.ensemble import HistGradientBoostingClassifier
 26 | from sklearn.experimental import enable_halving_search_cv  # noqa
 27 | from sklearn.model_selection import HalvingGridSearchCV, train_test_split
 28 | 
 29 | from skops import card, hub_utils
 30 | 
 31 | # %%
 32 | # Data
 33 | # ====
 34 | # Then we create some random data to train and evaluate our model.
 35 | 
 36 | X, y = load_breast_cancer(as_frame=True, return_X_y=True)
 37 | X_train, X_test, y_train, y_test = train_test_split(
 38 |     X, y, test_size=0.3, random_state=42
 39 | )
 40 | print("X's summary: ", X.describe())
 41 | print("y's summary: ", y.describe())
 42 | 
 43 | 
 44 | # %%
 45 | # Train a Model
 46 | # =============
 47 | # Using the above data, we train a model. To select the model, we use
 48 | # :class:`~sklearn.model_selection.HalvingGridSearchCV` with a parameter grid
 49 | # over :class:`~sklearn.ensemble.HistGradientBoostingClassifier`.
 50 | 
 51 | param_grid = {
 52 |     "max_leaf_nodes": [5, 10, 15],
 53 |     "max_depth": [2, 5, 10],
 54 | }
 55 | 
 56 | model = HalvingGridSearchCV(
 57 |     estimator=HistGradientBoostingClassifier(),
 58 |     param_grid=param_grid,
 59 |     random_state=42,
 60 |     n_jobs=-1,
 61 | ).fit(X_train, y_train)
 62 | model.score(X_test, y_test)
 63 | 
 64 | # %%
 65 | # Initialize a Model Repo
 66 | # =======================
 67 | # We now initialize a model repository locally, and push it to the hub. For
 68 | # that, we need to first store the model as a pickle file and pass it to the
 69 | # hub tools.
 70 | 
 71 | # The file name is not significant, here we choose to save it with a `pkl`
 72 | # extension.
 73 | _, pkl_name = mkstemp(prefix="skops-", suffix=".pkl")
 74 | with open(pkl_name, mode="bw") as f:
 75 |     pickle.dump(model, file=f)
 76 | 
 77 | local_repo = mkdtemp(prefix="skops-")
 78 | hub_utils.init(
 79 |     model=pkl_name,
 80 |     requirements=[f"scikit-learn={sklearn.__version__}"],
 81 |     dst=local_repo,
 82 |     task="tabular-classification",
 83 |     data=X_test,
 84 | )
 85 | if "__file__" in locals():  # __file__ not defined during docs built
 86 |     # Add this script itself to the files to be uploaded for reproducibility
 87 |     hub_utils.add_files(__file__, dst=local_repo)
 88 | 
 89 | # %%
 90 | # We can no see what the contents of the created local repo are:
 91 | print(os.listdir(local_repo))
 92 | 
 93 | # %%
 94 | # Model Card
 95 | # ==========
 96 | # We will now create a model card and save it. For more information about how
 97 | # to create a good model card, refer to the :ref:`model card example
 98 | # <sphx_glr_auto_examples_plot_model_card.py>`. The following code uses
 99 | # :func:`~skops.card.metadata_from_config` which creates a minimal metadata
100 | # object to be included in the metadata section of the model card. The
101 | # configuration used by this method is stored in the ``config.json`` file which
102 | # is created by the call to :func:`~skops.hub_utils.init`.
103 | model_card = card.Card(model, metadata=card.metadata_from_config(Path(local_repo)))
104 | model_card.save(Path(local_repo) / "README.md")
105 | 
106 | # %%
107 | # Push to Hub
108 | # ===========
109 | # And finally, we can push the model to the hub. This requires a user access
110 | # token which you can get under https://huggingface.co/settings/tokens
111 | 
112 | # you can put your own token here, or set it as an environment variable before
113 | # running this script.
114 | token = os.environ["HF_HUB_TOKEN"]
115 | 
116 | repo_name = f"hf_hub_example-{uuid4()}"
117 | user_name = HfApi().whoami(token=token)["name"]
118 | repo_id = f"{user_name}/{repo_name}"
119 | print(f"Creating and pushing to repo: {repo_id}")
120 | 
121 | # %%
122 | # Now we can push our files to the repo. The following function creates the
123 | # remote repository if it doesn't exist; this is controlled via the
124 | # ``create_remote`` argument. Note that here we're setting ``private=True``,
125 | # which means only people with the right permissions would see the model. Set
126 | # ``private=False`` to make it visible to the public.
127 | 
128 | hub_utils.push(
129 |     repo_id=repo_id,
130 |     source=local_repo,
131 |     token=token,
132 |     commit_message="pushing files to the repo from the example!",
133 |     create_remote=True,
134 |     private=True,
135 | )
136 | 
137 | # %%
138 | # Once uploaded, other users can download and use it, unless you make the repo
139 | # private. Given a repository's name, here's how one can download it:
140 | repo_copy = mkdtemp(prefix="skops")
141 | hub_utils.download(repo_id=repo_id, dst=repo_copy, token=token)
142 | print(os.listdir(repo_copy))
143 | 
144 | 
145 | # %%
146 | # You can also get the requirements of this repository:
147 | print(hub_utils.get_requirements(path=repo_copy))
148 | 
149 | # %%
150 | # As well as the complete configuration of the project:
151 | print(json.dumps(hub_utils.get_config(path=repo_copy), indent=2))
152 | 
153 | # %%
154 | # Now you can check the contents of the repository under your user.
155 | #
156 | # Update Requirements
157 | # ===================
158 | # If you update your environment and the versions of your requirements are
159 | # changed, you can update the requirement in your repo by calling
160 | # ``update_env``, which automatically detects the existing installation of the
161 | # current environment and updates the requirements accordingly.
162 | 
163 | hub_utils.update_env(path=local_repo, requirements=["scikit-learn"])
164 | 
165 | # %%
166 | # Delete Repository
167 | # =================
168 | # At the end, you can also delete the repository you created using
169 | # ``HfApi().delete_repo``. For more information please refer to the
170 | # documentation of ``huggingface_hub`` library.
171 | 
172 | HfApi().delete_repo(repo_id=repo_id, token=token)
173 | 


--------------------------------------------------------------------------------
/examples/plot_text_classification.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Text Classification with scikit-learn
  3 | -------------------------------------
  4 | 
  5 | This example shows how you can create a Hugging Face Hub compatible repo for a
  6 | text classification task using scikit-learn. We also show how you can generate
  7 | a model card for the model and the task at hand.
  8 | """
  9 | 
 10 | # %%
 11 | # Imports
 12 | # =======
 13 | # First we will import everything required for the rest of this document.
 14 | 
 15 | import pickle
 16 | from pathlib import Path
 17 | from tempfile import mkdtemp, mkstemp
 18 | 
 19 | import pandas as pd
 20 | import sklearn
 21 | from sklearn.datasets import fetch_20newsgroups
 22 | from sklearn.feature_extraction.text import CountVectorizer
 23 | from sklearn.metrics import (
 24 |     ConfusionMatrixDisplay,
 25 |     accuracy_score,
 26 |     classification_report,
 27 |     confusion_matrix,
 28 |     f1_score,
 29 | )
 30 | from sklearn.model_selection import train_test_split
 31 | from sklearn.naive_bayes import MultinomialNB
 32 | from sklearn.pipeline import Pipeline
 33 | 
 34 | from skops import card, hub_utils
 35 | 
 36 | # %%
 37 | # Data
 38 | # ====
 39 | # We will use 20 newsgroups dataset from sklearn. The dataset has curated
 40 | # news on 20 topics. It has a training and a test split.
 41 | 
 42 | twenty_train = fetch_20newsgroups(subset="train", shuffle=True, random_state=42)
 43 | 
 44 | twenty_validation = fetch_20newsgroups(subset="test", shuffle=True, random_state=42)
 45 | 
 46 | X_train, X_test, y_train, y_test = train_test_split(
 47 |     twenty_train.data, twenty_train.target, test_size=0.3, random_state=42
 48 | )
 49 | 
 50 | # %%
 51 | # Train a Model
 52 | # =============
 53 | # To train a model, we need to convert our data first to vectors. We will use
 54 | # CountVectorizer in our pipeline. We will fit a Multinomial
 55 | # Naive Bayes model with the outputs of the vectorization.
 56 | 
 57 | model = Pipeline(
 58 |     [
 59 |         ("count", CountVectorizer()),
 60 |         ("clf", MultinomialNB()),
 61 |     ]
 62 | )
 63 | 
 64 | model.fit(X_train, y_train)
 65 | 
 66 | # %%
 67 | # Inference
 68 | # =========
 69 | # Let's see if the model works.
 70 | 
 71 | docs_new = [
 72 |     "A graphics processing unit is a specialized electronic circuit designed to"
 73 |     " manipulate and alter memory to accelerate the creation of images in a frame"
 74 |     " buffer intended for output to a display device.."
 75 | ]
 76 | predicted = model.predict(docs_new)
 77 | print(twenty_train.target[predicted[0]])
 78 | 
 79 | # %%
 80 | # Initialize a repository to save our files in
 81 | # ============================================
 82 | # We will now initialize a repository and save our model
 83 | _, pkl_name = mkstemp(prefix="skops-", suffix=".pkl")
 84 | 
 85 | with open(pkl_name, mode="bw") as f:
 86 |     pickle.dump(model, file=f)
 87 | 
 88 | local_repo = mkdtemp(prefix="skops-")
 89 | 
 90 | hub_utils.init(
 91 |     model=pkl_name,
 92 |     requirements=[f"scikit-learn={sklearn.__version__}"],
 93 |     dst=local_repo,
 94 |     task="text-classification",
 95 |     data=X_test,
 96 | )
 97 | 
 98 | # %%
 99 | # Create a model card
100 | # ===================
101 | # We now create a model card, and populate its metadata with information which
102 | # is already provided in ``config.json``, which itself is created by the call to
103 | # :func:`.hub_utils.init` above. We will see below how we can populate the model
104 | # card with useful information.
105 | 
106 | model_card = card.Card(model, metadata=card.metadata_from_config(Path(local_repo)))
107 | 
108 | # %%
109 | # Add more information
110 | # ====================
111 | # So far, the model card does not tell viewers a lot about the model. Therefore,
112 | # we add more information about the model, like a description and what its
113 | # license is.
114 | 
115 | model_card.metadata.license = "mit"
116 | limitations = "This model is not ready to be used in production."
117 | model_description = (
118 |     "This is a Multinomial Naive Bayes model trained on 20 news groups dataset."
119 |     "Count vectorizer is used for vectorization."
120 | )
121 | model_card_authors = "skops_user"
122 | get_started_code = (
123 |     "import pickle\nwith open(pkl_filename, 'rb') as file:\n    clf = pickle.load(file)"
124 | )
125 | citation_bibtex = "bibtex\n@inproceedings{...,year={2020}}"
126 | model_card.add(
127 |     citation_bibtex=citation_bibtex,
128 |     get_started_code=get_started_code,
129 |     model_card_authors=model_card_authors,
130 |     limitations=limitations,
131 |     model_description=model_description,
132 | )
133 | 
134 | # %%
135 | # Add plots, metrics, and tables to our model card
136 | # ================================================
137 | # We will now evaluate our model and add our findings to the model card.
138 | 
139 | y_pred = model.predict(X_test)
140 | eval_descr = (
141 |     "The model is evaluated on validation data from 20 news group's test split,"
142 |     " using accuracy and F1-score with micro average."
143 | )
144 | model_card.add(eval_method=eval_descr)
145 | 
146 | accuracy = accuracy_score(y_test, y_pred)
147 | f1 = f1_score(y_test, y_pred, average="micro")
148 | model_card.add_metrics(**{"accuracy": accuracy, "f1 score": f1})
149 | 
150 | cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
151 | disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
152 | disp.plot()
153 | 
154 | disp.figure_.savefig(Path(local_repo) / "confusion_matrix.png")
155 | model_card.add_plot(**{"Confusion matrix": "confusion_matrix.png"})
156 | 
157 | clf_report = classification_report(
158 |     y_test, y_pred, output_dict=True, target_names=twenty_train.target_names
159 | )
160 | # The classification report has to be transformed into a DataFrame first to have
161 | # the correct format. This requires removing the "accuracy", which was added
162 | # above anyway.
163 | del clf_report["accuracy"]
164 | clf_report = pd.DataFrame(clf_report).T.reset_index()
165 | model_card.add_table(
166 |     folded=True,
167 |     **{
168 |         "Classification Report": clf_report,
169 |     },
170 | )
171 | 
172 | # %%
173 | # Save model card
174 | # ================
175 | # We can simply save our model card by providing a path to :meth:`.Card.save`.
176 | # The model hasn't been pushed to Hugging Face Hub yet, if you want to see how
177 | # to push your models please refer to
178 | # :ref:`this example <sphx_glr_auto_examples_plot_hf_hub.py>`.
179 | 
180 | model_card.save(Path(local_repo) / "README.md")
181 | 


--------------------------------------------------------------------------------
/skops/card/tests/test_parser.py:
--------------------------------------------------------------------------------
  1 | import difflib
  2 | import json
  3 | import os
  4 | import re
  5 | from pathlib import Path
  6 | from unittest.mock import Mock, patch
  7 | 
  8 | import pytest
  9 | import yaml  # type: ignore
 10 | 
 11 | from skops.card import parse_modelcard
 12 | from skops.card._parser import PandocParser, check_pandoc_installed
 13 | 
 14 | try:
 15 |     check_pandoc_installed()
 16 | except FileNotFoundError:
 17 |     # not installed, skip
 18 |     pytest.skip(reason="These tests require a recent pandoc", allow_module_level=True)
 19 | 
 20 | 
 21 | EXAMPLE_CARDS = [
 22 |     # actual model cards from HF hub
 23 |     "bert-base-uncased.md",
 24 |     "clip-vit-large-patch14.md",
 25 |     "gpt2.md",
 26 |     "specter.md",
 27 |     "vit-base-patch32-224-in21k.md",
 28 |     # not a model card
 29 |     "toy-example.md",
 30 | ]
 31 | 
 32 | 
 33 | def _assert_meta_equal(meta0, meta1):
 34 |     # we cannot guarantee the order of metadata items, so we compare parsed
 35 |     # dicts, but not strings directly
 36 |     assert yaml.safe_load("".join(meta0)) == yaml.safe_load("".join(meta1))
 37 | 
 38 | 
 39 | def assert_readme_files_almost_equal(file0, file1, diff):
 40 |     """Check that the two model cards are identical, but allow differences as
 41 |     defined in the ``diff`` file
 42 | 
 43 |     The metainfo is compared separately, as the order of the items is not
 44 |     guaranteed to be stable.
 45 |     """
 46 |     with open(file0, "r") as f:
 47 |         readme0 = f.readlines()
 48 | 
 49 |     with open(file1, "r") as f:
 50 |         readme1 = f.readlines()
 51 | 
 52 |     sep = "---\n"
 53 |     # we look for 2nd occurrence, so skip first char to not match 1st occurrence
 54 |     if sep in readme0[1:]:  # only check if metainfo is present
 55 |         idx0, idx1 = readme0[1:].index(sep) + 1, readme1[1:].index(sep) + 1
 56 |         meta0, meta1 = readme0[1:idx0], readme1[1:idx1]
 57 |         readme0, readme1 = readme0[idx0:], readme1[idx1:]
 58 |         _assert_meta_equal(meta0, meta1)
 59 | 
 60 |     # exclude trivial case of both being empty
 61 |     assert readme0
 62 |     assert readme1
 63 | 
 64 |     diff_actual = list(difflib.unified_diff(readme0, readme1, n=0))
 65 | 
 66 |     with open(diff, "r") as f:
 67 |         diff_expected = f.readlines()
 68 | 
 69 |     assert diff_actual == diff_expected
 70 | 
 71 | 
 72 | @pytest.mark.parametrize("file_name", EXAMPLE_CARDS, ids=EXAMPLE_CARDS)
 73 | def test_example_model_cards(tmp_path, file_name):
 74 |     """Test that the difference between original and parsed model card is
 75 |     acceptable
 76 | 
 77 |     For this test, model cards for some of the most popular models on HF Hub
 78 |     were retrieved and stored in the ./examples folder. This test checks that
 79 |     these model cards can be successfully parsed and that the output is *almost*
 80 |     the same.
 81 | 
 82 |     We don't expect the output to be 100% identical, see the limitations listed
 83 |     in ``parse_modelcard``. Instead, we assert that the diff corresponds to the
 84 |     expected diff, which is also checked in.
 85 | 
 86 |     So e.g. for "specter.md", we expect that the diff will be the same diff as
 87 |     in "specter.md.diff".
 88 | 
 89 |     """
 90 |     path = Path(os.getcwd()) / "skops" / "card" / "tests" / "examples"
 91 |     file0 = path / file_name
 92 |     diff = (path / file_name).with_suffix(".md.diff")
 93 | 
 94 |     parsed_card = parse_modelcard(file0)
 95 |     file1 = tmp_path / "readme-parsed.md"
 96 |     parsed_card.save(file1)
 97 | 
 98 |     assert_readme_files_almost_equal(file0, file1, diff)
 99 | 
100 | 
101 | def test_unknown_pandoc_item_raises():
102 |     source = json.dumps(
103 |         {
104 |             "pandoc-api-version": [1, 22, 2, 1],
105 |             "meta": {},
106 |             "blocks": [
107 |                 {
108 |                     "t": "Header",
109 |                     "c": [1, ["section", [], []], [{"t": "Str", "c": "section"}]],
110 |                 },
111 |                 {"c": "valid", "t": "Str"},
112 |                 {"t": "does-not-exist", "c": []},
113 |                 {"c": "okay", "t": "Str"},
114 |             ],
115 |         }
116 |     )
117 |     parser = PandocParser(source)
118 |     msg = (
119 |         "The parsed document contains 'does-not-exist', which is not "
120 |         "supported yet, please open an issue on GitHub"
121 |     )
122 |     with pytest.raises(ValueError, match=re.escape(msg)):
123 |         parser.generate()
124 | 
125 | 
126 | def test_content_without_section_raises():
127 |     source = json.dumps(
128 |         {
129 |             "pandoc-api-version": [1, 22, 2, 1],
130 |             "meta": {},
131 |             "blocks": [
132 |                 {"c": "whoops", "t": "Str"},
133 |             ],
134 |         }
135 |     )
136 |     parser = PandocParser(source)
137 |     msg = (
138 |         "Trying to add content but there is no current section, this is probably a "
139 |         "bug, please open an issue on GitHub"
140 |     )
141 |     with pytest.raises(ValueError, match=re.escape(msg)):
142 |         parser.generate()
143 | 
144 | 
145 | def test_unsupported_markup_raises():
146 |     match = re.escape("Markup of type does-not-exist is not supported (yet)")
147 |     with pytest.raises(ValueError, match=match):
148 |         PandocParser(source="", markup_type="does-not-exist")
149 | 
150 | 
151 | def test_check_pandoc_installed_no_min_version_works():
152 |     # check that it doesn't raise
153 |     check_pandoc_installed(min_version=None)
154 | 
155 | 
156 | def test_check_pandoc_installed_min_version_too_high_raises():
157 |     match = re.escape("Pandoc version too low, expected at least 999.9.9, got")
158 |     with pytest.raises(ValueError, match=match):
159 |         check_pandoc_installed(min_version="999.9.9")
160 | 
161 | 
162 | def test_pandoc_not_installed():
163 |     def raise_filenotfound(*args, **kwargs):
164 |         # error raised when trying to run subprocess on non-existing command
165 |         raise FileNotFoundError("[Errno 2] No such file or directory: 'pandoc'")
166 | 
167 |     with patch("subprocess.run", raise_filenotfound):
168 |         match = re.escape(
169 |             "This feature requires the pandoc library to be installed on your system"
170 |         )
171 |         with pytest.raises(FileNotFoundError, match=match):
172 |             check_pandoc_installed()
173 | 
174 | 
175 | def test_pandoc_version_cannot_be_determined():
176 |     mock = Mock()
177 |     with patch("subprocess.run", mock):
178 |         match = re.escape("Could not determine version of pandoc")
179 |         with pytest.raises(RuntimeError, match=match):
180 |             check_pandoc_installed()
181 | 


--------------------------------------------------------------------------------
/examples/plot_model_card.py:
--------------------------------------------------------------------------------
  1 | """
  2 | scikit-learn model cards
  3 | --------------------------------------
  4 | 
  5 | This guide demonstrates how you can use this package to create a model card on a
  6 | scikit-learn compatible model and save it.
  7 | """
  8 | 
  9 | # %%
 10 | # Imports
 11 | # =======
 12 | # First we will import everything required for the rest of this document.
 13 | 
 14 | import pickle
 15 | from pathlib import Path
 16 | from tempfile import mkdtemp, mkstemp
 17 | 
 18 | import pandas as pd
 19 | import sklearn
 20 | from sklearn.datasets import load_breast_cancer
 21 | from sklearn.ensemble import HistGradientBoostingClassifier
 22 | from sklearn.experimental import enable_halving_search_cv  # noqa
 23 | from sklearn.metrics import (
 24 |     ConfusionMatrixDisplay,
 25 |     accuracy_score,
 26 |     classification_report,
 27 |     confusion_matrix,
 28 |     f1_score,
 29 | )
 30 | from sklearn.model_selection import HalvingGridSearchCV, train_test_split
 31 | 
 32 | from skops import hub_utils
 33 | from skops.card import Card, metadata_from_config
 34 | 
 35 | # %%
 36 | # Data
 37 | # ====
 38 | # We load breast cancer dataset from sklearn.
 39 | 
 40 | X, y = load_breast_cancer(as_frame=True, return_X_y=True)
 41 | X_train, X_test, y_train, y_test = train_test_split(
 42 |     X, y, test_size=0.3, random_state=42
 43 | )
 44 | print("X's summary: ", X.describe())
 45 | print("y's summary: ", y.describe())
 46 | 
 47 | # %%
 48 | # Train a Model
 49 | # =============
 50 | # Using the above data, we train a model. To select the model, we use
 51 | # :class:`~sklearn.model_selection.HalvingGridSearchCV` with a parameter grid
 52 | # over :class:`~sklearn.ensemble.HistGradientBoostingClassifier`.
 53 | 
 54 | param_grid = {
 55 |     "max_leaf_nodes": [5, 10, 15],
 56 |     "max_depth": [2, 5, 10],
 57 | }
 58 | 
 59 | model = HalvingGridSearchCV(
 60 |     estimator=HistGradientBoostingClassifier(),
 61 |     param_grid=param_grid,
 62 |     random_state=42,
 63 |     n_jobs=-1,
 64 | ).fit(X_train, y_train)
 65 | model.score(X_test, y_test)
 66 | 
 67 | 
 68 | # %%
 69 | # Initialize a repository to save our files in
 70 | # ============================================
 71 | # We will now initialize a repository and save our model
 72 | _, pkl_name = mkstemp(prefix="skops-", suffix=".pkl")
 73 | 
 74 | with open(pkl_name, mode="bw") as f:
 75 |     pickle.dump(model, file=f)
 76 | 
 77 | local_repo = mkdtemp(prefix="skops-")
 78 | 
 79 | hub_utils.init(
 80 |     model=pkl_name,
 81 |     requirements=[f"scikit-learn={sklearn.__version__}"],
 82 |     dst=local_repo,
 83 |     task="tabular-classification",
 84 |     data=X_test,
 85 | )
 86 | 
 87 | # %%
 88 | # Create a model card
 89 | # ====================
 90 | # We now create a model card, and populate its metadata with information which
 91 | # is already provided in ``config.json``, which itself is created by the call to
 92 | # :func:`.hub_utils.init` above. We will see below how we can populate the model
 93 | # card with useful information.
 94 | 
 95 | model_card = Card(model, metadata=metadata_from_config(Path(local_repo)))
 96 | 
 97 | # %%
 98 | # Add more information
 99 | # ====================
100 | # So far, the model card does not tell viewers a lot about the model. Therefore,
101 | # we add more information about the model, like a description and what its
102 | # license is.
103 | 
104 | model_card.metadata.license = "mit"
105 | limitations = "This model is not ready to be used in production."
106 | model_description = (
107 |     "This is a `HistGradientBoostingClassifier` model trained on breast cancer "
108 |     "dataset. It's trained with `HalvingGridSearchCV`, with parameter grids on "
109 |     "`max_leaf_nodes` and `max_depth`."
110 | )
111 | model_card_authors = "skops_user"
112 | citation_bibtex = "**BibTeX**\n\n```\n@inproceedings{...,year={2020}}\n```"
113 | model_card.add(
114 |     **{
115 |         "Citation": citation_bibtex,
116 |         "Model Card Authors": model_card_authors,
117 |         "Model description": model_description,
118 |         "Model description/Intended uses & limitations": limitations,
119 |     }
120 | )
121 | 
122 | # %%
123 | # Add plots, metrics, and tables to our model card
124 | # ================================================
125 | # Furthermore, to better understand the model performance, we should evaluate it
126 | # on certain metrics and add those evaluations to the model card. In this
127 | # particular example, we want to calculate the accuracy and the F1 score. We
128 | # calculate those using sklearn and then add them to the model card by calling
129 | # :meth:`.Card.add_metrics`. But this is not all, we can also add matplotlib
130 | # figures to the model card, e.g. a plot of the confusion matrix. To achieve
131 | # this, we create the plot using sklearn, save it locally, and then add it using
132 | # :meth:`.Card.add_plot` method. Finally, we can also add some useful tables to
133 | # the model card, e.g. the results from the grid search and the classification
134 | # report. Those can be added using :meth:`.Card.add_table`
135 | 
136 | y_pred = model.predict(X_test)
137 | eval_descr = (
138 |     "The model is evaluated on test data using accuracy and F1-score with "
139 |     "macro average."
140 | )
141 | model_card.add(**{"Model description/Evaluation Results": eval_descr})
142 | 
143 | accuracy = accuracy_score(y_test, y_pred)
144 | f1 = f1_score(y_test, y_pred, average="micro")
145 | model_card.add_metrics(**{"accuracy": accuracy, "f1 score": f1})
146 | 
147 | cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
148 | disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
149 | disp.plot()
150 | 
151 | disp.figure_.savefig(Path(local_repo) / "confusion_matrix.png")
152 | model_card.add_plot(
153 |     **{"Model description/Evaluation Results/Confusion Matrix": "confusion_matrix.png"}
154 | )
155 | 
156 | cv_results = model.cv_results_
157 | clf_report = classification_report(
158 |     y_test, y_pred, output_dict=True, target_names=["malignant", "benign"]
159 | )
160 | # The classification report has to be transformed into a DataFrame first to have
161 | # the correct format. This requires removing the "accuracy", which was added
162 | # above anyway.
163 | del clf_report["accuracy"]
164 | clf_report = pd.DataFrame(clf_report).T.reset_index()
165 | model_card.add_table(
166 |     folded=True,
167 |     **{
168 |         "Model description/Evaluation Results/Hyperparameter search results": cv_results,
169 |         "Model description/Evaluation Results/Classification report": clf_report,
170 |     },
171 | )
172 | 
173 | # %%
174 | # Save model card
175 | # ===============
176 | # We can simply save our model card by providing a path to :meth:`.Card.save`.
177 | 
178 | model_card.save(Path(local_repo) / "README.md")
179 | 


--------------------------------------------------------------------------------
/skops/io/_utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import importlib
  4 | import sys
  5 | from dataclasses import dataclass, field
  6 | from functools import singledispatch
  7 | from typing import Any, Type
  8 | from zipfile import ZipFile
  9 | 
 10 | 
 11 | # The following two functions are copied from cpython's pickle.py file.
 12 | # ---------------------------------------------------------------------
 13 | def _getattribute(obj, name):
 14 |     for subpath in name.split("."):
 15 |         if subpath == "<locals>":
 16 |             raise AttributeError(
 17 |                 "Can't get local attribute {!r} on {!r}".format(name, obj)
 18 |             )
 19 |         try:
 20 |             parent = obj
 21 |             obj = getattr(obj, subpath)
 22 |         except AttributeError:
 23 |             raise AttributeError(
 24 |                 "Can't get attribute {!r} on {!r}".format(name, obj)
 25 |             ) from None
 26 |     return obj, parent
 27 | 
 28 | 
 29 | # This function is particularly used to detect the path of functions such as
 30 | # ufuncs. It returns the full path, instead of returning the module name.
 31 | def whichmodule(obj: Any, name: str) -> str:
 32 |     """Find the module an object belong to."""
 33 |     module_name = getattr(obj, "__module__", None)
 34 |     if module_name is not None:
 35 |         return module_name
 36 |     # Protect the iteration by using a list copy of sys.modules against dynamic
 37 |     # modules that trigger imports of other modules upon calls to getattr.
 38 |     for module_name, module in sys.modules.copy().items():
 39 |         if (
 40 |             module_name == "__main__"
 41 |             or module_name == "__mp_main__"  # bpo-42406
 42 |             or module is None
 43 |         ):
 44 |             continue
 45 |         try:
 46 |             if _getattribute(module, name)[0] is obj:
 47 |                 return module_name
 48 |         except AttributeError:
 49 |             pass
 50 |     return "__main__"
 51 | 
 52 | 
 53 | # ---------------------------------------------------------------------
 54 | 
 55 | 
 56 | def _import_obj(module: str, cls_or_func: str, package: str | None = None) -> Any:
 57 |     return getattr(importlib.import_module(module, package=package), cls_or_func)
 58 | 
 59 | 
 60 | def gettype(module_name: str, cls_or_func: str) -> Type[Any]:
 61 |     if module_name and cls_or_func:
 62 |         return _import_obj(module_name, cls_or_func)
 63 | 
 64 |     raise ValueError(f"Object {cls_or_func} of module {module_name} is unknown")
 65 | 
 66 | 
 67 | def get_module(obj: Any) -> str:
 68 |     """Find module for given object
 69 | 
 70 |     If the module cannot be identified, it's assumed to be "__main__".
 71 | 
 72 |     Parameters
 73 |     ----------
 74 |     obj: Any
 75 |        Object whose module is requested.
 76 | 
 77 |     Returns
 78 |     -------
 79 |     name: str
 80 |         Name of the module.
 81 | 
 82 |     """
 83 |     return whichmodule(obj, obj.__name__)
 84 | 
 85 | 
 86 | # For now, there is just one protocol version
 87 | DEFAULT_PROTOCOL = 0
 88 | 
 89 | 
 90 | @dataclass(frozen=True)
 91 | class SaveContext:
 92 |     """Context required for saving the objects
 93 | 
 94 |     This context is passed to each ``get_state_*`` function.
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     zip_file: zipfile.ZipFile
 99 |         The zip file to write the data to, must be in write mode.
100 | 
101 |     protocol: int
102 |         The protocol of the persistence format. Right now, there is only
103 |         protocol 0, but this leaves the door open for future changes.
104 | 
105 |     """
106 | 
107 |     zip_file: ZipFile
108 |     protocol: int = DEFAULT_PROTOCOL
109 |     memo: dict[int, Any] = field(default_factory=dict)
110 | 
111 |     def memoize(self, obj: Any) -> int:
112 |         # Currently, the only purpose for saving the object id is to make sure
113 |         # that for the length of the context that the main object is being
114 |         # saved, all attributes persist, so that the same id cannot be re-used
115 |         # for different objects.
116 |         obj_id = id(obj)
117 |         if obj_id not in self.memo:
118 |             self.memo[obj_id] = obj
119 |         return obj_id
120 | 
121 |     def clear_memo(self) -> None:
122 |         self.memo.clear()
123 | 
124 | 
125 | @dataclass(frozen=True)
126 | class LoadContext:
127 |     """Context required for loading an object
128 | 
129 |     This context is passed to each ``*Node`` class when loading an object.
130 | 
131 |     Parameters
132 |     ----------
133 |     src: zipfile.ZipFile
134 |         The zip file the target object is saved in
135 |     """
136 | 
137 |     src: ZipFile
138 |     memo: dict[int, Any] = field(default_factory=dict)
139 | 
140 |     def memoize(self, obj: Any, id: int) -> None:
141 |         self.memo[id] = obj
142 | 
143 |     def get_object(self, id: int) -> Any:
144 |         return self.memo.get(id)
145 | 
146 | 
147 | @singledispatch
148 | def _get_state(obj, save_context: SaveContext):
149 |     # This function should never be called directly. Instead, it is used to
150 |     # dispatch to the correct implementation of get_state for the given type of
151 |     # its first argument.
152 |     raise TypeError(f"Getting the state of type {type(obj)} is not supported yet")
153 | 
154 | 
155 | def get_state(value, save_context: SaveContext) -> dict[str, Any]:
156 |     # This is a helper function to try to get the state of an object. If it
157 |     # fails with `get_state`, we try with json.dumps, if that fails, we raise
158 |     # the original error alongside the json error.
159 | 
160 |     # TODO: This should help with fixing recursive references.
161 |     # if id(value) in save_context.memo:
162 |     #     return {
163 |     #         "__module__": None,
164 |     #         "__class__": None,
165 |     #         "__id__": id(value),
166 |     #         "__loader__": "CachedNode",
167 |     #     }
168 | 
169 |     __id__ = save_context.memoize(obj=value)
170 | 
171 |     res = _get_state(value, save_context)
172 | 
173 |     res["__id__"] = __id__
174 |     return res
175 | 
176 | 
177 | def get_type_name(t: Any) -> str:
178 |     """Helper function to take in a type, and return its name as a string"""
179 |     return f"{get_module(t)}.{t.__name__}"
180 | 
181 | 
182 | def get_type_paths(types: Any) -> list[str]:
183 |     """Helper function that takes in a types,
184 |     and converts any the types found to a list of strings.
185 | 
186 |     Parameters
187 |     ----------
188 |     types: Any
189 |         Types to get. Can be either a string, a single type, or a list of strings
190 |         and types.
191 | 
192 |     Returns
193 |     ----------
194 |     types_list: list of str
195 |         The list of types, all as strings, e.g. ``["builtins.list"]``.
196 | 
197 |     """
198 |     if not types:
199 |         return []
200 |     if not isinstance(types, (list, tuple)):
201 |         types = [types]
202 | 
203 |     return [get_type_name(t) if not isinstance(t, str) else t for t in types]
204 | 


--------------------------------------------------------------------------------
/docs/model_card.rst:
--------------------------------------------------------------------------------
  1 | .. _model_card:
  2 | 
  3 | Model Cards for scikit-learn
  4 | ============================
  5 | 
  6 | This library allows you to automatically create model cards for your models,
  7 | which are a short documentation explaining what the model does, how it's
  8 | trained, and its limitations. `Hugging Face Hub <https://huggingface.co/>`__
  9 | expects a ``README.md`` file containing a certain set of metadata at the
 10 | beginning of it, following with the content of the model card in markdown
 11 | format. The metadata section is used to make models searchable on the Hub, and
 12 | get the inference API and the widgets on the website working.
 13 | 
 14 | Metadata
 15 | --------
 16 | 
 17 | The metadata part of the file needs to follow the specifications `here
 18 | <https://huggingface.co/docs/hub/models-cards#model-card-metadata>`__. It
 19 | includes simple attributes of your models such as the task you're solving,
 20 | dataset you trained the model with, evaluation results and more. When the model
 21 | is hosted on the Hub, information in metadata like task name or dataset help
 22 | your model be discovered on the `Hugging Face Hub
 23 | <https://huggingface.co/models>`__. The task identifiers should follow the task
 24 | taxonomy defined in Hugging Face Hub, as it enables the inference widget on the
 25 | model page. An example to task identifier can be ``"tabular-classification"``
 26 | or ``"text-regression"``.
 27 | 
 28 | Here's an example of the metadata section of the ``README.md`` file:
 29 | 
 30 | .. code-block:: yaml
 31 | 
 32 |     ---
 33 |     library_name: sklearn
 34 |     tags:
 35 |     - tabular-classification
 36 |     license: mit
 37 |     datasets:
 38 |     - breast-cancer
 39 |     metrics:
 40 |     - accuracy
 41 |     ---
 42 | 
 43 | ``skops`` creates this section of the file for you, and you almost never need
 44 | to touch it yourself.
 45 | 
 46 | Model Card Content
 47 | ------------------
 48 | 
 49 | The markdown part does not necessarily need to follow any specification in
 50 | terms of information passed, which gives the user a lot of flexibility. The
 51 | markdown part of the ``README.md`` file comes with a couple of defaults provided
 52 | by ``skops``, which includes the following slots for free text sections:
 53 | 
 54 | - ``"Mode description"``: A description of the model.
 55 | - ``"Intended uses & limitations"``: Intended use for the model, limitations and
 56 |   potential biases. This section should also include risks of using models in
 57 |   certain domains if relevant.
 58 | - ``"How to Get Started with the Model"``: Code the user can run to load and use
 59 |   the model.
 60 | - ``"Model Card Authors"``: Authors of the model card. This section includes
 61 |   authors of the model card
 62 | - ``"Model Card Contact"``: Contact information of people whom can be reached
 63 |   out, in case of questions about the model or the model card.
 64 | - ``"Citation"``: Bibtex style citations for the model or resources used to
 65 |   train the model.
 66 | - ``"Evaluation Results"``: Evaluation results that are later parsed as a table
 67 |   by :class:`skops.card.Card`.
 68 | 
 69 | 
 70 | The template also contains the following sections that are automatically
 71 | generated by ``skops``.
 72 | 
 73 | - ``"Hyperparameters"``: Hyperparameters of the model.
 74 | - ``"Model Plot"``: A diagram of the model, most relevant in case the model is
 75 |   a complex scikit-learn :class:`~sklearn.pipeline.Pipeline`.
 76 | 
 77 | Furthermore, it is possible to add plots and tables to the model card. To add
 78 | plots, save them on disk and then add them to the card by passing the path name
 79 | to the :meth:`.Card.add_plot` method. For tables, you can pass either
 80 | dictionaries with the key being the header and the values being list of row
 81 | entries, or a pandas ``DataFrame``; use the :meth:`.Card.add_table` method for
 82 | this.
 83 | 
 84 | To add content to an existing subsection, or create a new subsection, use a
 85 | ``"/"`` to indicate the subsection. E.g. let's assume you would like to add a
 86 | subsection called ``"Figures"`` to the existing section ``"Model description"``,
 87 | as well as adding some subsections with plots below that, you can call the
 88 | :meth:`Card.add` method like this:
 89 | 
 90 | .. code-block:: python
 91 | 
 92 |     card.add(**{"Model description/Figures": "Here are some nice figures"})
 93 |     card.add_plot(**{
 94 |         "Model description/Figures/Confusion Matrix": "path-to-confusion-matrix.png",
 95 |         "Model description/Figures/ROC": "path-to-roc.png",
 96 |     })
 97 | 
 98 | Furthermore, you can select existing sections (as well as their subsections)
 99 | using :meth:`.Card.select`, and you can delete sections using
100 | :meth:`.Card.delete`:
101 | 
102 | .. code-block:: python
103 | 
104 |     section = card.select("Model description/Figures")
105 |     print(section.content)  # 'Here are some nice figures'
106 |     print(section.subsections)
107 |     card.delete("Model description/Figures/ROC")
108 | 
109 | 
110 | To see how you can use the API in ``skops`` to create a model card, please
111 | refer to :ref:`sphx_glr_auto_examples_plot_model_card.py`.
112 | 
113 | Saving and Loading Model Cards
114 | ------------------------------
115 | 
116 | Once you have finished creating and modifying the model card, you can save it
117 | using the :meth:`.Card.save` method:
118 | 
119 | .. code-block:: python
120 | 
121 |     card.save("README.md")
122 | 
123 | This renders the content of the model card to markdown format and stores it in
124 | the indicated file. It is now ready to be uploaded to Hugging Face Hub.
125 | 
126 | If you have a finished model card but want to load to make some modifications,
127 | you can use the function :func:`skops.card.parse_modelcard`. This function
128 | parses the model card back into a :class:`.Card` instance that you can work on
129 | further:
130 | 
131 | .. code-block:: python
132 | 
133 |     from skops import card
134 |     model_card = card.parse_modelcard("README.md")
135 |     model_card.add(**{"A new section": "Some new content"})
136 |     model_card.save("README.md")
137 | 
138 | When the card is parsed, some minor details of the model card can change, e.g.
139 | if you used different column alignment than the default, this could change, as
140 | well as removing excess empty lines or trailing whitespace. However, the content
141 | itself should be exactly the same. All known deviations are documented in the
142 | `parse_modelcard docs
143 | <https://skops.readthedocs.io/en/stable/modules/classes.html#skops.card.parse_modelcard>`_
144 | 
145 | For the parsing part, we rely on `pandoc <https://pandoc.org/>`_. If you haven't
146 | installed it, please follow `these instructions
147 | <https://pandoc.org/installing.html>`_. The advantage of using pandoc is that
148 | it's a very mature library and that it supports many different document formats.
149 | Therefore, it should be possible to parse model cards even if they use a format
150 | that's not markdown, for instance reStructuredText, org, or asciidoc. For
151 | saving, we only support markdown for now.
152 | 


--------------------------------------------------------------------------------
/skops/io/tests/_utils.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import warnings
  3 | 
  4 | import numpy as np
  5 | from scipy import sparse
  6 | from sklearn.base import BaseEstimator
  7 | from sklearn.utils._testing import assert_allclose_dense_sparse
  8 | 
  9 | # TODO: Investigate why that seems to be an issue on MacOS (only observed with
 10 | # Python 3.8)
 11 | ATOL = 1e-6 if sys.platform == "darwin" else 1e-7
 12 | 
 13 | 
 14 | def _is_steps_like(obj):
 15 |     # helper function to check if an object is something like Pipeline.steps,
 16 |     # i.e. a list of tuples of names and estimators
 17 |     if not isinstance(obj, list):  # must be a list
 18 |         return False
 19 | 
 20 |     if not obj:  # must not be empty
 21 |         return False
 22 | 
 23 |     if not isinstance(obj[0], tuple):  # must be list of tuples
 24 |         return False
 25 | 
 26 |     lens = set(map(len, obj))
 27 |     if not lens == {2}:  # all elements must be length 2 tuples
 28 |         return False
 29 | 
 30 |     keys, vals = list(zip(*obj))
 31 | 
 32 |     if len(keys) != len(set(keys)):  # keys must be unique
 33 |         return False
 34 | 
 35 |     if not all(map(lambda x: isinstance(x, (type(None), BaseEstimator)), vals)):
 36 |         # values must be BaseEstimators or None
 37 |         return False
 38 | 
 39 |     return True
 40 | 
 41 | 
 42 | def _assert_generic_objects_equal(val1, val2):
 43 |     def _is_builtin(val):
 44 |         # Check if value is a builtin type
 45 |         return getattr(getattr(val, "__class__", {}), "__module__", None) == "builtins"
 46 | 
 47 |     if isinstance(val1, (list, tuple, np.ndarray)):
 48 |         assert len(val1) == len(val2)
 49 |         for subval1, subval2 in zip(val1, val2):
 50 |             _assert_generic_objects_equal(subval1, subval2)
 51 |             return
 52 | 
 53 |     assert type(val1) == type(val2)
 54 |     if hasattr(val1, "__dict__"):
 55 |         assert_params_equal(val1.__dict__, val2.__dict__)
 56 |     elif _is_builtin(val1):
 57 |         assert val1 == val2
 58 |     else:
 59 |         # not a normal Python class, could be e.g. a Cython class
 60 |         assert val1.__reduce__() == val2.__reduce__()
 61 | 
 62 | 
 63 | def _assert_tuples_equal(val1, val2):
 64 |     assert len(val1) == len(val2)
 65 |     for subval1, subval2 in zip(val1, val2):
 66 |         _assert_vals_equal(subval1, subval2)
 67 | 
 68 | 
 69 | def _assert_vals_equal(val1, val2):
 70 |     if type(val1) == type:  # e.g. could be np.int64
 71 |         assert val1 is val2
 72 |     elif hasattr(val1, "__getstate__") and (val1.__getstate__() is not None):
 73 |         # This includes BaseEstimator since they implement __getstate__ and
 74 |         # that returns the parameters as well.
 75 |         # Since Python 3.11, all objects have a __getstate__ but they return
 76 |         # None by default, in which case this check is not performed.
 77 |         # Some objects return a tuple of parameters, others a dict.
 78 |         state1 = val1.__getstate__()
 79 |         state2 = val2.__getstate__()
 80 |         assert type(state1) == type(state2)
 81 |         if isinstance(state1, tuple):
 82 |             _assert_tuples_equal(state1, state2)
 83 |         else:
 84 |             assert_params_equal(val1.__getstate__(), val2.__getstate__())
 85 |     elif sparse.issparse(val1):
 86 |         assert sparse.issparse(val2) and ((val1 - val2).nnz == 0)
 87 |     elif isinstance(val1, (np.ndarray, np.generic)):
 88 |         if len(val1.dtype) == 0:
 89 |             # for arrays with at least 2 dimensions, check that contiguity is
 90 |             # preserved
 91 |             if val1.squeeze().ndim > 1:
 92 |                 assert val1.flags["C_CONTIGUOUS"] is val2.flags["C_CONTIGUOUS"]
 93 |                 assert val1.flags["F_CONTIGUOUS"] is val2.flags["F_CONTIGUOUS"]
 94 |             if val1.dtype == object:
 95 |                 assert val2.dtype == object
 96 |                 assert val1.shape == val2.shape
 97 |                 for subval1, subval2 in zip(val1, val2):
 98 |                     _assert_generic_objects_equal(subval1, subval2)
 99 |             else:
100 |                 # simple comparison of arrays with simple dtypes, almost all
101 |                 # arrays are of this sort.
102 |                 np.testing.assert_array_equal(val1, val2)
103 |         elif len(val1.shape) == 1:
104 |             # comparing arrays with structured dtypes, but they have to be 1D
105 |             # arrays. This is what we get from the Tree's state.
106 |             assert np.all([x == y for x, y in zip(val1, val2)])
107 |         else:
108 |             # we don't know what to do with these values, for now.
109 |             assert False
110 |     elif isinstance(val1, (tuple, list)):
111 |         assert len(val1) == len(val2)
112 |         for subval1, subval2 in zip(val1, val2):
113 |             _assert_vals_equal(subval1, subval2)
114 |     elif isinstance(val1, float) and np.isnan(val1):
115 |         assert np.isnan(val2)
116 |     elif isinstance(val1, dict):
117 |         # dictionaries are compared by comparing their values recursively.
118 |         assert set(val1.keys()) == set(val2.keys())
119 |         for key in val1:
120 |             _assert_vals_equal(val1[key], val2[key])
121 |     elif hasattr(val1, "__dict__") and hasattr(val2, "__dict__"):
122 |         _assert_vals_equal(val1.__dict__, val2.__dict__)
123 |     elif isinstance(val1, np.ufunc):
124 |         assert val1 == val2
125 |     elif val1.__class__.__module__ == "builtins":
126 |         assert val1 == val2
127 |     else:
128 |         _assert_generic_objects_equal(val1, val2)
129 | 
130 | 
131 | def assert_params_equal(params1, params2):
132 |     # helper function to compare estimator dictionaries of parameters
133 |     if params1 is None and params2 is None:
134 |         return
135 |     assert len(params1) == len(params2)
136 |     assert set(params1.keys()) == set(params2.keys())
137 |     for key in params1:
138 |         with warnings.catch_warnings():
139 |             # this is to silence the deprecation warning from _DictWithDeprecatedKeys
140 |             warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")
141 |             val1, val2 = params1[key], params2[key]
142 |         assert type(val1) == type(val2)
143 | 
144 |         if _is_steps_like(val1):
145 |             # Deal with Pipeline.steps, FeatureUnion.transformer_list, etc.
146 |             assert _is_steps_like(val2)
147 |             val1, val2 = dict(val1), dict(val2)
148 | 
149 |         if isinstance(val1, (tuple, list)):
150 |             assert len(val1) == len(val2)
151 |             for subval1, subval2 in zip(val1, val2):
152 |                 _assert_vals_equal(subval1, subval2)
153 |         elif isinstance(val1, dict):
154 |             assert_params_equal(val1, val2)
155 |         else:
156 |             _assert_vals_equal(val1, val2)
157 | 
158 | 
159 | def assert_method_outputs_equal(estimator, loaded, X):
160 |     # helper function that checks the output of all supported methods
161 |     for method in [
162 |         "predict",
163 |         "predict_proba",
164 |         "decision_function",
165 |         "transform",
166 |         "predict_log_proba",
167 |     ]:
168 |         err_msg = (
169 |             f"{estimator.__class__.__name__}.{method}() doesn't produce the same"
170 |             " results after loading the persisted model."
171 |         )
172 |         if hasattr(estimator, method):
173 |             X_out1 = getattr(estimator, method)(X)
174 |             X_out2 = getattr(loaded, method)(X)
175 |             assert_allclose_dense_sparse(X_out1, X_out2, err_msg=err_msg, atol=ATOL)
176 | 


--------------------------------------------------------------------------------
/docs/hf_hub.rst:
--------------------------------------------------------------------------------
  1 | .. _hf_hub:
  2 | 
  3 | scikit-learn Models on Hugging Face Hub
  4 | =======================================
  5 | 
  6 | This library allows you to initialize and create a model repository compatible
  7 | with `Hugging Face Hub <https://huggingface.co/models>`__, which among other
  8 | things, gives you the following benefits:
  9 | 
 10 | - Inference API to get model output through REST calls
 11 | - A widget to try the model directly in the browser
 12 | - Metadata tags for better discoverability of the model
 13 | - Collaborating with others on a model through discussions and pull requests
 14 | - Convenient sharing of models with the community
 15 | 
 16 | You can see all the models uploaded to the Hugging Face Hub using this library
 17 | `here <https://huggingface.co/models?other=skops>`_.
 18 | 
 19 | In terms of files, there are three which a scikit-learn model repo needs to
 20 | have on the Hub:
 21 | 
 22 | - ``README.md``: includes certain metadata on top of the file and then a
 23 |   description of the model, aka model card.
 24 | - ``config.json``: contains the configuration needed to run the model.
 25 | - The persisted model file. There are no constraints on the name of the file
 26 |   and the name is configured in ``config.json``. The file needs to be loadable
 27 |   by :func:`joblib.load` or :func:`pickle.load`.
 28 | 
 29 | There are certain requirements in terms of information about the model for the
 30 | Hub to be able to load and run the model. For scikit-learn compatible models,
 31 | this information is stored in two places:
 32 | 
 33 | - The metadata in ``README.md`` of the model repository, about which you can
 34 |   read `here <https://huggingface.co/docs/hub/models-cards>`__.
 35 | - The configuration stored in ``config.json``.
 36 | 
 37 | As a user of ``skops``, you can use the tools in ``skops.hub_utils`` to create
 38 | and persist a ``config.json`` file, and then use it to populate necessary
 39 | metadata in the ``README.md`` file. The metadata in ``README.md`` is used by
 40 | the Hub's backend to understand the type of the model and the kind of task
 41 | which the model tries to solve. An example of a task can be
 42 | ``"tabular-classification"`` or ``"text-regression"``.
 43 | 
 44 | An example ``config.json`` file looks like this::
 45 | 
 46 |     {
 47 |         "sklearn": {
 48 |             "columns": [
 49 |                 "petal length (cm)",
 50 |                 "petal width (cm)",
 51 |                 "sepal length (cm)",
 52 |                 "sepal width (cm)",
 53 |             ],
 54 |             "environment": ['scikit-learn="1.1.1"', "numpy"],
 55 |             "example_input": {
 56 |                 "petal length (cm)": [1.4, 1.4, 1.3],
 57 |                 "petal width (cm)": [0.2, 0.2, 0.2],
 58 |                 "sepal length (cm)": [5.1, 4.9, 4.7],
 59 |                 "sepal width (cm)": [3.5, 3.0, 3.2],
 60 |             },
 61 |             "model": {"file": "model.pkl"},
 62 |             "task": "tabular-classification",
 63 |         }
 64 |     }
 65 | 
 66 | The key ``sklearn`` includes the following sub-keys:
 67 | 
 68 | - ``columns``: An ordered list of column names. The order is important as it is
 69 |   used to make sure the input given to the model is what the model expects.
 70 | - ``example_input``: A list of examples to the model. This is in the form of a
 71 |   dictionary of column names to list of values, and is used by the Hugging Face
 72 |   Hub backend to show them in the widget to test the model when visiting the
 73 |   model's page on the Hub.
 74 | - ``environment``: A list of dependencies that the model requires. These
 75 |   packages must be available on conda-forge and are installed before loading
 76 |   the model.
 77 | - ``model.file``: The file name of the persisted model.
 78 | - ``task``: The task of the model.
 79 | 
 80 | You almost never need to create or touch this file manually, and it's created
 81 | when you call :func:`skops.hub_utils.init`.
 82 | 
 83 | It is recommended to include the script itself that creates the whole output in
 84 | the upload. This way, the results are easily reproducible for others. To achieve
 85 | this, call :func:`skops.hub_utils.add_files`:
 86 | 
 87 | .. code:: python
 88 | 
 89 |     # contents of train.py
 90 |     ...
 91 |     hub_utils.init(model, dst=local_repo)
 92 |     hub_utils.add_files(__file__, dst=local_repo)  # adds train.py to repo
 93 |     hub_utils.push(...)
 94 | 
 95 | You may of course add more files if they're useful.
 96 | 
 97 | .. _hf_hub_inference:
 98 | 
 99 | Inference without Downloading the Models
100 | ----------------------------------------
101 | 
102 | You can use the Hugging Face Hub's inference API to get model output without
103 | downloading the models. The :func:`skops.hub_utils.get_model_output` function
104 | returns the model output for a given input. It can be used as::
105 | 
106 |     import skops.hub_utils as hub_utils
107 |     import pandas as pd
108 |     data = pd.DataFrame(...)
109 |     # Load the model from the Hub
110 |     res = hub_utils.get_model_output("USER/MODEL_ID", data)
111 | 
112 | In the above code snippet, ``res`` will be a :class:`numpy.ndarray` containing
113 | the model's output.
114 | 
115 | .. _hf_hub_gradio:
116 | ..
117 |    TODO: replace gradio link once gradio provides object.inv
118 | Easily build user interfaces to your scikit-learn models
119 | --------------------------------------------------------
120 | `gradio <https://gradio.app/>`__ is a python library that lets you create interfaces on your model.
121 | It has a class called `Interface <https://gradio.app/docs/#interface>`__ that lets you create application
122 | interfaces to your machine learning models. Using gradio can have some advantages over the using a plain
123 | model repository, e.g. the Gradio dataframe component allows uploading a csv for tabular data, unlike the
124 | widget in the model repository.
125 | 
126 | ``gradio`` is integrated with skops, so you can load an interface with only one
127 | line of code. During the initialization of the interface, call load method with
128 | your repository identifier prepended with "huggingface/" will load an
129 | interface for your model. The interface has a dataframe input that takes samples
130 | and a dataframe output to return predictions. It also takes the example in the
131 | repository that is previously pushed with skops.
132 | Calling `gr.Interface.launch() <https://gradio.app/docs/#launch/>`__ will launch your application.
133 | 
134 | .. code:: python
135 | 
136 |     import gradio as gr
137 |     repo_id = "scikit-learn/tabular-playground"
138 |     gr.Interface.load(f"huggingface/{repo_id}").launch()
139 | 
140 | 
141 | You can further customize your UI, add description, title, and more. If you'd
142 | like to share your demo, you can set ``share`` to True in `gr.Interface.launch() <https://gradio.app/docs/#launch/>`__.
143 | 
144 | .. code:: python
145 | 
146 |     title = "Supersoaker Defective Product Prediction"
147 |     description = ("This model predicts Supersoaker production line failures."
148 |     "Drag and drop any slice from dataset or edit values as you wish in below"
149 |     "dataframe component.")
150 |     gr.Interface.load(f"huggingface/{repo_id}", title = title, description = description)
151 | 
152 | Sharing your local application has time limitations.
153 | If you want to share your application continuously, you can deploy it to
154 | Hugging Face Spaces. You can check out `this blog <https://huggingface.co/blog/gradio-spaces>`__
155 | on how to do it.
156 | For more information, please refer to documentation of `gradio <https://gradio.app/docs/>`__.
157 | 
158 | It's also possible to spawn a gradio space directly from the model repository.
159 | To achieve this, from the model page, click on ``Deploy`` (top right corner) >
160 | ``Spaces`` > ``Create new Space``, then follow the instructions. After
161 | finishing, you get a gradio space hosted on Hugging Face Hub, with all the
162 | benefits that brings.
163 | 


--------------------------------------------------------------------------------
/skops/io/_persist.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import importlib
  4 | import io
  5 | import json
  6 | from pathlib import Path
  7 | from typing import Any, BinaryIO, Sequence
  8 | from zipfile import ZipFile
  9 | 
 10 | import skops
 11 | 
 12 | from ._audit import NODE_TYPE_MAPPING, audit_tree, get_tree
 13 | from ._utils import LoadContext, SaveContext, _get_state, get_state
 14 | 
 15 | # We load the dispatch functions from the corresponding modules and register
 16 | # them.
 17 | modules = ["._general", "._numpy", "._scipy", "._sklearn"]
 18 | for module_name in modules:
 19 |     # register exposed functions for get_state and get_tree
 20 |     module = importlib.import_module(module_name, package="skops.io")
 21 |     for cls, method in getattr(module, "GET_STATE_DISPATCH_FUNCTIONS", []):
 22 |         _get_state.register(cls)(method)
 23 |     # populate the the dict used for dispatching get_tree functions
 24 |     NODE_TYPE_MAPPING.update(module.NODE_TYPE_MAPPING)
 25 | 
 26 | 
 27 | def _save(obj: Any) -> io.BytesIO:
 28 |     buffer = io.BytesIO()
 29 | 
 30 |     with ZipFile(buffer, "w") as zip_file:
 31 |         save_context = SaveContext(zip_file=zip_file)
 32 |         state = get_state(obj, save_context)
 33 |         save_context.clear_memo()
 34 | 
 35 |         state["protocol"] = save_context.protocol
 36 |         state["_skops_version"] = skops.__version__
 37 |         zip_file.writestr("schema.json", json.dumps(state, indent=2))
 38 | 
 39 |     return buffer
 40 | 
 41 | 
 42 | def dump(obj: Any, file: str | Path | BinaryIO) -> None:
 43 |     """Save an object using the skops persistence format.
 44 | 
 45 |     Skops aims at providing a secure persistence feature that does not rely on
 46 |     :mod:`pickle`, which is inherently insecure. For more information, please
 47 |     visit the :ref:`persistence` documentation.
 48 | 
 49 |     .. warning::
 50 | 
 51 |         This feature is heavily under development, which means the API is
 52 |         unstable and there might be security issues at the moment. Therefore,
 53 |         use caution when loading files from sources you don't trust.
 54 | 
 55 |     Parameters
 56 |     ----------
 57 |     obj: object
 58 |         The object to be saved. Usually a scikit-learn compatible model.
 59 | 
 60 |     file: str, path, or file-like object
 61 |         The file name. A zip archive will automatically created. As a matter of
 62 |         convention, we recommend to use the ".skops" file extension, e.g.
 63 |         ``save(model, "my-model.skops")``.
 64 | 
 65 |     """
 66 |     buffer = _save(obj)
 67 | 
 68 |     if isinstance(file, (str, Path)):
 69 |         with open(file, "wb") as f:
 70 |             f.write(buffer.getbuffer())
 71 |     else:
 72 |         file.write(buffer.getbuffer())
 73 | 
 74 | 
 75 | def dumps(obj: Any) -> bytes:
 76 |     """Save an object using the skops persistence format as a bytes object.
 77 | 
 78 |     .. warning::
 79 | 
 80 |         This feature is heavily under development, which means the API is
 81 |         unstable and there might be security issues at the moment. Therefore,
 82 |         use caution when loading files from sources you don't trust.
 83 | 
 84 |     Parameters
 85 |     ----------
 86 |     obj: object
 87 |         The object to be saved. Usually a scikit-learn compatible model.
 88 | 
 89 |     """
 90 |     buffer = _save(obj)
 91 |     return buffer.getbuffer().tobytes()
 92 | 
 93 | 
 94 | def load(file: str | Path, trusted: bool | Sequence[str] = False) -> Any:
 95 |     """Load an object saved with the skops persistence format.
 96 | 
 97 |     Skops aims at providing a secure persistence feature that does not rely on
 98 |     :mod:`pickle`, which is inherently insecure. For more information, please
 99 |     visit the :ref:`persistence` documentation.
100 | 
101 |     .. warning::
102 | 
103 |         This feature is heavily under development, which means the API is
104 |         unstable and there might be security issues at the moment. Therefore,
105 |         use caution when loading files from sources you don't trust.
106 | 
107 |     Parameters
108 |     ----------
109 |     file: str or pathlib.Path
110 |         The file name of the object to be loaded.
111 | 
112 |     trusted: bool, or list of str, default=False
113 |         If ``True``, the object will be loaded without any security checks. If
114 |         ``False``, the object will be loaded only if there are only trusted
115 |         objects in the dumped file. If a list of strings, the object will be
116 |         loaded only if there are only trusted objects and objects of types
117 |         listed in ``trusted`` are in the dumped file.
118 | 
119 |     Returns
120 |     -------
121 |     instance: object
122 |         The loaded object.
123 | 
124 |     """
125 |     with ZipFile(file, "r") as input_zip:
126 |         schema = input_zip.read("schema.json")
127 |         load_context = LoadContext(src=input_zip)
128 |         tree = get_tree(json.loads(schema), load_context)
129 |         audit_tree(tree, trusted)
130 |         instance = tree.construct()
131 | 
132 |     return instance
133 | 
134 | 
135 | def loads(data: bytes, trusted: bool | Sequence[str] = False) -> Any:
136 |     """Load an object saved with the skops persistence format from a bytes
137 |     object.
138 | 
139 |     .. warning::
140 | 
141 |         This feature is heavily under development, which means the API is
142 |         unstable and there might be security issues at the moment. Therefore,
143 |         use caution when loading files from sources you don't trust.
144 | 
145 |     Parameters
146 |     ----------
147 |     data: bytes
148 |         The dumped data to be loaded in bytes format.
149 | 
150 |     trusted: bool, or list of str, default=False
151 |         If ``True``, the object will be loaded without any security checks. If
152 |         ``False``, the object will be loaded only if there are only trusted
153 |         objects in the dumped file. If a list of strings, the object will be
154 |         loaded only if there are only trusted objects and objects of types
155 |         listed in ``trusted`` are in the dumped file.
156 | 
157 |     Returns
158 |     -------
159 |     instance: object
160 |         The loaded object.
161 |     """
162 |     if isinstance(data, str):
163 |         raise TypeError("Can't load skops format from string, pass bytes")
164 | 
165 |     with ZipFile(io.BytesIO(data), "r") as zip_file:
166 |         schema = json.loads(zip_file.read("schema.json"))
167 |         load_context = LoadContext(src=zip_file)
168 |         tree = get_tree(schema, load_context)
169 |         audit_tree(tree, trusted)
170 |         instance = tree.construct()
171 | 
172 |     return instance
173 | 
174 | 
175 | def get_untrusted_types(
176 |     *, data: bytes | None = None, file: str | Path | None = None
177 | ) -> list[str]:
178 |     """Get a list of untrusted types in a skops dump.
179 | 
180 |     Parameters
181 |     ----------
182 |     data: bytes
183 |         The data to be checked, in bytes format.
184 | 
185 |     file: str or Path
186 |         The file to be checked.
187 | 
188 |     Returns
189 |     -------
190 |     untrusted_types: list of str
191 |         The list of untrusted types in the dump.
192 | 
193 |     Notes
194 |     -----
195 |     Only one of data or file should be passed.
196 |     """
197 |     if data and file:
198 |         raise ValueError("Only one of data or file should be passed.")
199 |     if not data and not file:
200 |         raise ValueError("Exactly one of data or file should be passed.")
201 | 
202 |     content: io.BytesIO | str | Path
203 |     if data:
204 |         content = io.BytesIO(data)
205 |     else:
206 |         # mypy doesn't understand that file cannot be None here, thus ignore
207 |         content = file  # type: ignore
208 | 
209 |     with ZipFile(content, "r") as zip_file:
210 |         schema = json.loads(zip_file.read("schema.json"))
211 |         tree = get_tree(schema, load_context=LoadContext(src=zip_file))
212 |         untrusted_types = tree.get_unsafe_set()
213 | 
214 |     return sorted(untrusted_types)
215 | 


--------------------------------------------------------------------------------
/docs/persistence.rst:
--------------------------------------------------------------------------------
  1 | .. _persistence:
  2 | 
  3 | Secure persistence with skops
  4 | =============================
  5 | 
  6 | .. warning::
  7 | 
  8 |    This feature is heavily under development, which means the API is unstable
  9 |    and there might be security issues at the moment. Therefore, use caution
 10 |    when loading files from sources you don't trust.
 11 | 
 12 | Skops offers a way to save and load sklearn models without using :mod:`pickle`.
 13 | The ``pickle`` module is not secure, but with skops, you can [more] securely
 14 | save and load models without using ``pickle``.
 15 | 
 16 | ``Pickle`` is the standard serialization format for sklearn and for Python in
 17 | general (``cloudpickle`` and ``joblib`` use the same format). One of the main
 18 | advantages of ``pickle`` is that it can be used for almost all Python objects
 19 | but this flexibility also makes it inherently insecure. This is because loading
 20 | certain types of objects requires the ability to run arbitrary code, which can
 21 | be misused for malicious purposes. For example, an attacker can use it to steal
 22 | secrets from your machine or install a virus. As the `Python docs
 23 | <https://docs.python.org/3/library/pickle.html#module-pickle>`__ say:
 24 | 
 25 | .. warning::
 26 | 
 27 |     The pickle module is not secure. Only unpickle data you trust. It is
 28 |     possible to construct malicious pickle data which will execute arbitrary
 29 |     code during unpickling. Never unpickle data that could have come from an
 30 |     untrusted source, or that could have been tampered with.
 31 | 
 32 | In contrast to ``pickle``, the :func:`skops.io.dump` and :func:`skops.io.load`
 33 | functions have a more limited scope, while preventing users from running
 34 | arbitrary code or loading unknown and malicious objects.
 35 | 
 36 | When loading a file, :func:`skops.io.load`/:func:`skops.io.loads` will traverse
 37 | the input, check for known and unknown types, and will only construct those
 38 | objects if they are trusted, either by default or by the user.
 39 | 
 40 | .. note::
 41 |     You can try out converting your existing pickle files to the skops format
 42 |     using this Space on Hugging Face Hub:
 43 |     `pickle-to-skops <https://huggingface.co/spaces/scikit-learn/pickle-to-skops>`__.
 44 | 
 45 | Usage
 46 | -----
 47 | 
 48 | The code snippet below illustrates how to use :func:`skops.io.dump` and
 49 | :func:`skops.io.load`. Note that one needs `XGBoost
 50 | <https://xgboost.readthedocs.io/en/stable/>`__ installed to run this:
 51 | 
 52 | .. code:: python
 53 | 
 54 |     from xgboost.sklearn import XGBClassifier
 55 |     from sklearn.model_selection import GridSearchCV, train_test_split
 56 |     from sklearn.datasets import load_iris
 57 |     from skops.io import dump, load
 58 | 
 59 |     X, y = load_iris(return_X_y=True)
 60 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
 61 |     param_grid = {"tree_method": ["exact", "approx", "hist"]}
 62 |     clf = GridSearchCV(XGBClassifier(), param_grid=param_grid).fit(X_train, y_train)
 63 |     print(clf.score(X_test, y_test))
 64 |     0.9666666666666667
 65 |     dump(clf, "my-model.skops")
 66 |     # ...
 67 |     loaded = load("my-model.skops", trusted=True)
 68 |     print(loaded.score(X_test, y_test))
 69 |     0.9666666666666667
 70 | 
 71 |     # in memory
 72 |     from skops.io import dumps, loads
 73 |     serialized = dumps(clf)
 74 |     loaded = loads(serialized, trusted=True)
 75 | 
 76 | Note that you should only load files with ``trusted=True`` if you trust the
 77 | source. Otherwise you can get a list of untrusted types present in the dump
 78 | using :func:`skops.io.get_untrusted_types`:
 79 | 
 80 | .. code:: python
 81 | 
 82 |     from skops.io import get_untrusted_types
 83 |     unknown_types = get_untrusted_types(file="my-model.skops")
 84 |     print(unknown_types)
 85 |     ['numpy.float64', 'numpy.int64', 'sklearn.metrics._scorer._passthrough_scorer',
 86 |     'xgboost.core.Booster', 'xgboost.sklearn.XGBClassifier']
 87 | 
 88 | Note that everything in the above list is safe to load. We already have many
 89 | types included as trusted by default, and some of the above values might be
 90 | added to that list in the future.
 91 | 
 92 | Once you check the list and you validate that everything in the list is safe,
 93 | you can load the file with ``trusted=unknown_types``:
 94 | 
 95 | .. code:: python
 96 | 
 97 |     loaded = load("my-model.skops", trusted=unknown_types)
 98 | 
 99 | At the moment, we support the vast majority of sklearn estimators. This
100 | includes complex use cases such as :class:`sklearn.pipeline.Pipeline`,
101 | :class:`sklearn.model_selection.GridSearchCV`, classes using objects defined in
102 | Cython such as :class:`sklearn.tree.DecisionTreeClassifier`, and more. If you
103 | discover an sklearn estimator that does not work, please open an issue on the
104 | skops `GitHub page <https://github.com/skops-dev/skops/issues>`__ and let us
105 | know.
106 | 
107 | At the moment, ``skops`` cannot persist arbitrary Python code. This means if
108 | you have custom functions (say, a custom function to be used with
109 | :class:`sklearn.preprocessing.FunctionTransformer`), it will not work. However,
110 | most ``numpy`` and ``scipy`` functions should work. Therefore, you can save
111 | objects having references to functions such as ``numpy.sqrt``.
112 | 
113 | Command Line Interface
114 | ######################
115 | 
116 | Skops has a command line interface to convert scikit-learn models persisted with
117 | ``Pickle`` to ``Skops`` files.
118 | 
119 | To convert a file from the command line, use the ``skops convert`` entrypoint.
120 | 
121 | Below is an example call to convert a file ``my_model.pkl`` to ``my_model.skops``:
122 | 
123 | .. code:: console
124 | 
125 |     skops convert my_model.pkl
126 | 
127 | To convert multiple files, you can use bash commands to iterate the above call.
128 | For example, to convert all ``.pkl`` flies in the current directory:
129 | 
130 | .. code:: console
131 | 
132 |     for FILE in *.pkl; do skops convert FILE; done
133 | 
134 | Further help for the different supported options can be found by calling
135 | ``skops convert --help`` in a terminal.
136 | 
137 | 
138 | Supported libraries
139 | -------------------
140 | 
141 | Skops intends to support all of **scikit-learn**, that is, not only its
142 | estimators, but also other classes like cross validation splitters. Furthermore,
143 | most types from **numpy** and **scipy** should be supported, such as (sparse)
144 | arrays, dtypes, random generators, and ufuncs.
145 | 
146 | Apart from this core, we plan to support machine learning libraries commonly
147 | used be the community. So far, we have tested the following libraries:
148 | 
149 | - `LightGBM <https://lightgbm.readthedocs.io/>`_ (scikit-learn API)
150 | - `XGBoost <https://xgboost.readthedocs.io/en/stable/>`_ (scikit-learn API)
151 | - `CatBoost <https://catboost.ai/en/docs/>`_
152 | 
153 | If you run into a problem using any of the mentioned libraries, this could mean
154 | there is a bug in skops. Please open an issue on `our issue tracker
155 | <https://github.com/skops-dev/skops/issues>`__ (but please check first if a
156 | corresponding issue already exists).
157 | 
158 | In terms of security, we do not audit these libraries for security issues.
159 | Therefore, you should only load a skops file containing a model of any of those
160 | libraries if you trust them to be secure. It's not a perfect solution, but it's
161 | still better than trusting pickle files, which anyone can tamper with easily.
162 | 
163 | Roadmap
164 | -------
165 | There needs to be more testing to harden the loader and make sure we don't run
166 | arbitrary code when it's not intended. However, the safety mechanisms already
167 | in place should prevent most cases of abuse.
168 | 
169 | At the moment, persisting and loading arbitrary C extension types is not
170 | possible, unless a python object wraps around them and handles persistence and
171 | loading via ``__getstate__`` and ``__setstate__``. We plan to develop an API
172 | which would help third party libraries to make their C extension types
173 | ``skops`` compatible.
174 | 
175 | You can check on our `"issue tracker
176 | <https://github.com/skops-dev/skops/labels/persistence>`__ which features are
177 | planned for the near future.
178 | 


--------------------------------------------------------------------------------
/skops/card/_templates.py:
--------------------------------------------------------------------------------
  1 | """Templates for model cards
  2 | 
  3 | To add a new template, define it as a dictionary where the key is the section
  4 | and the value is the content of the section. If the content is empty but should
  5 | be filled by the user, set it to be the ``CONTENT_PLACEHOLDER``.
  6 | 
  7 | After defining the template itself, add it as another enum value in the
  8 | ``Templates`` enum.
  9 | 
 10 | Finally, if there is a corresponding section in the new template, some methods
 11 | on the ``Card`` class should be adjusted to make use of the template. First of
 12 | all, ``_fill_default_sections`` should be used to populate the model card with
 13 | the template.
 14 | 
 15 | Furthermore, some specific methods might require adjusting. For example, the
 16 | ``Card._add_hyperparams`` method will add a table of model hyperparameters, but
 17 | it needs to know in what section to put them. So if the template contains a
 18 | corresponding section, modify the method to put the hyperparameters inside that
 19 | section.
 20 | 
 21 | """
 22 | 
 23 | from enum import Enum
 24 | 
 25 | 
 26 | class Templates(Enum):
 27 |     skops = "skops"
 28 | 
 29 | 
 30 | CONTENT_PLACEHOLDER = "[More Information Needed]"
 31 | """When there is a section but content has yet to be added by the user, show
 32 | this"""
 33 | 
 34 | # fmt: off
 35 | SKOPS_TEMPLATE = {
 36 |     "Model description": CONTENT_PLACEHOLDER,
 37 |     "Model description/Intended uses & limitations": CONTENT_PLACEHOLDER,
 38 |     "Model description/Training Procedure": "",
 39 |     "Model description/Training Procedure/Hyperparameters": CONTENT_PLACEHOLDER,
 40 |     "Model description/Training Procedure/Model Plot": "The model plot is below.",
 41 |     "Model description/Evaluation Results": CONTENT_PLACEHOLDER,
 42 |     "How to Get Started with the Model": CONTENT_PLACEHOLDER,
 43 |     "Model Card Authors": (
 44 |         f"This model card is written by following authors:\n\n{CONTENT_PLACEHOLDER}"
 45 |     ),
 46 |     "Model Card Contact": (
 47 |         "You can contact the model card authors through following channels:\n"
 48 |         f"{CONTENT_PLACEHOLDER}"
 49 |     ),
 50 |     "Citation": (
 51 |         "Below you can find information related to citation.\n\n**BibTeX:**\n```\n"
 52 |         f"{CONTENT_PLACEHOLDER}\n```"
 53 |     ),
 54 | }
 55 | 
 56 | # The template below corresponds to the HF Hub default template, but is geared
 57 | # towards deep learning models, especially language models, and thus is not a
 58 | # good fit for most sklearn models.
 59 | _HUB_TEMPLATE = {
 60 |     "Model Card": "",
 61 |     # Provide a quick summary of what the model is/does.
 62 |     "Model Details": "",
 63 |     "Model Details/Model Description": "",
 64 |     # Provide a longer summary of what this model is.
 65 |     "Model Details/Model Description/Developed by": CONTENT_PLACEHOLDER,
 66 |     "Model Details/Model Description/Shared by [optional]": CONTENT_PLACEHOLDER,
 67 |     "Model Details/Model Description/Model type": CONTENT_PLACEHOLDER,
 68 |     "Model Details/Model Description/Language(s) (NLP)": CONTENT_PLACEHOLDER,
 69 |     "Model Details/Model Description/License": CONTENT_PLACEHOLDER,
 70 |     "Model Details/Model Description/Finetuned from model [optional]": CONTENT_PLACEHOLDER,
 71 |     "Model Details/Model Description/Resources for more information": CONTENT_PLACEHOLDER,
 72 | 
 73 |     "Uses": "",
 74 |     # Address questions around how the model is intended to be used, including
 75 |     # the foreseeable users of the model and those affected by the model.
 76 |     "Uses/Direct Use": CONTENT_PLACEHOLDER,
 77 |     # This section is for the model use without fine-tuning or plugging into a
 78 |     # larger ecosystem/app.
 79 |     "Uses/Downstream Use [optional]": CONTENT_PLACEHOLDER,
 80 |     # This section is for the model use when fine-tuned for a task, or when
 81 |     # plugged into a larger ecosystem/app.
 82 |     "Uses/Out-of-Scope Use": CONTENT_PLACEHOLDER,
 83 |     # This section addresses misuse, malicious use, and uses that the model will
 84 |     # not work well for.
 85 | 
 86 |     "Bias, Risks, and Limitations": CONTENT_PLACEHOLDER,
 87 |     # This section is meant to convey both technical and sociotechnical
 88 |     # limitations.
 89 |     "Bias, Risks, and Limitations/Recommendations": (
 90 |         "Users (both direct and downstream) should be made aware of the risks, biases "
 91 |         "and limitations of the model. More information needed for further "
 92 |         "recommendations."
 93 |     ),
 94 |     # This section is meant to convey recommendations with respect to the bias,
 95 |     # risk, and technical limitations.
 96 | 
 97 |     "Training Details": "",
 98 |     "Training Details/Training Data": CONTENT_PLACEHOLDER,
 99 |     # This should link to a Data Card, perhaps with a short stub of information
100 |     # on what the training data is all about as well as documentation related to
101 |     # data pre-processing or additional filtering.
102 |     "Training Details/Training Procedure [optional]": "",
103 |     # This relates heavily to the Technical Specifications. Content here should
104 |     # link to that section when it is relevant to the training procedure.
105 |     "Training Details/Training Procedure [optional]/Preprocessing": CONTENT_PLACEHOLDER,
106 |     "Training Details/Training Procedure [optional]/Speeds, Sizes, Times": CONTENT_PLACEHOLDER,
107 |     # This section provides information about throughput, start/end time,
108 |     # checkpoint size if relevant, etc.
109 | 
110 |     "Evaluation": "",
111 |     # This section describes the evaluation protocols and provides the results.
112 |     "Evaluation/Testing Data, Factors & Metrics": "",
113 |     "Evaluation/Testing Data, Factors & Metrics/Testing Data": CONTENT_PLACEHOLDER,
114 |     # This should link to a Data Card if possible
115 |     "Evaluation/Testing Data, Factors & Metrics/Factors": CONTENT_PLACEHOLDER,
116 |     # These are the things the evaluation is disaggregating by, e.g.,
117 |     # subpopulations or domains.
118 |     "Evaluation/Testing Data, Factors & Metrics/Metrics": CONTENT_PLACEHOLDER,
119 |     # These are the evaluation metrics being used, ideally with a description of
120 |     # why.
121 |     "Evaluation/Results": CONTENT_PLACEHOLDER,
122 | 
123 |     "Model Examination [optional]": CONTENT_PLACEHOLDER,
124 |     # Relevant interpretability work for the model goes here.
125 | 
126 |     "Environmental Impact": (
127 |         "Carbon emissions can be estimated using the "
128 |         "[Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) "
129 |         "presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700)."
130 |     ),
131 |     # Total emissions (in grams of CO2eq) and additional considerations, such as
132 |     # electricity usage, go here. Edit the suggested text below accordingly"
133 |     "Environmental Impact/Hardware Type": CONTENT_PLACEHOLDER,
134 |     "Environmental Impact/Hours used": CONTENT_PLACEHOLDER,
135 |     "Environmental Impact/Cloud Provider": CONTENT_PLACEHOLDER,
136 |     "Environmental Impact/Compute Region": CONTENT_PLACEHOLDER,
137 |     "Environmental Impact/Carbon Emitted": CONTENT_PLACEHOLDER,
138 | 
139 |     "Technical Specifications [optional]": "",
140 |     "Technical Specifications [optional]/Model Architecture and Objective": CONTENT_PLACEHOLDER,
141 |     "Technical Specifications [optional]/Compute Infrastructure": CONTENT_PLACEHOLDER,
142 |     "Technical Specifications [optional]/Compute Infrastructure/Hardware": CONTENT_PLACEHOLDER,
143 |     "Technical Specifications [optional]/Compute Infrastructure/Software": CONTENT_PLACEHOLDER,
144 | 
145 |     "Citation [optional]": "",
146 |     # If there is a paper or blog post introducing the model, the APA and Bibtex
147 |     # information for that should go in this section.
148 |     "Citation [optional]/BibTeX": CONTENT_PLACEHOLDER,
149 |     "Citation [optional]/APA": CONTENT_PLACEHOLDER,
150 | 
151 |     "Glossary [optional]": "",
152 |     # If relevant, include terms and calculations in this section that can help
153 |     # readers understand the model or model card.
154 | 
155 |     "More Information [optional]": CONTENT_PLACEHOLDER,
156 |     "Model Card Authors [optional]": CONTENT_PLACEHOLDER,
157 |     "Model Card Contact": CONTENT_PLACEHOLDER,
158 |     "How to Get Started with the Model": f"""Use the code below to get started with the model.
159 | 
160 | <details>
161 | <summary> Click to expand </summary>
162 | 
163 | {CONTENT_PLACEHOLDER}
164 | 
165 | </details>""",
166 | }
167 | # fmt: on
168 | 


--------------------------------------------------------------------------------
/skops/card/tests/examples/clip-vit-large-patch14.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | tags:
  3 | - vision
  4 | widget:
  5 | - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png
  6 |   candidate_labels: playing music, playing sports
  7 |   example_title: Cat & Dog
  8 | ---
  9 | 
 10 | # Model Card: CLIP
 11 | 
 12 | <!-- retrieved on 2022-12-05  | mod: removed trailing whitespaces -->
 13 | 
 14 | Disclaimer: The model card is taken and modified from the official CLIP repository, it can be found [here](https://github.com/openai/CLIP/blob/main/model-card.md).
 15 | 
 16 | ## Model Details
 17 | 
 18 | The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner. It was not developed for general model deployment - to deploy models like CLIP, researchers will first need to carefully study their capabilities in relation to the specific context they’re being deployed within.
 19 | 
 20 | ### Model Date
 21 | 
 22 | January 2021
 23 | 
 24 | ### Model Type
 25 | 
 26 | The base model uses a ViT-L/14 Transformer architecture as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss.
 27 | 
 28 | The original implementation had two variants: one using a ResNet image encoder and the other using a Vision Transformer. This repository has the variant with the Vision Transformer.
 29 | 
 30 | 
 31 | ### Documents
 32 | 
 33 | - [Blog Post](https://openai.com/blog/clip/)
 34 | - [CLIP Paper](https://arxiv.org/abs/2103.00020)
 35 | 
 36 | 
 37 | ### Use with Transformers
 38 | 
 39 | ```python
 40 | from PIL import Image
 41 | import requests
 42 | 
 43 | from transformers import CLIPProcessor, CLIPModel
 44 | 
 45 | model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
 46 | processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
 47 | 
 48 | url = "http://images.cocodataset.org/val2017/000000039769.jpg"
 49 | image = Image.open(requests.get(url, stream=True).raw)
 50 | 
 51 | inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
 52 | 
 53 | outputs = model(**inputs)
 54 | logits_per_image = outputs.logits_per_image # this is the image-text similarity score
 55 | probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
 56 | ```
 57 | 
 58 | 
 59 | ## Model Use
 60 | 
 61 | ### Intended Use
 62 | 
 63 | The model is intended as a research output for research communities. We hope that this model will enable researchers to better understand and explore zero-shot, arbitrary image classification. We also hope it can be used for interdisciplinary studies of the potential impact of such models - the CLIP paper includes a discussion of potential downstream impacts to provide an example for this sort of analysis.
 64 | 
 65 | #### Primary intended uses
 66 | 
 67 | The primary intended users of these models are AI researchers.
 68 | 
 69 | We primarily imagine the model will be used by researchers to better understand robustness, generalization, and other capabilities, biases, and constraints of computer vision models.
 70 | 
 71 | ### Out-of-Scope Use Cases
 72 | 
 73 | **Any** deployed use case of the model - whether commercial or not - is currently out of scope. Non-deployed use cases such as image search in a constrained environment, are also not recommended unless there is thorough in-domain testing of the model with a specific, fixed class taxonomy. This is because our safety assessment demonstrated a high need for task specific testing especially given the variability of CLIP’s performance with different class taxonomies. This makes untested and unconstrained deployment of the model in any use case currently potentially harmful.
 74 | 
 75 | Certain use cases which would fall under the domain of surveillance and facial recognition are always out-of-scope regardless of performance of the model. This is because the use of artificial intelligence for tasks such as these can be premature currently given the lack of testing norms and checks to ensure its fair use.
 76 | 
 77 | Since the model has not been purposefully trained in or evaluated on any languages other than English, its use should be limited to English language use cases.
 78 | 
 79 | 
 80 | 
 81 | ## Data
 82 | 
 83 | The model was trained on publicly available image-caption data. This was done through a combination of crawling a handful of websites and using commonly-used pre-existing image datasets such as [YFCC100M](http://projects.dfki.uni-kl.de/yfcc100m/). A large portion of the data comes from our crawling of the internet. This means that the data is more representative of people and societies most connected to the internet which tend to skew towards more developed nations, and younger, male users.
 84 | 
 85 | ### Data Mission Statement
 86 | 
 87 | Our goal with building this dataset was to test out robustness and generalizability in computer vision tasks. As a result, the focus was on gathering large quantities of data from different publicly-available internet data sources. The data was gathered in a mostly non-interventionist manner. However, we only crawled websites that had policies against excessively violent and adult images and allowed us to filter out such content. We do not intend for this dataset to be used as the basis for any commercial or deployed model and will not be releasing the dataset.
 88 | 
 89 | 
 90 | 
 91 | ## Performance and Limitations
 92 | 
 93 | ### Performance
 94 | 
 95 | We have evaluated the performance of CLIP on a wide range of benchmarks across a variety of computer vision datasets such as OCR to texture recognition to fine-grained classification. The paper describes model performance on the following datasets:
 96 | 
 97 | - Food101
 98 | - CIFAR10
 99 | - CIFAR100
100 | - Birdsnap
101 | - SUN397
102 | - Stanford Cars
103 | - FGVC Aircraft
104 | - VOC2007
105 | - DTD
106 | - Oxford-IIIT Pet dataset
107 | - Caltech101
108 | - Flowers102
109 | - MNIST
110 | - SVHN
111 | - IIIT5K
112 | - Hateful Memes
113 | - SST-2
114 | - UCF101
115 | - Kinetics700
116 | - Country211
117 | - CLEVR Counting
118 | - KITTI Distance
119 | - STL-10
120 | - RareAct
121 | - Flickr30
122 | - MSCOCO
123 | - ImageNet
124 | - ImageNet-A
125 | - ImageNet-R
126 | - ImageNet Sketch
127 | - ObjectNet (ImageNet Overlap)
128 | - Youtube-BB
129 | - ImageNet-Vid
130 | 
131 | ## Limitations
132 | 
133 | CLIP and our analysis of it have a number of limitations. CLIP currently struggles with respect to certain tasks such as fine grained classification and counting objects. CLIP also poses issues with regards to fairness and bias which we discuss in the paper and briefly in the next section. Additionally, our approach to testing CLIP also has an important limitation- in many cases we have used linear probes to evaluate the performance of CLIP and there is evidence suggesting that linear probes can underestimate model performance.
134 | 
135 | ### Bias and Fairness
136 | 
137 | We find that the performance of CLIP - and the specific biases it exhibits - can depend significantly on class design and the choices one makes for categories to include and exclude. We tested the risk of certain kinds of denigration with CLIP by classifying images of people from [Fairface](https://arxiv.org/abs/1908.04913) into crime-related and non-human animal categories. We found significant disparities with respect to race and gender. Additionally, we found that these disparities could shift based on how the classes were constructed. (Details captured in the Broader Impacts Section in the paper).
138 | 
139 | We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks.
140 | 
141 | 
142 | 
143 | ## Feedback
144 | 
145 | ### Where to send questions or comments about the model
146 | 
147 | Please use [this Google Form](https://forms.gle/Uv7afRH5dvY34ZEs9)
148 | 


--------------------------------------------------------------------------------
/skops/card/tests/examples/gpt2.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | language: en
  3 | tags:
  4 | - exbert
  5 | 
  6 | license: mit
  7 | ---
  8 | 
  9 | # GPT-2
 10 | 
 11 | <!-- retrieved on 2022-12-05 | mod: removed trailing whitespaces -->
 12 | 
 13 | Test the whole generation capabilities here: https://transformer.huggingface.co/doc/gpt2-large
 14 | 
 15 | Pretrained model on English language using a causal language modeling (CLM) objective. It was introduced in
 16 | [this paper](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
 17 | and first released at [this page](https://openai.com/blog/better-language-models/).
 18 | 
 19 | Disclaimer: The team releasing GPT-2 also wrote a
 20 | [model card](https://github.com/openai/gpt-2/blob/master/model_card.md) for their model. Content from this model card
 21 | has been written by the Hugging Face team to complete the information they provided and give specific examples of bias.
 22 | 
 23 | ## Model description
 24 | 
 25 | GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This
 26 | means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots
 27 | of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely,
 28 | it was trained to guess the next word in sentences.
 29 | 
 30 | More precisely, inputs are sequences of continuous text of a certain length and the targets are the same sequence,
 31 | shifted one token (word or piece of word) to the right. The model uses internally a mask-mechanism to make sure the
 32 | predictions for the token `i` only uses the inputs from `1` to `i` but not the future tokens.
 33 | 
 34 | This way, the model learns an inner representation of the English language that can then be used to extract features
 35 | useful for downstream tasks. The model is best at what it was pretrained for however, which is generating texts from a
 36 | prompt.
 37 | 
 38 | This is the **smallest** version of GPT-2, with 124M parameters.
 39 | 
 40 | **Related Models:** [GPT-Large](https://huggingface.co/gpt2-large), [GPT-Medium](https://huggingface.co/gpt2-medium) and [GPT-XL](https://huggingface.co/gpt2-xl)
 41 | 
 42 | ## Intended uses & limitations
 43 | 
 44 | You can use the raw model for text generation or fine-tune it to a downstream task. See the
 45 | [model hub](https://huggingface.co/models?filter=gpt2) to look for fine-tuned versions on a task that interests you.
 46 | 
 47 | ### How to use
 48 | 
 49 | You can use this model directly with a pipeline for text generation. Since the generation relies on some randomness, we
 50 | set a seed for reproducibility:
 51 | 
 52 | ```python
 53 | >>> from transformers import pipeline, set_seed
 54 | >>> generator = pipeline('text-generation', model='gpt2')
 55 | >>> set_seed(42)
 56 | >>> generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)
 57 | 
 58 | [{'generated_text': "Hello, I'm a language model, a language for thinking, a language for expressing thoughts."},
 59 |  {'generated_text': "Hello, I'm a language model, a compiler, a compiler library, I just want to know how I build this kind of stuff. I don"},
 60 |  {'generated_text': "Hello, I'm a language model, and also have more than a few of your own, but I understand that they're going to need some help"},
 61 |  {'generated_text': "Hello, I'm a language model, a system model. I want to know my language so that it might be more interesting, more user-friendly"},
 62 |  {'generated_text': 'Hello, I\'m a language model, not a language model"\n\nThe concept of "no-tricks" comes in handy later with new'}]
 63 | ```
 64 | 
 65 | Here is how to use this model to get the features of a given text in PyTorch:
 66 | 
 67 | ```python
 68 | from transformers import GPT2Tokenizer, GPT2Model
 69 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 70 | model = GPT2Model.from_pretrained('gpt2')
 71 | text = "Replace me by any text you'd like."
 72 | encoded_input = tokenizer(text, return_tensors='pt')
 73 | output = model(**encoded_input)
 74 | ```
 75 | 
 76 | and in TensorFlow:
 77 | 
 78 | ```python
 79 | from transformers import GPT2Tokenizer, TFGPT2Model
 80 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 81 | model = TFGPT2Model.from_pretrained('gpt2')
 82 | text = "Replace me by any text you'd like."
 83 | encoded_input = tokenizer(text, return_tensors='tf')
 84 | output = model(encoded_input)
 85 | ```
 86 | 
 87 | ### Limitations and bias
 88 | 
 89 | The training data used for this model has not been released as a dataset one can browse. We know it contains a lot of
 90 | unfiltered content from the internet, which is far from neutral. As the openAI team themselves point out in their
 91 | [model card](https://github.com/openai/gpt-2/blob/master/model_card.md#out-of-scope-use-cases):
 92 | 
 93 | > Because large-scale language models like GPT-2 do not distinguish fact from fiction, we don’t support use-cases
 94 | > that require the generated text to be true.
 95 | >
 96 | > Additionally, language models like GPT-2 reflect the biases inherent to the systems they were trained on, so we do
 97 | > not recommend that they be deployed into systems that interact with humans > unless the deployers first carry out a
 98 | > study of biases relevant to the intended use-case. We found no statistically significant difference in gender, race,
 99 | > and religious bias probes between 774M and 1.5B, implying all versions of GPT-2 should be approached with similar
100 | > levels of caution around use cases that are sensitive to biases around human attributes.
101 | 
102 | Here's an example of how the model can have biased predictions:
103 | 
104 | ```python
105 | >>> from transformers import pipeline, set_seed
106 | >>> generator = pipeline('text-generation', model='gpt2')
107 | >>> set_seed(42)
108 | >>> generator("The White man worked as a", max_length=10, num_return_sequences=5)
109 | 
110 | [{'generated_text': 'The White man worked as a mannequin for'},
111 |  {'generated_text': 'The White man worked as a maniser of the'},
112 |  {'generated_text': 'The White man worked as a bus conductor by day'},
113 |  {'generated_text': 'The White man worked as a plumber at the'},
114 |  {'generated_text': 'The White man worked as a journalist. He had'}]
115 | 
116 | >>> set_seed(42)
117 | >>> generator("The Black man worked as a", max_length=10, num_return_sequences=5)
118 | 
119 | [{'generated_text': 'The Black man worked as a man at a restaurant'},
120 |  {'generated_text': 'The Black man worked as a car salesman in a'},
121 |  {'generated_text': 'The Black man worked as a police sergeant at the'},
122 |  {'generated_text': 'The Black man worked as a man-eating monster'},
123 |  {'generated_text': 'The Black man worked as a slave, and was'}]
124 | ```
125 | 
126 | This bias will also affect all fine-tuned versions of this model.
127 | 
128 | ## Training data
129 | 
130 | The OpenAI team wanted to train this model on a corpus as large as possible. To build it, they scraped all the web
131 | pages from outbound links on Reddit which received at least 3 karma. Note that all Wikipedia pages were removed from
132 | this dataset, so the model was not trained on any part of Wikipedia. The resulting dataset (called WebText) weights
133 | 40GB of texts but has not been publicly released. You can find a list of the top 1,000 domains present in WebText
134 | [here](https://github.com/openai/gpt-2/blob/master/domains.txt).
135 | 
136 | ## Training procedure
137 | 
138 | ### Preprocessing
139 | 
140 | The texts are tokenized using a byte-level version of Byte Pair Encoding (BPE) (for unicode characters) and a
141 | vocabulary size of 50,257. The inputs are sequences of 1024 consecutive tokens.
142 | 
143 | The larger model was trained on 256 cloud TPU v3 cores. The training duration was not disclosed, nor were the exact
144 | details of training.
145 | 
146 | ## Evaluation results
147 | 
148 | The model achieves the following results without any fine-tuning (zero-shot):
149 | 
150 | | Dataset  | LAMBADA | LAMBADA | CBT-CN | CBT-NE | WikiText2 | PTB    | enwiki8 | text8  | WikiText103 | 1BW   |
151 | |:--------:|:-------:|:-------:|:------:|:------:|:---------:|:------:|:-------:|:------:|:-----------:|:-----:|
152 | | (metric) | (PPL)   | (ACC)   | (ACC)  | (ACC)  | (PPL)     | (PPL)  | (BPB)   | (BPC)  | (PPL)       | (PPL) |
153 | |          | 35.13   | 45.99   | 87.65  | 83.4   | 29.41     | 65.85  | 1.16    | 1,17   | 37.50       | 75.20 |
154 | 
155 | 
156 | ### BibTeX entry and citation info
157 | 
158 | ```bibtex
159 | @article{radford2019language,
160 |   title={Language Models are Unsupervised Multitask Learners},
161 |   author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
162 |   year={2019}
163 | }
164 | ```
165 | 
166 | <a href="https://huggingface.co/exbert/?model=gpt2">
167 | 	<img width="300px" src="https://cdn-media.huggingface.co/exbert/button.png">
168 | </a>
169 | 


--------------------------------------------------------------------------------
/skops/io/_sklearn.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import Any, Callable, Sequence, Type
  4 | 
  5 | from sklearn.cluster import Birch
  6 | 
  7 | try:
  8 |     # TODO: remove once support for sklearn<1.2 is dropped. See #187
  9 |     from sklearn.covariance._graph_lasso import _DictWithDeprecatedKeys
 10 | except ImportError:
 11 |     _DictWithDeprecatedKeys = None
 12 | from sklearn.linear_model._sgd_fast import (
 13 |     EpsilonInsensitive,
 14 |     Hinge,
 15 |     Huber,
 16 |     Log,
 17 |     LossFunction,
 18 |     ModifiedHuber,
 19 |     SquaredEpsilonInsensitive,
 20 |     SquaredHinge,
 21 |     SquaredLoss,
 22 | )
 23 | from sklearn.tree._tree import Tree
 24 | 
 25 | from ._audit import Node, get_tree
 26 | from ._general import unsupported_get_state
 27 | from ._utils import LoadContext, SaveContext, get_module, get_state, gettype
 28 | from .exceptions import UnsupportedTypeException
 29 | 
 30 | ALLOWED_SGD_LOSSES = {
 31 |     ModifiedHuber,
 32 |     Hinge,
 33 |     SquaredHinge,
 34 |     Log,
 35 |     SquaredLoss,
 36 |     Huber,
 37 |     EpsilonInsensitive,
 38 |     SquaredEpsilonInsensitive,
 39 | }
 40 | 
 41 | UNSUPPORTED_TYPES = {Birch}
 42 | 
 43 | 
 44 | def reduce_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
 45 |     # This method is for objects for which we have to use the __reduce__
 46 |     # method to get the state.
 47 |     res = {
 48 |         "__class__": obj.__class__.__name__,
 49 |         "__module__": get_module(type(obj)),
 50 |     }
 51 | 
 52 |     # We get the output of __reduce__ and use it to reconstruct the object.
 53 |     # For security reasons, we don't save the constructor object returned by
 54 |     # __reduce__, and instead use the pre-defined constructor for the object
 55 |     # that we know. This avoids having a function such as `eval()` as the
 56 |     # "constructor", abused by attackers.
 57 |     #
 58 |     # We can/should also look into removing __reduce__ from scikit-learn,
 59 |     # and that is not impossible. Most objects which use this don't really
 60 |     # need it.
 61 |     #
 62 |     # More info on __reduce__:
 63 |     # https://docs.python.org/3/library/pickle.html#object.__reduce__
 64 |     #
 65 |     # As a good example, this makes Tree object to be serializable.
 66 |     reduce = obj.__reduce__()
 67 |     res["__reduce__"] = {}
 68 |     res["__reduce__"]["args"] = get_state(reduce[1], save_context)
 69 | 
 70 |     if len(reduce) == 3:
 71 |         # reduce includes what's needed for __getstate__ and we don't need to
 72 |         # call __getstate__ directly.
 73 |         attrs = reduce[2]
 74 |     elif hasattr(obj, "__getstate__"):
 75 |         # since python311 __getstate__ is defined for `object` and might return
 76 |         # None
 77 |         attrs = obj.__getstate__() or {}
 78 |     elif hasattr(obj, "__dict__"):
 79 |         attrs = obj.__dict__
 80 |     else:
 81 |         attrs = {}
 82 | 
 83 |     if not isinstance(attrs, (dict, tuple)):
 84 |         raise UnsupportedTypeException(
 85 |             f"Objects of type {res['__class__']} not supported yet"
 86 |         )
 87 | 
 88 |     res["content"] = get_state(attrs, save_context)
 89 |     return res
 90 | 
 91 | 
 92 | class ReduceNode(Node):
 93 |     def __init__(
 94 |         self,
 95 |         state: dict[str, Any],
 96 |         load_context: LoadContext,
 97 |         constructor: Type[Any] | Callable[..., Any],
 98 |         trusted: bool | Sequence[str] = False,
 99 |     ) -> None:
100 |         super().__init__(state, load_context, trusted)
101 |         reduce = state["__reduce__"]
102 |         self.children = {
103 |             "attrs": get_tree(state["content"], load_context),
104 |             "args": get_tree(reduce["args"], load_context),
105 |             "constructor": constructor,
106 |         }
107 | 
108 |     def _construct(self):
109 |         args = self.children["args"].construct()
110 |         constructor = self.children["constructor"]
111 |         instance = constructor(*args)
112 |         attrs = self.children["attrs"].construct()
113 |         if not attrs:
114 |             # nothing more to do
115 |             return instance
116 | 
117 |         if isinstance(args, tuple) and not hasattr(instance, "__setstate__"):
118 |             raise UnsupportedTypeException(
119 |                 f"Objects of type {constructor} are not supported yet"
120 |             )
121 | 
122 |         if hasattr(instance, "__setstate__"):
123 |             instance.__setstate__(attrs)
124 |         elif isinstance(attrs, dict):
125 |             instance.__dict__.update(attrs)
126 |         else:
127 |             # we (probably) got tuple attrs but cannot setstate with them
128 |             raise UnsupportedTypeException(
129 |                 f"Objects of type {constructor} are not supported yet"
130 |             )
131 | 
132 |         return instance
133 | 
134 | 
135 | def tree_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
136 |     state = reduce_get_state(obj, save_context)
137 |     state["__loader__"] = "TreeNode"
138 |     return state
139 | 
140 | 
141 | class TreeNode(ReduceNode):
142 |     def __init__(
143 |         self,
144 |         state: dict[str, Any],
145 |         load_context: LoadContext,
146 |         trusted: bool | Sequence[str] = False,
147 |     ) -> None:
148 |         super().__init__(state, load_context, constructor=Tree, trusted=trusted)
149 |         self.trusted = self._get_trusted(trusted, [get_module(Tree) + ".Tree"])
150 | 
151 | 
152 | def sgd_loss_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
153 |     state = reduce_get_state(obj, save_context)
154 |     state["__loader__"] = "SGDNode"
155 |     return state
156 | 
157 | 
158 | class SGDNode(ReduceNode):
159 |     def __init__(
160 |         self,
161 |         state: dict[str, Any],
162 |         load_context: LoadContext,
163 |         trusted: bool | Sequence[str] = False,
164 |     ) -> None:
165 |         # TODO: make sure trusted here makes sense and used.
166 |         super().__init__(
167 |             state,
168 |             load_context,
169 |             constructor=gettype(state["__module__"], state["__class__"]),
170 |             trusted=False,
171 |         )
172 |         self.trusted = self._get_trusted(
173 |             trusted, [get_module(x) + "." + x.__name__ for x in ALLOWED_SGD_LOSSES]
174 |         )
175 | 
176 | 
177 | # TODO: remove once support for sklearn<1.2 is dropped.
178 | def _DictWithDeprecatedKeys_get_state(
179 |     obj: Any, save_context: SaveContext
180 | ) -> dict[str, Any]:
181 |     res = {
182 |         "__class__": obj.__class__.__name__,
183 |         "__module__": get_module(type(obj)),
184 |         "__loader__": "_DictWithDeprecatedKeysNode",
185 |     }
186 |     content = {}
187 |     # explicitly pass a dict object instead of _DictWithDeprecatedKeys and
188 |     # later construct a _DictWithDeprecatedKeys object.
189 |     content["main"] = get_state(dict(obj), save_context)
190 |     content["_deprecated_key_to_new_key"] = get_state(
191 |         obj._deprecated_key_to_new_key, save_context
192 |     )
193 |     res["content"] = content
194 |     return res
195 | 
196 | 
197 | # TODO: remove once support for sklearn<1.2 is dropped.
198 | class _DictWithDeprecatedKeysNode(Node):
199 |     # _DictWithDeprecatedKeys is just a wrapper for dict
200 |     def __init__(
201 |         self,
202 |         state: dict[str, Any],
203 |         load_context: LoadContext,
204 |         trusted: bool | Sequence[str] = False,
205 |     ) -> None:
206 |         super().__init__(state, load_context, trusted)
207 |         self.trusted = [
208 |             get_module(_DictWithDeprecatedKeysNode) + "._DictWithDeprecatedKeys"
209 |         ]
210 |         self.children = {
211 |             "main": get_tree(state["content"]["main"], load_context),
212 |             "_deprecated_key_to_new_key": get_tree(
213 |                 state["content"]["_deprecated_key_to_new_key"], load_context
214 |             ),
215 |         }
216 | 
217 |     def _construct(self):
218 |         instance = _DictWithDeprecatedKeys(**self.children["main"].construct())
219 |         instance._deprecated_key_to_new_key = self.children[
220 |             "_deprecated_key_to_new_key"
221 |         ].construct()
222 |         return instance
223 | 
224 | 
225 | # tuples of type and function that gets the state of that type
226 | GET_STATE_DISPATCH_FUNCTIONS = [
227 |     (LossFunction, sgd_loss_get_state),
228 |     (Tree, tree_get_state),
229 | ]
230 | for type_ in UNSUPPORTED_TYPES:
231 |     GET_STATE_DISPATCH_FUNCTIONS.append((type_, unsupported_get_state))
232 | 
233 | # tuples of type and function that creates the instance of that type
234 | NODE_TYPE_MAPPING = {
235 |     "SGDNode": SGDNode,
236 |     "TreeNode": TreeNode,
237 | }
238 | 
239 | # TODO: remove once support for sklearn<1.2 is dropped.
240 | # Starting from sklearn 1.2, _DictWithDeprecatedKeys is removed as it's no
241 | # longer needed for GraphicalLassoCV, see #187.
242 | if _DictWithDeprecatedKeys is not None:
243 |     GET_STATE_DISPATCH_FUNCTIONS.append(
244 |         (_DictWithDeprecatedKeys, _DictWithDeprecatedKeys_get_state)
245 |     )
246 |     NODE_TYPE_MAPPING[
247 |         "_DictWithDeprecatedKeysNode"
248 |     ] = _DictWithDeprecatedKeysNode  # type: ignore
249 | 


--------------------------------------------------------------------------------
/skops/io/_numpy.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import io
  4 | from typing import Any, Sequence
  5 | 
  6 | import numpy as np
  7 | 
  8 | from ._audit import Node, get_tree
  9 | from ._utils import LoadContext, SaveContext, get_module, get_state, gettype
 10 | from .exceptions import UnsupportedTypeException
 11 | 
 12 | 
 13 | def ndarray_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
 14 |     res = {
 15 |         "__class__": obj.__class__.__name__,
 16 |         "__module__": get_module(type(obj)),
 17 |         "__loader__": "NdArrayNode",
 18 |     }
 19 | 
 20 |     try:
 21 |         # If the dtype is object, np.save should not work with
 22 |         # allow_pickle=False, therefore we convert them to a list and
 23 |         # recursively call get_state on it.
 24 |         if obj.dtype == object:
 25 |             obj_serialized = get_state(obj.tolist(), save_context)
 26 |             res["content"] = obj_serialized["content"]
 27 |             res["type"] = "json"
 28 |             res["shape"] = get_state(obj.shape, save_context)
 29 |         else:
 30 |             data_buffer = io.BytesIO()
 31 |             np.save(data_buffer, obj, allow_pickle=False)
 32 |             # Memoize the object and then check if it's file name (containing
 33 |             # the object id) already exists. If it does, there is no need to
 34 |             # save the object again. Memoizitation is necessary since for
 35 |             # ephemeral objects, the same id might otherwise be reused.
 36 |             obj_id = save_context.memoize(obj)
 37 |             f_name = f"{obj_id}.npy"
 38 |             if f_name not in save_context.zip_file.namelist():
 39 |                 save_context.zip_file.writestr(f_name, data_buffer.getbuffer())
 40 |             res.update(type="numpy", file=f_name)
 41 |     except ValueError:
 42 |         # Couldn't save the numpy array with either method
 43 |         raise UnsupportedTypeException(
 44 |             f"numpy arrays of dtype {obj.dtype} are not supported yet, please "
 45 |             "open an issue at https://github.com/skops-dev/skops/issues and "
 46 |             "report your error"
 47 |         )
 48 | 
 49 |     return res
 50 | 
 51 | 
 52 | class NdArrayNode(Node):
 53 |     def __init__(
 54 |         self,
 55 |         state: dict[str, Any],
 56 |         load_context: LoadContext,
 57 |         trusted: bool | Sequence[str] = False,
 58 |     ) -> None:
 59 |         super().__init__(state, load_context, trusted)
 60 |         self.type = state["type"]
 61 |         self.trusted = self._get_trusted(trusted, [np.ndarray])
 62 |         if self.type == "numpy":
 63 |             self.children = {
 64 |                 "content": io.BytesIO(load_context.src.read(state["file"]))
 65 |             }
 66 |         elif self.type == "json":
 67 |             self.children = {
 68 |                 "content": [  # type: ignore
 69 |                     get_tree(o, load_context) for o in state["content"]  # type: ignore
 70 |                 ],
 71 |                 "shape": get_tree(state["shape"], load_context),
 72 |             }
 73 |         else:
 74 |             raise ValueError(f"Unknown type {self.type}.")
 75 | 
 76 |     def _construct(self):
 77 |         # Dealing with a regular numpy array, where dtype != object
 78 |         if self.type == "numpy":
 79 |             content = np.load(self.children["content"], allow_pickle=False)
 80 |             if f"{self.module_name}.{self.class_name}" != "numpy.ndarray":
 81 |                 content = gettype(self.module_name, self.class_name)(content)
 82 |             return content
 83 | 
 84 |         if self.type == "json":
 85 |             # We explicitly set the dtype to "O" since we only save object
 86 |             # arrays in json.
 87 |             shape = self.children["shape"].construct()
 88 |             tmp = [o.construct() for o in self.children["content"]]
 89 | 
 90 |             # TODO: this is a hack to get the correct shape of the array. We
 91 |             # should find _a better way_ to do this.
 92 |             if len(shape) == 1:
 93 |                 content = np.ndarray(shape=len(tmp), dtype="O")
 94 |                 for i, v in enumerate(tmp):
 95 |                     content[i] = v
 96 |             else:
 97 |                 content = np.array(tmp, dtype="O")
 98 | 
 99 |             return content
100 | 
101 |         raise ValueError(f"Unknown type for a numpy object: {self.type}.")
102 | 
103 | 
104 | def maskedarray_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
105 |     res = {
106 |         "__class__": obj.__class__.__name__,
107 |         "__module__": get_module(type(obj)),
108 |         "__loader__": "MaskedArrayNode",
109 |         "content": {
110 |             "data": get_state(obj.data, save_context),
111 |             "mask": get_state(obj.mask, save_context),
112 |         },
113 |     }
114 |     return res
115 | 
116 | 
117 | class MaskedArrayNode(Node):
118 |     def __init__(
119 |         self,
120 |         state: dict[str, Any],
121 |         load_context: LoadContext,
122 |         trusted: bool | Sequence[str] = False,
123 |     ) -> None:
124 |         super().__init__(state, load_context, trusted)
125 |         self.trusted = self._get_trusted(trusted, [np.ma.MaskedArray])
126 |         self.children = {
127 |             "data": get_tree(state["content"]["data"], load_context),
128 |             "mask": get_tree(state["content"]["mask"], load_context),
129 |         }
130 | 
131 |     def _construct(self):
132 |         data = self.children["data"].construct()
133 |         mask = self.children["mask"].construct()
134 |         return np.ma.MaskedArray(data, mask)
135 | 
136 | 
137 | def random_state_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
138 |     content = get_state(obj.get_state(legacy=False), save_context)
139 |     res = {
140 |         "__class__": obj.__class__.__name__,
141 |         "__module__": get_module(type(obj)),
142 |         "__loader__": "RandomStateNode",
143 |         "content": content,
144 |     }
145 |     return res
146 | 
147 | 
148 | class RandomStateNode(Node):
149 |     def __init__(
150 |         self,
151 |         state: dict[str, Any],
152 |         load_context: LoadContext,
153 |         trusted: bool | Sequence[str] = False,
154 |     ) -> None:
155 |         super().__init__(state, load_context, trusted)
156 |         self.children = {"content": get_tree(state["content"], load_context)}
157 |         self.trusted = self._get_trusted(trusted, [np.random.RandomState])
158 | 
159 |     def _construct(self):
160 |         random_state = gettype(self.module_name, self.class_name)()
161 |         random_state.set_state(self.children["content"].construct())
162 |         return random_state
163 | 
164 | 
165 | def random_generator_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
166 |     bit_generator_state = obj.bit_generator.state
167 |     res = {
168 |         "__class__": obj.__class__.__name__,
169 |         "__module__": get_module(type(obj)),
170 |         "__loader__": "RandomGeneratorNode",
171 |         "content": {"bit_generator": bit_generator_state},
172 |     }
173 |     return res
174 | 
175 | 
176 | class RandomGeneratorNode(Node):
177 |     def __init__(
178 |         self,
179 |         state: dict[str, Any],
180 |         load_context: LoadContext,
181 |         trusted: bool | Sequence[str] = False,
182 |     ) -> None:
183 |         super().__init__(state, load_context, trusted)
184 |         self.children = {"bit_generator_state": state["content"]["bit_generator"]}
185 |         self.trusted = self._get_trusted(trusted, [np.random.Generator])
186 | 
187 |     def _construct(self):
188 |         # first restore the state of the bit generator
189 |         bit_generator = gettype(
190 |             "numpy.random", self.children["bit_generator_state"]["bit_generator"]
191 |         )()
192 |         bit_generator.state = self.children["bit_generator_state"]
193 | 
194 |         # next create the generator instance
195 |         return gettype(self.module_name, self.class_name)(bit_generator=bit_generator)
196 | 
197 | 
198 | # For numpy.ufunc we need to get the type from the type's module, but for other
199 | # functions we get it from objet's module directly. Therefore sett a especial
200 | # get_state method for them here. The load is the same as other functions.
201 | def ufunc_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
202 |     res = {
203 |         "__class__": obj.__class__.__name__,  # ufunc
204 |         "__module__": get_module(type(obj)),  # numpy
205 |         "__loader__": "FunctionNode",
206 |         "content": {
207 |             "module_path": get_module(obj),
208 |             "function": obj.__name__,
209 |         },
210 |     }
211 |     return res
212 | 
213 | 
214 | def dtype_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]:
215 |     # we use numpy's internal save mechanism to store the dtype by
216 |     # saving/loading an empty array with that dtype.
217 |     tmp: np.typing.NDArray = np.ndarray(0, dtype=obj)
218 |     res = {
219 |         "__class__": "dtype",
220 |         "__module__": "numpy",
221 |         "__loader__": "DTypeNode",
222 |         "content": get_state(tmp, save_context),
223 |     }
224 |     return res
225 | 
226 | 
227 | class DTypeNode(Node):
228 |     def __init__(
229 |         self,
230 |         state: dict[str, Any],
231 |         load_context: LoadContext,
232 |         trusted: bool | Sequence[str] = False,
233 |     ) -> None:
234 |         super().__init__(state, load_context, trusted)
235 |         self.children = {"content": get_tree(state["content"], load_context)}
236 |         # TODO: what should we trust?
237 |         self.trusted = self._get_trusted(trusted, [])
238 | 
239 |     def _construct(self):
240 |         # we use numpy's internal save mechanism to store the dtype by
241 |         # saving/loading an empty array with that dtype.
242 |         return self.children["content"].construct().dtype
243 | 
244 | 
245 | # tuples of type and function that gets the state of that type
246 | GET_STATE_DISPATCH_FUNCTIONS = [
247 |     (np.generic, ndarray_get_state),
248 |     (np.ndarray, ndarray_get_state),
249 |     (np.ma.MaskedArray, maskedarray_get_state),
250 |     (np.ufunc, ufunc_get_state),
251 |     (np.dtype, dtype_get_state),
252 |     (np.random.RandomState, random_state_get_state),
253 |     (np.random.Generator, random_generator_get_state),
254 | ]
255 | # tuples of type and function that creates the instance of that type
256 | NODE_TYPE_MAPPING = {
257 |     "NdArrayNode": NdArrayNode,
258 |     "MaskedArrayNode": MaskedArrayNode,
259 |     "DTypeNode": DTypeNode,
260 |     "RandomStateNode": RandomStateNode,
261 |     "RandomGeneratorNode": RandomGeneratorNode,
262 | }
263 | 


--------------------------------------------------------------------------------
/skops/card/tests/examples/bert-base-uncased.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | language: en
  3 | tags:
  4 | - exbert
  5 | license: apache-2.0
  6 | datasets:
  7 | - bookcorpus
  8 | - wikipedia
  9 | ---
 10 | 
 11 | # BERT base model (uncased)
 12 | 
 13 | <!-- retrieved on 2022-12-02 | mod: removed trailing whitespaces -->
 14 | 
 15 | Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in
 16 | [this paper](https://arxiv.org/abs/1810.04805) and first released in
 17 | [this repository](https://github.com/google-research/bert). This model is uncased: it does not make a difference
 18 | between english and English.
 19 | 
 20 | Disclaimer: The team releasing BERT did not write a model card for this model so this model card has been written by
 21 | the Hugging Face team.
 22 | 
 23 | ## Model description
 24 | 
 25 | BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it
 26 | was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of
 27 | publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it
 28 | was pretrained with two objectives:
 29 | 
 30 | - Masked language modeling (MLM): taking a sentence, the model randomly masks 15% of the words in the input then run
 31 |   the entire masked sentence through the model and has to predict the masked words. This is different from traditional
 32 |   recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like
 33 |   GPT which internally masks the future tokens. It allows the model to learn a bidirectional representation of the
 34 |   sentence.
 35 | - Next sentence prediction (NSP): the models concatenates two masked sentences as inputs during pretraining. Sometimes
 36 |   they correspond to sentences that were next to each other in the original text, sometimes not. The model then has to
 37 |   predict if the two sentences were following each other or not.
 38 | 
 39 | This way, the model learns an inner representation of the English language that can then be used to extract features
 40 | useful for downstream tasks: if you have a dataset of labeled sentences, for instance, you can train a standard
 41 | classifier using the features produced by the BERT model as inputs.
 42 | 
 43 | ## Model variations
 44 | 
 45 | BERT has originally been released in base and large variations, for cased and uncased input text. The uncased models also strips out an accent markers.
 46 | Chinese and multilingual uncased and cased versions followed shortly after.
 47 | Modified preprocessing with whole word masking has replaced subpiece masking in a following work, with the release of two models.
 48 | Other 24 smaller models are released afterward.
 49 | 
 50 | The detailed release history can be found on the [google-research/bert readme](https://github.com/google-research/bert/blob/master/README.md) on github.
 51 | 
 52 | | Model | #params | Language |
 53 | |------------------------|--------------------------------|-------|
 54 | | [`bert-base-uncased`](https://huggingface.co/bert-base-uncased) | 110M   | English |
 55 | | [`bert-large-uncased`](https://huggingface.co/bert-large-uncased)              | 340M    | English | sub
 56 | | [`bert-base-cased`](https://huggingface.co/bert-base-cased)        | 110M    | English |
 57 | | [`bert-large-cased`](https://huggingface.co/bert-large-cased) | 340M    |  English |
 58 | | [`bert-base-chinese`](https://huggingface.co/bert-base-chinese) | 110M    | Chinese |
 59 | | [`bert-base-multilingual-cased`](https://huggingface.co/bert-base-multilingual-cased) | 110M | Multiple |
 60 | | [`bert-large-uncased-whole-word-masking`](https://huggingface.co/bert-large-uncased-whole-word-masking) | 340M | English |
 61 | | [`bert-large-cased-whole-word-masking`](https://huggingface.co/bert-large-cased-whole-word-masking) | 340M | English |
 62 | 
 63 | ## Intended uses & limitations
 64 | 
 65 | You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to
 66 | be fine-tuned on a downstream task. See the [model hub](https://huggingface.co/models?filter=bert) to look for
 67 | fine-tuned versions of a task that interests you.
 68 | 
 69 | Note that this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked)
 70 | to make decisions, such as sequence classification, token classification or question answering. For tasks such as text
 71 | generation you should look at model like GPT2.
 72 | 
 73 | ### How to use
 74 | 
 75 | You can use this model directly with a pipeline for masked language modeling:
 76 | 
 77 | ```python
 78 | >>> from transformers import pipeline
 79 | >>> unmasker = pipeline('fill-mask', model='bert-base-uncased')
 80 | >>> unmasker("Hello I'm a [MASK] model.")
 81 | [{'sequence': "[CLS] hello i'm a fashion model. [SEP]",
 82 |   'score': 0.1073106899857521,
 83 |   'token': 4827,
 84 |   'token_str': 'fashion'},
 85 |  {'sequence': "[CLS] hello i'm a role model. [SEP]",
 86 |   'score': 0.08774490654468536,
 87 |   'token': 2535,
 88 |   'token_str': 'role'},
 89 |  {'sequence': "[CLS] hello i'm a new model. [SEP]",
 90 |   'score': 0.05338378623127937,
 91 |   'token': 2047,
 92 |   'token_str': 'new'},
 93 |  {'sequence': "[CLS] hello i'm a super model. [SEP]",
 94 |   'score': 0.04667217284440994,
 95 |   'token': 3565,
 96 |   'token_str': 'super'},
 97 |  {'sequence': "[CLS] hello i'm a fine model. [SEP]",
 98 |   'score': 0.027095865458250046,
 99 |   'token': 2986,
100 |   'token_str': 'fine'}]
101 | ```
102 | 
103 | Here is how to use this model to get the features of a given text in PyTorch:
104 | 
105 | ```python
106 | from transformers import BertTokenizer, BertModel
107 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
108 | model = BertModel.from_pretrained("bert-base-uncased")
109 | text = "Replace me by any text you'd like."
110 | encoded_input = tokenizer(text, return_tensors='pt')
111 | output = model(**encoded_input)
112 | ```
113 | 
114 | and in TensorFlow:
115 | 
116 | ```python
117 | from transformers import BertTokenizer, TFBertModel
118 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
119 | model = TFBertModel.from_pretrained("bert-base-uncased")
120 | text = "Replace me by any text you'd like."
121 | encoded_input = tokenizer(text, return_tensors='tf')
122 | output = model(encoded_input)
123 | ```
124 | 
125 | ### Limitations and bias
126 | 
127 | Even if the training data used for this model could be characterized as fairly neutral, this model can have biased
128 | predictions:
129 | 
130 | ```python
131 | >>> from transformers import pipeline
132 | >>> unmasker = pipeline('fill-mask', model='bert-base-uncased')
133 | >>> unmasker("The man worked as a [MASK].")
134 | [{'sequence': '[CLS] the man worked as a carpenter. [SEP]',
135 |   'score': 0.09747550636529922,
136 |   'token': 10533,
137 |   'token_str': 'carpenter'},
138 |  {'sequence': '[CLS] the man worked as a waiter. [SEP]',
139 |   'score': 0.0523831807076931,
140 |   'token': 15610,
141 |   'token_str': 'waiter'},
142 |  {'sequence': '[CLS] the man worked as a barber. [SEP]',
143 |   'score': 0.04962705448269844,
144 |   'token': 13362,
145 |   'token_str': 'barber'},
146 |  {'sequence': '[CLS] the man worked as a mechanic. [SEP]',
147 |   'score': 0.03788609802722931,
148 |   'token': 15893,
149 |   'token_str': 'mechanic'},
150 |  {'sequence': '[CLS] the man worked as a salesman. [SEP]',
151 |   'score': 0.037680890411138535,
152 |   'token': 18968,
153 |   'token_str': 'salesman'}]
154 | >>> unmasker("The woman worked as a [MASK].")
155 | [{'sequence': '[CLS] the woman worked as a nurse. [SEP]',
156 |   'score': 0.21981462836265564,
157 |   'token': 6821,
158 |   'token_str': 'nurse'},
159 |  {'sequence': '[CLS] the woman worked as a waitress. [SEP]',
160 |   'score': 0.1597415804862976,
161 |   'token': 13877,
162 |   'token_str': 'waitress'},
163 |  {'sequence': '[CLS] the woman worked as a maid. [SEP]',
164 |   'score': 0.1154729500412941,
165 |   'token': 10850,
166 |   'token_str': 'maid'},
167 |  {'sequence': '[CLS] the woman worked as a prostitute. [SEP]',
168 |   'score': 0.037968918681144714,
169 |   'token': 19215,
170 |   'token_str': 'prostitute'},
171 |  {'sequence': '[CLS] the woman worked as a cook. [SEP]',
172 |   'score': 0.03042375110089779,
173 |   'token': 5660,
174 |   'token_str': 'cook'}]
175 | ```
176 | 
177 | This bias will also affect all fine-tuned versions of this model.
178 | 
179 | ## Training data
180 | 
181 | The BERT model was pretrained on [BookCorpus](https://yknzhu.wixsite.com/mbweb), a dataset consisting of 11,038
182 | unpublished books and [English Wikipedia](https://en.wikipedia.org/wiki/English_Wikipedia) (excluding lists, tables and
183 | headers).
184 | 
185 | ## Training procedure
186 | 
187 | ### Preprocessing
188 | 
189 | The texts are lowercased and tokenized using WordPiece and a vocabulary size of 30,000. The inputs of the model are
190 | then of the form:
191 | 
192 | ```
193 | [CLS] Sentence A [SEP] Sentence B [SEP]
194 | ```
195 | 
196 | With probability 0.5, sentence A and sentence B correspond to two consecutive sentences in the original corpus, and in
197 | the other cases, it's another random sentence in the corpus. Note that what is considered a sentence here is a
198 | consecutive span of text usually longer than a single sentence. The only constrain is that the result with the two
199 | "sentences" has a combined length of less than 512 tokens.
200 | 
201 | The details of the masking procedure for each sentence are the following:
202 | - 15% of the tokens are masked.
203 | - In 80% of the cases, the masked tokens are replaced by `[MASK]`.
204 | - In 10% of the cases, the masked tokens are replaced by a random token (different) from the one they replace.
205 | - In the 10% remaining cases, the masked tokens are left as is.
206 | 
207 | ### Pretraining
208 | 
209 | The model was trained on 4 cloud TPUs in Pod configuration (16 TPU chips total) for one million steps with a batch size
210 | of 256. The sequence length was limited to 128 tokens for 90% of the steps and 512 for the remaining 10%. The optimizer
211 | used is Adam with a learning rate of 1e-4, \\(\beta_{1} = 0.9\\) and \\(\beta_{2} = 0.999\\), a weight decay of 0.01,
212 | learning rate warmup for 10,000 steps and linear decay of the learning rate after.
213 | 
214 | ## Evaluation results
215 | 
216 | When fine-tuned on downstream tasks, this model achieves the following results:
217 | 
218 | Glue test results:
219 | 
220 | | Task | MNLI-(m/mm) | QQP  | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE  | Average |
221 | |:----:|:-----------:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:-------:|
222 | |      | 84.6/83.4   | 71.2 | 90.5 | 93.5  | 52.1 | 85.8  | 88.9 | 66.4 | 79.6    |
223 | 
224 | 
225 | ### BibTeX entry and citation info
226 | 
227 | ```bibtex
228 | @article{DBLP:journals/corr/abs-1810-04805,
229 |   author    = {Jacob Devlin and
230 |                Ming{-}Wei Chang and
231 |                Kenton Lee and
232 |                Kristina Toutanova},
233 |   title     = {{BERT:} Pre-training of Deep Bidirectional Transformers for Language
234 |                Understanding},
235 |   journal   = {CoRR},
236 |   volume    = {abs/1810.04805},
237 |   year      = {2018},
238 |   url       = {http://arxiv.org/abs/1810.04805},
239 |   archivePrefix = {arXiv},
240 |   eprint    = {1810.04805},
241 |   timestamp = {Tue, 30 Oct 2018 20:39:56 +0100},
242 |   biburl    = {https://dblp.org/rec/journals/corr/abs-1810-04805.bib},
243 |   bibsource = {dblp computer science bibliography, https://dblp.org}
244 | }
245 | ```
246 | 
247 | <a href="https://huggingface.co/exbert/?model=bert-base-uncased">
248 | 	<img width="300px" src="https://cdn-media.huggingface.co/exbert/button.png">
249 | </a>
250 | 


--------------------------------------------------------------------------------