├── skops ├── cli │ ├── __init__.py │ ├── _utils.py │ ├── tests │ │ ├── test_entrypoint.py │ │ └── test_convert.py │ ├── entrypoint.py │ └── _convert.py ├── utils │ └── __init__.py ├── hub_utils │ ├── tests │ │ └── common.py │ └── __init__.py ├── io │ ├── __init__.py │ ├── _trusted_types.py │ ├── exceptions.py │ ├── _scipy.py │ ├── tests │ │ ├── test_utils.py │ │ ├── test_audit.py │ │ └── _utils.py │ ├── _utils.py │ ├── _persist.py │ ├── _sklearn.py │ └── _numpy.py ├── card │ ├── __init__.py │ ├── tests │ │ ├── examples │ │ │ ├── specter.md.diff │ │ │ ├── vit-base-patch32-224-in21k.md.diff │ │ │ ├── specter.md │ │ │ ├── gpt2.md.diff │ │ │ ├── clip-vit-large-patch14.md.diff │ │ │ ├── toy-example.md.diff │ │ │ ├── bert-base-uncased.md.diff │ │ │ ├── toy-example.md │ │ │ ├── vit-base-patch32-224-in21k.md │ │ │ ├── clip-vit-large-patch14.md │ │ │ ├── gpt2.md │ │ │ └── bert-base-uncased.md │ │ └── test_parser.py │ ├── default_template.md │ └── _templates.py ├── conftest.py ├── __init__.py └── _min_dependencies.py ├── docs ├── requirements.txt ├── images │ └── logo.png ├── _authors.rst ├── installation.rst ├── modules │ └── classes.rst ├── Makefile ├── make.bat ├── community.rst ├── index.rst ├── changes.rst ├── conf.py ├── model_card.rst ├── hf_hub.rst └── persistence.rst ├── setup.cfg ├── MANIFEST.in ├── examples ├── README.rst ├── plot_hf_hub.py ├── plot_text_classification.py └── plot_model_card.py ├── Makefile ├── .codecov.yml ├── .readthedocs.yml ├── .github ├── dependabot.yml └── workflows │ ├── clean-skops-user.yml │ ├── PULL_REQUEST_TEMPLATE.md │ ├── publish-pypi.yml │ └── build-test.yml ├── .pre-commit-config.yaml ├── LICENSE ├── scripts └── clean_skops.py ├── pyproject.toml ├── .gitignore ├── setup.py ├── README.rst └── CONTRIBUTING.rst /skops/cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /skops/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | -e .[docs] 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 99 3 | enable-extensions = C, G 4 | -------------------------------------------------------------------------------- /docs/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/merveenoyan/skops/main/docs/images/logo.png -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.rst 3 | include skops/card/default_template.md 4 | -------------------------------------------------------------------------------- /examples/README.rst: -------------------------------------------------------------------------------- 1 | skops Gallery 2 | ============= 3 | 4 | Here are the examples to use this library. 5 | -------------------------------------------------------------------------------- /skops/hub_utils/tests/common.py: -------------------------------------------------------------------------------- 1 | # This is the token for the skops user on the hub, used for the CI. 2 | HF_HUB_TOKEN = "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" 3 | -------------------------------------------------------------------------------- /skops/io/__init__.py: -------------------------------------------------------------------------------- 1 | from ._persist import dump, dumps, get_untrusted_types, load, loads 2 | 3 | __all__ = ["dumps", "load", "loads", "dump", "get_untrusted_types"] 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # A makefile to simplify speatative steps 2 | 3 | package: 4 | python setup.py bdist_wheel 5 | python setup.py sdist 6 | 7 | pypi-upload: 8 | twine upload --verbose dist/* 9 | -------------------------------------------------------------------------------- /skops/card/__init__.py: -------------------------------------------------------------------------------- 1 | from ._model_card import Card, metadata_from_config 2 | from ._parser import parse_modelcard 3 | 4 | __all__ = ["Card", "metadata_from_config", "parse_modelcard"] 5 | -------------------------------------------------------------------------------- /.codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | codecov: 3 | branch: main 4 | require_ci_to_pass: true 5 | notify: 6 | after_n_builds: 12 7 | wait_for_ci: true 8 | ignore: 9 | - "skops/_min_dependencies.py" # This file is not tested, and won't be. 10 | -------------------------------------------------------------------------------- /docs/_authors.rst: -------------------------------------------------------------------------------- 1 | 2 | .. role:: raw-html(raw) 3 | :format: html 4 | 5 | 6 | .. _Adrin Jalali: https://github.com/adrinjalali 7 | 8 | .. _Benjamin Bossan: https://github.com/BenjaminBossan 9 | 10 | .. _Merve Noyan: https://github.com/merveenoyan 11 | -------------------------------------------------------------------------------- /skops/card/tests/examples/specter.md.diff: -------------------------------------------------------------------------------- 1 | --- 2 | +++ 3 | @@ -3 +3 @@ 4 | -## SPECTER 5 | +# SPECTER 6 | @@ -15 +15 @@ 7 | -Authors: *Arman Cohan, Sergey Feldman, Iz Beltagy, Doug Downey, Daniel S. Weld* 8 | +Authors: _Arman Cohan, Sergey Feldman, Iz Beltagy, Doug Downey, Daniel S. Weld_ 9 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | 3 | Installation 4 | ============ 5 | 6 | To install skops, run the following command in your Python environment: 7 | 8 | .. code-block:: bash 9 | 10 | python -m pip install skops 11 | 12 | If you're interested in contributing to skops, please follow the `contribution 13 | guideline `__ 14 | instead. 15 | -------------------------------------------------------------------------------- /skops/hub_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from ._hf_hub import ( 2 | add_files, 3 | download, 4 | get_config, 5 | get_model_output, 6 | get_requirements, 7 | init, 8 | push, 9 | update_env, 10 | ) 11 | 12 | __all__ = [ 13 | "add_files", 14 | "download", 15 | "get_config", 16 | "get_requirements", 17 | "get_model_output", 18 | "init", 19 | "push", 20 | "update_env", 21 | ] 22 | -------------------------------------------------------------------------------- /skops/io/_trusted_types.py: -------------------------------------------------------------------------------- 1 | from sklearn.utils import all_estimators 2 | 3 | from ._utils import get_type_name 4 | 5 | PRIMITIVES_TYPES = [int, float, str, bool] 6 | 7 | PRIMITIVE_TYPE_NAMES = ["builtins." + t.__name__ for t in PRIMITIVES_TYPES] 8 | 9 | SKLEARN_ESTIMATOR_TYPE_NAMES = [ 10 | get_type_name(estimator_class) 11 | for _, estimator_class in all_estimators() 12 | if get_type_name(estimator_class).startswith("sklearn.") 13 | ] 14 | -------------------------------------------------------------------------------- /skops/cli/_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def get_log_level(level: int = 0) -> int: 5 | """Takes in verbosity from a CLI entrypoint (number of times -v specified), 6 | and sets the logger to the required log level""" 7 | 8 | all_levels = [logging.WARNING, logging.INFO, logging.DEBUG] 9 | 10 | if level > len(all_levels): 11 | level = len(all_levels) - 1 12 | elif level < 0: 13 | level = 0 14 | 15 | return all_levels[level] 16 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-20.04 11 | tools: 12 | python: "3.10" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | python: 19 | install: 20 | - requirements: docs/requirements.txt 21 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "github-actions" 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /skops/card/tests/examples/vit-base-patch32-224-in21k.md.diff: -------------------------------------------------------------------------------- 1 | --- 2 | +++ 3 | @@ -17 +17 @@ 4 | -Note that this model does not provide any fine-tuned heads, as these were zero'd by Google researchers. However, the model does include the pre-trained pooler, which can be used for downstream tasks (such as image classification). 5 | +Note that this model does not provide any fine-tuned heads, as these were zero’d by Google researchers. However, the model does include the pre-trained pooler, which can be used for downstream tasks (such as image classification). 6 | -------------------------------------------------------------------------------- /skops/io/exceptions.py: -------------------------------------------------------------------------------- 1 | class UnsupportedTypeException(TypeError): 2 | """Raise when an object of this type is known to be unsupported""" 3 | 4 | def __init__(self, obj): 5 | super().__init__( 6 | f"Objects of type {obj.__class__.__name__} are not supported yet." 7 | ) 8 | 9 | 10 | class UntrustedTypesFoundException(TypeError): 11 | """Raise when some untrusted objects are found in the file.""" 12 | 13 | def __init__(self, unsafe): 14 | super().__init__(f"Untrusted types found in the file: {sorted(unsafe)}.") 15 | -------------------------------------------------------------------------------- /skops/conftest.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | import pytest 4 | 5 | 6 | @pytest.fixture 7 | def pandas_not_installed(): 8 | # patch import so that it raises an ImportError when trying to import 9 | # pandas. This works because pandas is only imported lazily. 10 | orig_import = __import__ 11 | 12 | def mock_import(name, *args, **kwargs): 13 | if name == "pandas": 14 | raise ImportError 15 | return orig_import(name, *args, **kwargs) 16 | 17 | with patch("builtins.__import__", side_effect=mock_import): 18 | yield 19 | -------------------------------------------------------------------------------- /docs/modules/classes.rst: -------------------------------------------------------------------------------- 1 | .. _api_ref: 2 | 3 | ============= 4 | API Reference 5 | ============= 6 | 7 | This is the class and function reference of skops. 8 | 9 | :mod:`skops.hf_hub`: Hugging Face Hub Integration 10 | ================================================= 11 | .. automodule:: skops.hub_utils 12 | :members: 13 | 14 | :mod:`skops.card`: Model Card Utilities 15 | ======================================= 16 | .. automodule:: skops.card 17 | :members: 18 | 19 | :mod:`skops.io`: Secure persistence 20 | =================================== 21 | .. automodule:: skops.io 22 | :members: 23 | -------------------------------------------------------------------------------- /.github/workflows/clean-skops-user.yml: -------------------------------------------------------------------------------- 1 | name: clean-skops-user 2 | 3 | on: 4 | schedule: 5 | # * is a special character in YAML so you have to quote this string 6 | - cron: '10 1 * * *' 7 | 8 | jobs: 9 | clean-skops-user: 10 | 11 | runs-on: ubuntu-latest 12 | if: "github.repository == 'skops-dev/skops'" 13 | 14 | # Timeout: https://stackoverflow.com/a/59076067/4521646 15 | timeout-minutes: 35 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Set up Python 20 | uses: actions/setup-python@v4 21 | - name: Install Requirements 22 | run: pip install huggingface_hub 23 | - name: run cleanup 24 | run: echo "y" | python scripts/clean_skops.py 25 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | clean: 18 | rm -rf $(BUILDDIR) 19 | rm -rf auto_examples 20 | 21 | # Catch-all target: route all unknown targets to Sphinx using the new 22 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 23 | %: Makefile 24 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 25 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.1.0 4 | hooks: 5 | - id: check-yaml 6 | exclude: .github/conda/meta.yaml 7 | - id: end-of-file-fixer 8 | - id: trailing-whitespace 9 | exclude: skops/card/tests/examples 10 | - id: check-case-conflict 11 | - id: check-merge-conflict 12 | - repo: https://github.com/psf/black 13 | rev: 22.6.0 14 | hooks: 15 | - id: black 16 | - repo: https://github.com/pycqa/flake8 17 | rev: 4.0.1 18 | hooks: 19 | - id: flake8 20 | types: [file, python] 21 | - repo: https://github.com/PyCQA/isort 22 | rev: 5.10.1 23 | hooks: 24 | - id: isort 25 | - repo: https://github.com/pre-commit/mirrors-mypy 26 | rev: v0.971 27 | hooks: 28 | - id: mypy 29 | args: [--config-file=pyproject.toml] 30 | additional_dependencies: [types-requests>=2.28.5] 31 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (C) 2021 Hugging Face Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /skops/card/tests/examples/specter.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: en 3 | thumbnail: "https://camo.githubusercontent.com/7d080b7a769f7fdf64ac0ebeb47b039cb50be35287e3071f9d633f0fe33e7596/68747470733a2f2f692e6962622e636f2f33544331576d472f737065637465722d6c6f676f2d63726f707065642e706e67" 4 | license: apache-2.0 5 | datasets: 6 | - SciDocs 7 | metrics: 8 | - F1 9 | - accuracy 10 | - map 11 | - ndcg 12 | --- 13 | 14 | ## SPECTER 15 | 16 | 17 | 18 | SPECTER is a pre-trained language model to generate document-level embedding of documents. It is pre-trained on a a powerful signal of document-level relatedness: the citation graph. Unlike existing pretrained language models, SPECTER can be easily applied to downstream applications without task-specific fine-tuning. 19 | 20 | Paper: [SPECTER: Document-level Representation Learning using Citation-informed Transformers](https://arxiv.org/pdf/2004.07180.pdf) 21 | 22 | Original Repo: [Github](https://github.com/allenai/specter) 23 | 24 | Evaluation Benchmark: [SciDocs](https://github.com/allenai/scidocs) 25 | 26 | Authors: *Arman Cohan, Sergey Feldman, Iz Beltagy, Doug Downey, Daniel S. Weld* 27 | -------------------------------------------------------------------------------- /scripts/clean_skops.py: -------------------------------------------------------------------------------- 1 | """This script removes all repos under the skops user on HF Hub. 2 | 3 | The user is used for the CI and if there are leftover repos, they can be 4 | removed. 5 | """ 6 | 7 | import datetime 8 | 9 | from huggingface_hub import HfApi 10 | 11 | # This is the token for the skops user. TODO remove eventually, see issue #47 12 | token = "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" 13 | client = HfApi(token=token) 14 | user = client.whoami()["name"] 15 | answer = input( 16 | f"Are you sure you want to delete all repos under {user} older than 7 days? (y/[n])" 17 | ) 18 | if answer != "y": 19 | exit(1) 20 | models = [x for x in client.list_models(author=user)] 21 | 22 | print(f"Found {len(models)} models, checking their age...") 23 | 24 | for model_info in models: 25 | info = client.model_info(model_info.modelId) 26 | age = ( 27 | datetime.datetime.now() 28 | - datetime.datetime.fromisoformat(info.lastModified.rsplit(".", 1)[0]) 29 | ).days 30 | if age < 7: 31 | print(f"Skipping model: {model_info.modelId}, age: {age}") 32 | continue 33 | print(f"deleting {model_info.modelId}, age: {age} days") 34 | client.delete_repo(model_info.modelId) 35 | -------------------------------------------------------------------------------- /skops/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | # PEP0440 compatible formatted version, see: 4 | # https://www.python.org/dev/peps/pep-0440/ 5 | # 6 | # Generic release markers: 7 | # X.Y.0 # For first release after an increment in Y 8 | # X.Y.Z # For bugfix releases 9 | # 10 | # Admissible pre-release markers: 11 | # X.Y.ZaN # Alpha release 12 | # X.Y.ZbN # Beta release 13 | # X.Y.ZrcN # Release Candidate 14 | # X.Y.Z # Final release 15 | # 16 | # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. 17 | # 'X.Y.dev0' is the canonical version of 'X.Y.dev' 18 | # 19 | __version__ = "0.6.dev0" 20 | 21 | try: 22 | # This variable is injected in the __builtins__ by the build 23 | # process. It is used to enable importing subpackages of skops when 24 | # the binaries are not built 25 | # mypy error: Cannot determine type of '__SKOPS_SETUP__' 26 | __SKOPS_SETUP__ # type: ignore 27 | except NameError: 28 | __SKOPS_SETUP__ = False 29 | 30 | if __SKOPS_SETUP__: 31 | sys.stderr.write("Partial import of the library during the build process.\n") 32 | # We are not importing the rest of the library during the build 33 | # process, as it may not be compiled yet or cause immature import 34 | -------------------------------------------------------------------------------- /.github/workflows/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 7 | 8 | #### Reference Issues/PRs 9 | 16 | 17 | 18 | #### What does this implement/fix? Explain your changes. 19 | 20 | 21 | #### Any other comments? 22 | 23 | 24 | -------------------------------------------------------------------------------- /skops/cli/tests/test_entrypoint.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pathlib 3 | import sys 4 | from unittest import mock 5 | 6 | import pytest 7 | 8 | from skops.cli.entrypoint import main_cli 9 | 10 | 11 | class TestEntrypoint: 12 | """Integration tests that check that entrypoint calls pass through correctly. 13 | Full coverage of individual entrypoint calls should be done in their own classes. 14 | """ 15 | 16 | @pytest.fixture(autouse=True) 17 | def clear_argv(self): 18 | # Required to clear argv in case Pytest is called on this specific function. 19 | # Otherwise, clogs parser.parse_known_args() in argparse 20 | sys.argv = [""] 21 | 22 | @mock.patch("skops.cli._convert._convert_file") 23 | def test_convert_works_as_expected( 24 | self, 25 | convert_file_mock: mock.MagicMock, 26 | caplog, 27 | ): 28 | """ 29 | Intended as a unit test to make sure, 30 | given 'convert' as the first argument, 31 | the parser is configured correctly 32 | """ 33 | 34 | args = ["convert", "abc.def"] 35 | 36 | main_cli(args) 37 | convert_file_mock.assert_called_once_with( 38 | input_file="abc.def", output_file=pathlib.Path.cwd() / "abc.skops" 39 | ) 40 | 41 | assert caplog.at_level(logging.WARNING) 42 | -------------------------------------------------------------------------------- /docs/community.rst: -------------------------------------------------------------------------------- 1 | .. _community: 2 | 3 | Community 4 | --------- 5 | Our community works mostly on `GitHub `__, 6 | directly on issues and pull requests. 7 | 8 | If you encounter any issues, please don't hesitate to open an issue on our 9 | repository. 10 | 11 | If you'd like to contribute to the project, please make sure you read our 12 | `contributing guidelines 13 | `__. 14 | 15 | 16 | Discord 17 | ~~~~~~~ 18 | We also have a place on Hugging Face's discord server. We're happy to see you 19 | there and answer any questions you might have. You can join using this `invite 20 | link `__. Once you join, first you need to accept 21 | the rules on the server regarding respectful and harassment free communication, 22 | and then you can head to the ``#role-assignment`` channel where you'll find and 23 | ``Open Source ML`` button. Clicking on that will give you access to a few 24 | channels and categories, including the ``skops`` category. 25 | 26 | Maintainers 27 | ----------- 28 | Current maintainers of the project are (in alphabetical order): 29 | 30 | - `Adrin Jalali `__ 31 | - `Benjamin Bossan `__ 32 | - `Erin Aho `__ 33 | - `Merve Noyan `__ 34 | -------------------------------------------------------------------------------- /skops/card/tests/examples/gpt2.md.diff: -------------------------------------------------------------------------------- 1 | --- 2 | +++ 3 | @@ -89 +88,0 @@ 4 | -> 5 | @@ -96 +95 @@ 6 | -Here's an example of how the model can have biased predictions: 7 | +Here’s an example of how the model can have biased predictions: 8 | @@ -144,5 +143,4 @@ 9 | -| Dataset | LAMBADA | LAMBADA | CBT-CN | CBT-NE | WikiText2 | PTB | enwiki8 | text8 | WikiText103 | 1BW | 10 | -|:--------:|:-------:|:-------:|:------:|:------:|:---------:|:------:|:-------:|:------:|:-----------:|:-----:| 11 | -| (metric) | (PPL) | (ACC) | (ACC) | (ACC) | (PPL) | (PPL) | (BPB) | (BPC) | (PPL) | (PPL) | 12 | -| | 35.13 | 45.99 | 87.65 | 83.4 | 29.41 | 65.85 | 1.16 | 1,17 | 37.50 | 75.20 | 13 | - 14 | +| Dataset | LAMBADA | CBT-CN | CBT-NE | WikiText2 | PTB | enwiki8 | text8 | WikiText103 | 1BW | 15 | +|-----------|-----------|----------|----------|-------------|-------|-----------|---------|---------------|-------| 16 | +| (metric) | (ACC) | (ACC) | (ACC) | (PPL) | (PPL) | (BPB) | (BPC) | (PPL) | (PPL) | 17 | +| | 45.99 | 87.65 | 83.4 | 29.41 | 65.85 | 1.16 | 1,17 | 37.50 | 75.20 | 18 | @@ -161 +159 @@ 19 | - 20 | + 21 | -------------------------------------------------------------------------------- /.github/workflows/publish-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish to (Test)PyPI 2 | on: 3 | workflow_dispatch: 4 | inputs: 5 | version: 6 | description: 'Version to upload to pypi' 7 | required: true 8 | pypi_repo: 9 | description: 'Repo to upload to ("testpypi" or "pypi")' 10 | default: 'testpypi' 11 | required: true 12 | 13 | jobs: 14 | publish: 15 | 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | with: 21 | ref: ${{ github.event.inputs.version }} 22 | 23 | - uses: actions/setup-python@v4 24 | with: 25 | python-version: '3.x' 26 | 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install -U pip 30 | python -m pip install -U setuptools wheel twine build 31 | 32 | - name: Generate distribution archives 33 | run: | 34 | python -m build 35 | 36 | - name: Publish package to TestPyPI 37 | uses: pypa/gh-action-pypi-publish@v1.6.4 38 | with: 39 | user: __token__ 40 | password: ${{ secrets.TEST_PYPI_TOKEN }} 41 | repository_url: https://test.pypi.org/legacy/ 42 | if: ${{ github.event.inputs.pypi_repo == 'testpypi' }} 43 | 44 | - name: Publish package to PyPI 45 | uses: pypa/gh-action-pypi-publish@v1.6.4 46 | with: 47 | user: __token__ 48 | password: ${{ secrets.PYPI_TOKEN }} 49 | if: ${{ github.event.inputs.pypi_repo == 'pypi' }} 50 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 88 3 | target_version = ['py38', 'py39', 'py310', 'py311'] 4 | preview = true 5 | 6 | [tool.isort] 7 | profile = "black" 8 | 9 | [tool.pytest.ini_options] 10 | filterwarnings = [ 11 | "error::DeprecationWarning", 12 | "error::FutureWarning", 13 | # TODO: remove when no longer supporting sklearn v1.0 14 | # numpy and scipy deprecation warnings in sklearn: 15 | 'ignore:\n\n \`numpy.distutils\` is deprecated since NumPy:DeprecationWarning', 16 | # https://github.com/scikit-learn/scikit-learn/issues/24080 17 | "ignore:The \\'sym_pos\\' keyword is deprecated and should be replaced:DeprecationWarning", 18 | # https://github.com/scikit-learn/scikit-learn/pull/23633 19 | "ignore:Unlike other reduction functions:FutureWarning", 20 | # https://github.com/scikit-learn/scikit-learn/pull/25157 21 | "ignore:\\w+ is deprecated. Use files\\(\\) instead:DeprecationWarning" 22 | ] 23 | markers = [ 24 | "network: marks tests as requiring internet (deselect with '-m \"not network\"')", 25 | "inference: marks tests that call inference API (deselect with '-m \"not inference\"')", 26 | ] 27 | addopts = "--cov=skops --cov-report=term-missing --doctest-modules" 28 | 29 | [tool.coverage.run] 30 | omit = [ 31 | "skops/**/test_*.py", 32 | "skops/_min_dependencies.py", 33 | "skops/conftest.py", 34 | ] 35 | 36 | [tool.mypy] 37 | exclude = "(\\w+/)*test_\\w+\\.py$" 38 | ignore_missing_imports = true 39 | no_implicit_optional = true 40 | -------------------------------------------------------------------------------- /skops/card/default_template.md: -------------------------------------------------------------------------------- 1 | --- 2 | {{ card_data }} 3 | --- 4 | 5 | # Model description 6 | 7 | {{ model_description | default("[More Information Needed]", true)}} 8 | 9 | ## Intended uses & limitations 10 | 11 | {{ limitations | default("[More Information Needed]", true)}} 12 | 13 | ## Training Procedure 14 | 15 | ### Hyperparameters 16 | 17 | The model is trained with below hyperparameters. 18 | 19 |
20 | Click to expand 21 | 22 | {{ hyperparameter_table }} 23 | 24 |
25 | 26 | ### Model Plot 27 | 28 | The model plot is below. 29 | 30 | {{ model_plot }} 31 | 32 | ## Evaluation Results 33 | 34 | You can find the details about evaluation process and the evaluation results. 35 | 36 | {{ eval_methods }} 37 | 38 | {{ eval_results | default("[More Information Needed]", true)}} 39 | 40 | # How to Get Started with the Model 41 | 42 | Use the code below to get started with the model. 43 | 44 | ```python 45 | {{ get_started_code | default("[More Information Needed]", true)}} 46 | ``` 47 | 48 | 49 | # Model Card Authors 50 | 51 | This model card is written by following authors: 52 | 53 | {{ model_card_authors | default("[More Information Needed]", true)}} 54 | 55 | # Model Card Contact 56 | 57 | You can contact the model card authors through following channels: 58 | {{ model_card_contact | default("[More Information Needed]", true)}} 59 | 60 | # Citation 61 | 62 | Below you can find information related to citation. 63 | 64 | **BibTeX:** 65 | ``` 66 | {{ citation_bibtex | default("[More Information Needed]", true)}} 67 | ``` 68 | -------------------------------------------------------------------------------- /skops/card/tests/examples/clip-vit-large-patch14.md.diff: -------------------------------------------------------------------------------- 1 | --- 2 | +++ 3 | @@ -23 +22,0 @@ 4 | - 5 | @@ -28 +26,0 @@ 6 | - 7 | @@ -51 +48,0 @@ 8 | - 9 | @@ -72,2 +68,0 @@ 10 | - 11 | - 12 | @@ -81,2 +75,0 @@ 13 | - 14 | - 15 | @@ -132,3 +125 @@ 16 | -We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks. 17 | - 18 | - 19 | +We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with 'Middle Eastern' having the highest accuracy (98.4%) and 'White' having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks. 20 | -------------------------------------------------------------------------------- /skops/cli/entrypoint.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import skops.cli._convert 4 | 5 | 6 | def main_cli(command_line_args=None): 7 | """Main command line interface entrypoint for all command line Skops methods. 8 | 9 | To add a new entrypoint: 10 | 1. Create a new method to call that accepts a namespace 11 | 2. Create a new subparser formatter to define the expected CL arguments 12 | 3. Add those to the function map. 13 | """ 14 | entry_parser = argparse.ArgumentParser( 15 | prog="Skops", 16 | description="Main entrypoint for all command line Skops methods.", 17 | add_help=True, 18 | ) 19 | 20 | subparsers = entry_parser.add_subparsers( 21 | title="Commands", 22 | description="Skops command to call", 23 | dest="cmd", 24 | help="Sub-commands help", 25 | ) 26 | 27 | # function_map should map a command to 28 | # method: the command to call (gets set to default 'func') 29 | # format_parser: the function used to create a subparser for that command 30 | function_map = { 31 | "convert": { 32 | "method": skops.cli._convert.main, 33 | "format_parser": skops.cli._convert.format_parser, 34 | }, 35 | } 36 | 37 | for func_name, values in function_map.items(): 38 | # Add subparser for each function in func map, 39 | # and assigns default func to be "method" from function_map 40 | subparser = subparsers.add_parser(func_name) 41 | subparser.set_defaults(func=values["method"]) 42 | values["format_parser"](subparser) 43 | 44 | # Parse arguments with arg parser for given function in function map, 45 | # Then call the matching method in the function_map with the argument namespace 46 | args = entry_parser.parse_args(command_line_args) 47 | args.func(args) 48 | -------------------------------------------------------------------------------- /skops/card/tests/examples/toy-example.md.diff: -------------------------------------------------------------------------------- 1 | --- 2 | +++ 3 | @@ -0,0 +1 @@ 4 | + 5 | @@ -17 +18 @@ 6 | -Parser doesn’t ‘preserve’ other “quotation” marks. 7 | +Parser doesn’t 'preserve' other "quotation" marks. 8 | @@ -22 +23 @@ 9 | -Another *way* of doing it. 10 | +Another _way_ of doing it. 11 | @@ -26 +27 @@ 12 | -One __way__ of doing it. 13 | +One **way** of doing it. 14 | @@ -45,2 +46,2 @@ 15 | -* using 16 | -* asterisk 17 | +- using 18 | +- asterisk 19 | @@ -56 +57 @@ 20 | -+ using plus 21 | +- using plus 22 | @@ -100 +101 @@ 23 | -[a link](https://skops.readthedocs.io/ "this disappears") 24 | +[a link](https://skops.readthedocs.io/) 25 | @@ -106 +107 @@ 26 | -[a link with reference][1] 27 | +[a link with reference](https://skops.readthedocs.io/) 28 | @@ -109,2 +109,0 @@ 29 | - 30 | -[1]: https://skops.readthedocs.io/ 31 | @@ -164 +163,6 @@ 32 | - 33 | + 34 | + 39 | @@ -167,8 +171,37 @@ 40 | -
Beast of Bodmin
41 | -
A large feline inhabiting Bodmin Moor.
42 | - 43 | -
Morgawr
44 | -
A sea serpent.
45 | - 46 | -
Owlman
47 | -
A giant owl-like creature.
48 | + 49 | +
50 | + 51 | +Beast of Bodmin 52 | + 53 | +
54 | + 55 | +
56 | + 57 | +A large feline inhabiting Bodmin Moor. 58 | + 59 | +
60 | + 61 | +
62 | + 63 | +Morgawr 64 | + 65 | +
66 | + 67 | +
68 | + 69 | +A sea serpent. 70 | + 71 | +
72 | + 73 | +
74 | + 75 | +Owlman 76 | + 77 | +
78 | + 79 | +
80 | + 81 | +A giant owl-like creature. 82 | + 83 | +
84 | + 85 | @@ -180,3 +213,2 @@ 86 | -
87 | -

Divs are possible

88 | -
89 | + 90 | +

Divs are possible

91 | @@ -186 +218 @@ 92 | -A text with 93 | +A text with 94 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | hub/ 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | .pytest_cache/ 6 | .mypy_cache/ 7 | *.py[cod] 8 | *$py.class 9 | py36-64/ 10 | py35-64/ 11 | 12 | # C extensions 13 | *.so 14 | .cython_src/ 15 | cython_src/ 16 | 17 | # Distribution / packaging 18 | .Python 19 | env/ 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | pip-wheel-metadata/* 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *,cover 55 | .hypothesis/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | docs/auto_examples/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # IPython Notebook 80 | .ipynb_checkpoints 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # dotenv 89 | .env* 90 | .~env 91 | .env-3.5.0 92 | .env-3.6.2 93 | 94 | # virtualenv 95 | venv/ 96 | ENV/ 97 | .linenv 98 | 99 | # Spyder project settings 100 | .spyderproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # PyCharm project settings 106 | .idea 107 | 108 | # Node 109 | node_modules 110 | 111 | # Redis 112 | *.rdb 113 | 114 | /tmp 115 | .vscode 116 | 117 | # Vim 118 | *.swp 119 | 120 | 121 | exports 122 | trash 123 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. skops documentation master file, created by 2 | sphinx-quickstart on Thu May 5 11:43:45 2022. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to skops's documentation! 7 | ================================= 8 | 9 | ``skops`` is a Python library helping you share your `scikit-learn 10 | `__ based models and put them in production. 11 | 12 | The library is still a work in progress and under active development. You can 13 | find the source code and the development discussions on `Github 14 | `__. 15 | 16 | The following examples are good starting points: 17 | 18 | - How to create and initialize a scikit-learn model repo: 19 | :ref:`sphx_glr_auto_examples_plot_hf_hub.py`. You can see all the models 20 | uploaded to the Hugging Face Hub using this library `here 21 | `_. 22 | - How to create a model card for your scikit-learn based model: 23 | :ref:`sphx_glr_auto_examples_plot_model_card.py` 24 | - A text classification example, and its integration with the hub: 25 | :ref:`sphx_glr_auto_examples_plot_text_classification.py` 26 | 27 | In order to better understand the role of each file and their content when 28 | uploaded to Hugging Face Hub, refer to this :ref:`user guide `. You can 29 | refer to :ref:`user guide ` to see how you can leverage model cards 30 | for documenting your scikit-learn models and enabling reproducibility. 31 | 32 | User Guide / API Reference 33 | ========================== 34 | 35 | .. toctree:: 36 | :maxdepth: 2 37 | 38 | installation 39 | hf_hub 40 | model_card 41 | persistence 42 | modules/classes 43 | 44 | Community / About 45 | ================= 46 | .. toctree:: 47 | :maxdepth: 1 48 | 49 | community 50 | changes 51 | 52 | Indices and tables 53 | ================== 54 | 55 | * :ref:`genindex` 56 | * :ref:`modindex` 57 | * :ref:`search` 58 | -------------------------------------------------------------------------------- /skops/_min_dependencies.py: -------------------------------------------------------------------------------- 1 | """All minimum dependencies for scikit-learn.""" 2 | import argparse 3 | 4 | PYTEST_MIN_VERSION = "5.0.1" 5 | 6 | # 'build' and 'install' is included to have structured metadata for CI. 7 | # It will NOT be included in setup's extras_require 8 | # The values are (version_spec, comma separated tags, condition) 9 | # tags can be: 'build', 'install', 'docs', 'examples', 'tests', 'benchmark' 10 | # example: 11 | # "tomli": ("1.1.0", "install", "python_full_version < '3.11.0a7'"), 12 | dependent_packages = { 13 | "scikit-learn": ("0.24", "install", None), 14 | "huggingface_hub": ("0.10.1", "install", None), 15 | "tabulate": ("0.8.8", "install", None), 16 | "pytest": (PYTEST_MIN_VERSION, "tests", None), 17 | "pytest-cov": ("2.9.0", "tests", None), 18 | "flake8": ("3.8.2", "tests", None), 19 | "types-requests": ("2.28.5", "tests", None), 20 | "flaky": ("3.7.0", "tests", None), 21 | "sphinx": ("3.2.0", "docs", None), 22 | "sphinx-gallery": ("0.7.0", "docs", None), 23 | "sphinx-rtd-theme": ("1", "docs", None), 24 | "numpydoc": ("1.0.0", "docs", None), 25 | "sphinx-prompt": ("1.3.0", "docs", None), 26 | "sphinx-issues": ("1.2.0", "docs", None), 27 | "matplotlib": ("3.3", "docs, tests", None), 28 | "packaging": ("17.0", "install", None), 29 | "pandas": ("1", "docs, tests", None), 30 | # required for persistence tests of external libraries 31 | "lightgbm": ("3", "tests", None), 32 | "xgboost": ("1.6", "tests", None), 33 | # TODO: remove condition when catboost supports python 3.11 34 | "catboost": ("1.0", "tests", "python_version < '3.11'"), 35 | } 36 | 37 | 38 | # create inverse mapping for setuptools 39 | tag_to_packages: dict = { 40 | extra: [] 41 | for extra in ["build", "install", "docs", "examples", "tests", "benchmark"] 42 | } 43 | for package, (min_version, extras, condition) in dependent_packages.items(): 44 | for extra in extras.split(", "): 45 | spec = f"{package}>={min_version}" 46 | if condition: 47 | spec += f"; {condition}" 48 | tag_to_packages[extra].append(spec) 49 | 50 | 51 | # Used by CI to get the min dependencies 52 | if __name__ == "__main__": 53 | parser = argparse.ArgumentParser(description="Get min dependencies for a package") 54 | 55 | parser.add_argument("package", choices=dependent_packages) 56 | args = parser.parse_args() 57 | min_version = dependent_packages[args.package][0] 58 | print(min_version) 59 | -------------------------------------------------------------------------------- /skops/io/_scipy.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import io 4 | from typing import Any, Sequence 5 | 6 | from scipy.sparse import load_npz, save_npz, spmatrix 7 | 8 | from ._audit import Node 9 | from ._utils import LoadContext, SaveContext, get_module 10 | 11 | 12 | def sparse_matrix_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: 13 | res = { 14 | "__class__": obj.__class__.__name__, 15 | "__module__": get_module(type(obj)), 16 | "__loader__": "SparseMatrixNode", 17 | } 18 | 19 | data_buffer = io.BytesIO() 20 | save_npz(data_buffer, obj) 21 | # Memoize the object and then check if it's file name (containing 22 | # the object id) already exists. If it does, there is no need to 23 | # save the object again. Memoizitation is necessary since for 24 | # ephemeral objects, the same id might otherwise be reused. 25 | obj_id = save_context.memoize(obj) 26 | f_name = f"{obj_id}.npz" 27 | if f_name not in save_context.zip_file.namelist(): 28 | save_context.zip_file.writestr(f_name, data_buffer.getbuffer()) 29 | 30 | res["type"] = "scipy" 31 | res["file"] = f_name 32 | return res 33 | 34 | 35 | class SparseMatrixNode(Node): 36 | def __init__( 37 | self, 38 | state: dict[str, Any], 39 | load_context: LoadContext, 40 | trusted: bool | Sequence[str] = False, 41 | ) -> None: 42 | super().__init__(state, load_context, trusted) 43 | type = state["type"] 44 | self.trusted = self._get_trusted(trusted, [spmatrix]) 45 | if type != "scipy": 46 | raise TypeError( 47 | f"Cannot load object of type {self.module_name}.{self.class_name}" 48 | ) 49 | 50 | self.children = {"content": io.BytesIO(load_context.src.read(state["file"]))} 51 | 52 | def _construct(self): 53 | # scipy load_npz uses numpy.save with allow_pickle=False under the 54 | # hood, so we're safe using it 55 | return load_npz(self.children["content"]) 56 | 57 | 58 | # tuples of type and function that gets the state of that type 59 | GET_STATE_DISPATCH_FUNCTIONS = [ 60 | # use 'spmatrix' to check if a matrix is a sparse matrix because that is 61 | # what scipy.sparse.issparse checks 62 | (spmatrix, sparse_matrix_get_state), 63 | ] 64 | # tuples of type and function that creates the instance of that type 65 | NODE_TYPE_MAPPING = { 66 | # use 'spmatrix' to check if a matrix is a sparse matrix because that is 67 | # what scipy.sparse.issparse checks 68 | "SparseMatrixNode": SparseMatrixNode, 69 | } 70 | -------------------------------------------------------------------------------- /skops/io/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | import scipy 4 | import sklearn.tree 5 | 6 | from skops.io._utils import get_type_name, get_type_paths 7 | 8 | 9 | class UserDefinedClass: 10 | pass 11 | 12 | 13 | class UserDefinedString(str): 14 | """Used to test behaviour of subclasses of strings""" 15 | 16 | pass 17 | 18 | 19 | class TestGetTypeName: 20 | @pytest.mark.parametrize( 21 | "input_type, expected_output", 22 | [ 23 | # Built-In types 24 | (list, "builtins.list"), 25 | (set, "builtins.set"), 26 | (dict, "builtins.dict"), 27 | (str, "builtins.str"), 28 | # Numpy types 29 | (np.ndarray, "numpy.ndarray"), 30 | (np.ma.MaskedArray, "numpy.ma.core.MaskedArray"), 31 | # SciPy types 32 | (scipy.fft.fft, "scipy.fft._basic.fft"), 33 | # SKlearn types 34 | ( 35 | sklearn.linear_model.HuberRegressor, 36 | "sklearn.linear_model._huber.HuberRegressor", 37 | ), 38 | # User defined types 39 | (UserDefinedClass, "test_utils.UserDefinedClass"), 40 | (UserDefinedString, "test_utils.UserDefinedString"), 41 | ], 42 | ) 43 | def test_for_input_types_returns_as_expected(self, input_type, expected_output): 44 | assert get_type_name(input_type) == expected_output 45 | 46 | 47 | class TestConvertTypesToStrings: 48 | @pytest.mark.parametrize( 49 | "input_list, output_list", 50 | [ 51 | # Happy path 52 | (["builtins.str", "builtins.list"], ["builtins.str", "builtins.list"]), 53 | ([str, list], ["builtins.str", "builtins.list"]), 54 | ([np.ndarray, "builtins.str"], ["numpy.ndarray", "builtins.str"]), 55 | # Edge cases 56 | (None, []), 57 | (int, ["builtins.int"]), 58 | ((list,), ["builtins.list"]), 59 | ([], []), 60 | (UserDefinedString, ["test_utils.UserDefinedString"]), 61 | (UserDefinedString("foo"), ["foo"]), 62 | ], 63 | ids=[ 64 | "As strings", 65 | "As types", 66 | "mixed", 67 | "None", 68 | "Single int type", 69 | "List in tuple", 70 | "Empty list", 71 | "UserDefinedString as type", 72 | "UserDefinedString as instance", 73 | ], 74 | ) 75 | def test_for_normal_input_lists_returns_as_expected(self, input_list, output_list): 76 | assert get_type_paths(input_list) == output_list 77 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # License: 3-clause BSD 3 | import builtins 4 | 5 | from setuptools import setup 6 | 7 | # This is a bit (!) hackish: we are setting a global variable so that the 8 | # main modelcard __init__ can detect if it is being loaded by the setup 9 | # routine, to avoid attempting to load components. 10 | builtins.__SKOPS_SETUP__ = True # type: ignore 11 | 12 | 13 | import skops # noqa 14 | import skops._min_dependencies as min_deps # noqa 15 | 16 | VERSION = skops.__version__ 17 | 18 | DISTNAME = "skops" 19 | DESCRIPTION = ( 20 | "A set of tools to push scikit-learn based models to and pull from Hugging Face Hub" 21 | ) 22 | with open("README.rst") as f: 23 | LONG_DESCRIPTION = f.read() 24 | MAINTAINER = "Adrin Jalali" 25 | MAINTAINER_EMAIL = "adrin.jalali@gmail.com" 26 | URL = "http://github.com/skops-dev/skops" 27 | DOWNLOAD_URL = "https://pypi.org/project/skops/#files" 28 | LICENSE = "MIT" 29 | PROJECT_URLS = { 30 | "Bug Tracker": "http://github.com/skops-dev/skops/issues", 31 | "Documentation": "http://github.com/skops-dev/skops", 32 | "Source Code": "http://github.com/skops-dev/skops", 33 | } 34 | 35 | 36 | def setup_package(): 37 | package_data = dict( 38 | entry_points={ 39 | "console_scripts": [ 40 | "skops = skops.cli.entrypoint:main_cli", 41 | ], 42 | } 43 | ) 44 | 45 | metadata = dict( 46 | name=DISTNAME, 47 | maintainer=MAINTAINER, 48 | maintainer_email=MAINTAINER_EMAIL, 49 | description=DESCRIPTION, 50 | license=LICENSE, 51 | url=URL, 52 | download_url=DOWNLOAD_URL, 53 | project_urls=PROJECT_URLS, 54 | version=VERSION, 55 | long_description=LONG_DESCRIPTION, 56 | classifiers=[ 57 | "Intended Audience :: Science/Research", 58 | "Intended Audience :: Developers", 59 | "License :: OSI Approved", 60 | "Programming Language :: Python", 61 | "Topic :: Software Development", 62 | "Topic :: Scientific/Engineering", 63 | "Development Status :: 1 - Planning", 64 | "Operating System :: Microsoft :: Windows", 65 | "Operating System :: POSIX", 66 | "Operating System :: Unix", 67 | "Operating System :: MacOS", 68 | "Programming Language :: Python :: 3", 69 | "Programming Language :: Python :: 3.8", 70 | "Programming Language :: Python :: 3.9", 71 | "Programming Language :: Python :: 3.10", 72 | "Programming Language :: Python :: 3.11", 73 | "Programming Language :: Python :: Implementation :: CPython", 74 | ], 75 | python_requires=">=3.8", 76 | install_requires=min_deps.tag_to_packages["install"], 77 | extras_require={ 78 | "docs": min_deps.tag_to_packages["docs"], 79 | "tests": min_deps.tag_to_packages["tests"], 80 | }, 81 | include_package_data=True, 82 | ) 83 | 84 | setup(**package_data, **metadata) 85 | 86 | 87 | if __name__ == "__main__": 88 | setup_package() 89 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | |readthedocs| |github-actions| |Codecov| |PyPi| |Black| 4 | 5 | .. |readthedocs| image:: https://readthedocs.org/projects/skops/badge/?version=latest&style=flat 6 | :target: https://skops.readthedocs.io/en/latest/ 7 | :alt: Documentation 8 | 9 | .. |github-actions| image:: https://github.com/skops-dev/skops/workflows/pytest/badge.svg 10 | :target: https://github.com/skops-dev/skops/actions 11 | :alt: Linux, macOS, Windows tests 12 | 13 | .. |Codecov| image:: https://codecov.io/gh/skops-dev/skops/branch/main/graph/badge.svg 14 | :target: https://codecov.io/gh/skops-dev/skops 15 | :alt: Codecov 16 | 17 | .. |PyPi| image:: https://img.shields.io/pypi/v/skops 18 | :target: https://pypi.org/project/skops 19 | :alt: PyPi 20 | 21 | .. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg 22 | :target: https://github.com/psf/black 23 | :alt: Black 24 | 25 | .. image:: https://raw.githubusercontent.com/skops-dev/skops/main/docs/images/logo.png 26 | :width: 500 27 | :target: https://skops.readthedocs.io/en/latest/ 28 | 29 | SKOPS 30 | ===== 31 | 32 | ``skops`` is a Python library helping you share your `scikit-learn 33 | `__ based models and put them in production. 34 | At the moment, it includes tools to easily integrate models on the Hugging Face 35 | Hub, which allows you to share your models, make them discoverable, and use the 36 | Hub's API inference and widgets to get outputs of the model without having to 37 | download or load the model. 38 | 39 | - ``skops.hub_utils``: tools to create a model repository to be stored on 40 | `Hugging Face Hub `__, mainly through 41 | ``skops.hub_utils.init`` and ``skops.hub_utils.push``. You can see all the 42 | models uploaded to the hub using this library `here 43 | `_ 44 | - ``skops.card``: tools to create a model card explaining what the model does 45 | and how it should be used. The model card can then be stored as the 46 | ``README.md`` file on the Hugging Face Hub, with pre-populated metadata to 47 | help Hub understand the model. 48 | - ``skops.io``: Secure persistence of sklearn estimators and more, without using 49 | ``pickle``. Visit `the docs 50 | `_ for more 51 | information. 52 | 53 | Please refer to our `documentation `_ 54 | on using the library as user, which includes user guides on the above topics as 55 | well as complete examples explaining how the features can be used. 56 | 57 | If you want to contribute to the library, please refer to our `contributing 58 | `_ guidelines. 59 | 60 | Installation 61 | ------------ 62 | 63 | You can install this library using: 64 | 65 | .. code-block:: bash 66 | 67 | python -m pip install skops 68 | 69 | Bug Reports and Questions 70 | ------------------------- 71 | 72 | Please send all your questions and report issues on this repository's issue 73 | tracker as an issue. Try to look for existing ones before you create a new one. 74 | -------------------------------------------------------------------------------- /.github/workflows/build-test.yml: -------------------------------------------------------------------------------- 1 | name: pytest 2 | 3 | on: 4 | - push 5 | - pull_request 6 | 7 | concurrency: 8 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 9 | cancel-in-progress: true 10 | 11 | jobs: 12 | pytest: 13 | 14 | runs-on: ${{ matrix.os }} 15 | if: "github.repository == 'skops-dev/skops'" 16 | strategy: 17 | fail-fast: false # need to see which ones fail 18 | matrix: 19 | os: [ubuntu-latest, windows-latest, macos-latest] 20 | python: ["3.8", "3.9", "3.10", "3.11"] 21 | # this is to make the CI run on different sklearn versions 22 | include: 23 | - python: "3.8" 24 | sklearn_version: "1.0" 25 | - python: "3.9" 26 | sklearn_version: "1.1" 27 | - python: "3.10" 28 | sklearn_version: "1.2" 29 | - python: "3.11" 30 | sklearn_version: "nightly" 31 | 32 | 33 | # Timeout: https://stackoverflow.com/a/59076067/4521646 34 | timeout-minutes: 15 35 | 36 | steps: 37 | 38 | # The following two steps are workarounds to retrieve the "real" commit 39 | # message and make it available in later steps. This is because we want to 40 | # check the content of the commit message, but on PRs, it's replaced by an 41 | # artificial commit message. See https://github.com/skops-dev/skops/pull/147 42 | - uses: actions/checkout@v3 43 | with: 44 | fetch-depth: 0 45 | ref: ${{github.event.after}} 46 | 47 | - run: | 48 | echo PR_COMMIT_MESSAGE=$(git log -1 --pretty=format:\"%s\") >> $GITHUB_ENV 49 | shell: bash 50 | 51 | - name: Set up Python ${{ matrix.python }} 52 | uses: actions/setup-python@v4 53 | with: 54 | python-version: ${{ matrix.python }} 55 | 56 | - name: Install dependencies 57 | run: | 58 | pip install .[docs,tests] 59 | pip install black=="22.6.0" isort=="5.10.1" mypy=="0.981" 60 | pip uninstall --yes scikit-learn 61 | if [ ${{ matrix.sklearn_version }} == "nightly" ]; 62 | then pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn; 63 | else pip install "scikit-learn~=${{ matrix.sklearn_version }}"; 64 | fi 65 | if [ ${{ matrix.os }} == "ubuntu-latest" ]; 66 | then sudo apt install pandoc && pandoc --version; 67 | fi 68 | python --version 69 | pip --version 70 | pip list 71 | shell: bash 72 | 73 | - name: Check black 74 | run: black --check --diff . 75 | 76 | - name: Check isort 77 | run: isort --check --diff . 78 | 79 | - name: Tests 80 | env: 81 | SUPER_SECRET: ${{ secrets.HF_HUB_TOKEN }} 82 | run: | 83 | python -m pytest -s -v --cov-report=xml -m "not inference" skops/ 84 | 85 | - name: Mypy 86 | run: mypy --config-file pyproject.toml skops 87 | 88 | - name: Inference tests (conditional) 89 | if: contains(env.PR_COMMIT_MESSAGE, '[CI inference]') 90 | run: | 91 | python -m pytest -s -v -m "inference" skops/ 92 | 93 | - name: Upload coverage to Codecov 94 | uses: codecov/codecov-action@v3 95 | with: 96 | env_vars: OS,PYTHON 97 | fail_ci_if_error: true 98 | token: ${{ secrets.CODECOV_TOKEN }} 99 | files: ./coverage.xml 100 | flags: unittests 101 | name: codecov-umbrella 102 | verbose: true 103 | -------------------------------------------------------------------------------- /skops/card/tests/examples/bert-base-uncased.md.diff: -------------------------------------------------------------------------------- 1 | --- 2 | +++ 3 | @@ -44,10 +44,10 @@ 4 | -| Model | #params | Language | 5 | -|------------------------|--------------------------------|-------| 6 | -| [`bert-base-uncased`](https://huggingface.co/bert-base-uncased) | 110M | English | 7 | -| [`bert-large-uncased`](https://huggingface.co/bert-large-uncased) | 340M | English | sub 8 | -| [`bert-base-cased`](https://huggingface.co/bert-base-cased) | 110M | English | 9 | -| [`bert-large-cased`](https://huggingface.co/bert-large-cased) | 340M | English | 10 | -| [`bert-base-chinese`](https://huggingface.co/bert-base-chinese) | 110M | Chinese | 11 | -| [`bert-base-multilingual-cased`](https://huggingface.co/bert-base-multilingual-cased) | 110M | Multiple | 12 | -| [`bert-large-uncased-whole-word-masking`](https://huggingface.co/bert-large-uncased-whole-word-masking) | 340M | English | 13 | -| [`bert-large-cased-whole-word-masking`](https://huggingface.co/bert-large-cased-whole-word-masking) | 340M | English | 14 | +| Model | #params | Language | 15 | +|---------------------------------------------------------------------------------------------------------|-----------|------------| 16 | +| [`bert-base-uncased`](https://huggingface.co/bert-base-uncased) | 110M | English | 17 | +| [`bert-large-uncased`](https://huggingface.co/bert-large-uncased) | 340M | English | 18 | +| [`bert-base-cased`](https://huggingface.co/bert-base-cased) | 110M | English | 19 | +| [`bert-large-cased`](https://huggingface.co/bert-large-cased) | 340M | English | 20 | +| [`bert-base-chinese`](https://huggingface.co/bert-base-chinese) | 110M | Chinese | 21 | +| [`bert-base-multilingual-cased`](https://huggingface.co/bert-base-multilingual-cased) | 110M | Multiple | 22 | +| [`bert-large-uncased-whole-word-masking`](https://huggingface.co/bert-large-uncased-whole-word-masking) | 340M | English | 23 | +| [`bert-large-cased-whole-word-masking`](https://huggingface.co/bert-large-cased-whole-word-masking) | 340M | English | 24 | @@ -57 +57 @@ 25 | -You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to 26 | +You can use the raw model for either masked language modeling or next sentence prediction, but it’s mostly intended to 27 | @@ -189 +189 @@ 28 | -the other cases, it's another random sentence in the corpus. Note that what is considered a sentence here is a 29 | +the other cases, it’s another random sentence in the corpus. Note that what is considered a sentence here is a 30 | @@ -212,4 +212,3 @@ 31 | -| Task | MNLI-(m/mm) | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | Average | 32 | -|:----:|:-----------:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:-------:| 33 | -| | 84.6/83.4 | 71.2 | 90.5 | 93.5 | 52.1 | 85.8 | 88.9 | 66.4 | 79.6 | 34 | - 35 | +| Task | MNLI-(m/mm) | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | Average | 36 | +|--------|---------------|-------|--------|---------|--------|---------|--------|-------|-----------| 37 | +| | 84.6/83.4 | 71.2 | 90.5 | 93.5 | 52.1 | 85.8 | 88.9 | 66.4 | 79.6 | 38 | @@ -240 +239 @@ 39 | - 40 | + 41 | -------------------------------------------------------------------------------- /skops/card/tests/examples/toy-example.md: -------------------------------------------------------------------------------- 1 | # This document tries to cover many common markdown contents 2 | 3 | This is not based on an existing model card and serves to increase test coverage. It also documents differences that may be found after parsing. There is no metainfo section. 4 | 5 | ## H2 6 | 7 | ### H3 8 | 9 | #### H4 10 | 11 | ##### H5 12 | 13 | ###### H6 14 | 15 | Parser 'preserves' some "quotation" marks. 16 | 17 | Parser doesn’t ‘preserve’ other “quotation” marks. 18 | 19 | ## Italics 20 | 21 | One _way_ of doing it. 22 | Another *way* of doing it. 23 | 24 | ## Bold 25 | 26 | One __way__ of doing it. 27 | Another **way** of doing it. 28 | 29 | ## Strikethrough 30 | 31 | This is ~~not~~ the way. 32 | 33 | ## Superscript and subscripts 34 | 35 | Really just html tags. 36 | 37 | E = mc2 38 | 39 | log2 40 | 41 | ## Bullet lists 42 | 43 | Pandoc does not differentiate between different notations, so we always use -, not * or +. 44 | 45 | * using 46 | * asterisk 47 | 48 | or 49 | 50 | - using 51 | - minus 52 | with line break 53 | 54 | or 55 | 56 | + using plus 57 | 58 | Finally: 59 | 60 | - nesting 61 | - is 62 | - indeed 63 | - very 64 | - possible 65 | - to achieve 66 | 67 | ## Ordered lists 68 | 69 | 1. a normal 70 | 2. ordered list 71 | 72 | or 73 | 74 | 1. an ordered 75 | 2. list 76 | 1. with 77 | 2. indentation 78 | 3. is possible 79 | 80 | ## Mixed lists 81 | 82 | 1. it’s 83 | 2. possible 84 | - to 85 | - mix 86 | 3. ordered _and_ unorderd 87 | 88 | ## TODOs 89 | 90 | - [x] This 91 | - [ ] is 92 | - [x] **done** 93 | 94 | ## Links 95 | 96 | [a link](https://skops.readthedocs.io/) 97 | 98 | The "title" is not parsed by pandoc 99 | 100 | [a link](https://skops.readthedocs.io/ "this disappears") 101 | 102 | [a link to a file](./toy-example.md) 103 | 104 | References are resolved, so `[1]` below is replaced by the actual link: 105 | 106 | [a link with reference][1] 107 | 108 | A plain link to https://skops.readthedocs.io/ used inside of text. 109 | 110 | [1]: https://skops.readthedocs.io/ 111 | 112 | ## Images 113 | 114 | ![skops logo](https://github.com/skops-dev/skops/blob/main/docs/images/logo.png) 115 | 116 | ### Using html 117 | 118 | logo 119 | 120 | ## Quotes 121 | 122 | > Someone said something importent 123 | 124 | > I quote wise words: 125 | > > Someone said something importent 126 | 127 | ## Tables 128 | 129 | | Header 0 | Header 1 | 130 | |--------------|----------------| 131 | | Some content | More content | 132 | | _Even more_ | This is **it** | 133 | 134 | Empty tables are legal 135 | 136 | | What now? | 137 | |-------------| 138 | 139 | ## Inline code 140 | 141 | Some `inline` code. 142 | 143 | `A whole line` 144 | 145 | ## Code blocks 146 | 147 | ``` 148 | A raw 149 | 150 | code block 151 | ``` 152 | 153 | With language 154 | 155 | ```python 156 | def foo(): 157 | return 0 158 | 159 | def bar(): 160 | return 1 161 | ``` 162 | 163 | ## Raw HTML 164 | 165 | 166 |
167 |
Beast of Bodmin
168 |
A large feline inhabiting Bodmin Moor.
169 | 170 |
Morgawr
171 |
A sea serpent.
172 | 173 |
Owlman
174 |
A giant owl-like creature.
175 |
176 | 177 | ## Div 178 | 179 | The "id" tag may change in order 180 |
181 |

Divs are possible

182 |
183 | 184 | ## Line breaks 185 | 186 | A text with 187 | a LineBreak item. 188 | -------------------------------------------------------------------------------- /skops/cli/_convert.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import pathlib 7 | import pickle 8 | from typing import Optional 9 | 10 | from skops.cli._utils import get_log_level 11 | from skops.io import dumps, get_untrusted_types 12 | 13 | 14 | def _convert_file( 15 | input_file: os.PathLike, 16 | output_file: os.PathLike, 17 | logger: logging.Logger = logging.getLogger(), 18 | ) -> None: 19 | """Function that is called by ``skops convert`` entrypoint. 20 | 21 | Loads a pickle model from the input path, converts to skops format, and saves to 22 | output file. 23 | 24 | Parameters 25 | ---------- 26 | input_file : os.PathLike 27 | Path of input .pkl model to load. 28 | 29 | output_file : os.PathLike 30 | Path to save .skops model to. 31 | 32 | """ 33 | model_name = pathlib.Path(input_file).stem 34 | 35 | logger.debug(f"Converting {model_name}") 36 | 37 | with open(input_file, "rb") as f: 38 | obj = pickle.load(f) 39 | skops_dump = dumps(obj) 40 | 41 | untrusted_types = get_untrusted_types(data=skops_dump) 42 | 43 | if not untrusted_types: 44 | logger.info(f"No unknown types found in {model_name}.") 45 | else: 46 | untrusted_str = ", ".join(untrusted_types) 47 | 48 | logger.warning( 49 | f"While converting {input_file}, " 50 | "the following unknown types were found: " 51 | f"{untrusted_str}. " 52 | f"When loading {output_file} with skops.load, these types must be " 53 | "specified as 'trusted'" 54 | ) 55 | 56 | with open(output_file, "wb") as out_file: 57 | logger.debug(f"Writing to {output_file}") 58 | out_file.write(skops_dump) 59 | 60 | 61 | def format_parser( 62 | parser: Optional[argparse.ArgumentParser] = None, 63 | ) -> argparse.ArgumentParser: 64 | """Adds arguments and help to parent CLI parser for the convert method.""" 65 | 66 | if not parser: # used in tests 67 | parser = argparse.ArgumentParser() 68 | 69 | parser_subgroup = parser.add_argument_group("convert") 70 | parser_subgroup.add_argument("input", help="Path to an input file to convert. ") 71 | 72 | parser_subgroup.add_argument( 73 | "-o", 74 | "--output-file", 75 | help=( 76 | "Specify the output file name for the converted skops file. " 77 | "If not provided, will default to using the same name as the input file, " 78 | "and saving to the current working directory with the suffix '.skops'." 79 | ), 80 | default=None, 81 | ) 82 | parser_subgroup.add_argument( 83 | "-v", 84 | "--verbose", 85 | help=( 86 | "Increases verbosity of logging. Can be used multiple times to increase " 87 | "verbosity further." 88 | ), 89 | action="count", 90 | dest="loglevel", 91 | default=0, 92 | ) 93 | return parser 94 | 95 | 96 | def main( 97 | parsed_args: argparse.Namespace, 98 | ) -> None: 99 | output_file = parsed_args.output_file 100 | input_file = parsed_args.input 101 | 102 | logging.basicConfig( 103 | format="%(levelname)-8s: %(message)s", level=get_log_level(parsed_args.loglevel) 104 | ) 105 | 106 | if not output_file: 107 | # No filename provided, defaulting to base file path 108 | file_name = pathlib.Path(input_file).stem 109 | output_file = pathlib.Path.cwd() / f"{file_name}.skops" 110 | 111 | _convert_file( 112 | input_file=input_file, 113 | output_file=output_file, 114 | ) 115 | -------------------------------------------------------------------------------- /skops/cli/tests/test_convert.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pathlib 3 | import pickle 4 | from unittest import mock 5 | 6 | import numpy as np 7 | import pytest 8 | 9 | from skops.cli import _convert 10 | from skops.io import load 11 | 12 | 13 | class MockUnsafeType: 14 | def __init__(self): 15 | pass 16 | 17 | 18 | class TestConvert: 19 | model_name = "some_model_name" 20 | 21 | @pytest.fixture 22 | def safe_obj(self): 23 | return np.ndarray([1, 2, 3, 4]) 24 | 25 | @pytest.fixture 26 | def unsafe_obj(self): 27 | return MockUnsafeType() 28 | 29 | @pytest.fixture 30 | def pkl_path(self, tmp_path): 31 | return tmp_path / f"{self.model_name}.pkl" 32 | 33 | @pytest.fixture 34 | def skops_path(self, tmp_path): 35 | return tmp_path / f"{self.model_name}.skops" 36 | 37 | @pytest.fixture 38 | def write_safe_file(self, pkl_path, safe_obj): 39 | with open(pkl_path, "wb") as f: 40 | pickle.dump(safe_obj, f) 41 | 42 | @pytest.fixture 43 | def write_unsafe_file(self, pkl_path, unsafe_obj): 44 | with open(pkl_path, "wb") as f: 45 | pickle.dump(unsafe_obj, f) 46 | 47 | def test_base_case_works_as_expected( 48 | self, pkl_path, tmp_path, skops_path, write_safe_file, safe_obj, caplog 49 | ): 50 | mock_logger = mock.MagicMock() 51 | _convert._convert_file(pkl_path, skops_path, logger=mock_logger) 52 | persisted_obj = load(skops_path) 53 | assert np.array_equal(persisted_obj, safe_obj) 54 | 55 | # Check no warnings or errors raised 56 | mock_logger.warning.assert_not_called() 57 | mock_logger.error.assert_not_called() 58 | 59 | def test_unsafe_case_works_as_expected( 60 | self, pkl_path, tmp_path, skops_path, write_unsafe_file, caplog 61 | ): 62 | caplog.set_level(logging.WARNING) 63 | _convert._convert_file(pkl_path, skops_path) 64 | persisted_obj = load(skops_path, trusted=True) 65 | 66 | assert isinstance(persisted_obj, MockUnsafeType) 67 | 68 | # check logging has warned that an unsafe type was found 69 | assert MockUnsafeType.__name__ in caplog.text 70 | 71 | 72 | class TestMain: 73 | @staticmethod 74 | def assert_called_correctly( 75 | mock_convert: mock.MagicMock, 76 | path, 77 | output_file=None, 78 | ): 79 | if not output_file: 80 | output_file = pathlib.Path.cwd() / f"{pathlib.Path(path).stem}.skops" 81 | mock_convert.assert_called_once_with(input_file=path, output_file=output_file) 82 | 83 | @mock.patch("skops.cli._convert._convert_file") 84 | def test_base_works_as_expected(self, mock_convert: mock.MagicMock): 85 | path = "123.pkl" 86 | namespace, _ = _convert.format_parser().parse_known_args([path]) 87 | 88 | _convert.main(namespace) 89 | self.assert_called_correctly(mock_convert, path) 90 | 91 | @mock.patch("skops.cli._convert._convert_file") 92 | @pytest.mark.parametrize( 93 | "input_path, output_file, expected_path", 94 | [ 95 | ("abc.123", "some/file/path.out", "some/file/path.out"), 96 | ("abc.123", None, pathlib.Path.cwd() / "abc.skops"), 97 | ], 98 | ids=["Given an output path", "No output path"], 99 | ) 100 | def test_with_output_dir_works_as_expected( 101 | self, mock_convert: mock.MagicMock, input_path, output_file, expected_path 102 | ): 103 | if output_file is not None: 104 | args = [input_path, "--output", output_file] 105 | else: 106 | args = [input_path] 107 | 108 | namespace, _ = _convert.format_parser().parse_known_args(args) 109 | 110 | _convert.main(namespace) 111 | self.assert_called_correctly( 112 | mock_convert, path=input_path, output_file=expected_path 113 | ) 114 | 115 | @mock.patch("skops.cli._convert._convert_file") 116 | @pytest.mark.parametrize( 117 | "verbosity, expected_level", 118 | [ 119 | ("", logging.WARNING), 120 | ("-v", logging.INFO), 121 | ("--verbose", logging.INFO), 122 | ("-vv", logging.DEBUG), 123 | ("-v -v", logging.DEBUG), 124 | ("-vvvvv", logging.DEBUG), 125 | ("--verbose --verbose", logging.DEBUG), 126 | ], 127 | ) 128 | def test_given_log_levels_works_as_expected( 129 | self, mock_convert: mock.MagicMock, verbosity, expected_level, caplog 130 | ): 131 | input_path = "abc.def" 132 | output_path = "bde.skops" 133 | args = [input_path, "--output", output_path, verbosity.split()] 134 | 135 | namespace, _ = _convert.format_parser().parse_known_args(args) 136 | 137 | _convert.main(namespace) 138 | self.assert_called_correctly( 139 | mock_convert, path=input_path, output_file=output_path 140 | ) 141 | 142 | assert caplog.at_level(expected_level) 143 | -------------------------------------------------------------------------------- /docs/changes.rst: -------------------------------------------------------------------------------- 1 | .. include:: _authors.rst 2 | 3 | .. _changelog: 4 | 5 | skops Changelog 6 | =============== 7 | 8 | .. contents:: Table of Contents 9 | :depth: 1 10 | :local: 11 | 12 | v0.6 13 | ---- 14 | 15 | v0.5 16 | ---- 17 | - Added CLI entrypoint support (:func:`.cli.entrypoint.main_cli`) 18 | and a command line function to convert Pickle files 19 | to Skops files (:func:`.cli._convert.main`). :pr:`249` by `Erin Aho`_ 20 | - Support more array-like data types for tabular data and list-like data types 21 | for text data. :pr:`179` by `Francesco Cariaggi`_. 22 | - Add an option `use_intelex` to :func:`skops.hub_utils.init` which, when 23 | enabled, will result in the Hugging Face inference API running with Intel's 24 | scikit-learn intelex library, which can accelerate inference times. :pr:`267` 25 | by `Benjamin Bossan`_. 26 | - Model cards that have been written into a markdown file can now be parsed back 27 | into a :class:`skops.card.Card` object and edited further by using the 28 | :func:`skops.card.parse_modelcard` function. :pr:`257` by `Benjamin Bossan`_. 29 | 30 | v0.4 31 | ---- 32 | - :func:`.io.dump` and :func:`.io.load` now work with file like objects, 33 | which means you can use them with the ``with open(...) as f: dump(obj, f)`` 34 | pattern, like you'd do with ``pickle``. :pr:`234` by `Benjamin Bossan`_. 35 | - All `scikit-learn` estimators are trusted by default. 36 | :pr:`237` by :user:`Edoardo Abati `. 37 | - Add `model_format` argument to :meth:`skops.hub_utils.init` to be stored in 38 | `config.json` so that we know how to load a model from the repository. 39 | :pr:`242` by `Merve Noyan`_. 40 | - Persistence now supports bytes and bytearrays, added tests to verify that 41 | LightGBM, XGBoost, and CatBoost work now. :pr:`244` by `Benjamin Bossan`_. 42 | - :class:`.card.Card` now allows to add content to existing sections, using a 43 | ``/`` to separate the subsections. E.g. use ``card.add(**{"Existing 44 | section/New section": "content"})`` to add "content" a new subsection called 45 | "New section" to an existing section called "Existing section". :pr:`203` by 46 | `Benjamin Bossan`_. 47 | 48 | v0.3 49 | ---- 50 | - Utility function to add arbitrary files to be uploaded to the hub by using 51 | :func:`.hub_utils.add_files`. :pr:`123` by `Benjamin Bossan`_. 52 | - Add ``private`` as an optional argument to :meth:`skops.hub_utils.push` to 53 | optionally set the visibility status of a repo when pushing to the hub. 54 | :pr:`130` by `Adrin Jalali`_. 55 | - First release of the skops secure persistence feature (:pr:`128`) by `Adrin 56 | Jalali`_ and `Benjamin Bossan`_. Visit :ref:`persistence` for more 57 | information. This feature is not production ready yet but we're happy to 58 | receive feedback from users. 59 | - Fix a bug that resulted in markdown tables being rendered incorrectly if 60 | entries contained line breaks. :pr:`156` by `Benjamin Bossan`_. 61 | - Raise an error instead of warning the user if a given model file is empty. 62 | :pr:`214` by `Adrin Jalali`_. 63 | - Use ``huggingface_hub`` v0.10.1 for model cards, drop ``modelcards`` 64 | dependency. :pr:`162` by `Benjamin Bossan`_. 65 | - Add source links to API documentation. :pr:`172` by :user:`Ayyuce Demirbas 66 | `. 67 | - Add support to load model if given Path/str to ``model`` argument in 68 | :mod:`skops.card` . :pr:`205` by :user:`Prajjwal Mishra `. 69 | 70 | 71 | v0.2 72 | ---- 73 | - Tables, e.g. cross-validation results, can now be added to model cards using 74 | the :meth:`.Card.add_table` method. :pr:`90` by `Benjamin Bossan`_. 75 | - Add method :meth:`.Card.render` which returns the model card as a string. 76 | :pr:`94` by `Benjamin Bossan`_. 77 | - Make :meth:`skops.hub_utils.init` atomic. Now it doesn't leave a trace on the 78 | filesystem if it fails for some reason. :pr:`60` by `Adrin Jalali`_ 79 | - When adding figures or tables, it's now possible to set ``folded=True`` to 80 | render the content inside a details tag. :pr:`108` by `Benjamin Bossan`_. 81 | - Add :meth:`skops.hub_utils.get_model_output` to get the model's output using 82 | The Hugging Face Hub's inference API, and return an array with the outputs. 83 | :pr:`105` by `Adrin Jalali`_. 84 | 85 | v0.1 86 | ---- 87 | 88 | This is the first release of the library. It include two main modules: 89 | 90 | - :mod:`skops.hub_utils`: tools to create a model repository to be stored on 91 | `Hugging Face Hub `__, mainly through 92 | :func:`skops.hub_utils.init` and :func:`skops.hub_utils.push`. 93 | - :mod:`skops.card`: tools to create a model card explaining what the model does 94 | and how it should be used. The model card can then be stored as the 95 | ``README.md`` file on the Hugging Face Hub, with pre-populated metadata to 96 | help Hub understand the model. 97 | 98 | 99 | Contributors 100 | ~~~~~~~~~~~~ 101 | 102 | :user:`Adrin Jalali `, :user:`Merve Noyan `, 103 | :user:`Benjamin Bossan `, :user:`Ayyuce Demirbas 104 | `, :user:`Prajjwal Mishra `, :user:`Francesco Cariaggi `, 105 | :user:`Erin Aho ` 106 | -------------------------------------------------------------------------------- /skops/card/tests/examples/vit-base-patch32-224-in21k.md: -------------------------------------------------------------------------------- 1 | --- 2 | license: apache-2.0 3 | tags: 4 | - vision 5 | datasets: 6 | - imagenet-21k 7 | inference: false 8 | --- 9 | 10 | # Vision Transformer (base-sized model) 11 | 12 | 13 | 14 | Vision Transformer (ViT) model pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224. It was introduced in the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Dosovitskiy et al. and first released in [this repository](https://github.com/google-research/vision_transformer). However, the weights were converted from the [timm repository](https://github.com/rwightman/pytorch-image-models) by Ross Wightman, who already converted the weights from JAX to PyTorch. Credits go to him. 15 | 16 | Disclaimer: The team releasing ViT did not write a model card for this model so this model card has been written by the Hugging Face team. 17 | 18 | ## Model description 19 | 20 | The Vision Transformer (ViT) is a transformer encoder model (BERT-like) pretrained on a large collection of images in a supervised fashion, namely ImageNet-21k, at a resolution of 224x224 pixels. 21 | 22 | Images are presented to the model as a sequence of fixed-size patches (resolution 32x32), which are linearly embedded. One also adds a [CLS] token to the beginning of a sequence to use it for classification tasks. One also adds absolute position embeddings before feeding the sequence to the layers of the Transformer encoder. 23 | 24 | Note that this model does not provide any fine-tuned heads, as these were zero'd by Google researchers. However, the model does include the pre-trained pooler, which can be used for downstream tasks (such as image classification). 25 | 26 | By pre-training the model, it learns an inner representation of images that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled images for instance, you can train a standard classifier by placing a linear layer on top of the pre-trained encoder. One typically places a linear layer on top of the [CLS] token, as the last hidden state of this token can be seen as a representation of an entire image. 27 | 28 | ## Intended uses & limitations 29 | 30 | You can use the raw model for image classification. See the [model hub](https://huggingface.co/models?search=google/vit) to look for 31 | fine-tuned versions on a task that interests you. 32 | 33 | ### How to use 34 | 35 | Here is how to use this model: 36 | 37 | ```python 38 | from transformers import ViTFeatureExtractor, ViTModel 39 | from PIL import Image 40 | import requests 41 | url = 'http://images.cocodataset.org/val2017/000000039769.jpg' 42 | image = Image.open(requests.get(url, stream=True).raw) 43 | feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch32-224-in21k') 44 | model = ViTModel.from_pretrained('google/vit-base-patch32-224-in21k') 45 | inputs = feature_extractor(images=image, return_tensors="pt") 46 | outputs = model(**inputs) 47 | last_hidden_state = outputs.last_hidden_state 48 | ``` 49 | 50 | Currently, both the feature extractor and model support PyTorch. Tensorflow and JAX/FLAX are coming soon, and the API of ViTFeatureExtractor might change. 51 | 52 | ## Training data 53 | 54 | The ViT model was pretrained on [ImageNet-21k](http://www.image-net.org/), a dataset consisting of 14 million images and 21k classes. 55 | 56 | ## Training procedure 57 | 58 | ### Preprocessing 59 | 60 | The exact details of preprocessing of images during training/validation can be found [here](https://github.com/google-research/vision_transformer/blob/master/vit_jax/input_pipeline.py). 61 | 62 | Images are resized/rescaled to the same resolution (224x224) and normalized across the RGB channels with mean (0.5, 0.5, 0.5) and standard deviation (0.5, 0.5, 0.5). 63 | 64 | ### Pretraining 65 | 66 | The model was trained on TPUv3 hardware (8 cores). All model variants are trained with a batch size of 4096 and learning rate warmup of 10k steps. For ImageNet, the authors found it beneficial to additionally apply gradient clipping at global norm 1. Pre-training resolution is 224. 67 | 68 | ## Evaluation results 69 | 70 | For evaluation results on several image classification benchmarks, we refer to tables 2 and 5 of the original paper. Note that for fine-tuning, the best results are obtained with a higher resolution (384x384). Of course, increasing the model size will result in better performance. 71 | 72 | ### BibTeX entry and citation info 73 | 74 | ```bibtex 75 | @misc{wu2020visual, 76 | title={Visual Transformers: Token-based Image Representation and Processing for Computer Vision}, 77 | author={Bichen Wu and Chenfeng Xu and Xiaoliang Dai and Alvin Wan and Peizhao Zhang and Zhicheng Yan and Masayoshi Tomizuka and Joseph Gonzalez and Kurt Keutzer and Peter Vajda}, 78 | year={2020}, 79 | eprint={2006.03677}, 80 | archivePrefix={arXiv}, 81 | primaryClass={cs.CV} 82 | } 83 | ``` 84 | 85 | ```bibtex 86 | @inproceedings{deng2009imagenet, 87 | title={Imagenet: A large-scale hierarchical image database}, 88 | author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li}, 89 | booktitle={2009 IEEE conference on computer vision and pattern recognition}, 90 | pages={248--255}, 91 | year={2009}, 92 | organization={Ieee} 93 | } 94 | ``` 95 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | import inspect 10 | import os 11 | import subprocess 12 | from operator import attrgetter 13 | 14 | from packaging.version import parse 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | import skops 24 | 25 | # -- Project information ----------------------------------------------------- 26 | 27 | project = "skops" 28 | copyright = "2022, Adrin Jalali" 29 | author = "Adrin Jalali" 30 | 31 | 32 | # The full version, including alpha/beta/rc tags 33 | 34 | parsed_version = parse(skops.__version__) 35 | release = ".".join(parsed_version.base_version.split(".")[:2]) 36 | 37 | 38 | # -- General configuration --------------------------------------------------- 39 | 40 | # Add any Sphinx extension module names here, as strings. They can be 41 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 42 | # ones. 43 | extensions = [ 44 | "sphinx.ext.linkcode", 45 | "sphinx.ext.autodoc", 46 | "numpydoc", 47 | "sphinx_gallery.gen_gallery", 48 | "sphinx_issues", 49 | "sphinx.ext.intersphinx", # link to other documentations, e.g. sklearn 50 | ] 51 | 52 | autodoc_default_options = {"members": True, "inherited-members": True} 53 | autodoc_typehints = "none" 54 | 55 | sphinx_gallery_conf = { 56 | "examples_dirs": "../examples", # path to your example scripts 57 | "gallery_dirs": "auto_examples", # path to where to save gallery generated output 58 | } 59 | # Add any paths that contain templates here, relative to this directory. 60 | templates_path = ["_templates"] 61 | 62 | # List of patterns, relative to source directory, that match files and 63 | # directories to ignore when looking for source files. 64 | # This pattern also affects html_static_path and html_extra_path. 65 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 66 | 67 | autosummary_generate = True 68 | 69 | # sphinx-issues configuration 70 | # Path to GitHub repo {group}/{project} 71 | # (note that `group` is the GitHub user or organization) 72 | issues_github_path = "skops-dev/skops" 73 | 74 | REVISION_CMD = "git rev-parse --short HEAD" 75 | 76 | 77 | def _get_git_revision(): 78 | try: 79 | revision = subprocess.check_output(REVISION_CMD.split()).strip() 80 | except (subprocess.CalledProcessError, OSError): 81 | print("Failed to execute git to get revision") 82 | return None 83 | return revision.decode("utf-8") 84 | 85 | 86 | def linkcode_resolve(domain, info): 87 | if domain not in ("py", "pyx"): 88 | return 89 | if not info.get("module") or not info.get("fullname"): 90 | return 91 | revision = _get_git_revision() 92 | 93 | if revision is None: 94 | return 95 | 96 | class_name = info["fullname"].split(".")[0] 97 | module = __import__(info["module"], fromlist=[class_name]) 98 | obj = attrgetter(info["fullname"])(module) 99 | 100 | # Unwrap the object to get the correct source 101 | # file in case that is wrapped by a decorator 102 | obj = inspect.unwrap(obj) 103 | 104 | try: 105 | fn = inspect.getsourcefile(inspect.unwrap(obj)) 106 | except TypeError: 107 | try: 108 | fn = inspect.getsourcefile(inspect.unwrap(obj.fget)) 109 | except (AttributeError, TypeError): 110 | fn = None 111 | if not fn: 112 | return None 113 | package = "skops" 114 | fn = os.path.relpath(fn, start=os.path.dirname(__import__(package).__file__)) 115 | try: 116 | lineno = inspect.getsourcelines(obj)[1] 117 | except Exception: 118 | lineno = "" 119 | url_fmt = ( 120 | "https://github.com/skops-dev/skops/blob/{revision}/{package}/{path}#L{lineno}" 121 | ) 122 | revision = _get_git_revision() 123 | return url_fmt.format(revision=revision, package=package, path=fn, lineno=lineno) 124 | 125 | 126 | # -- Options for HTML output ------------------------------------------------- 127 | 128 | # The theme to use for HTML and HTML Help pages. See the documentation for 129 | # a list of builtin themes. 130 | # 131 | html_theme = "sphinx_rtd_theme" 132 | 133 | # Add any paths that contain custom static files (such as style sheets) here, 134 | # relative to this directory. They are copied after the builtin static files, 135 | # so a file named "default.css" will overwrite the builtin "default.css". 136 | html_static_path = ["_static"] 137 | 138 | html_logo = "images/logo.png" 139 | html_theme_options = { 140 | "logo_only": True, 141 | } 142 | 143 | # See: 144 | # https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html#confval-intersphinx_mapping 145 | intersphinx_mapping = { 146 | "python": ("https://docs.python.org/3", None), 147 | "numpy": ("https://docs.scipy.org/doc/numpy/", None), 148 | "sklearn": ("https://scikit-learn.org/stable/", None), 149 | "pandas": ("https://pandas.pydata.org/docs/", None), 150 | "joblib": ("https://joblib.readthedocs.io/en/latest/", None), 151 | "huggingface_hub": ("https://huggingface.co/docs/huggingface_hub/main/en", None), 152 | } 153 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | Contributing to skops 2 | ===================== 3 | 4 | Please follow this workflow when contributing to skops: 5 | 6 | - Fork the repository under your own user 7 | - Clone the repository locally 8 | - Create a new branch for your changes 9 | - Add your changes to the branch 10 | - Commit your changes 11 | - Push your branch to the remote repository 12 | - Create a pull request on GitHub 13 | 14 | Issue Titles / Commit Messages 15 | ------------------------------ 16 | 17 | When creating a pull request, please use a descriptive title. You can prefix 18 | the title to indicate the type of it: 19 | 20 | - ``DOC``: documentation changes 21 | - ``FEAT/FEA``: new major features 22 | - ``ENH``: enhancements to existing features with user facing implications 23 | - ``CI``: continuous integration, sometimes overlaps with MNT 24 | - ``MNT/MAINT``: maintenance, technical debt, etc 25 | - ``FIX``: bug fixes 26 | - ``TST``: new tests, refactoring tests 27 | - ``PERF``: performance improvements 28 | 29 | If a contributor forgets to prefix the title, a maintainer can add the prefix 30 | when merging into ``main``. While merging, it is recommended that the 31 | maintainer refines the commit message to add a short description of what the PR 32 | being merged does. 33 | 34 | Review Process 35 | -------------- 36 | 37 | Don't hesitate to ping @skops-dev/maintainers in your issues and pull requests 38 | if you don't receive a review in a timely manner. We try to review all pull 39 | requests as soon as we can. 40 | 41 | If you have permissions, you should almost never merge your own pull request 42 | unless it's a hotfix and needs to be merged really quick and it's not a major 43 | change. 44 | 45 | Otherwise pull requests can be merged if at least one other person has approved 46 | it on GitHub. Please don't merge them until all outstanding comments are 47 | addressed or the discussions are concluded and people have agreed to tackle 48 | them in future pull requests. 49 | 50 | Working on Existing Issues 51 | -------------------------- 52 | 53 | If you intend to work on an issue, leave a comment and state your intentions. 54 | Also feel free to ask for clarifications if you're not sure what the issue 55 | entails. If you don't understand an issue, it's on us, not on you! 56 | 57 | Setting up the dev environment 58 | ------------------------------ 59 | 60 | Following these steps you can prepare a dev environment for yourself to 61 | contribute to `skops`. 62 | 63 | Using conda/mamba 64 | ~~~~~~~~~~~~~~~~~ 65 | 66 | .. code:: bash 67 | 68 | mamba create -c conda-forge -n skops python=3.10 69 | mamba activate skops 70 | python -m pip install -e ".[tests,docs]" 71 | # add pre-commit hooks 72 | mamba install -c conda-forge pre-commit 73 | pre-commit install 74 | 75 | You can also replace the above `mamba` commands with `conda` if you don't have 76 | `mamba` installed. 77 | 78 | 79 | Running Tests 80 | ~~~~~~~~~~~~~ 81 | 82 | skops uses pytest as its test runner, just run it from the project root: 83 | 84 | .. code:: bash 85 | 86 | pytest 87 | 88 | Certain tests require internet access to run, and they typically take slightly 89 | longer to run than other tests. If you'd like to skip those tests, you can add 90 | ``-m not network`` to your ``pytest`` command, or ``-m network`` to only run 91 | those tests. For example, you can run all tests except the ones requiring 92 | internet with: 93 | 94 | .. code:: bash 95 | 96 | pytest -m "not network" skops 97 | 98 | Similarly, there is a flag, ``-m inference`` for tests that hit the Hugging Face 99 | Inference API, which can be quite slow or even hang. Skip these tests as long as 100 | you don't make any changes to this functionality. If you already skip network 101 | tests, the inference tests will also be skipped. 102 | 103 | 104 | Releases 105 | ======== 106 | 107 | Releases are created using `manual GitHub workflows 108 | `_. 109 | As a maintainer, follow these steps: 110 | 111 | 1. Check and update the ``docs/changes.rst`` 112 | 2. For a major release, create a new branch with the name "0.version.X", e.g. 113 | "0.2.X". This branch will have all tags for all releases under 0.2. 114 | 3. Bump the version defined in ``skops/__init__.py`` 115 | 4. Git grep for any TODO's that need fixing before the release (e.g. 116 | deprecations). You can do this, for example by: 117 | 118 | .. code:: bash 119 | 120 | git grep -n TODO 121 | 122 | 123 | 5. Create a PR with all the changes and have it reviewed and merged 124 | 6. Create a tag with the format "v0.version", e.g. "v0.2", and push it to the 125 | remote repository. Use this tag for releasing the package. If there is a 126 | minor release under the same branch, it would be "v0.2.1" for example. 127 | 7. Use the `GitHub action 128 | `__ to 129 | create a new release on **TestPyPI**. Check it for correctness `on test.pypi 130 | `_. 131 | 8. Use the `GitHub action 132 | `__ to 133 | create a new release on **PyPI**. Check it for correctness `pypi 134 | `_. 135 | 9. Create a `new release `_ on 136 | GitHub 137 | 10. Update the patch version of the package to a new dev version, e.g. from 138 | ``v0.3.dev0`` to ``v0.4.dev0`` 139 | 11. Add a section for the new release in the ``docs/changes.rst`` file. 140 | 12. Check that the new stable branch of documentation was built correctly on 141 | `readthedocs `_, and make 142 | sure all relevant releases are *active*. 143 | -------------------------------------------------------------------------------- /skops/io/tests/test_audit.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | import re 4 | from contextlib import suppress 5 | from zipfile import ZipFile 6 | 7 | import numpy as np 8 | import pytest 9 | from sklearn.linear_model import LogisticRegression 10 | from sklearn.pipeline import FeatureUnion, Pipeline 11 | from sklearn.preprocessing import FunctionTransformer, StandardScaler 12 | 13 | from skops.io import dumps, get_untrusted_types 14 | from skops.io._audit import Node, audit_tree, check_type, get_tree, temp_setattr 15 | from skops.io._general import DictNode, dict_get_state 16 | from skops.io._utils import LoadContext, SaveContext, gettype 17 | 18 | 19 | class CustomType: 20 | """A custom untrusted class.""" 21 | 22 | def __init__(self, value): 23 | self.value = value 24 | 25 | 26 | @pytest.mark.parametrize( 27 | "module_name, type_name, trusted, expected", 28 | [ 29 | ("sklearn", "Pipeline", ["sklearn.Pipeline"], True), 30 | ("sklearn", "Pipeline", ["sklearn.preprocessing.StandardScaler"], False), 31 | ("sklearn", "Pipeline", True, True), 32 | ("builtins", "int", ["builtins.int"], True), 33 | ("builtins", "int", [], False), 34 | ], 35 | ids=["list-True", "list-False", "True", "int-True", "int-False"], 36 | ) 37 | def test_check_type(module_name, type_name, trusted, expected): 38 | assert check_type(module_name, type_name, trusted) == expected 39 | 40 | 41 | def test_audit_tree_untrusted(): 42 | var = {"a": CustomType(1), 2: CustomType(2)} 43 | state = dict_get_state(var, SaveContext(None, 0, {})) 44 | node = DictNode(state, LoadContext(None), trusted=False) 45 | with pytest.raises( 46 | TypeError, 47 | match=re.escape( 48 | "Untrusted types found in the file: ['test_audit.CustomType']." 49 | ), 50 | ): 51 | audit_tree(node, trusted=False) 52 | 53 | # there shouldn't be an error with trusted=True 54 | audit_tree(node, trusted=True) 55 | 56 | untrusted_list = get_untrusted_types(data=dumps(var)) 57 | assert untrusted_list == ["test_audit.CustomType"] 58 | 59 | # passing the type would fix it. 60 | audit_tree(node, trusted=untrusted_list) 61 | 62 | 63 | def test_audit_tree_defaults(): 64 | # test that the default types are trusted 65 | var = {"a": 1, 2: "b"} 66 | state = dict_get_state(var, SaveContext(None, 0, {})) 67 | node = DictNode(state, LoadContext(None), trusted=False) 68 | audit_tree(node, trusted=[]) 69 | 70 | 71 | @pytest.mark.parametrize( 72 | "trusted, defaults, expected", 73 | [ 74 | (True, None, True), 75 | (False, int, ["builtins.int"]), 76 | ([int], None, ["builtins.int"]), 77 | ], 78 | ids=["trusted", "untrusted", "untrusted_list"], 79 | ) 80 | def test_Node_get_trusted(trusted, defaults, expected): 81 | assert Node._get_trusted(trusted, defaults) == expected 82 | 83 | 84 | @pytest.mark.parametrize( 85 | "values, is_safe", 86 | [ 87 | ([1, 2], True), 88 | ([1, {1: 2}], True), 89 | ([1, {1: CustomType(1)}], False), 90 | (eval, False), 91 | (pytest.mark.parametrize, False), 92 | ], 93 | ids=["int", "dict", "untrusted", "eval", "parametrize"], 94 | ) 95 | def test_list_safety(values, is_safe): 96 | content = dumps(values) 97 | 98 | with ZipFile(io.BytesIO(content), "r") as zip_file: 99 | schema = json.loads(zip_file.read("schema.json")) 100 | tree = get_tree(schema, load_context=LoadContext(src=zip_file)) 101 | assert tree.is_safe() == is_safe 102 | 103 | 104 | def test_gettype_error(): 105 | msg = "Object None of module test is unknown" 106 | with pytest.raises(ValueError, match=msg): 107 | gettype(module_name="test", cls_or_func=None) 108 | 109 | msg = "Object test of module None is unknown" 110 | with pytest.raises(ValueError, match=msg): 111 | gettype(module_name=None, cls_or_func="test") 112 | 113 | # ImportError if the module cannot be imported 114 | with pytest.raises(ImportError): 115 | gettype(module_name="invalid-module", cls_or_func="invalid-type") 116 | 117 | 118 | @pytest.mark.parametrize( 119 | "data, file, exception, message", 120 | [ 121 | ("not-none", "not-none", ValueError, "Only one of data or file"), 122 | (None, None, ValueError, "Exactly one of data or file should be passed"), 123 | ("string", None, TypeError, "a bytes-like object is required, not 'str'"), 124 | ], 125 | ids=["both", "neither", "string-data"], 126 | ) 127 | def test_get_untrusted_types_validation(data, file, exception, message): 128 | with pytest.raises(exception, match=message): 129 | get_untrusted_types(data=data, file=file) 130 | 131 | 132 | def test_temp_setattr(): 133 | # Test that temp_setattr works as expected 134 | class A: 135 | def __init__(self): 136 | self.a = 1 137 | 138 | temp = A() 139 | with suppress(ValueError): 140 | with temp_setattr(temp, a=2, b=3): 141 | assert temp.a == 2 142 | assert temp.b == 3 143 | raise ValueError # to make sure context manager handles exceptions 144 | 145 | assert temp.a == 1 146 | assert not hasattr(temp, "b") 147 | 148 | 149 | def test_complex_pipeline_untrusted_set(): 150 | # fmt: off 151 | clf = Pipeline([ 152 | ("features", FeatureUnion([ 153 | ("scaler", StandardScaler()), 154 | ("sqrt", FunctionTransformer( 155 | func=np.sqrt, 156 | inverse_func=np.square, 157 | )), 158 | ])), 159 | ("clf", LogisticRegression(random_state=0, solver="liblinear")), 160 | ]) 161 | # fmt: on 162 | 163 | untrusted = get_untrusted_types(data=dumps(clf)) 164 | type_names = [x.split(".")[-1] for x in untrusted] 165 | assert type_names == ["sqrt", "square"] 166 | -------------------------------------------------------------------------------- /examples/plot_hf_hub.py: -------------------------------------------------------------------------------- 1 | """ 2 | scikit-learn models on Hugging Face Hub 3 | --------------------------------------- 4 | 5 | This guide demonstrates how you can use this package to create a Hugging Face 6 | Hub model repository based on a scikit-learn compatible model, and how to 7 | fetch scikit-learn compatible models from the Hub and run them locally. 8 | """ 9 | 10 | # %% 11 | # Imports 12 | # ======= 13 | # First we will import everything required for the rest of this document. 14 | 15 | import json 16 | import os 17 | import pickle 18 | from pathlib import Path 19 | from tempfile import mkdtemp, mkstemp 20 | from uuid import uuid4 21 | 22 | import sklearn 23 | from huggingface_hub import HfApi 24 | from sklearn.datasets import load_breast_cancer 25 | from sklearn.ensemble import HistGradientBoostingClassifier 26 | from sklearn.experimental import enable_halving_search_cv # noqa 27 | from sklearn.model_selection import HalvingGridSearchCV, train_test_split 28 | 29 | from skops import card, hub_utils 30 | 31 | # %% 32 | # Data 33 | # ==== 34 | # Then we create some random data to train and evaluate our model. 35 | 36 | X, y = load_breast_cancer(as_frame=True, return_X_y=True) 37 | X_train, X_test, y_train, y_test = train_test_split( 38 | X, y, test_size=0.3, random_state=42 39 | ) 40 | print("X's summary: ", X.describe()) 41 | print("y's summary: ", y.describe()) 42 | 43 | 44 | # %% 45 | # Train a Model 46 | # ============= 47 | # Using the above data, we train a model. To select the model, we use 48 | # :class:`~sklearn.model_selection.HalvingGridSearchCV` with a parameter grid 49 | # over :class:`~sklearn.ensemble.HistGradientBoostingClassifier`. 50 | 51 | param_grid = { 52 | "max_leaf_nodes": [5, 10, 15], 53 | "max_depth": [2, 5, 10], 54 | } 55 | 56 | model = HalvingGridSearchCV( 57 | estimator=HistGradientBoostingClassifier(), 58 | param_grid=param_grid, 59 | random_state=42, 60 | n_jobs=-1, 61 | ).fit(X_train, y_train) 62 | model.score(X_test, y_test) 63 | 64 | # %% 65 | # Initialize a Model Repo 66 | # ======================= 67 | # We now initialize a model repository locally, and push it to the hub. For 68 | # that, we need to first store the model as a pickle file and pass it to the 69 | # hub tools. 70 | 71 | # The file name is not significant, here we choose to save it with a `pkl` 72 | # extension. 73 | _, pkl_name = mkstemp(prefix="skops-", suffix=".pkl") 74 | with open(pkl_name, mode="bw") as f: 75 | pickle.dump(model, file=f) 76 | 77 | local_repo = mkdtemp(prefix="skops-") 78 | hub_utils.init( 79 | model=pkl_name, 80 | requirements=[f"scikit-learn={sklearn.__version__}"], 81 | dst=local_repo, 82 | task="tabular-classification", 83 | data=X_test, 84 | ) 85 | if "__file__" in locals(): # __file__ not defined during docs built 86 | # Add this script itself to the files to be uploaded for reproducibility 87 | hub_utils.add_files(__file__, dst=local_repo) 88 | 89 | # %% 90 | # We can no see what the contents of the created local repo are: 91 | print(os.listdir(local_repo)) 92 | 93 | # %% 94 | # Model Card 95 | # ========== 96 | # We will now create a model card and save it. For more information about how 97 | # to create a good model card, refer to the :ref:`model card example 98 | # `. The following code uses 99 | # :func:`~skops.card.metadata_from_config` which creates a minimal metadata 100 | # object to be included in the metadata section of the model card. The 101 | # configuration used by this method is stored in the ``config.json`` file which 102 | # is created by the call to :func:`~skops.hub_utils.init`. 103 | model_card = card.Card(model, metadata=card.metadata_from_config(Path(local_repo))) 104 | model_card.save(Path(local_repo) / "README.md") 105 | 106 | # %% 107 | # Push to Hub 108 | # =========== 109 | # And finally, we can push the model to the hub. This requires a user access 110 | # token which you can get under https://huggingface.co/settings/tokens 111 | 112 | # you can put your own token here, or set it as an environment variable before 113 | # running this script. 114 | token = os.environ["HF_HUB_TOKEN"] 115 | 116 | repo_name = f"hf_hub_example-{uuid4()}" 117 | user_name = HfApi().whoami(token=token)["name"] 118 | repo_id = f"{user_name}/{repo_name}" 119 | print(f"Creating and pushing to repo: {repo_id}") 120 | 121 | # %% 122 | # Now we can push our files to the repo. The following function creates the 123 | # remote repository if it doesn't exist; this is controlled via the 124 | # ``create_remote`` argument. Note that here we're setting ``private=True``, 125 | # which means only people with the right permissions would see the model. Set 126 | # ``private=False`` to make it visible to the public. 127 | 128 | hub_utils.push( 129 | repo_id=repo_id, 130 | source=local_repo, 131 | token=token, 132 | commit_message="pushing files to the repo from the example!", 133 | create_remote=True, 134 | private=True, 135 | ) 136 | 137 | # %% 138 | # Once uploaded, other users can download and use it, unless you make the repo 139 | # private. Given a repository's name, here's how one can download it: 140 | repo_copy = mkdtemp(prefix="skops") 141 | hub_utils.download(repo_id=repo_id, dst=repo_copy, token=token) 142 | print(os.listdir(repo_copy)) 143 | 144 | 145 | # %% 146 | # You can also get the requirements of this repository: 147 | print(hub_utils.get_requirements(path=repo_copy)) 148 | 149 | # %% 150 | # As well as the complete configuration of the project: 151 | print(json.dumps(hub_utils.get_config(path=repo_copy), indent=2)) 152 | 153 | # %% 154 | # Now you can check the contents of the repository under your user. 155 | # 156 | # Update Requirements 157 | # =================== 158 | # If you update your environment and the versions of your requirements are 159 | # changed, you can update the requirement in your repo by calling 160 | # ``update_env``, which automatically detects the existing installation of the 161 | # current environment and updates the requirements accordingly. 162 | 163 | hub_utils.update_env(path=local_repo, requirements=["scikit-learn"]) 164 | 165 | # %% 166 | # Delete Repository 167 | # ================= 168 | # At the end, you can also delete the repository you created using 169 | # ``HfApi().delete_repo``. For more information please refer to the 170 | # documentation of ``huggingface_hub`` library. 171 | 172 | HfApi().delete_repo(repo_id=repo_id, token=token) 173 | -------------------------------------------------------------------------------- /examples/plot_text_classification.py: -------------------------------------------------------------------------------- 1 | """ 2 | Text Classification with scikit-learn 3 | ------------------------------------- 4 | 5 | This example shows how you can create a Hugging Face Hub compatible repo for a 6 | text classification task using scikit-learn. We also show how you can generate 7 | a model card for the model and the task at hand. 8 | """ 9 | 10 | # %% 11 | # Imports 12 | # ======= 13 | # First we will import everything required for the rest of this document. 14 | 15 | import pickle 16 | from pathlib import Path 17 | from tempfile import mkdtemp, mkstemp 18 | 19 | import pandas as pd 20 | import sklearn 21 | from sklearn.datasets import fetch_20newsgroups 22 | from sklearn.feature_extraction.text import CountVectorizer 23 | from sklearn.metrics import ( 24 | ConfusionMatrixDisplay, 25 | accuracy_score, 26 | classification_report, 27 | confusion_matrix, 28 | f1_score, 29 | ) 30 | from sklearn.model_selection import train_test_split 31 | from sklearn.naive_bayes import MultinomialNB 32 | from sklearn.pipeline import Pipeline 33 | 34 | from skops import card, hub_utils 35 | 36 | # %% 37 | # Data 38 | # ==== 39 | # We will use 20 newsgroups dataset from sklearn. The dataset has curated 40 | # news on 20 topics. It has a training and a test split. 41 | 42 | twenty_train = fetch_20newsgroups(subset="train", shuffle=True, random_state=42) 43 | 44 | twenty_validation = fetch_20newsgroups(subset="test", shuffle=True, random_state=42) 45 | 46 | X_train, X_test, y_train, y_test = train_test_split( 47 | twenty_train.data, twenty_train.target, test_size=0.3, random_state=42 48 | ) 49 | 50 | # %% 51 | # Train a Model 52 | # ============= 53 | # To train a model, we need to convert our data first to vectors. We will use 54 | # CountVectorizer in our pipeline. We will fit a Multinomial 55 | # Naive Bayes model with the outputs of the vectorization. 56 | 57 | model = Pipeline( 58 | [ 59 | ("count", CountVectorizer()), 60 | ("clf", MultinomialNB()), 61 | ] 62 | ) 63 | 64 | model.fit(X_train, y_train) 65 | 66 | # %% 67 | # Inference 68 | # ========= 69 | # Let's see if the model works. 70 | 71 | docs_new = [ 72 | "A graphics processing unit is a specialized electronic circuit designed to" 73 | " manipulate and alter memory to accelerate the creation of images in a frame" 74 | " buffer intended for output to a display device.." 75 | ] 76 | predicted = model.predict(docs_new) 77 | print(twenty_train.target[predicted[0]]) 78 | 79 | # %% 80 | # Initialize a repository to save our files in 81 | # ============================================ 82 | # We will now initialize a repository and save our model 83 | _, pkl_name = mkstemp(prefix="skops-", suffix=".pkl") 84 | 85 | with open(pkl_name, mode="bw") as f: 86 | pickle.dump(model, file=f) 87 | 88 | local_repo = mkdtemp(prefix="skops-") 89 | 90 | hub_utils.init( 91 | model=pkl_name, 92 | requirements=[f"scikit-learn={sklearn.__version__}"], 93 | dst=local_repo, 94 | task="text-classification", 95 | data=X_test, 96 | ) 97 | 98 | # %% 99 | # Create a model card 100 | # =================== 101 | # We now create a model card, and populate its metadata with information which 102 | # is already provided in ``config.json``, which itself is created by the call to 103 | # :func:`.hub_utils.init` above. We will see below how we can populate the model 104 | # card with useful information. 105 | 106 | model_card = card.Card(model, metadata=card.metadata_from_config(Path(local_repo))) 107 | 108 | # %% 109 | # Add more information 110 | # ==================== 111 | # So far, the model card does not tell viewers a lot about the model. Therefore, 112 | # we add more information about the model, like a description and what its 113 | # license is. 114 | 115 | model_card.metadata.license = "mit" 116 | limitations = "This model is not ready to be used in production." 117 | model_description = ( 118 | "This is a Multinomial Naive Bayes model trained on 20 news groups dataset." 119 | "Count vectorizer is used for vectorization." 120 | ) 121 | model_card_authors = "skops_user" 122 | get_started_code = ( 123 | "import pickle\nwith open(pkl_filename, 'rb') as file:\n clf = pickle.load(file)" 124 | ) 125 | citation_bibtex = "bibtex\n@inproceedings{...,year={2020}}" 126 | model_card.add( 127 | citation_bibtex=citation_bibtex, 128 | get_started_code=get_started_code, 129 | model_card_authors=model_card_authors, 130 | limitations=limitations, 131 | model_description=model_description, 132 | ) 133 | 134 | # %% 135 | # Add plots, metrics, and tables to our model card 136 | # ================================================ 137 | # We will now evaluate our model and add our findings to the model card. 138 | 139 | y_pred = model.predict(X_test) 140 | eval_descr = ( 141 | "The model is evaluated on validation data from 20 news group's test split," 142 | " using accuracy and F1-score with micro average." 143 | ) 144 | model_card.add(eval_method=eval_descr) 145 | 146 | accuracy = accuracy_score(y_test, y_pred) 147 | f1 = f1_score(y_test, y_pred, average="micro") 148 | model_card.add_metrics(**{"accuracy": accuracy, "f1 score": f1}) 149 | 150 | cm = confusion_matrix(y_test, y_pred, labels=model.classes_) 151 | disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_) 152 | disp.plot() 153 | 154 | disp.figure_.savefig(Path(local_repo) / "confusion_matrix.png") 155 | model_card.add_plot(**{"Confusion matrix": "confusion_matrix.png"}) 156 | 157 | clf_report = classification_report( 158 | y_test, y_pred, output_dict=True, target_names=twenty_train.target_names 159 | ) 160 | # The classification report has to be transformed into a DataFrame first to have 161 | # the correct format. This requires removing the "accuracy", which was added 162 | # above anyway. 163 | del clf_report["accuracy"] 164 | clf_report = pd.DataFrame(clf_report).T.reset_index() 165 | model_card.add_table( 166 | folded=True, 167 | **{ 168 | "Classification Report": clf_report, 169 | }, 170 | ) 171 | 172 | # %% 173 | # Save model card 174 | # ================ 175 | # We can simply save our model card by providing a path to :meth:`.Card.save`. 176 | # The model hasn't been pushed to Hugging Face Hub yet, if you want to see how 177 | # to push your models please refer to 178 | # :ref:`this example `. 179 | 180 | model_card.save(Path(local_repo) / "README.md") 181 | -------------------------------------------------------------------------------- /skops/card/tests/test_parser.py: -------------------------------------------------------------------------------- 1 | import difflib 2 | import json 3 | import os 4 | import re 5 | from pathlib import Path 6 | from unittest.mock import Mock, patch 7 | 8 | import pytest 9 | import yaml # type: ignore 10 | 11 | from skops.card import parse_modelcard 12 | from skops.card._parser import PandocParser, check_pandoc_installed 13 | 14 | try: 15 | check_pandoc_installed() 16 | except FileNotFoundError: 17 | # not installed, skip 18 | pytest.skip(reason="These tests require a recent pandoc", allow_module_level=True) 19 | 20 | 21 | EXAMPLE_CARDS = [ 22 | # actual model cards from HF hub 23 | "bert-base-uncased.md", 24 | "clip-vit-large-patch14.md", 25 | "gpt2.md", 26 | "specter.md", 27 | "vit-base-patch32-224-in21k.md", 28 | # not a model card 29 | "toy-example.md", 30 | ] 31 | 32 | 33 | def _assert_meta_equal(meta0, meta1): 34 | # we cannot guarantee the order of metadata items, so we compare parsed 35 | # dicts, but not strings directly 36 | assert yaml.safe_load("".join(meta0)) == yaml.safe_load("".join(meta1)) 37 | 38 | 39 | def assert_readme_files_almost_equal(file0, file1, diff): 40 | """Check that the two model cards are identical, but allow differences as 41 | defined in the ``diff`` file 42 | 43 | The metainfo is compared separately, as the order of the items is not 44 | guaranteed to be stable. 45 | """ 46 | with open(file0, "r") as f: 47 | readme0 = f.readlines() 48 | 49 | with open(file1, "r") as f: 50 | readme1 = f.readlines() 51 | 52 | sep = "---\n" 53 | # we look for 2nd occurrence, so skip first char to not match 1st occurrence 54 | if sep in readme0[1:]: # only check if metainfo is present 55 | idx0, idx1 = readme0[1:].index(sep) + 1, readme1[1:].index(sep) + 1 56 | meta0, meta1 = readme0[1:idx0], readme1[1:idx1] 57 | readme0, readme1 = readme0[idx0:], readme1[idx1:] 58 | _assert_meta_equal(meta0, meta1) 59 | 60 | # exclude trivial case of both being empty 61 | assert readme0 62 | assert readme1 63 | 64 | diff_actual = list(difflib.unified_diff(readme0, readme1, n=0)) 65 | 66 | with open(diff, "r") as f: 67 | diff_expected = f.readlines() 68 | 69 | assert diff_actual == diff_expected 70 | 71 | 72 | @pytest.mark.parametrize("file_name", EXAMPLE_CARDS, ids=EXAMPLE_CARDS) 73 | def test_example_model_cards(tmp_path, file_name): 74 | """Test that the difference between original and parsed model card is 75 | acceptable 76 | 77 | For this test, model cards for some of the most popular models on HF Hub 78 | were retrieved and stored in the ./examples folder. This test checks that 79 | these model cards can be successfully parsed and that the output is *almost* 80 | the same. 81 | 82 | We don't expect the output to be 100% identical, see the limitations listed 83 | in ``parse_modelcard``. Instead, we assert that the diff corresponds to the 84 | expected diff, which is also checked in. 85 | 86 | So e.g. for "specter.md", we expect that the diff will be the same diff as 87 | in "specter.md.diff". 88 | 89 | """ 90 | path = Path(os.getcwd()) / "skops" / "card" / "tests" / "examples" 91 | file0 = path / file_name 92 | diff = (path / file_name).with_suffix(".md.diff") 93 | 94 | parsed_card = parse_modelcard(file0) 95 | file1 = tmp_path / "readme-parsed.md" 96 | parsed_card.save(file1) 97 | 98 | assert_readme_files_almost_equal(file0, file1, diff) 99 | 100 | 101 | def test_unknown_pandoc_item_raises(): 102 | source = json.dumps( 103 | { 104 | "pandoc-api-version": [1, 22, 2, 1], 105 | "meta": {}, 106 | "blocks": [ 107 | { 108 | "t": "Header", 109 | "c": [1, ["section", [], []], [{"t": "Str", "c": "section"}]], 110 | }, 111 | {"c": "valid", "t": "Str"}, 112 | {"t": "does-not-exist", "c": []}, 113 | {"c": "okay", "t": "Str"}, 114 | ], 115 | } 116 | ) 117 | parser = PandocParser(source) 118 | msg = ( 119 | "The parsed document contains 'does-not-exist', which is not " 120 | "supported yet, please open an issue on GitHub" 121 | ) 122 | with pytest.raises(ValueError, match=re.escape(msg)): 123 | parser.generate() 124 | 125 | 126 | def test_content_without_section_raises(): 127 | source = json.dumps( 128 | { 129 | "pandoc-api-version": [1, 22, 2, 1], 130 | "meta": {}, 131 | "blocks": [ 132 | {"c": "whoops", "t": "Str"}, 133 | ], 134 | } 135 | ) 136 | parser = PandocParser(source) 137 | msg = ( 138 | "Trying to add content but there is no current section, this is probably a " 139 | "bug, please open an issue on GitHub" 140 | ) 141 | with pytest.raises(ValueError, match=re.escape(msg)): 142 | parser.generate() 143 | 144 | 145 | def test_unsupported_markup_raises(): 146 | match = re.escape("Markup of type does-not-exist is not supported (yet)") 147 | with pytest.raises(ValueError, match=match): 148 | PandocParser(source="", markup_type="does-not-exist") 149 | 150 | 151 | def test_check_pandoc_installed_no_min_version_works(): 152 | # check that it doesn't raise 153 | check_pandoc_installed(min_version=None) 154 | 155 | 156 | def test_check_pandoc_installed_min_version_too_high_raises(): 157 | match = re.escape("Pandoc version too low, expected at least 999.9.9, got") 158 | with pytest.raises(ValueError, match=match): 159 | check_pandoc_installed(min_version="999.9.9") 160 | 161 | 162 | def test_pandoc_not_installed(): 163 | def raise_filenotfound(*args, **kwargs): 164 | # error raised when trying to run subprocess on non-existing command 165 | raise FileNotFoundError("[Errno 2] No such file or directory: 'pandoc'") 166 | 167 | with patch("subprocess.run", raise_filenotfound): 168 | match = re.escape( 169 | "This feature requires the pandoc library to be installed on your system" 170 | ) 171 | with pytest.raises(FileNotFoundError, match=match): 172 | check_pandoc_installed() 173 | 174 | 175 | def test_pandoc_version_cannot_be_determined(): 176 | mock = Mock() 177 | with patch("subprocess.run", mock): 178 | match = re.escape("Could not determine version of pandoc") 179 | with pytest.raises(RuntimeError, match=match): 180 | check_pandoc_installed() 181 | -------------------------------------------------------------------------------- /examples/plot_model_card.py: -------------------------------------------------------------------------------- 1 | """ 2 | scikit-learn model cards 3 | -------------------------------------- 4 | 5 | This guide demonstrates how you can use this package to create a model card on a 6 | scikit-learn compatible model and save it. 7 | """ 8 | 9 | # %% 10 | # Imports 11 | # ======= 12 | # First we will import everything required for the rest of this document. 13 | 14 | import pickle 15 | from pathlib import Path 16 | from tempfile import mkdtemp, mkstemp 17 | 18 | import pandas as pd 19 | import sklearn 20 | from sklearn.datasets import load_breast_cancer 21 | from sklearn.ensemble import HistGradientBoostingClassifier 22 | from sklearn.experimental import enable_halving_search_cv # noqa 23 | from sklearn.metrics import ( 24 | ConfusionMatrixDisplay, 25 | accuracy_score, 26 | classification_report, 27 | confusion_matrix, 28 | f1_score, 29 | ) 30 | from sklearn.model_selection import HalvingGridSearchCV, train_test_split 31 | 32 | from skops import hub_utils 33 | from skops.card import Card, metadata_from_config 34 | 35 | # %% 36 | # Data 37 | # ==== 38 | # We load breast cancer dataset from sklearn. 39 | 40 | X, y = load_breast_cancer(as_frame=True, return_X_y=True) 41 | X_train, X_test, y_train, y_test = train_test_split( 42 | X, y, test_size=0.3, random_state=42 43 | ) 44 | print("X's summary: ", X.describe()) 45 | print("y's summary: ", y.describe()) 46 | 47 | # %% 48 | # Train a Model 49 | # ============= 50 | # Using the above data, we train a model. To select the model, we use 51 | # :class:`~sklearn.model_selection.HalvingGridSearchCV` with a parameter grid 52 | # over :class:`~sklearn.ensemble.HistGradientBoostingClassifier`. 53 | 54 | param_grid = { 55 | "max_leaf_nodes": [5, 10, 15], 56 | "max_depth": [2, 5, 10], 57 | } 58 | 59 | model = HalvingGridSearchCV( 60 | estimator=HistGradientBoostingClassifier(), 61 | param_grid=param_grid, 62 | random_state=42, 63 | n_jobs=-1, 64 | ).fit(X_train, y_train) 65 | model.score(X_test, y_test) 66 | 67 | 68 | # %% 69 | # Initialize a repository to save our files in 70 | # ============================================ 71 | # We will now initialize a repository and save our model 72 | _, pkl_name = mkstemp(prefix="skops-", suffix=".pkl") 73 | 74 | with open(pkl_name, mode="bw") as f: 75 | pickle.dump(model, file=f) 76 | 77 | local_repo = mkdtemp(prefix="skops-") 78 | 79 | hub_utils.init( 80 | model=pkl_name, 81 | requirements=[f"scikit-learn={sklearn.__version__}"], 82 | dst=local_repo, 83 | task="tabular-classification", 84 | data=X_test, 85 | ) 86 | 87 | # %% 88 | # Create a model card 89 | # ==================== 90 | # We now create a model card, and populate its metadata with information which 91 | # is already provided in ``config.json``, which itself is created by the call to 92 | # :func:`.hub_utils.init` above. We will see below how we can populate the model 93 | # card with useful information. 94 | 95 | model_card = Card(model, metadata=metadata_from_config(Path(local_repo))) 96 | 97 | # %% 98 | # Add more information 99 | # ==================== 100 | # So far, the model card does not tell viewers a lot about the model. Therefore, 101 | # we add more information about the model, like a description and what its 102 | # license is. 103 | 104 | model_card.metadata.license = "mit" 105 | limitations = "This model is not ready to be used in production." 106 | model_description = ( 107 | "This is a `HistGradientBoostingClassifier` model trained on breast cancer " 108 | "dataset. It's trained with `HalvingGridSearchCV`, with parameter grids on " 109 | "`max_leaf_nodes` and `max_depth`." 110 | ) 111 | model_card_authors = "skops_user" 112 | citation_bibtex = "**BibTeX**\n\n```\n@inproceedings{...,year={2020}}\n```" 113 | model_card.add( 114 | **{ 115 | "Citation": citation_bibtex, 116 | "Model Card Authors": model_card_authors, 117 | "Model description": model_description, 118 | "Model description/Intended uses & limitations": limitations, 119 | } 120 | ) 121 | 122 | # %% 123 | # Add plots, metrics, and tables to our model card 124 | # ================================================ 125 | # Furthermore, to better understand the model performance, we should evaluate it 126 | # on certain metrics and add those evaluations to the model card. In this 127 | # particular example, we want to calculate the accuracy and the F1 score. We 128 | # calculate those using sklearn and then add them to the model card by calling 129 | # :meth:`.Card.add_metrics`. But this is not all, we can also add matplotlib 130 | # figures to the model card, e.g. a plot of the confusion matrix. To achieve 131 | # this, we create the plot using sklearn, save it locally, and then add it using 132 | # :meth:`.Card.add_plot` method. Finally, we can also add some useful tables to 133 | # the model card, e.g. the results from the grid search and the classification 134 | # report. Those can be added using :meth:`.Card.add_table` 135 | 136 | y_pred = model.predict(X_test) 137 | eval_descr = ( 138 | "The model is evaluated on test data using accuracy and F1-score with " 139 | "macro average." 140 | ) 141 | model_card.add(**{"Model description/Evaluation Results": eval_descr}) 142 | 143 | accuracy = accuracy_score(y_test, y_pred) 144 | f1 = f1_score(y_test, y_pred, average="micro") 145 | model_card.add_metrics(**{"accuracy": accuracy, "f1 score": f1}) 146 | 147 | cm = confusion_matrix(y_test, y_pred, labels=model.classes_) 148 | disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_) 149 | disp.plot() 150 | 151 | disp.figure_.savefig(Path(local_repo) / "confusion_matrix.png") 152 | model_card.add_plot( 153 | **{"Model description/Evaluation Results/Confusion Matrix": "confusion_matrix.png"} 154 | ) 155 | 156 | cv_results = model.cv_results_ 157 | clf_report = classification_report( 158 | y_test, y_pred, output_dict=True, target_names=["malignant", "benign"] 159 | ) 160 | # The classification report has to be transformed into a DataFrame first to have 161 | # the correct format. This requires removing the "accuracy", which was added 162 | # above anyway. 163 | del clf_report["accuracy"] 164 | clf_report = pd.DataFrame(clf_report).T.reset_index() 165 | model_card.add_table( 166 | folded=True, 167 | **{ 168 | "Model description/Evaluation Results/Hyperparameter search results": cv_results, 169 | "Model description/Evaluation Results/Classification report": clf_report, 170 | }, 171 | ) 172 | 173 | # %% 174 | # Save model card 175 | # =============== 176 | # We can simply save our model card by providing a path to :meth:`.Card.save`. 177 | 178 | model_card.save(Path(local_repo) / "README.md") 179 | -------------------------------------------------------------------------------- /skops/io/_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import importlib 4 | import sys 5 | from dataclasses import dataclass, field 6 | from functools import singledispatch 7 | from typing import Any, Type 8 | from zipfile import ZipFile 9 | 10 | 11 | # The following two functions are copied from cpython's pickle.py file. 12 | # --------------------------------------------------------------------- 13 | def _getattribute(obj, name): 14 | for subpath in name.split("."): 15 | if subpath == "": 16 | raise AttributeError( 17 | "Can't get local attribute {!r} on {!r}".format(name, obj) 18 | ) 19 | try: 20 | parent = obj 21 | obj = getattr(obj, subpath) 22 | except AttributeError: 23 | raise AttributeError( 24 | "Can't get attribute {!r} on {!r}".format(name, obj) 25 | ) from None 26 | return obj, parent 27 | 28 | 29 | # This function is particularly used to detect the path of functions such as 30 | # ufuncs. It returns the full path, instead of returning the module name. 31 | def whichmodule(obj: Any, name: str) -> str: 32 | """Find the module an object belong to.""" 33 | module_name = getattr(obj, "__module__", None) 34 | if module_name is not None: 35 | return module_name 36 | # Protect the iteration by using a list copy of sys.modules against dynamic 37 | # modules that trigger imports of other modules upon calls to getattr. 38 | for module_name, module in sys.modules.copy().items(): 39 | if ( 40 | module_name == "__main__" 41 | or module_name == "__mp_main__" # bpo-42406 42 | or module is None 43 | ): 44 | continue 45 | try: 46 | if _getattribute(module, name)[0] is obj: 47 | return module_name 48 | except AttributeError: 49 | pass 50 | return "__main__" 51 | 52 | 53 | # --------------------------------------------------------------------- 54 | 55 | 56 | def _import_obj(module: str, cls_or_func: str, package: str | None = None) -> Any: 57 | return getattr(importlib.import_module(module, package=package), cls_or_func) 58 | 59 | 60 | def gettype(module_name: str, cls_or_func: str) -> Type[Any]: 61 | if module_name and cls_or_func: 62 | return _import_obj(module_name, cls_or_func) 63 | 64 | raise ValueError(f"Object {cls_or_func} of module {module_name} is unknown") 65 | 66 | 67 | def get_module(obj: Any) -> str: 68 | """Find module for given object 69 | 70 | If the module cannot be identified, it's assumed to be "__main__". 71 | 72 | Parameters 73 | ---------- 74 | obj: Any 75 | Object whose module is requested. 76 | 77 | Returns 78 | ------- 79 | name: str 80 | Name of the module. 81 | 82 | """ 83 | return whichmodule(obj, obj.__name__) 84 | 85 | 86 | # For now, there is just one protocol version 87 | DEFAULT_PROTOCOL = 0 88 | 89 | 90 | @dataclass(frozen=True) 91 | class SaveContext: 92 | """Context required for saving the objects 93 | 94 | This context is passed to each ``get_state_*`` function. 95 | 96 | Parameters 97 | ---------- 98 | zip_file: zipfile.ZipFile 99 | The zip file to write the data to, must be in write mode. 100 | 101 | protocol: int 102 | The protocol of the persistence format. Right now, there is only 103 | protocol 0, but this leaves the door open for future changes. 104 | 105 | """ 106 | 107 | zip_file: ZipFile 108 | protocol: int = DEFAULT_PROTOCOL 109 | memo: dict[int, Any] = field(default_factory=dict) 110 | 111 | def memoize(self, obj: Any) -> int: 112 | # Currently, the only purpose for saving the object id is to make sure 113 | # that for the length of the context that the main object is being 114 | # saved, all attributes persist, so that the same id cannot be re-used 115 | # for different objects. 116 | obj_id = id(obj) 117 | if obj_id not in self.memo: 118 | self.memo[obj_id] = obj 119 | return obj_id 120 | 121 | def clear_memo(self) -> None: 122 | self.memo.clear() 123 | 124 | 125 | @dataclass(frozen=True) 126 | class LoadContext: 127 | """Context required for loading an object 128 | 129 | This context is passed to each ``*Node`` class when loading an object. 130 | 131 | Parameters 132 | ---------- 133 | src: zipfile.ZipFile 134 | The zip file the target object is saved in 135 | """ 136 | 137 | src: ZipFile 138 | memo: dict[int, Any] = field(default_factory=dict) 139 | 140 | def memoize(self, obj: Any, id: int) -> None: 141 | self.memo[id] = obj 142 | 143 | def get_object(self, id: int) -> Any: 144 | return self.memo.get(id) 145 | 146 | 147 | @singledispatch 148 | def _get_state(obj, save_context: SaveContext): 149 | # This function should never be called directly. Instead, it is used to 150 | # dispatch to the correct implementation of get_state for the given type of 151 | # its first argument. 152 | raise TypeError(f"Getting the state of type {type(obj)} is not supported yet") 153 | 154 | 155 | def get_state(value, save_context: SaveContext) -> dict[str, Any]: 156 | # This is a helper function to try to get the state of an object. If it 157 | # fails with `get_state`, we try with json.dumps, if that fails, we raise 158 | # the original error alongside the json error. 159 | 160 | # TODO: This should help with fixing recursive references. 161 | # if id(value) in save_context.memo: 162 | # return { 163 | # "__module__": None, 164 | # "__class__": None, 165 | # "__id__": id(value), 166 | # "__loader__": "CachedNode", 167 | # } 168 | 169 | __id__ = save_context.memoize(obj=value) 170 | 171 | res = _get_state(value, save_context) 172 | 173 | res["__id__"] = __id__ 174 | return res 175 | 176 | 177 | def get_type_name(t: Any) -> str: 178 | """Helper function to take in a type, and return its name as a string""" 179 | return f"{get_module(t)}.{t.__name__}" 180 | 181 | 182 | def get_type_paths(types: Any) -> list[str]: 183 | """Helper function that takes in a types, 184 | and converts any the types found to a list of strings. 185 | 186 | Parameters 187 | ---------- 188 | types: Any 189 | Types to get. Can be either a string, a single type, or a list of strings 190 | and types. 191 | 192 | Returns 193 | ---------- 194 | types_list: list of str 195 | The list of types, all as strings, e.g. ``["builtins.list"]``. 196 | 197 | """ 198 | if not types: 199 | return [] 200 | if not isinstance(types, (list, tuple)): 201 | types = [types] 202 | 203 | return [get_type_name(t) if not isinstance(t, str) else t for t in types] 204 | -------------------------------------------------------------------------------- /docs/model_card.rst: -------------------------------------------------------------------------------- 1 | .. _model_card: 2 | 3 | Model Cards for scikit-learn 4 | ============================ 5 | 6 | This library allows you to automatically create model cards for your models, 7 | which are a short documentation explaining what the model does, how it's 8 | trained, and its limitations. `Hugging Face Hub `__ 9 | expects a ``README.md`` file containing a certain set of metadata at the 10 | beginning of it, following with the content of the model card in markdown 11 | format. The metadata section is used to make models searchable on the Hub, and 12 | get the inference API and the widgets on the website working. 13 | 14 | Metadata 15 | -------- 16 | 17 | The metadata part of the file needs to follow the specifications `here 18 | `__. It 19 | includes simple attributes of your models such as the task you're solving, 20 | dataset you trained the model with, evaluation results and more. When the model 21 | is hosted on the Hub, information in metadata like task name or dataset help 22 | your model be discovered on the `Hugging Face Hub 23 | `__. The task identifiers should follow the task 24 | taxonomy defined in Hugging Face Hub, as it enables the inference widget on the 25 | model page. An example to task identifier can be ``"tabular-classification"`` 26 | or ``"text-regression"``. 27 | 28 | Here's an example of the metadata section of the ``README.md`` file: 29 | 30 | .. code-block:: yaml 31 | 32 | --- 33 | library_name: sklearn 34 | tags: 35 | - tabular-classification 36 | license: mit 37 | datasets: 38 | - breast-cancer 39 | metrics: 40 | - accuracy 41 | --- 42 | 43 | ``skops`` creates this section of the file for you, and you almost never need 44 | to touch it yourself. 45 | 46 | Model Card Content 47 | ------------------ 48 | 49 | The markdown part does not necessarily need to follow any specification in 50 | terms of information passed, which gives the user a lot of flexibility. The 51 | markdown part of the ``README.md`` file comes with a couple of defaults provided 52 | by ``skops``, which includes the following slots for free text sections: 53 | 54 | - ``"Mode description"``: A description of the model. 55 | - ``"Intended uses & limitations"``: Intended use for the model, limitations and 56 | potential biases. This section should also include risks of using models in 57 | certain domains if relevant. 58 | - ``"How to Get Started with the Model"``: Code the user can run to load and use 59 | the model. 60 | - ``"Model Card Authors"``: Authors of the model card. This section includes 61 | authors of the model card 62 | - ``"Model Card Contact"``: Contact information of people whom can be reached 63 | out, in case of questions about the model or the model card. 64 | - ``"Citation"``: Bibtex style citations for the model or resources used to 65 | train the model. 66 | - ``"Evaluation Results"``: Evaluation results that are later parsed as a table 67 | by :class:`skops.card.Card`. 68 | 69 | 70 | The template also contains the following sections that are automatically 71 | generated by ``skops``. 72 | 73 | - ``"Hyperparameters"``: Hyperparameters of the model. 74 | - ``"Model Plot"``: A diagram of the model, most relevant in case the model is 75 | a complex scikit-learn :class:`~sklearn.pipeline.Pipeline`. 76 | 77 | Furthermore, it is possible to add plots and tables to the model card. To add 78 | plots, save them on disk and then add them to the card by passing the path name 79 | to the :meth:`.Card.add_plot` method. For tables, you can pass either 80 | dictionaries with the key being the header and the values being list of row 81 | entries, or a pandas ``DataFrame``; use the :meth:`.Card.add_table` method for 82 | this. 83 | 84 | To add content to an existing subsection, or create a new subsection, use a 85 | ``"/"`` to indicate the subsection. E.g. let's assume you would like to add a 86 | subsection called ``"Figures"`` to the existing section ``"Model description"``, 87 | as well as adding some subsections with plots below that, you can call the 88 | :meth:`Card.add` method like this: 89 | 90 | .. code-block:: python 91 | 92 | card.add(**{"Model description/Figures": "Here are some nice figures"}) 93 | card.add_plot(**{ 94 | "Model description/Figures/Confusion Matrix": "path-to-confusion-matrix.png", 95 | "Model description/Figures/ROC": "path-to-roc.png", 96 | }) 97 | 98 | Furthermore, you can select existing sections (as well as their subsections) 99 | using :meth:`.Card.select`, and you can delete sections using 100 | :meth:`.Card.delete`: 101 | 102 | .. code-block:: python 103 | 104 | section = card.select("Model description/Figures") 105 | print(section.content) # 'Here are some nice figures' 106 | print(section.subsections) 107 | card.delete("Model description/Figures/ROC") 108 | 109 | 110 | To see how you can use the API in ``skops`` to create a model card, please 111 | refer to :ref:`sphx_glr_auto_examples_plot_model_card.py`. 112 | 113 | Saving and Loading Model Cards 114 | ------------------------------ 115 | 116 | Once you have finished creating and modifying the model card, you can save it 117 | using the :meth:`.Card.save` method: 118 | 119 | .. code-block:: python 120 | 121 | card.save("README.md") 122 | 123 | This renders the content of the model card to markdown format and stores it in 124 | the indicated file. It is now ready to be uploaded to Hugging Face Hub. 125 | 126 | If you have a finished model card but want to load to make some modifications, 127 | you can use the function :func:`skops.card.parse_modelcard`. This function 128 | parses the model card back into a :class:`.Card` instance that you can work on 129 | further: 130 | 131 | .. code-block:: python 132 | 133 | from skops import card 134 | model_card = card.parse_modelcard("README.md") 135 | model_card.add(**{"A new section": "Some new content"}) 136 | model_card.save("README.md") 137 | 138 | When the card is parsed, some minor details of the model card can change, e.g. 139 | if you used different column alignment than the default, this could change, as 140 | well as removing excess empty lines or trailing whitespace. However, the content 141 | itself should be exactly the same. All known deviations are documented in the 142 | `parse_modelcard docs 143 | `_ 144 | 145 | For the parsing part, we rely on `pandoc `_. If you haven't 146 | installed it, please follow `these instructions 147 | `_. The advantage of using pandoc is that 148 | it's a very mature library and that it supports many different document formats. 149 | Therefore, it should be possible to parse model cards even if they use a format 150 | that's not markdown, for instance reStructuredText, org, or asciidoc. For 151 | saving, we only support markdown for now. 152 | -------------------------------------------------------------------------------- /skops/io/tests/_utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import warnings 3 | 4 | import numpy as np 5 | from scipy import sparse 6 | from sklearn.base import BaseEstimator 7 | from sklearn.utils._testing import assert_allclose_dense_sparse 8 | 9 | # TODO: Investigate why that seems to be an issue on MacOS (only observed with 10 | # Python 3.8) 11 | ATOL = 1e-6 if sys.platform == "darwin" else 1e-7 12 | 13 | 14 | def _is_steps_like(obj): 15 | # helper function to check if an object is something like Pipeline.steps, 16 | # i.e. a list of tuples of names and estimators 17 | if not isinstance(obj, list): # must be a list 18 | return False 19 | 20 | if not obj: # must not be empty 21 | return False 22 | 23 | if not isinstance(obj[0], tuple): # must be list of tuples 24 | return False 25 | 26 | lens = set(map(len, obj)) 27 | if not lens == {2}: # all elements must be length 2 tuples 28 | return False 29 | 30 | keys, vals = list(zip(*obj)) 31 | 32 | if len(keys) != len(set(keys)): # keys must be unique 33 | return False 34 | 35 | if not all(map(lambda x: isinstance(x, (type(None), BaseEstimator)), vals)): 36 | # values must be BaseEstimators or None 37 | return False 38 | 39 | return True 40 | 41 | 42 | def _assert_generic_objects_equal(val1, val2): 43 | def _is_builtin(val): 44 | # Check if value is a builtin type 45 | return getattr(getattr(val, "__class__", {}), "__module__", None) == "builtins" 46 | 47 | if isinstance(val1, (list, tuple, np.ndarray)): 48 | assert len(val1) == len(val2) 49 | for subval1, subval2 in zip(val1, val2): 50 | _assert_generic_objects_equal(subval1, subval2) 51 | return 52 | 53 | assert type(val1) == type(val2) 54 | if hasattr(val1, "__dict__"): 55 | assert_params_equal(val1.__dict__, val2.__dict__) 56 | elif _is_builtin(val1): 57 | assert val1 == val2 58 | else: 59 | # not a normal Python class, could be e.g. a Cython class 60 | assert val1.__reduce__() == val2.__reduce__() 61 | 62 | 63 | def _assert_tuples_equal(val1, val2): 64 | assert len(val1) == len(val2) 65 | for subval1, subval2 in zip(val1, val2): 66 | _assert_vals_equal(subval1, subval2) 67 | 68 | 69 | def _assert_vals_equal(val1, val2): 70 | if type(val1) == type: # e.g. could be np.int64 71 | assert val1 is val2 72 | elif hasattr(val1, "__getstate__") and (val1.__getstate__() is not None): 73 | # This includes BaseEstimator since they implement __getstate__ and 74 | # that returns the parameters as well. 75 | # Since Python 3.11, all objects have a __getstate__ but they return 76 | # None by default, in which case this check is not performed. 77 | # Some objects return a tuple of parameters, others a dict. 78 | state1 = val1.__getstate__() 79 | state2 = val2.__getstate__() 80 | assert type(state1) == type(state2) 81 | if isinstance(state1, tuple): 82 | _assert_tuples_equal(state1, state2) 83 | else: 84 | assert_params_equal(val1.__getstate__(), val2.__getstate__()) 85 | elif sparse.issparse(val1): 86 | assert sparse.issparse(val2) and ((val1 - val2).nnz == 0) 87 | elif isinstance(val1, (np.ndarray, np.generic)): 88 | if len(val1.dtype) == 0: 89 | # for arrays with at least 2 dimensions, check that contiguity is 90 | # preserved 91 | if val1.squeeze().ndim > 1: 92 | assert val1.flags["C_CONTIGUOUS"] is val2.flags["C_CONTIGUOUS"] 93 | assert val1.flags["F_CONTIGUOUS"] is val2.flags["F_CONTIGUOUS"] 94 | if val1.dtype == object: 95 | assert val2.dtype == object 96 | assert val1.shape == val2.shape 97 | for subval1, subval2 in zip(val1, val2): 98 | _assert_generic_objects_equal(subval1, subval2) 99 | else: 100 | # simple comparison of arrays with simple dtypes, almost all 101 | # arrays are of this sort. 102 | np.testing.assert_array_equal(val1, val2) 103 | elif len(val1.shape) == 1: 104 | # comparing arrays with structured dtypes, but they have to be 1D 105 | # arrays. This is what we get from the Tree's state. 106 | assert np.all([x == y for x, y in zip(val1, val2)]) 107 | else: 108 | # we don't know what to do with these values, for now. 109 | assert False 110 | elif isinstance(val1, (tuple, list)): 111 | assert len(val1) == len(val2) 112 | for subval1, subval2 in zip(val1, val2): 113 | _assert_vals_equal(subval1, subval2) 114 | elif isinstance(val1, float) and np.isnan(val1): 115 | assert np.isnan(val2) 116 | elif isinstance(val1, dict): 117 | # dictionaries are compared by comparing their values recursively. 118 | assert set(val1.keys()) == set(val2.keys()) 119 | for key in val1: 120 | _assert_vals_equal(val1[key], val2[key]) 121 | elif hasattr(val1, "__dict__") and hasattr(val2, "__dict__"): 122 | _assert_vals_equal(val1.__dict__, val2.__dict__) 123 | elif isinstance(val1, np.ufunc): 124 | assert val1 == val2 125 | elif val1.__class__.__module__ == "builtins": 126 | assert val1 == val2 127 | else: 128 | _assert_generic_objects_equal(val1, val2) 129 | 130 | 131 | def assert_params_equal(params1, params2): 132 | # helper function to compare estimator dictionaries of parameters 133 | if params1 is None and params2 is None: 134 | return 135 | assert len(params1) == len(params2) 136 | assert set(params1.keys()) == set(params2.keys()) 137 | for key in params1: 138 | with warnings.catch_warnings(): 139 | # this is to silence the deprecation warning from _DictWithDeprecatedKeys 140 | warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn") 141 | val1, val2 = params1[key], params2[key] 142 | assert type(val1) == type(val2) 143 | 144 | if _is_steps_like(val1): 145 | # Deal with Pipeline.steps, FeatureUnion.transformer_list, etc. 146 | assert _is_steps_like(val2) 147 | val1, val2 = dict(val1), dict(val2) 148 | 149 | if isinstance(val1, (tuple, list)): 150 | assert len(val1) == len(val2) 151 | for subval1, subval2 in zip(val1, val2): 152 | _assert_vals_equal(subval1, subval2) 153 | elif isinstance(val1, dict): 154 | assert_params_equal(val1, val2) 155 | else: 156 | _assert_vals_equal(val1, val2) 157 | 158 | 159 | def assert_method_outputs_equal(estimator, loaded, X): 160 | # helper function that checks the output of all supported methods 161 | for method in [ 162 | "predict", 163 | "predict_proba", 164 | "decision_function", 165 | "transform", 166 | "predict_log_proba", 167 | ]: 168 | err_msg = ( 169 | f"{estimator.__class__.__name__}.{method}() doesn't produce the same" 170 | " results after loading the persisted model." 171 | ) 172 | if hasattr(estimator, method): 173 | X_out1 = getattr(estimator, method)(X) 174 | X_out2 = getattr(loaded, method)(X) 175 | assert_allclose_dense_sparse(X_out1, X_out2, err_msg=err_msg, atol=ATOL) 176 | -------------------------------------------------------------------------------- /docs/hf_hub.rst: -------------------------------------------------------------------------------- 1 | .. _hf_hub: 2 | 3 | scikit-learn Models on Hugging Face Hub 4 | ======================================= 5 | 6 | This library allows you to initialize and create a model repository compatible 7 | with `Hugging Face Hub `__, which among other 8 | things, gives you the following benefits: 9 | 10 | - Inference API to get model output through REST calls 11 | - A widget to try the model directly in the browser 12 | - Metadata tags for better discoverability of the model 13 | - Collaborating with others on a model through discussions and pull requests 14 | - Convenient sharing of models with the community 15 | 16 | You can see all the models uploaded to the Hugging Face Hub using this library 17 | `here `_. 18 | 19 | In terms of files, there are three which a scikit-learn model repo needs to 20 | have on the Hub: 21 | 22 | - ``README.md``: includes certain metadata on top of the file and then a 23 | description of the model, aka model card. 24 | - ``config.json``: contains the configuration needed to run the model. 25 | - The persisted model file. There are no constraints on the name of the file 26 | and the name is configured in ``config.json``. The file needs to be loadable 27 | by :func:`joblib.load` or :func:`pickle.load`. 28 | 29 | There are certain requirements in terms of information about the model for the 30 | Hub to be able to load and run the model. For scikit-learn compatible models, 31 | this information is stored in two places: 32 | 33 | - The metadata in ``README.md`` of the model repository, about which you can 34 | read `here `__. 35 | - The configuration stored in ``config.json``. 36 | 37 | As a user of ``skops``, you can use the tools in ``skops.hub_utils`` to create 38 | and persist a ``config.json`` file, and then use it to populate necessary 39 | metadata in the ``README.md`` file. The metadata in ``README.md`` is used by 40 | the Hub's backend to understand the type of the model and the kind of task 41 | which the model tries to solve. An example of a task can be 42 | ``"tabular-classification"`` or ``"text-regression"``. 43 | 44 | An example ``config.json`` file looks like this:: 45 | 46 | { 47 | "sklearn": { 48 | "columns": [ 49 | "petal length (cm)", 50 | "petal width (cm)", 51 | "sepal length (cm)", 52 | "sepal width (cm)", 53 | ], 54 | "environment": ['scikit-learn="1.1.1"', "numpy"], 55 | "example_input": { 56 | "petal length (cm)": [1.4, 1.4, 1.3], 57 | "petal width (cm)": [0.2, 0.2, 0.2], 58 | "sepal length (cm)": [5.1, 4.9, 4.7], 59 | "sepal width (cm)": [3.5, 3.0, 3.2], 60 | }, 61 | "model": {"file": "model.pkl"}, 62 | "task": "tabular-classification", 63 | } 64 | } 65 | 66 | The key ``sklearn`` includes the following sub-keys: 67 | 68 | - ``columns``: An ordered list of column names. The order is important as it is 69 | used to make sure the input given to the model is what the model expects. 70 | - ``example_input``: A list of examples to the model. This is in the form of a 71 | dictionary of column names to list of values, and is used by the Hugging Face 72 | Hub backend to show them in the widget to test the model when visiting the 73 | model's page on the Hub. 74 | - ``environment``: A list of dependencies that the model requires. These 75 | packages must be available on conda-forge and are installed before loading 76 | the model. 77 | - ``model.file``: The file name of the persisted model. 78 | - ``task``: The task of the model. 79 | 80 | You almost never need to create or touch this file manually, and it's created 81 | when you call :func:`skops.hub_utils.init`. 82 | 83 | It is recommended to include the script itself that creates the whole output in 84 | the upload. This way, the results are easily reproducible for others. To achieve 85 | this, call :func:`skops.hub_utils.add_files`: 86 | 87 | .. code:: python 88 | 89 | # contents of train.py 90 | ... 91 | hub_utils.init(model, dst=local_repo) 92 | hub_utils.add_files(__file__, dst=local_repo) # adds train.py to repo 93 | hub_utils.push(...) 94 | 95 | You may of course add more files if they're useful. 96 | 97 | .. _hf_hub_inference: 98 | 99 | Inference without Downloading the Models 100 | ---------------------------------------- 101 | 102 | You can use the Hugging Face Hub's inference API to get model output without 103 | downloading the models. The :func:`skops.hub_utils.get_model_output` function 104 | returns the model output for a given input. It can be used as:: 105 | 106 | import skops.hub_utils as hub_utils 107 | import pandas as pd 108 | data = pd.DataFrame(...) 109 | # Load the model from the Hub 110 | res = hub_utils.get_model_output("USER/MODEL_ID", data) 111 | 112 | In the above code snippet, ``res`` will be a :class:`numpy.ndarray` containing 113 | the model's output. 114 | 115 | .. _hf_hub_gradio: 116 | .. 117 | TODO: replace gradio link once gradio provides object.inv 118 | Easily build user interfaces to your scikit-learn models 119 | -------------------------------------------------------- 120 | `gradio `__ is a python library that lets you create interfaces on your model. 121 | It has a class called `Interface `__ that lets you create application 122 | interfaces to your machine learning models. Using gradio can have some advantages over the using a plain 123 | model repository, e.g. the Gradio dataframe component allows uploading a csv for tabular data, unlike the 124 | widget in the model repository. 125 | 126 | ``gradio`` is integrated with skops, so you can load an interface with only one 127 | line of code. During the initialization of the interface, call load method with 128 | your repository identifier prepended with "huggingface/" will load an 129 | interface for your model. The interface has a dataframe input that takes samples 130 | and a dataframe output to return predictions. It also takes the example in the 131 | repository that is previously pushed with skops. 132 | Calling `gr.Interface.launch() `__ will launch your application. 133 | 134 | .. code:: python 135 | 136 | import gradio as gr 137 | repo_id = "scikit-learn/tabular-playground" 138 | gr.Interface.load(f"huggingface/{repo_id}").launch() 139 | 140 | 141 | You can further customize your UI, add description, title, and more. If you'd 142 | like to share your demo, you can set ``share`` to True in `gr.Interface.launch() `__. 143 | 144 | .. code:: python 145 | 146 | title = "Supersoaker Defective Product Prediction" 147 | description = ("This model predicts Supersoaker production line failures." 148 | "Drag and drop any slice from dataset or edit values as you wish in below" 149 | "dataframe component.") 150 | gr.Interface.load(f"huggingface/{repo_id}", title = title, description = description) 151 | 152 | Sharing your local application has time limitations. 153 | If you want to share your application continuously, you can deploy it to 154 | Hugging Face Spaces. You can check out `this blog `__ 155 | on how to do it. 156 | For more information, please refer to documentation of `gradio `__. 157 | 158 | It's also possible to spawn a gradio space directly from the model repository. 159 | To achieve this, from the model page, click on ``Deploy`` (top right corner) > 160 | ``Spaces`` > ``Create new Space``, then follow the instructions. After 161 | finishing, you get a gradio space hosted on Hugging Face Hub, with all the 162 | benefits that brings. 163 | -------------------------------------------------------------------------------- /skops/io/_persist.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import importlib 4 | import io 5 | import json 6 | from pathlib import Path 7 | from typing import Any, BinaryIO, Sequence 8 | from zipfile import ZipFile 9 | 10 | import skops 11 | 12 | from ._audit import NODE_TYPE_MAPPING, audit_tree, get_tree 13 | from ._utils import LoadContext, SaveContext, _get_state, get_state 14 | 15 | # We load the dispatch functions from the corresponding modules and register 16 | # them. 17 | modules = ["._general", "._numpy", "._scipy", "._sklearn"] 18 | for module_name in modules: 19 | # register exposed functions for get_state and get_tree 20 | module = importlib.import_module(module_name, package="skops.io") 21 | for cls, method in getattr(module, "GET_STATE_DISPATCH_FUNCTIONS", []): 22 | _get_state.register(cls)(method) 23 | # populate the the dict used for dispatching get_tree functions 24 | NODE_TYPE_MAPPING.update(module.NODE_TYPE_MAPPING) 25 | 26 | 27 | def _save(obj: Any) -> io.BytesIO: 28 | buffer = io.BytesIO() 29 | 30 | with ZipFile(buffer, "w") as zip_file: 31 | save_context = SaveContext(zip_file=zip_file) 32 | state = get_state(obj, save_context) 33 | save_context.clear_memo() 34 | 35 | state["protocol"] = save_context.protocol 36 | state["_skops_version"] = skops.__version__ 37 | zip_file.writestr("schema.json", json.dumps(state, indent=2)) 38 | 39 | return buffer 40 | 41 | 42 | def dump(obj: Any, file: str | Path | BinaryIO) -> None: 43 | """Save an object using the skops persistence format. 44 | 45 | Skops aims at providing a secure persistence feature that does not rely on 46 | :mod:`pickle`, which is inherently insecure. For more information, please 47 | visit the :ref:`persistence` documentation. 48 | 49 | .. warning:: 50 | 51 | This feature is heavily under development, which means the API is 52 | unstable and there might be security issues at the moment. Therefore, 53 | use caution when loading files from sources you don't trust. 54 | 55 | Parameters 56 | ---------- 57 | obj: object 58 | The object to be saved. Usually a scikit-learn compatible model. 59 | 60 | file: str, path, or file-like object 61 | The file name. A zip archive will automatically created. As a matter of 62 | convention, we recommend to use the ".skops" file extension, e.g. 63 | ``save(model, "my-model.skops")``. 64 | 65 | """ 66 | buffer = _save(obj) 67 | 68 | if isinstance(file, (str, Path)): 69 | with open(file, "wb") as f: 70 | f.write(buffer.getbuffer()) 71 | else: 72 | file.write(buffer.getbuffer()) 73 | 74 | 75 | def dumps(obj: Any) -> bytes: 76 | """Save an object using the skops persistence format as a bytes object. 77 | 78 | .. warning:: 79 | 80 | This feature is heavily under development, which means the API is 81 | unstable and there might be security issues at the moment. Therefore, 82 | use caution when loading files from sources you don't trust. 83 | 84 | Parameters 85 | ---------- 86 | obj: object 87 | The object to be saved. Usually a scikit-learn compatible model. 88 | 89 | """ 90 | buffer = _save(obj) 91 | return buffer.getbuffer().tobytes() 92 | 93 | 94 | def load(file: str | Path, trusted: bool | Sequence[str] = False) -> Any: 95 | """Load an object saved with the skops persistence format. 96 | 97 | Skops aims at providing a secure persistence feature that does not rely on 98 | :mod:`pickle`, which is inherently insecure. For more information, please 99 | visit the :ref:`persistence` documentation. 100 | 101 | .. warning:: 102 | 103 | This feature is heavily under development, which means the API is 104 | unstable and there might be security issues at the moment. Therefore, 105 | use caution when loading files from sources you don't trust. 106 | 107 | Parameters 108 | ---------- 109 | file: str or pathlib.Path 110 | The file name of the object to be loaded. 111 | 112 | trusted: bool, or list of str, default=False 113 | If ``True``, the object will be loaded without any security checks. If 114 | ``False``, the object will be loaded only if there are only trusted 115 | objects in the dumped file. If a list of strings, the object will be 116 | loaded only if there are only trusted objects and objects of types 117 | listed in ``trusted`` are in the dumped file. 118 | 119 | Returns 120 | ------- 121 | instance: object 122 | The loaded object. 123 | 124 | """ 125 | with ZipFile(file, "r") as input_zip: 126 | schema = input_zip.read("schema.json") 127 | load_context = LoadContext(src=input_zip) 128 | tree = get_tree(json.loads(schema), load_context) 129 | audit_tree(tree, trusted) 130 | instance = tree.construct() 131 | 132 | return instance 133 | 134 | 135 | def loads(data: bytes, trusted: bool | Sequence[str] = False) -> Any: 136 | """Load an object saved with the skops persistence format from a bytes 137 | object. 138 | 139 | .. warning:: 140 | 141 | This feature is heavily under development, which means the API is 142 | unstable and there might be security issues at the moment. Therefore, 143 | use caution when loading files from sources you don't trust. 144 | 145 | Parameters 146 | ---------- 147 | data: bytes 148 | The dumped data to be loaded in bytes format. 149 | 150 | trusted: bool, or list of str, default=False 151 | If ``True``, the object will be loaded without any security checks. If 152 | ``False``, the object will be loaded only if there are only trusted 153 | objects in the dumped file. If a list of strings, the object will be 154 | loaded only if there are only trusted objects and objects of types 155 | listed in ``trusted`` are in the dumped file. 156 | 157 | Returns 158 | ------- 159 | instance: object 160 | The loaded object. 161 | """ 162 | if isinstance(data, str): 163 | raise TypeError("Can't load skops format from string, pass bytes") 164 | 165 | with ZipFile(io.BytesIO(data), "r") as zip_file: 166 | schema = json.loads(zip_file.read("schema.json")) 167 | load_context = LoadContext(src=zip_file) 168 | tree = get_tree(schema, load_context) 169 | audit_tree(tree, trusted) 170 | instance = tree.construct() 171 | 172 | return instance 173 | 174 | 175 | def get_untrusted_types( 176 | *, data: bytes | None = None, file: str | Path | None = None 177 | ) -> list[str]: 178 | """Get a list of untrusted types in a skops dump. 179 | 180 | Parameters 181 | ---------- 182 | data: bytes 183 | The data to be checked, in bytes format. 184 | 185 | file: str or Path 186 | The file to be checked. 187 | 188 | Returns 189 | ------- 190 | untrusted_types: list of str 191 | The list of untrusted types in the dump. 192 | 193 | Notes 194 | ----- 195 | Only one of data or file should be passed. 196 | """ 197 | if data and file: 198 | raise ValueError("Only one of data or file should be passed.") 199 | if not data and not file: 200 | raise ValueError("Exactly one of data or file should be passed.") 201 | 202 | content: io.BytesIO | str | Path 203 | if data: 204 | content = io.BytesIO(data) 205 | else: 206 | # mypy doesn't understand that file cannot be None here, thus ignore 207 | content = file # type: ignore 208 | 209 | with ZipFile(content, "r") as zip_file: 210 | schema = json.loads(zip_file.read("schema.json")) 211 | tree = get_tree(schema, load_context=LoadContext(src=zip_file)) 212 | untrusted_types = tree.get_unsafe_set() 213 | 214 | return sorted(untrusted_types) 215 | -------------------------------------------------------------------------------- /docs/persistence.rst: -------------------------------------------------------------------------------- 1 | .. _persistence: 2 | 3 | Secure persistence with skops 4 | ============================= 5 | 6 | .. warning:: 7 | 8 | This feature is heavily under development, which means the API is unstable 9 | and there might be security issues at the moment. Therefore, use caution 10 | when loading files from sources you don't trust. 11 | 12 | Skops offers a way to save and load sklearn models without using :mod:`pickle`. 13 | The ``pickle`` module is not secure, but with skops, you can [more] securely 14 | save and load models without using ``pickle``. 15 | 16 | ``Pickle`` is the standard serialization format for sklearn and for Python in 17 | general (``cloudpickle`` and ``joblib`` use the same format). One of the main 18 | advantages of ``pickle`` is that it can be used for almost all Python objects 19 | but this flexibility also makes it inherently insecure. This is because loading 20 | certain types of objects requires the ability to run arbitrary code, which can 21 | be misused for malicious purposes. For example, an attacker can use it to steal 22 | secrets from your machine or install a virus. As the `Python docs 23 | `__ say: 24 | 25 | .. warning:: 26 | 27 | The pickle module is not secure. Only unpickle data you trust. It is 28 | possible to construct malicious pickle data which will execute arbitrary 29 | code during unpickling. Never unpickle data that could have come from an 30 | untrusted source, or that could have been tampered with. 31 | 32 | In contrast to ``pickle``, the :func:`skops.io.dump` and :func:`skops.io.load` 33 | functions have a more limited scope, while preventing users from running 34 | arbitrary code or loading unknown and malicious objects. 35 | 36 | When loading a file, :func:`skops.io.load`/:func:`skops.io.loads` will traverse 37 | the input, check for known and unknown types, and will only construct those 38 | objects if they are trusted, either by default or by the user. 39 | 40 | .. note:: 41 | You can try out converting your existing pickle files to the skops format 42 | using this Space on Hugging Face Hub: 43 | `pickle-to-skops `__. 44 | 45 | Usage 46 | ----- 47 | 48 | The code snippet below illustrates how to use :func:`skops.io.dump` and 49 | :func:`skops.io.load`. Note that one needs `XGBoost 50 | `__ installed to run this: 51 | 52 | .. code:: python 53 | 54 | from xgboost.sklearn import XGBClassifier 55 | from sklearn.model_selection import GridSearchCV, train_test_split 56 | from sklearn.datasets import load_iris 57 | from skops.io import dump, load 58 | 59 | X, y = load_iris(return_X_y=True) 60 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) 61 | param_grid = {"tree_method": ["exact", "approx", "hist"]} 62 | clf = GridSearchCV(XGBClassifier(), param_grid=param_grid).fit(X_train, y_train) 63 | print(clf.score(X_test, y_test)) 64 | 0.9666666666666667 65 | dump(clf, "my-model.skops") 66 | # ... 67 | loaded = load("my-model.skops", trusted=True) 68 | print(loaded.score(X_test, y_test)) 69 | 0.9666666666666667 70 | 71 | # in memory 72 | from skops.io import dumps, loads 73 | serialized = dumps(clf) 74 | loaded = loads(serialized, trusted=True) 75 | 76 | Note that you should only load files with ``trusted=True`` if you trust the 77 | source. Otherwise you can get a list of untrusted types present in the dump 78 | using :func:`skops.io.get_untrusted_types`: 79 | 80 | .. code:: python 81 | 82 | from skops.io import get_untrusted_types 83 | unknown_types = get_untrusted_types(file="my-model.skops") 84 | print(unknown_types) 85 | ['numpy.float64', 'numpy.int64', 'sklearn.metrics._scorer._passthrough_scorer', 86 | 'xgboost.core.Booster', 'xgboost.sklearn.XGBClassifier'] 87 | 88 | Note that everything in the above list is safe to load. We already have many 89 | types included as trusted by default, and some of the above values might be 90 | added to that list in the future. 91 | 92 | Once you check the list and you validate that everything in the list is safe, 93 | you can load the file with ``trusted=unknown_types``: 94 | 95 | .. code:: python 96 | 97 | loaded = load("my-model.skops", trusted=unknown_types) 98 | 99 | At the moment, we support the vast majority of sklearn estimators. This 100 | includes complex use cases such as :class:`sklearn.pipeline.Pipeline`, 101 | :class:`sklearn.model_selection.GridSearchCV`, classes using objects defined in 102 | Cython such as :class:`sklearn.tree.DecisionTreeClassifier`, and more. If you 103 | discover an sklearn estimator that does not work, please open an issue on the 104 | skops `GitHub page `__ and let us 105 | know. 106 | 107 | At the moment, ``skops`` cannot persist arbitrary Python code. This means if 108 | you have custom functions (say, a custom function to be used with 109 | :class:`sklearn.preprocessing.FunctionTransformer`), it will not work. However, 110 | most ``numpy`` and ``scipy`` functions should work. Therefore, you can save 111 | objects having references to functions such as ``numpy.sqrt``. 112 | 113 | Command Line Interface 114 | ###################### 115 | 116 | Skops has a command line interface to convert scikit-learn models persisted with 117 | ``Pickle`` to ``Skops`` files. 118 | 119 | To convert a file from the command line, use the ``skops convert`` entrypoint. 120 | 121 | Below is an example call to convert a file ``my_model.pkl`` to ``my_model.skops``: 122 | 123 | .. code:: console 124 | 125 | skops convert my_model.pkl 126 | 127 | To convert multiple files, you can use bash commands to iterate the above call. 128 | For example, to convert all ``.pkl`` flies in the current directory: 129 | 130 | .. code:: console 131 | 132 | for FILE in *.pkl; do skops convert FILE; done 133 | 134 | Further help for the different supported options can be found by calling 135 | ``skops convert --help`` in a terminal. 136 | 137 | 138 | Supported libraries 139 | ------------------- 140 | 141 | Skops intends to support all of **scikit-learn**, that is, not only its 142 | estimators, but also other classes like cross validation splitters. Furthermore, 143 | most types from **numpy** and **scipy** should be supported, such as (sparse) 144 | arrays, dtypes, random generators, and ufuncs. 145 | 146 | Apart from this core, we plan to support machine learning libraries commonly 147 | used be the community. So far, we have tested the following libraries: 148 | 149 | - `LightGBM `_ (scikit-learn API) 150 | - `XGBoost `_ (scikit-learn API) 151 | - `CatBoost `_ 152 | 153 | If you run into a problem using any of the mentioned libraries, this could mean 154 | there is a bug in skops. Please open an issue on `our issue tracker 155 | `__ (but please check first if a 156 | corresponding issue already exists). 157 | 158 | In terms of security, we do not audit these libraries for security issues. 159 | Therefore, you should only load a skops file containing a model of any of those 160 | libraries if you trust them to be secure. It's not a perfect solution, but it's 161 | still better than trusting pickle files, which anyone can tamper with easily. 162 | 163 | Roadmap 164 | ------- 165 | There needs to be more testing to harden the loader and make sure we don't run 166 | arbitrary code when it's not intended. However, the safety mechanisms already 167 | in place should prevent most cases of abuse. 168 | 169 | At the moment, persisting and loading arbitrary C extension types is not 170 | possible, unless a python object wraps around them and handles persistence and 171 | loading via ``__getstate__`` and ``__setstate__``. We plan to develop an API 172 | which would help third party libraries to make their C extension types 173 | ``skops`` compatible. 174 | 175 | You can check on our `"issue tracker 176 | `__ which features are 177 | planned for the near future. 178 | -------------------------------------------------------------------------------- /skops/card/_templates.py: -------------------------------------------------------------------------------- 1 | """Templates for model cards 2 | 3 | To add a new template, define it as a dictionary where the key is the section 4 | and the value is the content of the section. If the content is empty but should 5 | be filled by the user, set it to be the ``CONTENT_PLACEHOLDER``. 6 | 7 | After defining the template itself, add it as another enum value in the 8 | ``Templates`` enum. 9 | 10 | Finally, if there is a corresponding section in the new template, some methods 11 | on the ``Card`` class should be adjusted to make use of the template. First of 12 | all, ``_fill_default_sections`` should be used to populate the model card with 13 | the template. 14 | 15 | Furthermore, some specific methods might require adjusting. For example, the 16 | ``Card._add_hyperparams`` method will add a table of model hyperparameters, but 17 | it needs to know in what section to put them. So if the template contains a 18 | corresponding section, modify the method to put the hyperparameters inside that 19 | section. 20 | 21 | """ 22 | 23 | from enum import Enum 24 | 25 | 26 | class Templates(Enum): 27 | skops = "skops" 28 | 29 | 30 | CONTENT_PLACEHOLDER = "[More Information Needed]" 31 | """When there is a section but content has yet to be added by the user, show 32 | this""" 33 | 34 | # fmt: off 35 | SKOPS_TEMPLATE = { 36 | "Model description": CONTENT_PLACEHOLDER, 37 | "Model description/Intended uses & limitations": CONTENT_PLACEHOLDER, 38 | "Model description/Training Procedure": "", 39 | "Model description/Training Procedure/Hyperparameters": CONTENT_PLACEHOLDER, 40 | "Model description/Training Procedure/Model Plot": "The model plot is below.", 41 | "Model description/Evaluation Results": CONTENT_PLACEHOLDER, 42 | "How to Get Started with the Model": CONTENT_PLACEHOLDER, 43 | "Model Card Authors": ( 44 | f"This model card is written by following authors:\n\n{CONTENT_PLACEHOLDER}" 45 | ), 46 | "Model Card Contact": ( 47 | "You can contact the model card authors through following channels:\n" 48 | f"{CONTENT_PLACEHOLDER}" 49 | ), 50 | "Citation": ( 51 | "Below you can find information related to citation.\n\n**BibTeX:**\n```\n" 52 | f"{CONTENT_PLACEHOLDER}\n```" 53 | ), 54 | } 55 | 56 | # The template below corresponds to the HF Hub default template, but is geared 57 | # towards deep learning models, especially language models, and thus is not a 58 | # good fit for most sklearn models. 59 | _HUB_TEMPLATE = { 60 | "Model Card": "", 61 | # Provide a quick summary of what the model is/does. 62 | "Model Details": "", 63 | "Model Details/Model Description": "", 64 | # Provide a longer summary of what this model is. 65 | "Model Details/Model Description/Developed by": CONTENT_PLACEHOLDER, 66 | "Model Details/Model Description/Shared by [optional]": CONTENT_PLACEHOLDER, 67 | "Model Details/Model Description/Model type": CONTENT_PLACEHOLDER, 68 | "Model Details/Model Description/Language(s) (NLP)": CONTENT_PLACEHOLDER, 69 | "Model Details/Model Description/License": CONTENT_PLACEHOLDER, 70 | "Model Details/Model Description/Finetuned from model [optional]": CONTENT_PLACEHOLDER, 71 | "Model Details/Model Description/Resources for more information": CONTENT_PLACEHOLDER, 72 | 73 | "Uses": "", 74 | # Address questions around how the model is intended to be used, including 75 | # the foreseeable users of the model and those affected by the model. 76 | "Uses/Direct Use": CONTENT_PLACEHOLDER, 77 | # This section is for the model use without fine-tuning or plugging into a 78 | # larger ecosystem/app. 79 | "Uses/Downstream Use [optional]": CONTENT_PLACEHOLDER, 80 | # This section is for the model use when fine-tuned for a task, or when 81 | # plugged into a larger ecosystem/app. 82 | "Uses/Out-of-Scope Use": CONTENT_PLACEHOLDER, 83 | # This section addresses misuse, malicious use, and uses that the model will 84 | # not work well for. 85 | 86 | "Bias, Risks, and Limitations": CONTENT_PLACEHOLDER, 87 | # This section is meant to convey both technical and sociotechnical 88 | # limitations. 89 | "Bias, Risks, and Limitations/Recommendations": ( 90 | "Users (both direct and downstream) should be made aware of the risks, biases " 91 | "and limitations of the model. More information needed for further " 92 | "recommendations." 93 | ), 94 | # This section is meant to convey recommendations with respect to the bias, 95 | # risk, and technical limitations. 96 | 97 | "Training Details": "", 98 | "Training Details/Training Data": CONTENT_PLACEHOLDER, 99 | # This should link to a Data Card, perhaps with a short stub of information 100 | # on what the training data is all about as well as documentation related to 101 | # data pre-processing or additional filtering. 102 | "Training Details/Training Procedure [optional]": "", 103 | # This relates heavily to the Technical Specifications. Content here should 104 | # link to that section when it is relevant to the training procedure. 105 | "Training Details/Training Procedure [optional]/Preprocessing": CONTENT_PLACEHOLDER, 106 | "Training Details/Training Procedure [optional]/Speeds, Sizes, Times": CONTENT_PLACEHOLDER, 107 | # This section provides information about throughput, start/end time, 108 | # checkpoint size if relevant, etc. 109 | 110 | "Evaluation": "", 111 | # This section describes the evaluation protocols and provides the results. 112 | "Evaluation/Testing Data, Factors & Metrics": "", 113 | "Evaluation/Testing Data, Factors & Metrics/Testing Data": CONTENT_PLACEHOLDER, 114 | # This should link to a Data Card if possible 115 | "Evaluation/Testing Data, Factors & Metrics/Factors": CONTENT_PLACEHOLDER, 116 | # These are the things the evaluation is disaggregating by, e.g., 117 | # subpopulations or domains. 118 | "Evaluation/Testing Data, Factors & Metrics/Metrics": CONTENT_PLACEHOLDER, 119 | # These are the evaluation metrics being used, ideally with a description of 120 | # why. 121 | "Evaluation/Results": CONTENT_PLACEHOLDER, 122 | 123 | "Model Examination [optional]": CONTENT_PLACEHOLDER, 124 | # Relevant interpretability work for the model goes here. 125 | 126 | "Environmental Impact": ( 127 | "Carbon emissions can be estimated using the " 128 | "[Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) " 129 | "presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700)." 130 | ), 131 | # Total emissions (in grams of CO2eq) and additional considerations, such as 132 | # electricity usage, go here. Edit the suggested text below accordingly" 133 | "Environmental Impact/Hardware Type": CONTENT_PLACEHOLDER, 134 | "Environmental Impact/Hours used": CONTENT_PLACEHOLDER, 135 | "Environmental Impact/Cloud Provider": CONTENT_PLACEHOLDER, 136 | "Environmental Impact/Compute Region": CONTENT_PLACEHOLDER, 137 | "Environmental Impact/Carbon Emitted": CONTENT_PLACEHOLDER, 138 | 139 | "Technical Specifications [optional]": "", 140 | "Technical Specifications [optional]/Model Architecture and Objective": CONTENT_PLACEHOLDER, 141 | "Technical Specifications [optional]/Compute Infrastructure": CONTENT_PLACEHOLDER, 142 | "Technical Specifications [optional]/Compute Infrastructure/Hardware": CONTENT_PLACEHOLDER, 143 | "Technical Specifications [optional]/Compute Infrastructure/Software": CONTENT_PLACEHOLDER, 144 | 145 | "Citation [optional]": "", 146 | # If there is a paper or blog post introducing the model, the APA and Bibtex 147 | # information for that should go in this section. 148 | "Citation [optional]/BibTeX": CONTENT_PLACEHOLDER, 149 | "Citation [optional]/APA": CONTENT_PLACEHOLDER, 150 | 151 | "Glossary [optional]": "", 152 | # If relevant, include terms and calculations in this section that can help 153 | # readers understand the model or model card. 154 | 155 | "More Information [optional]": CONTENT_PLACEHOLDER, 156 | "Model Card Authors [optional]": CONTENT_PLACEHOLDER, 157 | "Model Card Contact": CONTENT_PLACEHOLDER, 158 | "How to Get Started with the Model": f"""Use the code below to get started with the model. 159 | 160 |
161 | Click to expand 162 | 163 | {CONTENT_PLACEHOLDER} 164 | 165 |
""", 166 | } 167 | # fmt: on 168 | -------------------------------------------------------------------------------- /skops/card/tests/examples/clip-vit-large-patch14.md: -------------------------------------------------------------------------------- 1 | --- 2 | tags: 3 | - vision 4 | widget: 5 | - src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/cat-dog-music.png 6 | candidate_labels: playing music, playing sports 7 | example_title: Cat & Dog 8 | --- 9 | 10 | # Model Card: CLIP 11 | 12 | 13 | 14 | Disclaimer: The model card is taken and modified from the official CLIP repository, it can be found [here](https://github.com/openai/CLIP/blob/main/model-card.md). 15 | 16 | ## Model Details 17 | 18 | The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner. It was not developed for general model deployment - to deploy models like CLIP, researchers will first need to carefully study their capabilities in relation to the specific context they’re being deployed within. 19 | 20 | ### Model Date 21 | 22 | January 2021 23 | 24 | ### Model Type 25 | 26 | The base model uses a ViT-L/14 Transformer architecture as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss. 27 | 28 | The original implementation had two variants: one using a ResNet image encoder and the other using a Vision Transformer. This repository has the variant with the Vision Transformer. 29 | 30 | 31 | ### Documents 32 | 33 | - [Blog Post](https://openai.com/blog/clip/) 34 | - [CLIP Paper](https://arxiv.org/abs/2103.00020) 35 | 36 | 37 | ### Use with Transformers 38 | 39 | ```python 40 | from PIL import Image 41 | import requests 42 | 43 | from transformers import CLIPProcessor, CLIPModel 44 | 45 | model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14") 46 | processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") 47 | 48 | url = "http://images.cocodataset.org/val2017/000000039769.jpg" 49 | image = Image.open(requests.get(url, stream=True).raw) 50 | 51 | inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True) 52 | 53 | outputs = model(**inputs) 54 | logits_per_image = outputs.logits_per_image # this is the image-text similarity score 55 | probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities 56 | ``` 57 | 58 | 59 | ## Model Use 60 | 61 | ### Intended Use 62 | 63 | The model is intended as a research output for research communities. We hope that this model will enable researchers to better understand and explore zero-shot, arbitrary image classification. We also hope it can be used for interdisciplinary studies of the potential impact of such models - the CLIP paper includes a discussion of potential downstream impacts to provide an example for this sort of analysis. 64 | 65 | #### Primary intended uses 66 | 67 | The primary intended users of these models are AI researchers. 68 | 69 | We primarily imagine the model will be used by researchers to better understand robustness, generalization, and other capabilities, biases, and constraints of computer vision models. 70 | 71 | ### Out-of-Scope Use Cases 72 | 73 | **Any** deployed use case of the model - whether commercial or not - is currently out of scope. Non-deployed use cases such as image search in a constrained environment, are also not recommended unless there is thorough in-domain testing of the model with a specific, fixed class taxonomy. This is because our safety assessment demonstrated a high need for task specific testing especially given the variability of CLIP’s performance with different class taxonomies. This makes untested and unconstrained deployment of the model in any use case currently potentially harmful. 74 | 75 | Certain use cases which would fall under the domain of surveillance and facial recognition are always out-of-scope regardless of performance of the model. This is because the use of artificial intelligence for tasks such as these can be premature currently given the lack of testing norms and checks to ensure its fair use. 76 | 77 | Since the model has not been purposefully trained in or evaluated on any languages other than English, its use should be limited to English language use cases. 78 | 79 | 80 | 81 | ## Data 82 | 83 | The model was trained on publicly available image-caption data. This was done through a combination of crawling a handful of websites and using commonly-used pre-existing image datasets such as [YFCC100M](http://projects.dfki.uni-kl.de/yfcc100m/). A large portion of the data comes from our crawling of the internet. This means that the data is more representative of people and societies most connected to the internet which tend to skew towards more developed nations, and younger, male users. 84 | 85 | ### Data Mission Statement 86 | 87 | Our goal with building this dataset was to test out robustness and generalizability in computer vision tasks. As a result, the focus was on gathering large quantities of data from different publicly-available internet data sources. The data was gathered in a mostly non-interventionist manner. However, we only crawled websites that had policies against excessively violent and adult images and allowed us to filter out such content. We do not intend for this dataset to be used as the basis for any commercial or deployed model and will not be releasing the dataset. 88 | 89 | 90 | 91 | ## Performance and Limitations 92 | 93 | ### Performance 94 | 95 | We have evaluated the performance of CLIP on a wide range of benchmarks across a variety of computer vision datasets such as OCR to texture recognition to fine-grained classification. The paper describes model performance on the following datasets: 96 | 97 | - Food101 98 | - CIFAR10 99 | - CIFAR100 100 | - Birdsnap 101 | - SUN397 102 | - Stanford Cars 103 | - FGVC Aircraft 104 | - VOC2007 105 | - DTD 106 | - Oxford-IIIT Pet dataset 107 | - Caltech101 108 | - Flowers102 109 | - MNIST 110 | - SVHN 111 | - IIIT5K 112 | - Hateful Memes 113 | - SST-2 114 | - UCF101 115 | - Kinetics700 116 | - Country211 117 | - CLEVR Counting 118 | - KITTI Distance 119 | - STL-10 120 | - RareAct 121 | - Flickr30 122 | - MSCOCO 123 | - ImageNet 124 | - ImageNet-A 125 | - ImageNet-R 126 | - ImageNet Sketch 127 | - ObjectNet (ImageNet Overlap) 128 | - Youtube-BB 129 | - ImageNet-Vid 130 | 131 | ## Limitations 132 | 133 | CLIP and our analysis of it have a number of limitations. CLIP currently struggles with respect to certain tasks such as fine grained classification and counting objects. CLIP also poses issues with regards to fairness and bias which we discuss in the paper and briefly in the next section. Additionally, our approach to testing CLIP also has an important limitation- in many cases we have used linear probes to evaluate the performance of CLIP and there is evidence suggesting that linear probes can underestimate model performance. 134 | 135 | ### Bias and Fairness 136 | 137 | We find that the performance of CLIP - and the specific biases it exhibits - can depend significantly on class design and the choices one makes for categories to include and exclude. We tested the risk of certain kinds of denigration with CLIP by classifying images of people from [Fairface](https://arxiv.org/abs/1908.04913) into crime-related and non-human animal categories. We found significant disparities with respect to race and gender. Additionally, we found that these disparities could shift based on how the classes were constructed. (Details captured in the Broader Impacts Section in the paper). 138 | 139 | We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks. 140 | 141 | 142 | 143 | ## Feedback 144 | 145 | ### Where to send questions or comments about the model 146 | 147 | Please use [this Google Form](https://forms.gle/Uv7afRH5dvY34ZEs9) 148 | -------------------------------------------------------------------------------- /skops/card/tests/examples/gpt2.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: en 3 | tags: 4 | - exbert 5 | 6 | license: mit 7 | --- 8 | 9 | # GPT-2 10 | 11 | 12 | 13 | Test the whole generation capabilities here: https://transformer.huggingface.co/doc/gpt2-large 14 | 15 | Pretrained model on English language using a causal language modeling (CLM) objective. It was introduced in 16 | [this paper](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) 17 | and first released at [this page](https://openai.com/blog/better-language-models/). 18 | 19 | Disclaimer: The team releasing GPT-2 also wrote a 20 | [model card](https://github.com/openai/gpt-2/blob/master/model_card.md) for their model. Content from this model card 21 | has been written by the Hugging Face team to complete the information they provided and give specific examples of bias. 22 | 23 | ## Model description 24 | 25 | GPT-2 is a transformers model pretrained on a very large corpus of English data in a self-supervised fashion. This 26 | means it was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots 27 | of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, 28 | it was trained to guess the next word in sentences. 29 | 30 | More precisely, inputs are sequences of continuous text of a certain length and the targets are the same sequence, 31 | shifted one token (word or piece of word) to the right. The model uses internally a mask-mechanism to make sure the 32 | predictions for the token `i` only uses the inputs from `1` to `i` but not the future tokens. 33 | 34 | This way, the model learns an inner representation of the English language that can then be used to extract features 35 | useful for downstream tasks. The model is best at what it was pretrained for however, which is generating texts from a 36 | prompt. 37 | 38 | This is the **smallest** version of GPT-2, with 124M parameters. 39 | 40 | **Related Models:** [GPT-Large](https://huggingface.co/gpt2-large), [GPT-Medium](https://huggingface.co/gpt2-medium) and [GPT-XL](https://huggingface.co/gpt2-xl) 41 | 42 | ## Intended uses & limitations 43 | 44 | You can use the raw model for text generation or fine-tune it to a downstream task. See the 45 | [model hub](https://huggingface.co/models?filter=gpt2) to look for fine-tuned versions on a task that interests you. 46 | 47 | ### How to use 48 | 49 | You can use this model directly with a pipeline for text generation. Since the generation relies on some randomness, we 50 | set a seed for reproducibility: 51 | 52 | ```python 53 | >>> from transformers import pipeline, set_seed 54 | >>> generator = pipeline('text-generation', model='gpt2') 55 | >>> set_seed(42) 56 | >>> generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5) 57 | 58 | [{'generated_text': "Hello, I'm a language model, a language for thinking, a language for expressing thoughts."}, 59 | {'generated_text': "Hello, I'm a language model, a compiler, a compiler library, I just want to know how I build this kind of stuff. I don"}, 60 | {'generated_text': "Hello, I'm a language model, and also have more than a few of your own, but I understand that they're going to need some help"}, 61 | {'generated_text': "Hello, I'm a language model, a system model. I want to know my language so that it might be more interesting, more user-friendly"}, 62 | {'generated_text': 'Hello, I\'m a language model, not a language model"\n\nThe concept of "no-tricks" comes in handy later with new'}] 63 | ``` 64 | 65 | Here is how to use this model to get the features of a given text in PyTorch: 66 | 67 | ```python 68 | from transformers import GPT2Tokenizer, GPT2Model 69 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 70 | model = GPT2Model.from_pretrained('gpt2') 71 | text = "Replace me by any text you'd like." 72 | encoded_input = tokenizer(text, return_tensors='pt') 73 | output = model(**encoded_input) 74 | ``` 75 | 76 | and in TensorFlow: 77 | 78 | ```python 79 | from transformers import GPT2Tokenizer, TFGPT2Model 80 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 81 | model = TFGPT2Model.from_pretrained('gpt2') 82 | text = "Replace me by any text you'd like." 83 | encoded_input = tokenizer(text, return_tensors='tf') 84 | output = model(encoded_input) 85 | ``` 86 | 87 | ### Limitations and bias 88 | 89 | The training data used for this model has not been released as a dataset one can browse. We know it contains a lot of 90 | unfiltered content from the internet, which is far from neutral. As the openAI team themselves point out in their 91 | [model card](https://github.com/openai/gpt-2/blob/master/model_card.md#out-of-scope-use-cases): 92 | 93 | > Because large-scale language models like GPT-2 do not distinguish fact from fiction, we don’t support use-cases 94 | > that require the generated text to be true. 95 | > 96 | > Additionally, language models like GPT-2 reflect the biases inherent to the systems they were trained on, so we do 97 | > not recommend that they be deployed into systems that interact with humans > unless the deployers first carry out a 98 | > study of biases relevant to the intended use-case. We found no statistically significant difference in gender, race, 99 | > and religious bias probes between 774M and 1.5B, implying all versions of GPT-2 should be approached with similar 100 | > levels of caution around use cases that are sensitive to biases around human attributes. 101 | 102 | Here's an example of how the model can have biased predictions: 103 | 104 | ```python 105 | >>> from transformers import pipeline, set_seed 106 | >>> generator = pipeline('text-generation', model='gpt2') 107 | >>> set_seed(42) 108 | >>> generator("The White man worked as a", max_length=10, num_return_sequences=5) 109 | 110 | [{'generated_text': 'The White man worked as a mannequin for'}, 111 | {'generated_text': 'The White man worked as a maniser of the'}, 112 | {'generated_text': 'The White man worked as a bus conductor by day'}, 113 | {'generated_text': 'The White man worked as a plumber at the'}, 114 | {'generated_text': 'The White man worked as a journalist. He had'}] 115 | 116 | >>> set_seed(42) 117 | >>> generator("The Black man worked as a", max_length=10, num_return_sequences=5) 118 | 119 | [{'generated_text': 'The Black man worked as a man at a restaurant'}, 120 | {'generated_text': 'The Black man worked as a car salesman in a'}, 121 | {'generated_text': 'The Black man worked as a police sergeant at the'}, 122 | {'generated_text': 'The Black man worked as a man-eating monster'}, 123 | {'generated_text': 'The Black man worked as a slave, and was'}] 124 | ``` 125 | 126 | This bias will also affect all fine-tuned versions of this model. 127 | 128 | ## Training data 129 | 130 | The OpenAI team wanted to train this model on a corpus as large as possible. To build it, they scraped all the web 131 | pages from outbound links on Reddit which received at least 3 karma. Note that all Wikipedia pages were removed from 132 | this dataset, so the model was not trained on any part of Wikipedia. The resulting dataset (called WebText) weights 133 | 40GB of texts but has not been publicly released. You can find a list of the top 1,000 domains present in WebText 134 | [here](https://github.com/openai/gpt-2/blob/master/domains.txt). 135 | 136 | ## Training procedure 137 | 138 | ### Preprocessing 139 | 140 | The texts are tokenized using a byte-level version of Byte Pair Encoding (BPE) (for unicode characters) and a 141 | vocabulary size of 50,257. The inputs are sequences of 1024 consecutive tokens. 142 | 143 | The larger model was trained on 256 cloud TPU v3 cores. The training duration was not disclosed, nor were the exact 144 | details of training. 145 | 146 | ## Evaluation results 147 | 148 | The model achieves the following results without any fine-tuning (zero-shot): 149 | 150 | | Dataset | LAMBADA | LAMBADA | CBT-CN | CBT-NE | WikiText2 | PTB | enwiki8 | text8 | WikiText103 | 1BW | 151 | |:--------:|:-------:|:-------:|:------:|:------:|:---------:|:------:|:-------:|:------:|:-----------:|:-----:| 152 | | (metric) | (PPL) | (ACC) | (ACC) | (ACC) | (PPL) | (PPL) | (BPB) | (BPC) | (PPL) | (PPL) | 153 | | | 35.13 | 45.99 | 87.65 | 83.4 | 29.41 | 65.85 | 1.16 | 1,17 | 37.50 | 75.20 | 154 | 155 | 156 | ### BibTeX entry and citation info 157 | 158 | ```bibtex 159 | @article{radford2019language, 160 | title={Language Models are Unsupervised Multitask Learners}, 161 | author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya}, 162 | year={2019} 163 | } 164 | ``` 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /skops/io/_sklearn.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Callable, Sequence, Type 4 | 5 | from sklearn.cluster import Birch 6 | 7 | try: 8 | # TODO: remove once support for sklearn<1.2 is dropped. See #187 9 | from sklearn.covariance._graph_lasso import _DictWithDeprecatedKeys 10 | except ImportError: 11 | _DictWithDeprecatedKeys = None 12 | from sklearn.linear_model._sgd_fast import ( 13 | EpsilonInsensitive, 14 | Hinge, 15 | Huber, 16 | Log, 17 | LossFunction, 18 | ModifiedHuber, 19 | SquaredEpsilonInsensitive, 20 | SquaredHinge, 21 | SquaredLoss, 22 | ) 23 | from sklearn.tree._tree import Tree 24 | 25 | from ._audit import Node, get_tree 26 | from ._general import unsupported_get_state 27 | from ._utils import LoadContext, SaveContext, get_module, get_state, gettype 28 | from .exceptions import UnsupportedTypeException 29 | 30 | ALLOWED_SGD_LOSSES = { 31 | ModifiedHuber, 32 | Hinge, 33 | SquaredHinge, 34 | Log, 35 | SquaredLoss, 36 | Huber, 37 | EpsilonInsensitive, 38 | SquaredEpsilonInsensitive, 39 | } 40 | 41 | UNSUPPORTED_TYPES = {Birch} 42 | 43 | 44 | def reduce_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: 45 | # This method is for objects for which we have to use the __reduce__ 46 | # method to get the state. 47 | res = { 48 | "__class__": obj.__class__.__name__, 49 | "__module__": get_module(type(obj)), 50 | } 51 | 52 | # We get the output of __reduce__ and use it to reconstruct the object. 53 | # For security reasons, we don't save the constructor object returned by 54 | # __reduce__, and instead use the pre-defined constructor for the object 55 | # that we know. This avoids having a function such as `eval()` as the 56 | # "constructor", abused by attackers. 57 | # 58 | # We can/should also look into removing __reduce__ from scikit-learn, 59 | # and that is not impossible. Most objects which use this don't really 60 | # need it. 61 | # 62 | # More info on __reduce__: 63 | # https://docs.python.org/3/library/pickle.html#object.__reduce__ 64 | # 65 | # As a good example, this makes Tree object to be serializable. 66 | reduce = obj.__reduce__() 67 | res["__reduce__"] = {} 68 | res["__reduce__"]["args"] = get_state(reduce[1], save_context) 69 | 70 | if len(reduce) == 3: 71 | # reduce includes what's needed for __getstate__ and we don't need to 72 | # call __getstate__ directly. 73 | attrs = reduce[2] 74 | elif hasattr(obj, "__getstate__"): 75 | # since python311 __getstate__ is defined for `object` and might return 76 | # None 77 | attrs = obj.__getstate__() or {} 78 | elif hasattr(obj, "__dict__"): 79 | attrs = obj.__dict__ 80 | else: 81 | attrs = {} 82 | 83 | if not isinstance(attrs, (dict, tuple)): 84 | raise UnsupportedTypeException( 85 | f"Objects of type {res['__class__']} not supported yet" 86 | ) 87 | 88 | res["content"] = get_state(attrs, save_context) 89 | return res 90 | 91 | 92 | class ReduceNode(Node): 93 | def __init__( 94 | self, 95 | state: dict[str, Any], 96 | load_context: LoadContext, 97 | constructor: Type[Any] | Callable[..., Any], 98 | trusted: bool | Sequence[str] = False, 99 | ) -> None: 100 | super().__init__(state, load_context, trusted) 101 | reduce = state["__reduce__"] 102 | self.children = { 103 | "attrs": get_tree(state["content"], load_context), 104 | "args": get_tree(reduce["args"], load_context), 105 | "constructor": constructor, 106 | } 107 | 108 | def _construct(self): 109 | args = self.children["args"].construct() 110 | constructor = self.children["constructor"] 111 | instance = constructor(*args) 112 | attrs = self.children["attrs"].construct() 113 | if not attrs: 114 | # nothing more to do 115 | return instance 116 | 117 | if isinstance(args, tuple) and not hasattr(instance, "__setstate__"): 118 | raise UnsupportedTypeException( 119 | f"Objects of type {constructor} are not supported yet" 120 | ) 121 | 122 | if hasattr(instance, "__setstate__"): 123 | instance.__setstate__(attrs) 124 | elif isinstance(attrs, dict): 125 | instance.__dict__.update(attrs) 126 | else: 127 | # we (probably) got tuple attrs but cannot setstate with them 128 | raise UnsupportedTypeException( 129 | f"Objects of type {constructor} are not supported yet" 130 | ) 131 | 132 | return instance 133 | 134 | 135 | def tree_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: 136 | state = reduce_get_state(obj, save_context) 137 | state["__loader__"] = "TreeNode" 138 | return state 139 | 140 | 141 | class TreeNode(ReduceNode): 142 | def __init__( 143 | self, 144 | state: dict[str, Any], 145 | load_context: LoadContext, 146 | trusted: bool | Sequence[str] = False, 147 | ) -> None: 148 | super().__init__(state, load_context, constructor=Tree, trusted=trusted) 149 | self.trusted = self._get_trusted(trusted, [get_module(Tree) + ".Tree"]) 150 | 151 | 152 | def sgd_loss_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: 153 | state = reduce_get_state(obj, save_context) 154 | state["__loader__"] = "SGDNode" 155 | return state 156 | 157 | 158 | class SGDNode(ReduceNode): 159 | def __init__( 160 | self, 161 | state: dict[str, Any], 162 | load_context: LoadContext, 163 | trusted: bool | Sequence[str] = False, 164 | ) -> None: 165 | # TODO: make sure trusted here makes sense and used. 166 | super().__init__( 167 | state, 168 | load_context, 169 | constructor=gettype(state["__module__"], state["__class__"]), 170 | trusted=False, 171 | ) 172 | self.trusted = self._get_trusted( 173 | trusted, [get_module(x) + "." + x.__name__ for x in ALLOWED_SGD_LOSSES] 174 | ) 175 | 176 | 177 | # TODO: remove once support for sklearn<1.2 is dropped. 178 | def _DictWithDeprecatedKeys_get_state( 179 | obj: Any, save_context: SaveContext 180 | ) -> dict[str, Any]: 181 | res = { 182 | "__class__": obj.__class__.__name__, 183 | "__module__": get_module(type(obj)), 184 | "__loader__": "_DictWithDeprecatedKeysNode", 185 | } 186 | content = {} 187 | # explicitly pass a dict object instead of _DictWithDeprecatedKeys and 188 | # later construct a _DictWithDeprecatedKeys object. 189 | content["main"] = get_state(dict(obj), save_context) 190 | content["_deprecated_key_to_new_key"] = get_state( 191 | obj._deprecated_key_to_new_key, save_context 192 | ) 193 | res["content"] = content 194 | return res 195 | 196 | 197 | # TODO: remove once support for sklearn<1.2 is dropped. 198 | class _DictWithDeprecatedKeysNode(Node): 199 | # _DictWithDeprecatedKeys is just a wrapper for dict 200 | def __init__( 201 | self, 202 | state: dict[str, Any], 203 | load_context: LoadContext, 204 | trusted: bool | Sequence[str] = False, 205 | ) -> None: 206 | super().__init__(state, load_context, trusted) 207 | self.trusted = [ 208 | get_module(_DictWithDeprecatedKeysNode) + "._DictWithDeprecatedKeys" 209 | ] 210 | self.children = { 211 | "main": get_tree(state["content"]["main"], load_context), 212 | "_deprecated_key_to_new_key": get_tree( 213 | state["content"]["_deprecated_key_to_new_key"], load_context 214 | ), 215 | } 216 | 217 | def _construct(self): 218 | instance = _DictWithDeprecatedKeys(**self.children["main"].construct()) 219 | instance._deprecated_key_to_new_key = self.children[ 220 | "_deprecated_key_to_new_key" 221 | ].construct() 222 | return instance 223 | 224 | 225 | # tuples of type and function that gets the state of that type 226 | GET_STATE_DISPATCH_FUNCTIONS = [ 227 | (LossFunction, sgd_loss_get_state), 228 | (Tree, tree_get_state), 229 | ] 230 | for type_ in UNSUPPORTED_TYPES: 231 | GET_STATE_DISPATCH_FUNCTIONS.append((type_, unsupported_get_state)) 232 | 233 | # tuples of type and function that creates the instance of that type 234 | NODE_TYPE_MAPPING = { 235 | "SGDNode": SGDNode, 236 | "TreeNode": TreeNode, 237 | } 238 | 239 | # TODO: remove once support for sklearn<1.2 is dropped. 240 | # Starting from sklearn 1.2, _DictWithDeprecatedKeys is removed as it's no 241 | # longer needed for GraphicalLassoCV, see #187. 242 | if _DictWithDeprecatedKeys is not None: 243 | GET_STATE_DISPATCH_FUNCTIONS.append( 244 | (_DictWithDeprecatedKeys, _DictWithDeprecatedKeys_get_state) 245 | ) 246 | NODE_TYPE_MAPPING[ 247 | "_DictWithDeprecatedKeysNode" 248 | ] = _DictWithDeprecatedKeysNode # type: ignore 249 | -------------------------------------------------------------------------------- /skops/io/_numpy.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import io 4 | from typing import Any, Sequence 5 | 6 | import numpy as np 7 | 8 | from ._audit import Node, get_tree 9 | from ._utils import LoadContext, SaveContext, get_module, get_state, gettype 10 | from .exceptions import UnsupportedTypeException 11 | 12 | 13 | def ndarray_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: 14 | res = { 15 | "__class__": obj.__class__.__name__, 16 | "__module__": get_module(type(obj)), 17 | "__loader__": "NdArrayNode", 18 | } 19 | 20 | try: 21 | # If the dtype is object, np.save should not work with 22 | # allow_pickle=False, therefore we convert them to a list and 23 | # recursively call get_state on it. 24 | if obj.dtype == object: 25 | obj_serialized = get_state(obj.tolist(), save_context) 26 | res["content"] = obj_serialized["content"] 27 | res["type"] = "json" 28 | res["shape"] = get_state(obj.shape, save_context) 29 | else: 30 | data_buffer = io.BytesIO() 31 | np.save(data_buffer, obj, allow_pickle=False) 32 | # Memoize the object and then check if it's file name (containing 33 | # the object id) already exists. If it does, there is no need to 34 | # save the object again. Memoizitation is necessary since for 35 | # ephemeral objects, the same id might otherwise be reused. 36 | obj_id = save_context.memoize(obj) 37 | f_name = f"{obj_id}.npy" 38 | if f_name not in save_context.zip_file.namelist(): 39 | save_context.zip_file.writestr(f_name, data_buffer.getbuffer()) 40 | res.update(type="numpy", file=f_name) 41 | except ValueError: 42 | # Couldn't save the numpy array with either method 43 | raise UnsupportedTypeException( 44 | f"numpy arrays of dtype {obj.dtype} are not supported yet, please " 45 | "open an issue at https://github.com/skops-dev/skops/issues and " 46 | "report your error" 47 | ) 48 | 49 | return res 50 | 51 | 52 | class NdArrayNode(Node): 53 | def __init__( 54 | self, 55 | state: dict[str, Any], 56 | load_context: LoadContext, 57 | trusted: bool | Sequence[str] = False, 58 | ) -> None: 59 | super().__init__(state, load_context, trusted) 60 | self.type = state["type"] 61 | self.trusted = self._get_trusted(trusted, [np.ndarray]) 62 | if self.type == "numpy": 63 | self.children = { 64 | "content": io.BytesIO(load_context.src.read(state["file"])) 65 | } 66 | elif self.type == "json": 67 | self.children = { 68 | "content": [ # type: ignore 69 | get_tree(o, load_context) for o in state["content"] # type: ignore 70 | ], 71 | "shape": get_tree(state["shape"], load_context), 72 | } 73 | else: 74 | raise ValueError(f"Unknown type {self.type}.") 75 | 76 | def _construct(self): 77 | # Dealing with a regular numpy array, where dtype != object 78 | if self.type == "numpy": 79 | content = np.load(self.children["content"], allow_pickle=False) 80 | if f"{self.module_name}.{self.class_name}" != "numpy.ndarray": 81 | content = gettype(self.module_name, self.class_name)(content) 82 | return content 83 | 84 | if self.type == "json": 85 | # We explicitly set the dtype to "O" since we only save object 86 | # arrays in json. 87 | shape = self.children["shape"].construct() 88 | tmp = [o.construct() for o in self.children["content"]] 89 | 90 | # TODO: this is a hack to get the correct shape of the array. We 91 | # should find _a better way_ to do this. 92 | if len(shape) == 1: 93 | content = np.ndarray(shape=len(tmp), dtype="O") 94 | for i, v in enumerate(tmp): 95 | content[i] = v 96 | else: 97 | content = np.array(tmp, dtype="O") 98 | 99 | return content 100 | 101 | raise ValueError(f"Unknown type for a numpy object: {self.type}.") 102 | 103 | 104 | def maskedarray_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: 105 | res = { 106 | "__class__": obj.__class__.__name__, 107 | "__module__": get_module(type(obj)), 108 | "__loader__": "MaskedArrayNode", 109 | "content": { 110 | "data": get_state(obj.data, save_context), 111 | "mask": get_state(obj.mask, save_context), 112 | }, 113 | } 114 | return res 115 | 116 | 117 | class MaskedArrayNode(Node): 118 | def __init__( 119 | self, 120 | state: dict[str, Any], 121 | load_context: LoadContext, 122 | trusted: bool | Sequence[str] = False, 123 | ) -> None: 124 | super().__init__(state, load_context, trusted) 125 | self.trusted = self._get_trusted(trusted, [np.ma.MaskedArray]) 126 | self.children = { 127 | "data": get_tree(state["content"]["data"], load_context), 128 | "mask": get_tree(state["content"]["mask"], load_context), 129 | } 130 | 131 | def _construct(self): 132 | data = self.children["data"].construct() 133 | mask = self.children["mask"].construct() 134 | return np.ma.MaskedArray(data, mask) 135 | 136 | 137 | def random_state_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: 138 | content = get_state(obj.get_state(legacy=False), save_context) 139 | res = { 140 | "__class__": obj.__class__.__name__, 141 | "__module__": get_module(type(obj)), 142 | "__loader__": "RandomStateNode", 143 | "content": content, 144 | } 145 | return res 146 | 147 | 148 | class RandomStateNode(Node): 149 | def __init__( 150 | self, 151 | state: dict[str, Any], 152 | load_context: LoadContext, 153 | trusted: bool | Sequence[str] = False, 154 | ) -> None: 155 | super().__init__(state, load_context, trusted) 156 | self.children = {"content": get_tree(state["content"], load_context)} 157 | self.trusted = self._get_trusted(trusted, [np.random.RandomState]) 158 | 159 | def _construct(self): 160 | random_state = gettype(self.module_name, self.class_name)() 161 | random_state.set_state(self.children["content"].construct()) 162 | return random_state 163 | 164 | 165 | def random_generator_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: 166 | bit_generator_state = obj.bit_generator.state 167 | res = { 168 | "__class__": obj.__class__.__name__, 169 | "__module__": get_module(type(obj)), 170 | "__loader__": "RandomGeneratorNode", 171 | "content": {"bit_generator": bit_generator_state}, 172 | } 173 | return res 174 | 175 | 176 | class RandomGeneratorNode(Node): 177 | def __init__( 178 | self, 179 | state: dict[str, Any], 180 | load_context: LoadContext, 181 | trusted: bool | Sequence[str] = False, 182 | ) -> None: 183 | super().__init__(state, load_context, trusted) 184 | self.children = {"bit_generator_state": state["content"]["bit_generator"]} 185 | self.trusted = self._get_trusted(trusted, [np.random.Generator]) 186 | 187 | def _construct(self): 188 | # first restore the state of the bit generator 189 | bit_generator = gettype( 190 | "numpy.random", self.children["bit_generator_state"]["bit_generator"] 191 | )() 192 | bit_generator.state = self.children["bit_generator_state"] 193 | 194 | # next create the generator instance 195 | return gettype(self.module_name, self.class_name)(bit_generator=bit_generator) 196 | 197 | 198 | # For numpy.ufunc we need to get the type from the type's module, but for other 199 | # functions we get it from objet's module directly. Therefore sett a especial 200 | # get_state method for them here. The load is the same as other functions. 201 | def ufunc_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: 202 | res = { 203 | "__class__": obj.__class__.__name__, # ufunc 204 | "__module__": get_module(type(obj)), # numpy 205 | "__loader__": "FunctionNode", 206 | "content": { 207 | "module_path": get_module(obj), 208 | "function": obj.__name__, 209 | }, 210 | } 211 | return res 212 | 213 | 214 | def dtype_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: 215 | # we use numpy's internal save mechanism to store the dtype by 216 | # saving/loading an empty array with that dtype. 217 | tmp: np.typing.NDArray = np.ndarray(0, dtype=obj) 218 | res = { 219 | "__class__": "dtype", 220 | "__module__": "numpy", 221 | "__loader__": "DTypeNode", 222 | "content": get_state(tmp, save_context), 223 | } 224 | return res 225 | 226 | 227 | class DTypeNode(Node): 228 | def __init__( 229 | self, 230 | state: dict[str, Any], 231 | load_context: LoadContext, 232 | trusted: bool | Sequence[str] = False, 233 | ) -> None: 234 | super().__init__(state, load_context, trusted) 235 | self.children = {"content": get_tree(state["content"], load_context)} 236 | # TODO: what should we trust? 237 | self.trusted = self._get_trusted(trusted, []) 238 | 239 | def _construct(self): 240 | # we use numpy's internal save mechanism to store the dtype by 241 | # saving/loading an empty array with that dtype. 242 | return self.children["content"].construct().dtype 243 | 244 | 245 | # tuples of type and function that gets the state of that type 246 | GET_STATE_DISPATCH_FUNCTIONS = [ 247 | (np.generic, ndarray_get_state), 248 | (np.ndarray, ndarray_get_state), 249 | (np.ma.MaskedArray, maskedarray_get_state), 250 | (np.ufunc, ufunc_get_state), 251 | (np.dtype, dtype_get_state), 252 | (np.random.RandomState, random_state_get_state), 253 | (np.random.Generator, random_generator_get_state), 254 | ] 255 | # tuples of type and function that creates the instance of that type 256 | NODE_TYPE_MAPPING = { 257 | "NdArrayNode": NdArrayNode, 258 | "MaskedArrayNode": MaskedArrayNode, 259 | "DTypeNode": DTypeNode, 260 | "RandomStateNode": RandomStateNode, 261 | "RandomGeneratorNode": RandomGeneratorNode, 262 | } 263 | -------------------------------------------------------------------------------- /skops/card/tests/examples/bert-base-uncased.md: -------------------------------------------------------------------------------- 1 | --- 2 | language: en 3 | tags: 4 | - exbert 5 | license: apache-2.0 6 | datasets: 7 | - bookcorpus 8 | - wikipedia 9 | --- 10 | 11 | # BERT base model (uncased) 12 | 13 | 14 | 15 | Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in 16 | [this paper](https://arxiv.org/abs/1810.04805) and first released in 17 | [this repository](https://github.com/google-research/bert). This model is uncased: it does not make a difference 18 | between english and English. 19 | 20 | Disclaimer: The team releasing BERT did not write a model card for this model so this model card has been written by 21 | the Hugging Face team. 22 | 23 | ## Model description 24 | 25 | BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it 26 | was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of 27 | publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it 28 | was pretrained with two objectives: 29 | 30 | - Masked language modeling (MLM): taking a sentence, the model randomly masks 15% of the words in the input then run 31 | the entire masked sentence through the model and has to predict the masked words. This is different from traditional 32 | recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like 33 | GPT which internally masks the future tokens. It allows the model to learn a bidirectional representation of the 34 | sentence. 35 | - Next sentence prediction (NSP): the models concatenates two masked sentences as inputs during pretraining. Sometimes 36 | they correspond to sentences that were next to each other in the original text, sometimes not. The model then has to 37 | predict if the two sentences were following each other or not. 38 | 39 | This way, the model learns an inner representation of the English language that can then be used to extract features 40 | useful for downstream tasks: if you have a dataset of labeled sentences, for instance, you can train a standard 41 | classifier using the features produced by the BERT model as inputs. 42 | 43 | ## Model variations 44 | 45 | BERT has originally been released in base and large variations, for cased and uncased input text. The uncased models also strips out an accent markers. 46 | Chinese and multilingual uncased and cased versions followed shortly after. 47 | Modified preprocessing with whole word masking has replaced subpiece masking in a following work, with the release of two models. 48 | Other 24 smaller models are released afterward. 49 | 50 | The detailed release history can be found on the [google-research/bert readme](https://github.com/google-research/bert/blob/master/README.md) on github. 51 | 52 | | Model | #params | Language | 53 | |------------------------|--------------------------------|-------| 54 | | [`bert-base-uncased`](https://huggingface.co/bert-base-uncased) | 110M | English | 55 | | [`bert-large-uncased`](https://huggingface.co/bert-large-uncased) | 340M | English | sub 56 | | [`bert-base-cased`](https://huggingface.co/bert-base-cased) | 110M | English | 57 | | [`bert-large-cased`](https://huggingface.co/bert-large-cased) | 340M | English | 58 | | [`bert-base-chinese`](https://huggingface.co/bert-base-chinese) | 110M | Chinese | 59 | | [`bert-base-multilingual-cased`](https://huggingface.co/bert-base-multilingual-cased) | 110M | Multiple | 60 | | [`bert-large-uncased-whole-word-masking`](https://huggingface.co/bert-large-uncased-whole-word-masking) | 340M | English | 61 | | [`bert-large-cased-whole-word-masking`](https://huggingface.co/bert-large-cased-whole-word-masking) | 340M | English | 62 | 63 | ## Intended uses & limitations 64 | 65 | You can use the raw model for either masked language modeling or next sentence prediction, but it's mostly intended to 66 | be fine-tuned on a downstream task. See the [model hub](https://huggingface.co/models?filter=bert) to look for 67 | fine-tuned versions of a task that interests you. 68 | 69 | Note that this model is primarily aimed at being fine-tuned on tasks that use the whole sentence (potentially masked) 70 | to make decisions, such as sequence classification, token classification or question answering. For tasks such as text 71 | generation you should look at model like GPT2. 72 | 73 | ### How to use 74 | 75 | You can use this model directly with a pipeline for masked language modeling: 76 | 77 | ```python 78 | >>> from transformers import pipeline 79 | >>> unmasker = pipeline('fill-mask', model='bert-base-uncased') 80 | >>> unmasker("Hello I'm a [MASK] model.") 81 | [{'sequence': "[CLS] hello i'm a fashion model. [SEP]", 82 | 'score': 0.1073106899857521, 83 | 'token': 4827, 84 | 'token_str': 'fashion'}, 85 | {'sequence': "[CLS] hello i'm a role model. [SEP]", 86 | 'score': 0.08774490654468536, 87 | 'token': 2535, 88 | 'token_str': 'role'}, 89 | {'sequence': "[CLS] hello i'm a new model. [SEP]", 90 | 'score': 0.05338378623127937, 91 | 'token': 2047, 92 | 'token_str': 'new'}, 93 | {'sequence': "[CLS] hello i'm a super model. [SEP]", 94 | 'score': 0.04667217284440994, 95 | 'token': 3565, 96 | 'token_str': 'super'}, 97 | {'sequence': "[CLS] hello i'm a fine model. [SEP]", 98 | 'score': 0.027095865458250046, 99 | 'token': 2986, 100 | 'token_str': 'fine'}] 101 | ``` 102 | 103 | Here is how to use this model to get the features of a given text in PyTorch: 104 | 105 | ```python 106 | from transformers import BertTokenizer, BertModel 107 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 108 | model = BertModel.from_pretrained("bert-base-uncased") 109 | text = "Replace me by any text you'd like." 110 | encoded_input = tokenizer(text, return_tensors='pt') 111 | output = model(**encoded_input) 112 | ``` 113 | 114 | and in TensorFlow: 115 | 116 | ```python 117 | from transformers import BertTokenizer, TFBertModel 118 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 119 | model = TFBertModel.from_pretrained("bert-base-uncased") 120 | text = "Replace me by any text you'd like." 121 | encoded_input = tokenizer(text, return_tensors='tf') 122 | output = model(encoded_input) 123 | ``` 124 | 125 | ### Limitations and bias 126 | 127 | Even if the training data used for this model could be characterized as fairly neutral, this model can have biased 128 | predictions: 129 | 130 | ```python 131 | >>> from transformers import pipeline 132 | >>> unmasker = pipeline('fill-mask', model='bert-base-uncased') 133 | >>> unmasker("The man worked as a [MASK].") 134 | [{'sequence': '[CLS] the man worked as a carpenter. [SEP]', 135 | 'score': 0.09747550636529922, 136 | 'token': 10533, 137 | 'token_str': 'carpenter'}, 138 | {'sequence': '[CLS] the man worked as a waiter. [SEP]', 139 | 'score': 0.0523831807076931, 140 | 'token': 15610, 141 | 'token_str': 'waiter'}, 142 | {'sequence': '[CLS] the man worked as a barber. [SEP]', 143 | 'score': 0.04962705448269844, 144 | 'token': 13362, 145 | 'token_str': 'barber'}, 146 | {'sequence': '[CLS] the man worked as a mechanic. [SEP]', 147 | 'score': 0.03788609802722931, 148 | 'token': 15893, 149 | 'token_str': 'mechanic'}, 150 | {'sequence': '[CLS] the man worked as a salesman. [SEP]', 151 | 'score': 0.037680890411138535, 152 | 'token': 18968, 153 | 'token_str': 'salesman'}] 154 | >>> unmasker("The woman worked as a [MASK].") 155 | [{'sequence': '[CLS] the woman worked as a nurse. [SEP]', 156 | 'score': 0.21981462836265564, 157 | 'token': 6821, 158 | 'token_str': 'nurse'}, 159 | {'sequence': '[CLS] the woman worked as a waitress. [SEP]', 160 | 'score': 0.1597415804862976, 161 | 'token': 13877, 162 | 'token_str': 'waitress'}, 163 | {'sequence': '[CLS] the woman worked as a maid. [SEP]', 164 | 'score': 0.1154729500412941, 165 | 'token': 10850, 166 | 'token_str': 'maid'}, 167 | {'sequence': '[CLS] the woman worked as a prostitute. [SEP]', 168 | 'score': 0.037968918681144714, 169 | 'token': 19215, 170 | 'token_str': 'prostitute'}, 171 | {'sequence': '[CLS] the woman worked as a cook. [SEP]', 172 | 'score': 0.03042375110089779, 173 | 'token': 5660, 174 | 'token_str': 'cook'}] 175 | ``` 176 | 177 | This bias will also affect all fine-tuned versions of this model. 178 | 179 | ## Training data 180 | 181 | The BERT model was pretrained on [BookCorpus](https://yknzhu.wixsite.com/mbweb), a dataset consisting of 11,038 182 | unpublished books and [English Wikipedia](https://en.wikipedia.org/wiki/English_Wikipedia) (excluding lists, tables and 183 | headers). 184 | 185 | ## Training procedure 186 | 187 | ### Preprocessing 188 | 189 | The texts are lowercased and tokenized using WordPiece and a vocabulary size of 30,000. The inputs of the model are 190 | then of the form: 191 | 192 | ``` 193 | [CLS] Sentence A [SEP] Sentence B [SEP] 194 | ``` 195 | 196 | With probability 0.5, sentence A and sentence B correspond to two consecutive sentences in the original corpus, and in 197 | the other cases, it's another random sentence in the corpus. Note that what is considered a sentence here is a 198 | consecutive span of text usually longer than a single sentence. The only constrain is that the result with the two 199 | "sentences" has a combined length of less than 512 tokens. 200 | 201 | The details of the masking procedure for each sentence are the following: 202 | - 15% of the tokens are masked. 203 | - In 80% of the cases, the masked tokens are replaced by `[MASK]`. 204 | - In 10% of the cases, the masked tokens are replaced by a random token (different) from the one they replace. 205 | - In the 10% remaining cases, the masked tokens are left as is. 206 | 207 | ### Pretraining 208 | 209 | The model was trained on 4 cloud TPUs in Pod configuration (16 TPU chips total) for one million steps with a batch size 210 | of 256. The sequence length was limited to 128 tokens for 90% of the steps and 512 for the remaining 10%. The optimizer 211 | used is Adam with a learning rate of 1e-4, \\(\beta_{1} = 0.9\\) and \\(\beta_{2} = 0.999\\), a weight decay of 0.01, 212 | learning rate warmup for 10,000 steps and linear decay of the learning rate after. 213 | 214 | ## Evaluation results 215 | 216 | When fine-tuned on downstream tasks, this model achieves the following results: 217 | 218 | Glue test results: 219 | 220 | | Task | MNLI-(m/mm) | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | Average | 221 | |:----:|:-----------:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:|:-------:| 222 | | | 84.6/83.4 | 71.2 | 90.5 | 93.5 | 52.1 | 85.8 | 88.9 | 66.4 | 79.6 | 223 | 224 | 225 | ### BibTeX entry and citation info 226 | 227 | ```bibtex 228 | @article{DBLP:journals/corr/abs-1810-04805, 229 | author = {Jacob Devlin and 230 | Ming{-}Wei Chang and 231 | Kenton Lee and 232 | Kristina Toutanova}, 233 | title = {{BERT:} Pre-training of Deep Bidirectional Transformers for Language 234 | Understanding}, 235 | journal = {CoRR}, 236 | volume = {abs/1810.04805}, 237 | year = {2018}, 238 | url = {http://arxiv.org/abs/1810.04805}, 239 | archivePrefix = {arXiv}, 240 | eprint = {1810.04805}, 241 | timestamp = {Tue, 30 Oct 2018 20:39:56 +0100}, 242 | biburl = {https://dblp.org/rec/journals/corr/abs-1810-04805.bib}, 243 | bibsource = {dblp computer science bibliography, https://dblp.org} 244 | } 245 | ``` 246 | 247 | 248 | 249 | 250 | --------------------------------------------------------------------------------