├── .conda └── meta.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── data ├── .gitignore └── pie.py ├── mypy.ini ├── pastry ├── pypastry ├── __init__.py ├── commands │ ├── __init__.py │ ├── init.py │ ├── print_.py │ └── run.py ├── display.py ├── experiment │ ├── __init__.py │ ├── evaluation.py │ ├── hasher.py │ └── results.py └── paths.py ├── setup.py └── tests ├── display_test.py └── evaluation_test.py /.conda/meta.yml: -------------------------------------------------------------------------------- 1 | {% set name = "pypastry" %} 2 | {% set version = "0.0.1" %} 3 | 4 | package: 5 | name: "{{ name|lower }}" 6 | version: "{{ version }}" 7 | 8 | source: 9 | 10 | #url: ../dist/{{ name }}-{{ version }}.tar.gz 11 | url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz 12 | sha256: a1c81cd0e01fe69637a9c6393dd6e97901d5c5fb8270553c9b6a61978fe0c063 13 | 14 | build: 15 | #number: 0 16 | #noarch: python 17 | #script: python setup.py install 18 | number: 0 19 | script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed -vv " 20 | 21 | requirements: 22 | host: 23 | # - conda-build 24 | - gitpython 25 | - pandas 26 | - pip 27 | - pyarrow 28 | - python 29 | - scikit-learn 30 | - tomlkit 31 | run: 32 | - gitpython 33 | - pandas 34 | - pyarrow 35 | - python 36 | - scikit-learn 37 | - tomlkit 38 | 39 | test: 40 | imports: 41 | -pypastry 42 | -pandas 43 | -sklearn 44 | 45 | requires: 46 | -python 47 | -unittest 48 | 49 | 50 | about: 51 | home: https://github.com/datapastry/pypastry 52 | license: MIT 53 | license_family: MIT 54 | license_file: LICENSE 55 | summary: PyPastry machine learning experimentation framework 56 | doc_url: 57 | dev_url: 58 | extra: 59 | recipe-maintainers: 60 | - mrrutledge -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | *__pycache__* 3 | *.egg-info 4 | .idea/ 5 | 6 | ### Python ### 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | pip-wheel-metadata/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # pipenv 76 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 77 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 78 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 79 | # install all needed dependencies. 80 | #Pipfile.lock 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # Spyder project settings 89 | .spyderproject 90 | .spyproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | # Mr Developer 96 | .mr.developer.cfg 97 | .project 98 | .pydevproject 99 | 100 | # mkdocs documentation 101 | /site 102 | /sample 103 | 104 | 105 | # mypy 106 | .mypy_cache/ 107 | .dmypy.json 108 | dmypy.json 109 | 110 | # Pyre type checker 111 | .pyre/ 112 | 113 | #Env 114 | .vscode 115 | /pastry-test 116 | 117 | # End of https://www.gitignore.io/api/git,python -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 DataPastry 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include mypy.ini 3 | recursive-include data *.py 4 | recursive-include data *.gitignore 5 | recursive-include pypastry *.py 6 | recursive-include tests *.py 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PyPastry - the opinionated machine learning experimentation framework 2 | ===================================================================== 3 | 4 | PyPastry is a framework for developers and data scientists to run 5 | machine learning experiments. We enable you to: 6 | 7 | - Iterate quickly. The more experiments you do, the more likely you 8 | are to find something that works well. 9 | - Experiment correctly and consistently. Anything else is not really 10 | an experiment, is it? 11 | - Make experiments reproducible. That means keeping track of your 12 | code state and results. 13 | - Experiment locally. None of that Spark rubbish. 14 | - Use standard tools. Everything is based on Scikit-learn, Pandas and Git. 15 | 16 | Quick start 17 | ----------- 18 | 19 | PyPastry requires python 3.5 or greater. 20 | 21 | > pip install pypastry==0.2.0 22 | > pastry init pastry-test 23 | > cd pastry-test 24 | > pastry run -m "First experiment" 25 | Got dataset with 10 rows 26 | Git hash Dataset hash Run start Model Score Duration (s) 27 | 0 aa87ce62 71e8f4fd 2019-08-28 06:39:07 DecisionTreeClassifier 0.933 ± 0.067 0.03 28 | 29 | The command `pastry init` creates a file called `pie.py` in the `pastry-test` directory. If you open 30 | that up, you should see some code. The important bit is: 31 | 32 | def get_experiment(): 33 | dataset = pd.DataFrame({ 34 | 'feature': [1, 0, 1, 1, 0, 0, 1, 1, 0, 1], 35 | 'class': [True, False, True, True, False, False, True, True, False, False], 36 | }) 37 | predictor = DecisionTreeClassifier() 38 | cross_validator = StratifiedKFold(n_splits=5) 39 | scorer = make_scorer(f1_score) 40 | label_column = 'class' 41 | return Experiment(dataset, label_column, predictor, cross_validator, scorer) 42 | 43 | This returns an `Experiment` instance that specifies how the experiment should be run. An experiment 44 | consists of: 45 | - `dataset`: a Pandas `DataFrame` where each row is an instance to be used in the experiment. 46 | - `label_column`: the name of the column in `dataset` that contains the label we wish to predict. 47 | - `predictor`: a Scikit-learn predictor, e.g. a classifier, regressor or `Pipeline` object. 48 | - `cross_validator`: a Scikit-learn cross validator that specifies how the data should be split 49 | up when running the experiment. 50 | - `scorer` a Scikit-learn scorer that will be used as an indication of how well the classifier has 51 | learnt to generate predictions. 52 | 53 | When you type `pastry run`, PyPastry does this: 54 | - Splits `dataset` into one or more train and test sets. 55 | - For each train and test set, it trains the `predictor` on the train set and generate predictions 56 | on the test set, and computes the score on the test set using the `scorer`. 57 | - Generates a results file in JSON format and stores it in a folder called `results` 58 | - Outputs the results of the experiment. 59 | - Your repo has to be clean (no (un)staged changes) for experiment to run. If you want to use dirty repo, you can with calling pypsatry with force flag `-f`. However, results will not be possible to bond with exact code state. 60 | 61 | The results includes: 62 | - Git hash: the commit identifier of the code used to run the experiment. There might be `"dirty_"` prefix indicating that unclean repo was used with this experiment. The hash belongs to the latest commit, however, the information about (un)staged changes is lost. 63 | - Git summary: A summary note 64 | - Dataset hash: a hash generated from the dataset that will change if the dataset changes. 65 | - Run start: the time that the experiment run started 66 | - Model: the name of the `predictor` class used 67 | - Score: the mean ± the standard error in the mean, computed over the different folds generated 68 | by the `cross_validator`. 69 | - Duration: how long the experiment took to run, in seconds. 70 | 71 | Contributing 72 | ------------ 73 | 74 | PyPastry is at an early stage so there's plenty to do and we'd love to have your contribution. 75 | 76 | Check out the issues for a list of things that need doing and post a comment if you'd like to take 77 | something on. 78 | 79 | If you have an idea for something you'd like to do, create an issue. 80 | 81 | Run `python -m pytest` in the project root to run all tests. 82 | 83 | Thanks for using PyPastry! 84 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | .pypastry/ 2 | __pycache__/ 3 | -------------------------------------------------------------------------------- /data/pie.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.metrics import f1_score, make_scorer 3 | from sklearn.model_selection import StratifiedKFold 4 | 5 | from sklearn.tree import DecisionTreeClassifier 6 | 7 | from pypastry.experiment import Experiment 8 | 9 | 10 | def get_experiment(): 11 | dataset = pd.DataFrame({ 12 | 'feature': [1, 0, 1, 1, 0, 0, 1, 1, 0, 1], 13 | 'class': [True, False, True, True, False, False, True, True, False, False], 14 | }) 15 | predictor = DecisionTreeClassifier() 16 | cross_validator = StratifiedKFold(n_splits=5) 17 | scorer = make_scorer(f1_score) 18 | label_column = 'class' 19 | return Experiment(dataset, label_column, predictor, cross_validator, scorer) 20 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True 3 | disallow_untyped_defs = False 4 | 5 | -------------------------------------------------------------------------------- /pastry: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import os 4 | import sys 5 | from pstats import Stats 6 | 7 | PROFILE_PATH = '/tmp/pastry.profile' 8 | 9 | 10 | def parse_and_run(): 11 | parser = argparse.ArgumentParser( 12 | description='Run a machine learning experiment', 13 | usage='''pastry [] 14 | 15 | The command can be: 16 | run Run an experiment 17 | print Print out results from previous experiments 18 | ''') 19 | parser.add_argument('command', help='Subcommand to run') 20 | args = parser.parse_args(sys.argv[1:2]) 21 | 22 | if args.command == 'print': 23 | from pypastry.commands.print_ import run 24 | run() 25 | elif args.command == 'run': 26 | from pypastry.commands.run import run 27 | run() 28 | elif args.command == 'init': 29 | from pypastry.commands.init import run 30 | run() 31 | else: 32 | print("Unrecognised command: {}".format(args.command)) 33 | parser.print_usage() 34 | exit(1) 35 | 36 | 37 | if __name__ == "__main__": 38 | if os.environ.get('PASTRY_PROFILE'): 39 | import cProfile 40 | cProfile.run('parse_and_run()', PROFILE_PATH) 41 | stats = Stats(PROFILE_PATH) 42 | stats.sort_stats('cumulative').print_stats(20) 43 | else: 44 | parse_and_run() 45 | -------------------------------------------------------------------------------- /pypastry/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datapastry/pypastry/9d0093e75f81b56d7b924af8177bee7ea1cfecc0/pypastry/__init__.py -------------------------------------------------------------------------------- /pypastry/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datapastry/pypastry/9d0093e75f81b56d7b924af8177bee7ea1cfecc0/pypastry/commands/__init__.py -------------------------------------------------------------------------------- /pypastry/commands/init.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | from os import path, mkdir, chdir 4 | 5 | 6 | def run(): 7 | parser = argparse.ArgumentParser(prog='pastry init') 8 | parser.add_argument('directory', nargs='?', type=str, 9 | help='Path to directory to create (or default to current directory)') 10 | args = parser.parse_args(sys.argv[2:]) # type: argparse.Namespace 11 | 12 | directory = args.directory if args.directory is not None else '.' 13 | 14 | try: 15 | mkdir(directory) 16 | except FileExistsError: 17 | pass 18 | 19 | chdir(directory) 20 | -------------------------------------------------------------------------------- /pypastry/commands/print_.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from pypastry.display import print_cache_file, cache_display, _get_results_dataframe 4 | from pypastry.paths import RESULTS_PATH 5 | from pypastry.experiment.results import ResultsRepo 6 | 7 | 8 | def run(): 9 | parser = argparse.ArgumentParser(prog='pastry print') 10 | parser.add_argument('-l', '--limit', type=int, default=None, help='Limit lines to print') 11 | parser.add_argument('-e', '--export', type=str, required=False, help='File to output the results in CSV format') 12 | 13 | args = parser.parse_args(sys.argv[2:]) 14 | if args.export is not None: 15 | results = get_results() 16 | results_dataframe = _get_results_dataframe(results) 17 | results_dataframe.to_csv(args.export) 18 | return 19 | try: 20 | print_cache_file(args.limit) 21 | except FileNotFoundError: 22 | results = get_results() 23 | cache_display(results) 24 | print_cache_file(args.limit) 25 | 26 | 27 | def get_results(): 28 | results_repo = ResultsRepo(RESULTS_PATH) 29 | results = results_repo.get_results() 30 | return results 31 | -------------------------------------------------------------------------------- /pypastry/commands/run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | from pypastry.experiment.evaluation import run_experiment 5 | 6 | 7 | def run(): 8 | parser = argparse.ArgumentParser(prog='pastry run') 9 | parser.add_argument('-m', '--message', default="", type=str, help='Summary message about the experiment.') 10 | parser.add_argument('-f', '--force', action='store_true', help='Force a re-run of the experiment') 11 | parser.add_argument('-p', '--no-print', action='store_true', help='Do not print results.') 12 | 13 | args = parser.parse_args(sys.argv[2:]) 14 | 15 | sys.path.append('.') 16 | import pie 17 | experiment = pie.get_experiment() 18 | force = args.force 19 | message = args.message 20 | 21 | run_experiment(experiment, message, force, show_results=not args.no_print) 22 | -------------------------------------------------------------------------------- /pypastry/display.py: -------------------------------------------------------------------------------- 1 | """ 2 | Handle displaying results. 3 | 4 | This code needs to display results really fast. That's why it has some odd things: 5 | - Inline imports 6 | - Lack of typing for the results_repo to avoid unnecessary imports 7 | 8 | This is because I don't like having to wait a second for things to be imported. 9 | I want my pastry now! 10 | """ 11 | import os 12 | from typing import Any, Dict, List, Iterator, TYPE_CHECKING 13 | 14 | from pandas import Series 15 | 16 | from pypastry.paths import DISPLAY_PATH, DISPLAY_DIR 17 | if TYPE_CHECKING: 18 | import pypastry 19 | 20 | 21 | def cache_display(results_from_repo: Iterator['pypastry.experiment.results.Result']) -> None: 22 | results_dataframe = _get_results_dataframe(results_from_repo) 23 | display = repr(results_dataframe) 24 | 25 | try: 26 | os.mkdir(DISPLAY_DIR) 27 | except FileExistsError: 28 | pass 29 | 30 | with open(DISPLAY_PATH, 'w') as output_file: 31 | output_file.write(display) 32 | 33 | 34 | def _get_results_dataframe(results_from_repo: Iterator['pypastry.experiment.results.Result']) -> 'DataFrame': 35 | from pandas import DataFrame, set_option 36 | set_option('display.max_rows', None) 37 | set_option('display.max_columns', None) 38 | set_option('display.width', None) 39 | set_option('display.max_colwidth', -1) 40 | results = [] 41 | for repo_result in results_from_repo: 42 | data = repo_result.data 43 | result = { 44 | 'Git hash': data["git_hash"] if "git_hash" in data else "Unavailable", 45 | 'Summary': data["git_summary"] if "git_summary" in data else "Unavailable", 46 | 'Dataset hash': data['dataset']['hash'][:8], 47 | 'Dataset size': data['dataset']['size'] if "size" in data["dataset"] else "Unavailable", 48 | 'Result JSON name': data['result_json_name'] if "result_json_name" in data else "Unavailable", 49 | 'Run start': data['run_start'][:19], 50 | 'Model': data['model_info']['type'], 51 | 'Duration (s)': "{:.2f}".format(data['run_seconds']), 52 | } 53 | 54 | try: 55 | scores = DataFrame(data['results']) 56 | for row in scores.itertuples(): 57 | result[row.Index] = "{:.3f} ± {:.3f}".format(row.test_score, row.test_score_sem) 58 | except ValueError: 59 | result['Score'] = "{:.3f} ± {:.3f}".format(data['results']['test_score'], 60 | data['results']['test_score_sem']) 61 | 62 | results.append(result) 63 | results_dataframe = DataFrame(results) 64 | return results_dataframe.sort_values(by='Run start').reset_index(drop=True) 65 | 66 | 67 | def print_cache_file(limit=False): 68 | with open(DISPLAY_PATH) as display_file: 69 | print(display_file.read()) 70 | # read_lines = display_file.read() 71 | # read_list = read_lines.split("\n") 72 | # if limit: 73 | # limit = min(limit, len(read_list)-3) 74 | # # to avoid printing more than there is 75 | # else: 76 | # limit = len(read_list)-3 77 | # print(read_list[0]) 78 | # print("\n".join(read_list[-(2+limit):-2])) 79 | # print(read_list[-1]) 80 | 81 | -------------------------------------------------------------------------------- /pypastry/experiment/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Union, Iterable 2 | 3 | from pandas import DataFrame 4 | from sklearn.base import BaseEstimator 5 | from sklearn.metrics import accuracy_score, make_scorer 6 | from sklearn.metrics._scorer import _BaseScorer as BaseScorer 7 | 8 | 9 | class Experiment: 10 | def __init__(self, dataset: DataFrame, label_column: str, predictor: BaseEstimator, 11 | cross_validator: Any = None, scorer: Union[BaseScorer, Iterable[BaseScorer]] = None, 12 | group_column: str=None, test_set: DataFrame = None, average_scores_on_instances: bool = False, 13 | additional_info: Callable[[BaseEstimator], Any] = None): 14 | if (test_set is not None) == (cross_validator is not None): 15 | raise ValueError("You must specify either a cross validator or a test set (and not both)") 16 | 17 | if average_scores_on_instances and group_column is not None: 18 | raise ValueError("You can only average on instances when not grouping instances") 19 | 20 | if scorer is None: 21 | scorer = [make_scorer(accuracy_score)] 22 | 23 | if not isinstance(scorer, Iterable): 24 | if not isinstance(scorer, BaseScorer): 25 | raise ValueError("Scorer must be created using make_scorer()") 26 | 27 | scorer = [scorer] 28 | 29 | self.dataset = dataset 30 | self.label_column = label_column 31 | self.predictor = predictor 32 | self.cross_validator = cross_validator 33 | self.scorer = scorer 34 | self.group_column = group_column 35 | self.test_set = test_set 36 | self.average_scores_on_instances = average_scores_on_instances 37 | self.additional_info = additional_info 38 | -------------------------------------------------------------------------------- /pypastry/experiment/evaluation.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | from types import ModuleType 4 | from typing import Any, Dict, Tuple, List 5 | from pathlib import Path 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from git import Repo 10 | from joblib import Parallel, delayed 11 | from pandas import Series 12 | from sklearn.base import BaseEstimator, is_classifier, clone 13 | from sklearn.metrics._scorer import _BaseScorer 14 | from sklearn.model_selection import check_cv, PredefinedSplit 15 | 16 | from pypastry import display 17 | from pypastry.experiment import Experiment 18 | from pypastry.experiment.hasher import get_dataset_hash 19 | from pypastry.experiment.results import ResultsRepo 20 | from pypastry.paths import REPO_PATH, RESULTS_PATH 21 | 22 | MAX_PARAMETER_VALUE_LENGTH = 500 23 | 24 | 25 | class DirtyRepoError(Exception): 26 | def __init__(self, message): 27 | super().__init__(message) 28 | 29 | 30 | class ExperimentRunner: 31 | def __init__(self, git_repo: Repo, results_repo: ResultsRepo, results_display: ModuleType): 32 | self.git_repo = git_repo 33 | self.results_repo = results_repo 34 | self.results_display = results_display 35 | 36 | def run_experiment( 37 | self, 38 | experiment: Experiment, 39 | message: str = "", 40 | force: bool = False, 41 | limit: int = None, 42 | show_results: bool = True, 43 | ) -> Tuple[List[BaseEstimator], Path]: 44 | 45 | print("Got dataset with {} rows".format(len(experiment.dataset))) 46 | if force or not self.git_repo.is_dirty(): 47 | print("Running evaluation") 48 | estimators, result_file_path = self._run_evaluation(experiment, message) 49 | results = self.results_repo.get_results() 50 | self.results_display.cache_display(results) 51 | else: 52 | raise DirtyRepoError("There are untracked/unstaged/staged changes in git repo, force flag was not given. " 53 | "Please commit your changes or provide force flag - note that in this case " 54 | "saved commit hash in your result file will not correspond to the actual code!") 55 | if show_results: 56 | self.results_display.print_cache_file(limit) 57 | 58 | return estimators, result_file_path 59 | 60 | def _run_evaluation(self, experiment: Experiment, message: str) -> Tuple[List[BaseEstimator], Path]: 61 | run_info, estimators = evaluate_predictor(experiment) 62 | dataset_hash = get_dataset_hash(experiment.dataset, experiment.test_set) 63 | dataset_info = { 64 | 'hash': dataset_hash, 65 | 'columns': experiment.dataset.columns.tolist(), 66 | 'size': len(experiment.dataset), 67 | } 68 | git_info = { 69 | "git_hash_msg": ("dirty_" if self.git_repo.is_dirty() else "") + self.git_repo.head.object.hexsha[:8], 70 | "git_summary_msg": message, 71 | } 72 | result_file_path = self.results_repo.save_results(run_info, dataset_info, git_info=git_info) 73 | 74 | return estimators, result_file_path 75 | 76 | 77 | def evaluate_predictor(experiment: Experiment) -> Dict[str, Tuple[Any, List[BaseEstimator]]]: 78 | start = datetime.utcnow() 79 | scores, estimators = _get_scores_and_estimators(experiment) 80 | end = datetime.utcnow() 81 | 82 | additional = experiment.additional_info 83 | additional_info = [additional(estimator) if additional is not None else None 84 | for estimator in estimators] 85 | 86 | if experiment.group_column is not None: 87 | for group, score_values in scores: 88 | score_values[experiment.group_column] = group 89 | 90 | values = [x[1] for x in scores] 91 | 92 | scores_array = pd.DataFrame(values) 93 | 94 | mean_score = scores_array.mean().to_dict() 95 | sem_score = scores_array.sem().to_dict() 96 | results = {'test_score': mean_score, 'test_score_sem': sem_score} 97 | 98 | model_info = get_model_info(experiment.predictor) 99 | 100 | run_info = { 101 | 'run_start': str(start), 102 | 'run_end': str(end), 103 | 'run_seconds': (end - start).total_seconds(), 104 | 'results': results, 105 | 'results_detail': scores_array.to_dict('list'), 106 | 'model_info': model_info, 107 | 'additional_info': additional_info, 108 | } 109 | 110 | return run_info, estimators 111 | 112 | 113 | def get_model_info(model: BaseEstimator): 114 | all_info = model.get_params() 115 | info = {key: value for key, value in all_info.items() 116 | if len(json.dumps(value, default=str)) < MAX_PARAMETER_VALUE_LENGTH} 117 | info['type'] = type(model).__name__ 118 | return info 119 | 120 | 121 | def _get_scores_and_estimators(experiment: Experiment) -> Tuple[List[float], List[Any]]: 122 | if experiment.test_set is not None: 123 | assert experiment.cross_validator is None, "Cannot use a cross validator with train test split" 124 | dataset = pd.concat([experiment.dataset, experiment.test_set]) 125 | split = np.array([-1] * len(experiment.dataset) + [1] * len(experiment.test_set)) 126 | cross_validator = PredefinedSplit(split) 127 | else: 128 | dataset = experiment.dataset 129 | cross_validator = experiment.cross_validator 130 | 131 | X = dataset.drop(columns=[experiment.label_column]) 132 | y = dataset[experiment.label_column] 133 | if experiment.group_column is None: 134 | if experiment.average_scores_on_instances: 135 | groups = Series(range(len(X)), index=X.index) 136 | else: 137 | groups = None 138 | else: 139 | groups = X[experiment.group_column] 140 | X = X.drop(columns=[experiment.group_column]) 141 | 142 | cv = check_cv(cross_validator, y, classifier=is_classifier(experiment.predictor)) 143 | train_test = cv.split(X, y, groups) 144 | 145 | # We clone the estimator to make sure that all the folds are 146 | # independent, and that it is pickle-able. 147 | parallel = Parallel(n_jobs=None, verbose=False, 148 | pre_dispatch='2*n_jobs') 149 | scores_and_estimators = parallel( 150 | delayed(_fit_and_predict)( 151 | clone(experiment.predictor), X, y, train, test, groups, experiment.scorer) 152 | for train, test in train_test) 153 | scores_lists, estimators = zip(*scores_and_estimators) 154 | scores = [score for score_list in scores_lists for score in score_list] 155 | return scores, estimators 156 | 157 | 158 | def _fit_and_predict(estimator: BaseEstimator, X, y, train, test, groups, scorer): 159 | if groups is not None: 160 | scores = _fit_and_predict_groups(X, estimator, groups, scorer, test, train, y) 161 | else: 162 | scores = _fit_and_predict_simple(X, estimator, scorer, test, train, y) 163 | return scores, estimator 164 | 165 | 166 | def _fit_and_predict_simple(X, estimator, scorers, test, train, y): 167 | X_train = X.iloc[train] 168 | y_train = y.iloc[train] 169 | estimator.fit(X_train, y_train) 170 | X_test = X.iloc[test] 171 | y_test = y.iloc[test] 172 | score = _score(scorers, estimator, X_test, y_test) 173 | return [(None, score)] 174 | 175 | 176 | def _fit_and_predict_groups(X, estimator, groups, scorers, test, train, y): 177 | X_train = X.iloc[train] 178 | y_train = y.iloc[train] 179 | estimator.fit(X_train, y_train) 180 | X_test = X.iloc[test] 181 | y_test = y.iloc[test] 182 | groups_test = groups.iloc[test] 183 | test_df = pd.DataFrame(X_test) 184 | test_df['y'] = y_test 185 | test_df['groups'] = groups_test 186 | test_groups = test_df.groupby('groups') 187 | scores = [] 188 | for key, group in test_groups: 189 | X_group = group[X.columns] 190 | score = _score(scorers, estimator, X_group, group['y']) 191 | scores.append((key, score)) 192 | return scores 193 | 194 | 195 | def _score(scorers: List[_BaseScorer], estimator, X_test, y_test): 196 | scores = {} 197 | for scorer in scorers: 198 | score = scorer(estimator, X_test, y_test) 199 | score_name = scorer._score_func.__name__ 200 | sign = scorer._sign 201 | score_ignoring_sign = score*sign 202 | scores[score_name] = score_ignoring_sign 203 | return scores 204 | 205 | 206 | def run_experiment(experiment, message="", force=False, show_results=True) -> Tuple[List[BaseEstimator], Path]: 207 | git_repo = Repo(REPO_PATH, search_parent_directories=True) # type: pypastry.experiment.Experiment 208 | results_repo = ResultsRepo(RESULTS_PATH) # type: pypastry.experiment.results.ResultsRepo 209 | runner = ExperimentRunner(git_repo, results_repo, display) # type: 210 | # pypastry.experiment.evaluation.ExperimentRunner 211 | return runner.run_experiment( 212 | experiment=experiment, 213 | message=message, 214 | force=force, 215 | show_results=show_results, 216 | ) 217 | -------------------------------------------------------------------------------- /pypastry/experiment/hasher.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | from io import BytesIO 3 | 4 | import pandas as pd 5 | from pandas import DataFrame 6 | 7 | BLOCKSIZE = 65536 8 | 9 | 10 | def get_dataset_hash(dataset: DataFrame, test_set: DataFrame = None) -> str: 11 | buffer = BytesIO() 12 | dataset.to_parquet(buffer) 13 | if test_set is not None: 14 | test_set.to_parquet(buffer) 15 | data = buffer.getvalue() 16 | 17 | return get_bytes_hash(data) 18 | 19 | 20 | def get_bytes_hash(data: bytes): 21 | hasher = hashlib.sha1() 22 | for i in range(0, len(data), BLOCKSIZE): 23 | block = data[i:i + BLOCKSIZE] 24 | hasher.update(block) 25 | return hasher.hexdigest() 26 | 27 | 28 | if __name__ == "__main__": 29 | data = pd.read_csv('../data/iris.csv') 30 | hash = get_dataset_hash(data) 31 | print(hash) 32 | -------------------------------------------------------------------------------- /pypastry/experiment/results.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import glob 4 | from pathlib import Path 5 | from tempfile import NamedTemporaryFile 6 | from typing import Dict, Any, NamedTuple 7 | 8 | 9 | Result = NamedTuple('Result', [('data', Dict[str, Any])]) 10 | 11 | 12 | class ResultsRepo: 13 | def __init__(self, results_path: str): 14 | self.results_path = results_path 15 | 16 | def save_results(self, run_info: Dict[str, Any], dataset_info: Dict[str, Any], git_info: Dict[str, str]) -> Path: 17 | try: 18 | os.mkdir(self.results_path) 19 | except FileExistsError: 20 | pass 21 | run_info['dataset'] = dataset_info 22 | run_info['git_hash'] = git_info["git_hash_msg"] 23 | run_info['git_summary'] = git_info["git_summary_msg"] 24 | with NamedTemporaryFile(mode='w', prefix='result-', suffix='.json', 25 | dir=self.results_path, delete=False) as output_file: 26 | result_file_path = Path(output_file.name) 27 | run_info["result_json_name"] = result_file_path.name 28 | json.dump(run_info, output_file, indent=4, default=str) 29 | output_file.flush() 30 | 31 | return result_file_path 32 | 33 | def get_results(self): 34 | for path in glob.glob(os.path.join(self.results_path, "*.json")): 35 | with open(str(path), "r") as results_file: 36 | result_json = json.load(results_file) 37 | yield Result(result_json) 38 | -------------------------------------------------------------------------------- /pypastry/paths.py: -------------------------------------------------------------------------------- 1 | DISPLAY_DIR = '.pypastry' 2 | DISPLAY_PATH = DISPLAY_DIR + '/display.txt' 3 | RESULTS_PATH = 'results' 4 | REPO_PATH = '.' 5 | 6 | 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from setuptools import setup, find_packages 4 | 5 | #check to make sure the python version is compatible 6 | if sys.version_info < (3, 6): 7 | sys.exit('Sorry, PyPastry requires Python version 3.6 or greater') 8 | 9 | # Reading in the ReadMe file as the doc file 10 | with open("README.md", "r") as fh: 11 | long_description = fh.read() 12 | 13 | setup( 14 | name='pypastry', 15 | version='0.3.0', 16 | description='PyPastry machine learning experimentation framework', 17 | author='Daoud Clarke', 18 | url='https://github.com/datapastry/pypastry', 19 | scripts=['pastry'], 20 | install_requires=['tomlkit', 'pandas', 'scikit-learn', 'pyarrow', 'gitpython', 'pytest'], 21 | #To find the packages 22 | packages=find_packages(), 23 | #To read in data file modules 24 | py_modules=['data/pie'], 25 | # commands that can be run in a console in the commands folder 26 | entry_points={ 27 | 'console_scripts': [ 28 | 'init = pypastry.commands.init:run', 29 | 'print = pypastry.commands.print_:run', 30 | 'run = pypastry.commands.run:run' 31 | ]}, 32 | package_data={ 33 | 34 | '' : ['data/*.gitignore'], 35 | # And include any *.gitignore files found in the 'data' package, too: 36 | 'data': ['*.gitignore'], 37 | 38 | }, 39 | long_description=long_description, 40 | long_description_content_type='text/markdown', 41 | # Make the setup file aware of the Manifest file 42 | include_package_data=True, 43 | #Minimum requirement of python, licesnse, and operating system. 44 | classifiers=[ 45 | "Programming Language :: Python :: 3.6", 46 | "Programming Language :: Python :: 3.7", 47 | "Programming Language :: Python :: 3.8", 48 | "License :: OSI Approved :: MIT License", 49 | "Operating System :: OS Independent"], 50 | python_requires='>=3.5', 51 | ) 52 | -------------------------------------------------------------------------------- /tests/display_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pypastry.display import _get_results_dataframe 4 | from pypastry.experiment.results import Result 5 | 6 | 7 | @pytest.fixture 8 | def get_result_dict(): 9 | 10 | return { 11 | "run_start": "2020-01-31 09:19:44.261276", 12 | "run_end": "2020-01-31 09:19:48.743458", 13 | "run_seconds": 4.482182, 14 | "results": { 15 | "test_score": { 16 | "mean_relative_error": 0.5, 17 | "mean_absolute_error": 100, 18 | "mean_squared_error": 1000 19 | }, 20 | "test_score_sem": { 21 | "mean_relative_error": 0.01, 22 | "mean_absolute_error": 1.0, 23 | "mean_squared_error": 10.0 24 | } 25 | }, 26 | "model_info": { 27 | "n_neighbors": 10, 28 | "type": "KNearestNeighbor" 29 | }, 30 | "additional_info": [ 31 | None 32 | ], 33 | "dataset": { 34 | "hash": "998c9dea0afb12d91a8c67f256f80b0a603dd59b", 35 | "columns": [ 36 | "input", 37 | "output" 38 | ], 39 | "size": 100, 40 | }, 41 | "git_hash": "123456781234567812345678", 42 | "git_summary": "12345678", 43 | "result_json_name": "jsonhash", 44 | } 45 | 46 | 47 | def test_get_display(get_result_dict): 48 | result = Result(get_result_dict) 49 | results_dataframe = _get_results_dataframe([result]) 50 | row = results_dataframe.iloc[0].to_dict() 51 | 52 | expected = { 53 | 'Git hash': get_result_dict["git_hash"], 54 | 'Summary': get_result_dict["git_summary"], 55 | 'Dataset size': get_result_dict["dataset"]["size"], 56 | 'Dataset hash': get_result_dict["dataset"]["hash"][:8], 57 | 'Run start': get_result_dict["run_start"][:19], 58 | 'Model': get_result_dict['model_info']["type"], 59 | 'Result JSON name': get_result_dict['result_json_name'], 60 | 'Duration (s)': "{:.2f}".format(get_result_dict['run_seconds']), 61 | 'mean_relative_error': "{:.3f} ± {:.3f}".format( 62 | get_result_dict["results"]["test_score"]["mean_relative_error"], 63 | get_result_dict["results"]["test_score_sem"]["mean_relative_error"] 64 | ), 65 | 'mean_absolute_error': "{:.3f} ± {:.3f}".format( 66 | get_result_dict["results"]["test_score"]["mean_absolute_error"], 67 | get_result_dict["results"]["test_score_sem"]["mean_absolute_error"] 68 | ), 69 | 'mean_squared_error': "{:.3f} ± {:.3f}".format( 70 | get_result_dict["results"]["test_score"]["mean_squared_error"], 71 | get_result_dict["results"]["test_score_sem"]["mean_squared_error"] 72 | ), 73 | } 74 | assert expected == row 75 | -------------------------------------------------------------------------------- /tests/evaluation_test.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock, MagicMock 2 | 3 | import pytest 4 | from pandas import DataFrame 5 | from sklearn.dummy import DummyClassifier 6 | from sklearn.metrics import accuracy_score, make_scorer, precision_score 7 | from sklearn.model_selection import StratifiedShuffleSplit, GroupShuffleSplit 8 | from sklearn.tree import DecisionTreeClassifier 9 | 10 | from pypastry.experiment import Experiment 11 | from pypastry.experiment.evaluation import ExperimentRunner, evaluate_predictor, DirtyRepoError 12 | 13 | 14 | @pytest.fixture 15 | def simple_dataset(): 16 | return DataFrame({ 17 | 'a': [1, 1, 0, 0], 18 | 'b': [1, 1, 0, 0], 19 | }) 20 | 21 | 22 | @pytest.fixture 23 | def grouped_dataset(): 24 | label = [i % 2 for i in range(100)] 25 | return DataFrame({ 26 | 'a': label, 27 | 'b': label, 28 | 'g': [i // 2 for i in range(100)] 29 | }) 30 | 31 | 32 | @pytest.fixture 33 | def get_predictor(): 34 | return DecisionTreeClassifier() 35 | 36 | 37 | @pytest.fixture 38 | def get_cross_validator(): 39 | return StratifiedShuffleSplit(n_splits=1, test_size=0.5) 40 | 41 | 42 | @pytest.fixture 43 | def get_scorer(): 44 | return make_scorer(accuracy_score) 45 | 46 | 47 | @pytest.mark.parametrize("dirty, force", [(False, False), (False, True), (True, False), (True, True)]) 48 | def test_simple_evaluation(dirty, force, simple_dataset): 49 | 50 | cross_validation = StratifiedShuffleSplit(n_splits=1, test_size=0.5) 51 | predictor = DecisionTreeClassifier() 52 | scorer = make_scorer(accuracy_score) 53 | 54 | experiment = Experiment(simple_dataset, 'b', predictor, cross_validation, scorer) 55 | 56 | git_mock = Mock() 57 | git_mock.is_dirty.return_value = dirty 58 | git_mock.head.object.hexsha = MagicMock() 59 | results_repo_mock = Mock() 60 | results_display_mock = Mock() 61 | runner = ExperimentRunner(git_mock, results_repo_mock, results_display_mock) 62 | 63 | try: 64 | runner.run_experiment(experiment, "msg", force) 65 | except DirtyRepoError: 66 | if dirty is True and force is False: 67 | # Expected behaviour. 68 | return 69 | else: 70 | raise 71 | 72 | call_args_list = results_repo_mock.save_results.call_args_list 73 | assert 1 == len(call_args_list) 74 | run_info, dataset_info = call_args_list[0][0] 75 | print("Run info", run_info) 76 | 77 | results = run_info['results'] 78 | assert {'accuracy_score': 1.0} == results['test_score'] 79 | assert ['a', 'b'] == dataset_info['columns'] 80 | 81 | # TODO: check the hash. Need to find a way to make this consistent between python versions etc. 82 | # assert '28ea628a50a47c726a9b0ec437c88fc4742d81fd' == dataset_info['hash'] 83 | 84 | assert 1 == len(results_display_mock.cache_display.call_args_list) 85 | print(results_display_mock.cache_display.call_args[0]) 86 | assert len(results_display_mock.cache_display.call_args[0]) > 0 87 | assert 1 == len(results_display_mock.print_cache_file.call_args_list) 88 | 89 | 90 | @pytest.mark.parametrize("dirty, force", [(False, False), (False, True), (True, False), (True, True)]) 91 | def test_grouped_evaluation(dirty, force, grouped_dataset): 92 | 93 | cross_validation = GroupShuffleSplit(n_splits=1, test_size=0.5) 94 | predictor = DummyClassifier(strategy='constant', constant=1) 95 | scorer = make_scorer(accuracy_score) 96 | 97 | experiment = Experiment(grouped_dataset, 'b', predictor, cross_validation, scorer, group_column='g') 98 | 99 | git_mock = Mock() 100 | git_mock.is_dirty.return_value = dirty 101 | git_mock.head.object.hexsha = MagicMock() 102 | results_repo_mock = Mock() 103 | results_display_mock = Mock() 104 | runner = ExperimentRunner(git_mock, results_repo_mock, results_display_mock) 105 | 106 | try: 107 | runner.run_experiment(experiment, "msg", force) 108 | except DirtyRepoError: 109 | if dirty is True and force is False: 110 | # Expected behaviour. 111 | return 112 | else: 113 | raise 114 | 115 | assert 1 == len(results_repo_mock.save_results.call_args_list) 116 | run_info, dataset_info = results_repo_mock.save_results.call_args[0] 117 | 118 | print("Run infos", run_info) 119 | 120 | assert run_info['results']["test_score"]["accuracy_score"] == 0.5 121 | assert run_info['results']["test_score_sem"]["accuracy_score"] == 0.0 122 | 123 | 124 | def test_multiple_scorers(simple_dataset): 125 | 126 | cross_validation = StratifiedShuffleSplit(n_splits=2, test_size=0.5) 127 | predictor = DummyClassifier(strategy='constant', constant=1) 128 | scorer = [make_scorer(accuracy_score), make_scorer(precision_score)] 129 | 130 | experiment = Experiment(simple_dataset, 'b', predictor, cross_validation, scorer) 131 | 132 | run_info, _ = evaluate_predictor(experiment) 133 | results = run_info['results'] 134 | print("Results", results) 135 | 136 | expected_results = { 137 | 'test_score': {'accuracy_score': 0.5, 'precision_score': 0.5}, 138 | 'test_score_sem': {'accuracy_score': 0.0, 'precision_score': 0.0}, 139 | } 140 | 141 | assert expected_results == results 142 | --------------------------------------------------------------------------------