├── .conda
    └── meta.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── data
    ├── .gitignore
    └── pie.py
├── mypy.ini
├── pastry
├── pypastry
    ├── __init__.py
    ├── commands
    │   ├── __init__.py
    │   ├── init.py
    │   ├── print_.py
    │   └── run.py
    ├── display.py
    ├── experiment
    │   ├── __init__.py
    │   ├── evaluation.py
    │   ├── hasher.py
    │   └── results.py
    └── paths.py
├── setup.py
└── tests
    ├── display_test.py
    └── evaluation_test.py


/.conda/meta.yml:
--------------------------------------------------------------------------------
 1 | {% set name = "pypastry" %}
 2 | {% set version = "0.0.1" %}
 3 | 
 4 | package:
 5 |   name: "{{ name|lower }}"
 6 |   version: "{{ version }}"
 7 | 
 8 | source:
 9 | 
10 |   #url: ../dist/{{ name }}-{{ version }}.tar.gz
11 |   url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/{{ name }}-{{ version }}.tar.gz
12 |   sha256: a1c81cd0e01fe69637a9c6393dd6e97901d5c5fb8270553c9b6a61978fe0c063
13 | 
14 | build:
15 |   #number: 0
16 |   #noarch: python
17 |   #script: python setup.py install 
18 |   number: 0
19 |   script: "{{ PYTHON }} -m pip install . --no-deps --ignore-installed -vv "
20 | 
21 | requirements:
22 |   host:
23 |  #  - conda-build 
24 |     - gitpython
25 |     - pandas
26 |     - pip
27 |     - pyarrow
28 |     - python
29 |     - scikit-learn
30 |     - tomlkit
31 |   run:
32 |     - gitpython
33 |     - pandas
34 |     - pyarrow
35 |     - python
36 |     - scikit-learn
37 |     - tomlkit
38 | 
39 |   test:
40 |     imports:
41 |       -pypastry
42 |       -pandas 
43 |       -sklearn
44 | 
45 |     requires:
46 |       -python
47 |       -unittest
48 |       
49 | 
50 | about:
51 |   home: https://github.com/datapastry/pypastry
52 |   license: MIT
53 |   license_family: MIT
54 |   license_file: LICENSE
55 |   summary: PyPastry machine learning experimentation framework
56 |   doc_url: 
57 |   dev_url:  
58 | extra:
59 |   recipe-maintainers:
60 |     -  mrrutledge


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | venv/
  2 | *__pycache__*
  3 | *.egg-info
  4 | .idea/
  5 | 
  6 | ### Python ###
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | pip-wheel-metadata/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # pipenv
 76 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 77 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 78 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 79 | #   install all needed dependencies.
 80 | #Pipfile.lock
 81 | 
 82 | # celery beat schedule file
 83 | celerybeat-schedule
 84 | 
 85 | # SageMath parsed files
 86 | *.sage.py
 87 | 
 88 | # Spyder project settings
 89 | .spyderproject
 90 | .spyproject
 91 | 
 92 | # Rope project settings
 93 | .ropeproject
 94 | 
 95 | # Mr Developer
 96 | .mr.developer.cfg
 97 | .project
 98 | .pydevproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | /sample
103 | 
104 | 
105 | # mypy
106 | .mypy_cache/
107 | .dmypy.json
108 | dmypy.json
109 | 
110 | # Pyre type checker
111 | .pyre/
112 | 
113 | #Env
114 | .vscode
115 | /pastry-test
116 | 
117 | # End of https://www.gitignore.io/api/git,python


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 DataPastry
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include mypy.ini
3 | recursive-include data *.py
4 | recursive-include data  *.gitignore
5 | recursive-include pypastry *.py
6 | recursive-include tests *.py
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | PyPastry - the opinionated machine learning experimentation framework
 2 | =====================================================================
 3 | 
 4 | PyPastry is a framework for developers and data scientists to run
 5 | machine learning experiments. We enable you to:
 6 | 
 7 |  - Iterate quickly. The more experiments you do, the more likely you
 8 |    are to find something that works well.
 9 |  - Experiment correctly and consistently. Anything else is not really
10 |    an experiment, is it?
11 |  - Make experiments reproducible. That means keeping track of your
12 |    code state and results.
13 |  - Experiment locally. None of that Spark rubbish.
14 |  - Use standard tools. Everything is based on Scikit-learn, Pandas and Git.
15 | 
16 | Quick start
17 | -----------
18 | 
19 | PyPastry requires python 3.5 or greater.
20 | 
21 |     > pip install pypastry==0.2.0
22 | 	> pastry init pastry-test
23 |     > cd pastry-test
24 |     > pastry run -m "First experiment"
25 |     Got dataset with 10 rows
26 |        Git hash Dataset hash            Run start                   Model          Score Duration (s)
27 |     0  aa87ce62     71e8f4fd  2019-08-28 06:39:07  DecisionTreeClassifier  0.933 ± 0.067         0.03
28 | 
29 | The command `pastry init` creates a file called `pie.py` in the `pastry-test` directory. If you open
30 | that up, you should see some code. The important bit is:
31 | 
32 |     def get_experiment():
33 |         dataset = pd.DataFrame({
34 |             'feature': [1, 0, 1, 1, 0, 0, 1, 1, 0, 1],
35 |             'class': [True, False, True, True, False, False, True, True, False, False],
36 |         })
37 |         predictor = DecisionTreeClassifier()
38 |         cross_validator = StratifiedKFold(n_splits=5)
39 |         scorer = make_scorer(f1_score)
40 |         label_column = 'class'
41 |         return Experiment(dataset, label_column, predictor, cross_validator, scorer)
42 | 
43 | This returns an `Experiment` instance that specifies how the experiment should be run. An experiment
44 | consists of:
45 |  - `dataset`: a Pandas `DataFrame` where each row is an instance to be used in the experiment.
46 |  - `label_column`: the name of the column in `dataset` that contains the label we wish to predict.
47 |  - `predictor`: a Scikit-learn predictor, e.g. a classifier, regressor or `Pipeline` object.
48 |  - `cross_validator`: a Scikit-learn cross validator that specifies how the data should be split
49 |    up when running the experiment.
50 |  - `scorer` a Scikit-learn scorer that will be used as an indication of how well the classifier has
51 |    learnt to generate predictions.
52 | 
53 | When you type `pastry run`, PyPastry does this:
54 |  - Splits `dataset` into one or more train and test sets.
55 |  - For each train and test set, it trains the `predictor` on the train set and generate predictions
56 |    on the test set, and computes the score on the test set using the `scorer`.
57 |  - Generates a results file in JSON format and stores it in a folder called `results`
58 |  - Outputs the results of the experiment.
59 |  - Your repo has to be clean (no (un)staged changes) for experiment to run. If you want to use dirty repo, you can with calling pypsatry with force flag `-f`. However, results will not be possible to bond with exact code state.
60 | 
61 | The results includes:
62 |  - Git hash: the commit identifier of the code used to run the experiment. There might be `"dirty_"` prefix indicating that unclean repo was used with this experiment. The hash belongs to the latest commit, however, the information about (un)staged changes is lost.
63 |  - Git summary: A summary note
64 |  - Dataset hash: a hash generated from the dataset that will change if the dataset changes.
65 |  - Run start: the time that the experiment run started
66 |  - Model: the name of the `predictor` class used
67 |  - Score: the mean ± the standard error in the mean, computed over the different folds generated
68 |    by the `cross_validator`.
69 |  - Duration: how long the experiment took to run, in seconds.
70 | 
71 | Contributing
72 | ------------
73 | 
74 | PyPastry is at an early stage so there's plenty to do and we'd love to have your contribution.
75 | 
76 | Check out the issues for a list of things that need doing and post a comment if you'd like to take
77 | something on.
78 | 
79 | If you have an idea for something you'd like to do, create an issue.
80 | 
81 | Run `python -m pytest` in the project root to run all tests.
82 | 
83 | Thanks for using PyPastry!
84 | 


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | .pypastry/
2 | __pycache__/
3 | 


--------------------------------------------------------------------------------
/data/pie.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.metrics import f1_score, make_scorer
 3 | from sklearn.model_selection import StratifiedKFold
 4 | 
 5 | from sklearn.tree import DecisionTreeClassifier
 6 | 
 7 | from pypastry.experiment import Experiment
 8 | 
 9 | 
10 | def get_experiment():
11 |     dataset = pd.DataFrame({
12 |         'feature': [1, 0, 1, 1, 0, 0, 1, 1, 0, 1],
13 |         'class': [True, False, True, True, False, False, True, True, False, False],
14 |     })
15 |     predictor = DecisionTreeClassifier()
16 |     cross_validator = StratifiedKFold(n_splits=5)
17 |     scorer = make_scorer(f1_score)
18 |     label_column = 'class'
19 |     return Experiment(dataset, label_column, predictor, cross_validator, scorer)
20 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = True
3 | disallow_untyped_defs = False
4 | 
5 | 


--------------------------------------------------------------------------------
/pastry:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | import os
 4 | import sys
 5 | from pstats import Stats
 6 | 
 7 | PROFILE_PATH = '/tmp/pastry.profile'
 8 | 
 9 | 
10 | def parse_and_run():
11 |         parser = argparse.ArgumentParser(
12 |             description='Run a machine learning experiment',
13 |             usage='''pastry <command> [<args>]
14 | 
15 | The command can be:
16 |    run        Run an experiment
17 |    print      Print out results from previous experiments
18 | ''')
19 |         parser.add_argument('command', help='Subcommand to run')
20 |         args = parser.parse_args(sys.argv[1:2])
21 | 
22 |         if args.command == 'print':
23 |             from pypastry.commands.print_ import run
24 |             run()
25 |         elif args.command == 'run':
26 |             from pypastry.commands.run import run
27 |             run()
28 |         elif args.command == 'init':
29 |             from pypastry.commands.init import run
30 |             run()
31 |         else:
32 |             print("Unrecognised command: {}".format(args.command))
33 |             parser.print_usage()
34 |             exit(1)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     if os.environ.get('PASTRY_PROFILE'):
39 |         import cProfile
40 |         cProfile.run('parse_and_run()', PROFILE_PATH)
41 |         stats = Stats(PROFILE_PATH)
42 |         stats.sort_stats('cumulative').print_stats(20)
43 |     else:
44 |         parse_and_run()
45 | 


--------------------------------------------------------------------------------
/pypastry/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datapastry/pypastry/9d0093e75f81b56d7b924af8177bee7ea1cfecc0/pypastry/__init__.py


--------------------------------------------------------------------------------
/pypastry/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datapastry/pypastry/9d0093e75f81b56d7b924af8177bee7ea1cfecc0/pypastry/commands/__init__.py


--------------------------------------------------------------------------------
/pypastry/commands/init.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | from os import path, mkdir, chdir
 4 | 
 5 | 
 6 | def run():
 7 |     parser = argparse.ArgumentParser(prog='pastry init')
 8 |     parser.add_argument('directory', nargs='?', type=str,
 9 |                         help='Path to directory to create (or default to current directory)')
10 |     args = parser.parse_args(sys.argv[2:])  # type: argparse.Namespace
11 | 
12 |     directory = args.directory if args.directory is not None else '.'
13 | 
14 |     try:
15 |         mkdir(directory)
16 |     except FileExistsError:
17 |         pass
18 | 
19 |     chdir(directory)
20 | 


--------------------------------------------------------------------------------
/pypastry/commands/print_.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from pypastry.display import print_cache_file, cache_display, _get_results_dataframe
 4 | from pypastry.paths import RESULTS_PATH
 5 | from pypastry.experiment.results import ResultsRepo
 6 | 
 7 | 
 8 | def run():
 9 |     parser = argparse.ArgumentParser(prog='pastry print')
10 |     parser.add_argument('-l', '--limit', type=int, default=None, help='Limit lines to print')
11 |     parser.add_argument('-e', '--export', type=str, required=False, help='File to output the results in CSV format')
12 | 
13 |     args = parser.parse_args(sys.argv[2:])
14 |     if args.export is not None:
15 |         results = get_results()
16 |         results_dataframe = _get_results_dataframe(results)
17 |         results_dataframe.to_csv(args.export)
18 |         return
19 |     try:
20 |         print_cache_file(args.limit)
21 |     except FileNotFoundError:
22 |         results = get_results()
23 |         cache_display(results)
24 |         print_cache_file(args.limit)
25 | 
26 | 
27 | def get_results():
28 |     results_repo = ResultsRepo(RESULTS_PATH)
29 |     results = results_repo.get_results()
30 |     return results
31 | 


--------------------------------------------------------------------------------
/pypastry/commands/run.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | 
 4 | from pypastry.experiment.evaluation import run_experiment
 5 | 
 6 | 
 7 | def run():
 8 |     parser = argparse.ArgumentParser(prog='pastry run')
 9 |     parser.add_argument('-m', '--message', default="", type=str, help='Summary message about the experiment.')
10 |     parser.add_argument('-f', '--force', action='store_true', help='Force a re-run of the experiment')
11 |     parser.add_argument('-p', '--no-print', action='store_true', help='Do not print results.')
12 | 
13 |     args = parser.parse_args(sys.argv[2:])
14 | 
15 |     sys.path.append('.')
16 |     import pie
17 |     experiment = pie.get_experiment()
18 |     force = args.force
19 |     message = args.message
20 | 
21 |     run_experiment(experiment, message, force, show_results=not args.no_print)
22 | 


--------------------------------------------------------------------------------
/pypastry/display.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Handle displaying results.
 3 | 
 4 | This code needs to display results really fast. That's why it has some odd things:
 5 |  - Inline imports
 6 |  - Lack of typing for the results_repo to avoid unnecessary imports
 7 | 
 8 | This is because I don't like having to wait a second for things to be imported.
 9 | I want my pastry now!
10 | """
11 | import os
12 | from typing import Any, Dict, List, Iterator, TYPE_CHECKING
13 | 
14 | from pandas import Series
15 | 
16 | from pypastry.paths import DISPLAY_PATH, DISPLAY_DIR
17 | if TYPE_CHECKING:
18 |     import pypastry
19 | 
20 | 
21 | def cache_display(results_from_repo: Iterator['pypastry.experiment.results.Result']) -> None:
22 |     results_dataframe = _get_results_dataframe(results_from_repo)
23 |     display = repr(results_dataframe)
24 | 
25 |     try:
26 |         os.mkdir(DISPLAY_DIR)
27 |     except FileExistsError:
28 |         pass
29 | 
30 |     with open(DISPLAY_PATH, 'w') as output_file:
31 |         output_file.write(display)
32 | 
33 | 
34 | def _get_results_dataframe(results_from_repo: Iterator['pypastry.experiment.results.Result']) -> 'DataFrame':
35 |     from pandas import DataFrame, set_option
36 |     set_option('display.max_rows', None)
37 |     set_option('display.max_columns', None)
38 |     set_option('display.width', None)
39 |     set_option('display.max_colwidth', -1)
40 |     results = []
41 |     for repo_result in results_from_repo:
42 |         data = repo_result.data
43 |         result = {
44 |             'Git hash': data["git_hash"] if "git_hash" in data else "Unavailable",
45 |             'Summary': data["git_summary"] if "git_summary" in data else "Unavailable",
46 |             'Dataset hash': data['dataset']['hash'][:8],
47 |             'Dataset size': data['dataset']['size'] if "size" in data["dataset"] else "Unavailable",
48 |             'Result JSON name': data['result_json_name'] if "result_json_name" in data else "Unavailable",
49 |             'Run start': data['run_start'][:19],
50 |             'Model': data['model_info']['type'],
51 |             'Duration (s)': "{:.2f}".format(data['run_seconds']),
52 |         }
53 | 
54 |         try:
55 |             scores = DataFrame(data['results'])
56 |             for row in scores.itertuples():
57 |                 result[row.Index] = "{:.3f} ± {:.3f}".format(row.test_score, row.test_score_sem)
58 |         except ValueError:
59 |             result['Score'] = "{:.3f} ± {:.3f}".format(data['results']['test_score'],
60 |                                                        data['results']['test_score_sem'])
61 | 
62 |         results.append(result)
63 |     results_dataframe = DataFrame(results)
64 |     return results_dataframe.sort_values(by='Run start').reset_index(drop=True)
65 | 
66 | 
67 | def print_cache_file(limit=False):
68 |     with open(DISPLAY_PATH) as display_file:
69 |         print(display_file.read())
70 |         # read_lines = display_file.read()
71 |         # read_list = read_lines.split("\n")
72 |         # if limit:
73 |         #     limit = min(limit, len(read_list)-3)
74 |         #     # to avoid printing more than there is
75 |         # else:
76 |         #     limit = len(read_list)-3
77 |         # print(read_list[0])
78 |         # print("\n".join(read_list[-(2+limit):-2]))
79 |         # print(read_list[-1])
80 | 
81 | 


--------------------------------------------------------------------------------
/pypastry/experiment/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Callable, Union, Iterable
 2 | 
 3 | from pandas import DataFrame
 4 | from sklearn.base import BaseEstimator
 5 | from sklearn.metrics import accuracy_score, make_scorer
 6 | from sklearn.metrics._scorer import _BaseScorer as BaseScorer
 7 | 
 8 | 
 9 | class Experiment:
10 |     def __init__(self, dataset: DataFrame, label_column: str, predictor: BaseEstimator,
11 |                  cross_validator: Any = None, scorer: Union[BaseScorer, Iterable[BaseScorer]] = None,
12 |                  group_column: str=None, test_set: DataFrame = None, average_scores_on_instances: bool = False,
13 |                  additional_info: Callable[[BaseEstimator], Any] = None):
14 |         if (test_set is not None) == (cross_validator is not None):
15 |             raise ValueError("You must specify either a cross validator or a test set (and not both)")
16 | 
17 |         if average_scores_on_instances and group_column is not None:
18 |             raise ValueError("You can only average on instances when not grouping instances")
19 | 
20 |         if scorer is None:
21 |             scorer = [make_scorer(accuracy_score)]
22 | 
23 |         if not isinstance(scorer, Iterable):
24 |             if not isinstance(scorer, BaseScorer):
25 |                 raise ValueError("Scorer must be created using make_scorer()")
26 | 
27 |             scorer = [scorer]
28 | 
29 |         self.dataset = dataset
30 |         self.label_column = label_column
31 |         self.predictor = predictor
32 |         self.cross_validator = cross_validator
33 |         self.scorer = scorer
34 |         self.group_column = group_column
35 |         self.test_set = test_set
36 |         self.average_scores_on_instances = average_scores_on_instances
37 |         self.additional_info = additional_info
38 | 


--------------------------------------------------------------------------------
/pypastry/experiment/evaluation.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from datetime import datetime
  3 | from types import ModuleType
  4 | from typing import Any, Dict, Tuple, List
  5 | from pathlib import Path
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | from git import Repo
 10 | from joblib import Parallel, delayed
 11 | from pandas import Series
 12 | from sklearn.base import BaseEstimator, is_classifier, clone
 13 | from sklearn.metrics._scorer import _BaseScorer
 14 | from sklearn.model_selection import check_cv, PredefinedSplit
 15 | 
 16 | from pypastry import display
 17 | from pypastry.experiment import Experiment
 18 | from pypastry.experiment.hasher import get_dataset_hash
 19 | from pypastry.experiment.results import ResultsRepo
 20 | from pypastry.paths import REPO_PATH, RESULTS_PATH
 21 | 
 22 | MAX_PARAMETER_VALUE_LENGTH = 500
 23 | 
 24 | 
 25 | class DirtyRepoError(Exception):
 26 |     def __init__(self, message):
 27 |         super().__init__(message)
 28 | 
 29 | 
 30 | class ExperimentRunner:
 31 |     def __init__(self, git_repo: Repo, results_repo: ResultsRepo, results_display: ModuleType):
 32 |         self.git_repo = git_repo
 33 |         self.results_repo = results_repo
 34 |         self.results_display = results_display
 35 | 
 36 |     def run_experiment(
 37 |         self,
 38 |         experiment: Experiment,
 39 |         message: str = "",
 40 |         force: bool = False,
 41 |         limit: int = None,
 42 |         show_results: bool = True,
 43 |     ) -> Tuple[List[BaseEstimator], Path]:
 44 | 
 45 |         print("Got dataset with {} rows".format(len(experiment.dataset)))
 46 |         if force or not self.git_repo.is_dirty():
 47 |             print("Running evaluation")
 48 |             estimators, result_file_path = self._run_evaluation(experiment, message)
 49 |             results = self.results_repo.get_results()
 50 |             self.results_display.cache_display(results)
 51 |         else:
 52 |             raise DirtyRepoError("There are untracked/unstaged/staged changes in git repo, force flag was not given. "
 53 |                                  "Please commit your changes or provide force flag - note that in this case "
 54 |                                  "saved commit hash in your result file will not correspond to the actual code!")
 55 |         if show_results:
 56 |             self.results_display.print_cache_file(limit)
 57 | 
 58 |         return estimators, result_file_path
 59 | 
 60 |     def _run_evaluation(self, experiment: Experiment, message: str) -> Tuple[List[BaseEstimator], Path]:
 61 |         run_info, estimators = evaluate_predictor(experiment)
 62 |         dataset_hash = get_dataset_hash(experiment.dataset, experiment.test_set)
 63 |         dataset_info = {
 64 |             'hash': dataset_hash,
 65 |             'columns': experiment.dataset.columns.tolist(),
 66 |             'size': len(experiment.dataset),
 67 |         }
 68 |         git_info = {
 69 |             "git_hash_msg": ("dirty_" if self.git_repo.is_dirty() else "") + self.git_repo.head.object.hexsha[:8],
 70 |             "git_summary_msg": message,
 71 |         }
 72 |         result_file_path = self.results_repo.save_results(run_info, dataset_info, git_info=git_info)
 73 | 
 74 |         return estimators, result_file_path
 75 | 
 76 | 
 77 | def evaluate_predictor(experiment: Experiment) -> Dict[str, Tuple[Any, List[BaseEstimator]]]:
 78 |     start = datetime.utcnow()
 79 |     scores, estimators = _get_scores_and_estimators(experiment)
 80 |     end = datetime.utcnow()
 81 | 
 82 |     additional = experiment.additional_info
 83 |     additional_info = [additional(estimator) if additional is not None else None
 84 |                        for estimator in estimators]
 85 | 
 86 |     if experiment.group_column is not None:
 87 |         for group, score_values in scores:
 88 |             score_values[experiment.group_column] = group
 89 | 
 90 |     values = [x[1] for x in scores]
 91 | 
 92 |     scores_array = pd.DataFrame(values)
 93 | 
 94 |     mean_score = scores_array.mean().to_dict()
 95 |     sem_score = scores_array.sem().to_dict()
 96 |     results = {'test_score': mean_score, 'test_score_sem': sem_score}
 97 | 
 98 |     model_info = get_model_info(experiment.predictor)
 99 | 
100 |     run_info = {
101 |         'run_start': str(start),
102 |         'run_end': str(end),
103 |         'run_seconds': (end - start).total_seconds(),
104 |         'results': results,
105 |         'results_detail': scores_array.to_dict('list'),
106 |         'model_info': model_info,
107 |         'additional_info': additional_info,
108 |     }
109 | 
110 |     return run_info, estimators
111 | 
112 | 
113 | def get_model_info(model: BaseEstimator):
114 |     all_info = model.get_params()
115 |     info = {key: value for key, value in all_info.items()
116 |             if len(json.dumps(value, default=str)) < MAX_PARAMETER_VALUE_LENGTH}
117 |     info['type'] = type(model).__name__
118 |     return info
119 | 
120 | 
121 | def _get_scores_and_estimators(experiment: Experiment) -> Tuple[List[float], List[Any]]:
122 |     if experiment.test_set is not None:
123 |         assert experiment.cross_validator is None, "Cannot use a cross validator with train test split"
124 |         dataset = pd.concat([experiment.dataset, experiment.test_set])
125 |         split = np.array([-1] * len(experiment.dataset) + [1] * len(experiment.test_set))
126 |         cross_validator = PredefinedSplit(split)
127 |     else:
128 |         dataset = experiment.dataset
129 |         cross_validator = experiment.cross_validator
130 | 
131 |     X = dataset.drop(columns=[experiment.label_column])
132 |     y = dataset[experiment.label_column]
133 |     if experiment.group_column is None:
134 |         if experiment.average_scores_on_instances:
135 |             groups = Series(range(len(X)), index=X.index)
136 |         else:
137 |             groups = None
138 |     else:
139 |         groups = X[experiment.group_column]
140 |         X = X.drop(columns=[experiment.group_column])
141 | 
142 |     cv = check_cv(cross_validator, y, classifier=is_classifier(experiment.predictor))
143 |     train_test = cv.split(X, y, groups)
144 | 
145 |     # We clone the estimator to make sure that all the folds are
146 |     # independent, and that it is pickle-able.
147 |     parallel = Parallel(n_jobs=None, verbose=False,
148 |                         pre_dispatch='2*n_jobs')
149 |     scores_and_estimators = parallel(
150 |         delayed(_fit_and_predict)(
151 |             clone(experiment.predictor), X, y, train, test, groups, experiment.scorer)
152 |         for train, test in train_test)
153 |     scores_lists, estimators = zip(*scores_and_estimators)
154 |     scores = [score for score_list in scores_lists for score in score_list]
155 |     return scores, estimators
156 | 
157 | 
158 | def _fit_and_predict(estimator: BaseEstimator, X, y, train, test, groups, scorer):
159 |     if groups is not None:
160 |         scores = _fit_and_predict_groups(X, estimator, groups, scorer, test, train, y)
161 |     else:
162 |         scores = _fit_and_predict_simple(X, estimator, scorer, test, train, y)
163 |     return scores, estimator
164 | 
165 | 
166 | def _fit_and_predict_simple(X, estimator, scorers, test, train, y):
167 |     X_train = X.iloc[train]
168 |     y_train = y.iloc[train]
169 |     estimator.fit(X_train, y_train)
170 |     X_test = X.iloc[test]
171 |     y_test = y.iloc[test]
172 |     score = _score(scorers, estimator, X_test, y_test)
173 |     return [(None, score)]
174 | 
175 | 
176 | def _fit_and_predict_groups(X, estimator, groups, scorers, test, train, y):
177 |     X_train = X.iloc[train]
178 |     y_train = y.iloc[train]
179 |     estimator.fit(X_train, y_train)
180 |     X_test = X.iloc[test]
181 |     y_test = y.iloc[test]
182 |     groups_test = groups.iloc[test]
183 |     test_df = pd.DataFrame(X_test)
184 |     test_df['y'] = y_test
185 |     test_df['groups'] = groups_test
186 |     test_groups = test_df.groupby('groups')
187 |     scores = []
188 |     for key, group in test_groups:
189 |         X_group = group[X.columns]
190 |         score = _score(scorers, estimator, X_group, group['y'])
191 |         scores.append((key, score))
192 |     return scores
193 | 
194 | 
195 | def _score(scorers: List[_BaseScorer], estimator, X_test, y_test):
196 |     scores = {}
197 |     for scorer in scorers:
198 |         score = scorer(estimator, X_test, y_test)
199 |         score_name = scorer._score_func.__name__
200 |         sign = scorer._sign
201 |         score_ignoring_sign = score*sign
202 |         scores[score_name] = score_ignoring_sign
203 |     return scores
204 | 
205 | 
206 | def run_experiment(experiment, message="", force=False, show_results=True) -> Tuple[List[BaseEstimator], Path]:
207 |     git_repo = Repo(REPO_PATH, search_parent_directories=True)  # type: pypastry.experiment.Experiment
208 |     results_repo = ResultsRepo(RESULTS_PATH)  # type: pypastry.experiment.results.ResultsRepo
209 |     runner = ExperimentRunner(git_repo, results_repo, display)  # type:
210 |     # pypastry.experiment.evaluation.ExperimentRunner
211 |     return runner.run_experiment(
212 |         experiment=experiment,
213 |         message=message,
214 |         force=force,
215 |         show_results=show_results,
216 |     )
217 | 


--------------------------------------------------------------------------------
/pypastry/experiment/hasher.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | from io import BytesIO
 3 | 
 4 | import pandas as pd
 5 | from pandas import DataFrame
 6 | 
 7 | BLOCKSIZE = 65536
 8 | 
 9 | 
10 | def get_dataset_hash(dataset: DataFrame, test_set: DataFrame = None) -> str:
11 |     buffer = BytesIO()
12 |     dataset.to_parquet(buffer)
13 |     if test_set is not None:
14 |         test_set.to_parquet(buffer)
15 |     data = buffer.getvalue()
16 | 
17 |     return get_bytes_hash(data)
18 | 
19 | 
20 | def get_bytes_hash(data: bytes):
21 |     hasher = hashlib.sha1()
22 |     for i in range(0, len(data), BLOCKSIZE):
23 |         block = data[i:i + BLOCKSIZE]
24 |         hasher.update(block)
25 |     return hasher.hexdigest()
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     data = pd.read_csv('../data/iris.csv')
30 |     hash = get_dataset_hash(data)
31 |     print(hash)
32 | 


--------------------------------------------------------------------------------
/pypastry/experiment/results.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import glob
 4 | from pathlib import Path
 5 | from tempfile import NamedTemporaryFile
 6 | from typing import Dict, Any, NamedTuple
 7 | 
 8 | 
 9 | Result = NamedTuple('Result', [('data', Dict[str, Any])])
10 | 
11 | 
12 | class ResultsRepo:
13 |     def __init__(self, results_path: str):
14 |         self.results_path = results_path
15 | 
16 |     def save_results(self, run_info: Dict[str, Any], dataset_info: Dict[str, Any], git_info: Dict[str, str]) -> Path:
17 |         try:
18 |             os.mkdir(self.results_path)
19 |         except FileExistsError:
20 |             pass
21 |         run_info['dataset'] = dataset_info
22 |         run_info['git_hash'] = git_info["git_hash_msg"]
23 |         run_info['git_summary'] = git_info["git_summary_msg"]
24 |         with NamedTemporaryFile(mode='w', prefix='result-', suffix='.json',
25 |                                 dir=self.results_path, delete=False) as output_file:
26 |             result_file_path = Path(output_file.name)
27 |             run_info["result_json_name"] = result_file_path.name
28 |             json.dump(run_info, output_file, indent=4, default=str)
29 |             output_file.flush()
30 | 
31 |             return result_file_path
32 | 
33 |     def get_results(self):
34 |         for path in glob.glob(os.path.join(self.results_path, "*.json")):
35 |             with open(str(path), "r") as results_file:
36 |                 result_json = json.load(results_file)
37 |             yield Result(result_json)
38 | 


--------------------------------------------------------------------------------
/pypastry/paths.py:
--------------------------------------------------------------------------------
1 | DISPLAY_DIR = '.pypastry'
2 | DISPLAY_PATH = DISPLAY_DIR + '/display.txt'
3 | RESULTS_PATH = 'results'
4 | REPO_PATH = '.'
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | #check to make sure the python version is compatible 
 6 | if sys.version_info < (3, 6):
 7 |     sys.exit('Sorry, PyPastry requires Python version 3.6 or greater')
 8 | 
 9 | # Reading in the ReadMe file as the doc file
10 | with open("README.md", "r") as fh:
11 |     long_description = fh.read()
12 | 
13 | setup(
14 |     name='pypastry',
15 |     version='0.3.0',
16 |     description='PyPastry machine learning experimentation framework',
17 |     author='Daoud Clarke',
18 |     url='https://github.com/datapastry/pypastry',
19 |     scripts=['pastry'],
20 |     install_requires=['tomlkit', 'pandas', 'scikit-learn', 'pyarrow', 'gitpython', 'pytest'],
21 |     #To find the packages 
22 |     packages=find_packages(),
23 |     #To read in data file modules 
24 |     py_modules=['data/pie'],
25 |     # commands that can be run in a console in the commands folder
26 |     entry_points={
27 |         'console_scripts': [
28 |             'init = pypastry.commands.init:run',
29 |             'print = pypastry.commands.print_:run',
30 |             'run = pypastry.commands.run:run'
31 |             ]},
32 |     package_data={
33 | 
34 |          '' : ['data/*.gitignore'],
35 |         # And include any *.gitignore files found in the 'data' package, too:
36 |         'data': ['*.gitignore'],
37 | 
38 |     },
39 |     long_description=long_description,
40 |     long_description_content_type='text/markdown',
41 |     # Make the setup file aware of the Manifest file
42 |     include_package_data=True,
43 |     #Minimum requirement of python, licesnse, and operating system. 
44 |     classifiers=[
45 |         "Programming Language :: Python :: 3.6",
46 |         "Programming Language :: Python :: 3.7",
47 |         "Programming Language :: Python :: 3.8",
48 |         "License :: OSI Approved :: MIT License",
49 |         "Operating System :: OS Independent"],
50 |     python_requires='>=3.5',
51 |     )
52 | 


--------------------------------------------------------------------------------
/tests/display_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from pypastry.display import _get_results_dataframe
 4 | from pypastry.experiment.results import Result
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def get_result_dict():
 9 | 
10 |     return {
11 |         "run_start": "2020-01-31 09:19:44.261276",
12 |         "run_end": "2020-01-31 09:19:48.743458",
13 |         "run_seconds": 4.482182,
14 |         "results": {
15 |             "test_score": {
16 |                 "mean_relative_error": 0.5,
17 |                 "mean_absolute_error": 100,
18 |                 "mean_squared_error": 1000
19 |             },
20 |             "test_score_sem": {
21 |                 "mean_relative_error": 0.01,
22 |                 "mean_absolute_error": 1.0,
23 |                 "mean_squared_error": 10.0
24 |             }
25 |         },
26 |         "model_info": {
27 |             "n_neighbors": 10,
28 |             "type": "KNearestNeighbor"
29 |         },
30 |         "additional_info": [
31 |             None
32 |         ],
33 |         "dataset": {
34 |             "hash": "998c9dea0afb12d91a8c67f256f80b0a603dd59b",
35 |             "columns": [
36 |                 "input",
37 |                 "output"
38 |             ],
39 |             "size": 100,
40 |         },
41 |         "git_hash": "123456781234567812345678",
42 |         "git_summary": "12345678",
43 |         "result_json_name": "jsonhash",
44 |     }
45 | 
46 | 
47 | def test_get_display(get_result_dict):
48 |     result = Result(get_result_dict)
49 |     results_dataframe = _get_results_dataframe([result])
50 |     row = results_dataframe.iloc[0].to_dict()
51 | 
52 |     expected = {
53 |         'Git hash': get_result_dict["git_hash"],
54 |         'Summary': get_result_dict["git_summary"],
55 |         'Dataset size': get_result_dict["dataset"]["size"],
56 |         'Dataset hash': get_result_dict["dataset"]["hash"][:8],
57 |         'Run start': get_result_dict["run_start"][:19],
58 |         'Model': get_result_dict['model_info']["type"],
59 |         'Result JSON name': get_result_dict['result_json_name'],
60 |         'Duration (s)': "{:.2f}".format(get_result_dict['run_seconds']),
61 |         'mean_relative_error': "{:.3f} ± {:.3f}".format(
62 |             get_result_dict["results"]["test_score"]["mean_relative_error"],
63 |             get_result_dict["results"]["test_score_sem"]["mean_relative_error"]
64 |         ),
65 |         'mean_absolute_error': "{:.3f} ± {:.3f}".format(
66 |             get_result_dict["results"]["test_score"]["mean_absolute_error"],
67 |             get_result_dict["results"]["test_score_sem"]["mean_absolute_error"]
68 |         ),
69 |         'mean_squared_error': "{:.3f} ± {:.3f}".format(
70 |             get_result_dict["results"]["test_score"]["mean_squared_error"],
71 |             get_result_dict["results"]["test_score_sem"]["mean_squared_error"]
72 |         ),
73 |     }
74 |     assert expected == row
75 | 


--------------------------------------------------------------------------------
/tests/evaluation_test.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import Mock, MagicMock
  2 | 
  3 | import pytest
  4 | from pandas import DataFrame
  5 | from sklearn.dummy import DummyClassifier
  6 | from sklearn.metrics import accuracy_score, make_scorer, precision_score
  7 | from sklearn.model_selection import StratifiedShuffleSplit, GroupShuffleSplit
  8 | from sklearn.tree import DecisionTreeClassifier
  9 | 
 10 | from pypastry.experiment import Experiment
 11 | from pypastry.experiment.evaluation import ExperimentRunner, evaluate_predictor, DirtyRepoError
 12 | 
 13 | 
 14 | @pytest.fixture
 15 | def simple_dataset():
 16 |     return DataFrame({
 17 |         'a': [1, 1, 0, 0],
 18 |         'b': [1, 1, 0, 0],
 19 |     })
 20 | 
 21 | 
 22 | @pytest.fixture
 23 | def grouped_dataset():
 24 |     label = [i % 2 for i in range(100)]
 25 |     return DataFrame({
 26 |         'a': label,
 27 |         'b': label,
 28 |         'g': [i // 2 for i in range(100)]
 29 |     })
 30 | 
 31 | 
 32 | @pytest.fixture
 33 | def get_predictor():
 34 |     return DecisionTreeClassifier()
 35 | 
 36 | 
 37 | @pytest.fixture
 38 | def get_cross_validator():
 39 |     return StratifiedShuffleSplit(n_splits=1, test_size=0.5)
 40 | 
 41 | 
 42 | @pytest.fixture
 43 | def get_scorer():
 44 |     return make_scorer(accuracy_score)
 45 | 
 46 | 
 47 | @pytest.mark.parametrize("dirty, force", [(False, False), (False, True), (True, False), (True, True)])
 48 | def test_simple_evaluation(dirty, force, simple_dataset):
 49 | 
 50 |     cross_validation = StratifiedShuffleSplit(n_splits=1, test_size=0.5)
 51 |     predictor = DecisionTreeClassifier()
 52 |     scorer = make_scorer(accuracy_score)
 53 | 
 54 |     experiment = Experiment(simple_dataset, 'b', predictor, cross_validation, scorer)
 55 | 
 56 |     git_mock = Mock()
 57 |     git_mock.is_dirty.return_value = dirty
 58 |     git_mock.head.object.hexsha = MagicMock()
 59 |     results_repo_mock = Mock()
 60 |     results_display_mock = Mock()
 61 |     runner = ExperimentRunner(git_mock, results_repo_mock, results_display_mock)
 62 | 
 63 |     try:
 64 |         runner.run_experiment(experiment, "msg", force)
 65 |     except DirtyRepoError:
 66 |         if dirty is True and force is False:
 67 |             # Expected behaviour.
 68 |             return
 69 |         else:
 70 |             raise
 71 | 
 72 |     call_args_list = results_repo_mock.save_results.call_args_list
 73 |     assert 1 == len(call_args_list)
 74 |     run_info, dataset_info = call_args_list[0][0]
 75 |     print("Run info", run_info)
 76 | 
 77 |     results = run_info['results']
 78 |     assert {'accuracy_score': 1.0} == results['test_score']
 79 |     assert ['a', 'b'] == dataset_info['columns']
 80 | 
 81 |     # TODO: check the hash. Need to find a way to make this consistent between python versions etc.
 82 |     # assert '28ea628a50a47c726a9b0ec437c88fc4742d81fd' == dataset_info['hash']
 83 | 
 84 |     assert 1 == len(results_display_mock.cache_display.call_args_list)
 85 |     print(results_display_mock.cache_display.call_args[0])
 86 |     assert len(results_display_mock.cache_display.call_args[0]) > 0
 87 |     assert 1 == len(results_display_mock.print_cache_file.call_args_list)
 88 | 
 89 | 
 90 | @pytest.mark.parametrize("dirty, force", [(False, False), (False, True), (True, False), (True, True)])
 91 | def test_grouped_evaluation(dirty, force, grouped_dataset):
 92 | 
 93 |     cross_validation = GroupShuffleSplit(n_splits=1, test_size=0.5)
 94 |     predictor = DummyClassifier(strategy='constant', constant=1)
 95 |     scorer = make_scorer(accuracy_score)
 96 | 
 97 |     experiment = Experiment(grouped_dataset, 'b', predictor, cross_validation, scorer, group_column='g')
 98 | 
 99 |     git_mock = Mock()
100 |     git_mock.is_dirty.return_value = dirty
101 |     git_mock.head.object.hexsha = MagicMock()
102 |     results_repo_mock = Mock()
103 |     results_display_mock = Mock()
104 |     runner = ExperimentRunner(git_mock, results_repo_mock, results_display_mock)
105 | 
106 |     try:
107 |         runner.run_experiment(experiment, "msg", force)
108 |     except DirtyRepoError:
109 |         if dirty is True and force is False:
110 |             # Expected behaviour.
111 |             return
112 |         else:
113 |             raise
114 | 
115 |     assert 1 == len(results_repo_mock.save_results.call_args_list)
116 |     run_info, dataset_info = results_repo_mock.save_results.call_args[0]
117 | 
118 |     print("Run infos", run_info)
119 | 
120 |     assert run_info['results']["test_score"]["accuracy_score"] == 0.5
121 |     assert run_info['results']["test_score_sem"]["accuracy_score"] == 0.0
122 | 
123 | 
124 | def test_multiple_scorers(simple_dataset):
125 | 
126 |     cross_validation = StratifiedShuffleSplit(n_splits=2, test_size=0.5)
127 |     predictor = DummyClassifier(strategy='constant', constant=1)
128 |     scorer = [make_scorer(accuracy_score), make_scorer(precision_score)]
129 | 
130 |     experiment = Experiment(simple_dataset, 'b', predictor, cross_validation, scorer)
131 | 
132 |     run_info, _ = evaluate_predictor(experiment)
133 |     results = run_info['results']
134 |     print("Results", results)
135 | 
136 |     expected_results = {
137 |         'test_score': {'accuracy_score': 0.5, 'precision_score': 0.5},
138 |         'test_score_sem': {'accuracy_score': 0.0, 'precision_score': 0.0},
139 |     }
140 | 
141 |     assert expected_results == results
142 | 


--------------------------------------------------------------------------------