├── .coveragerc
├── .github
    └── ISSUE_TEMPLATE.md
├── .gitignore
├── .travis.yml
├── CONTRIBUTING.rst
├── LICENSE
├── ODSC_East_2019
    └── ODSC_XGBoost_Interpretability.ipynb
├── ODSC_East_2020
    └── ODSC_East_2020_XGBoost_Interpretability.ipynb
├── ODSC_West_2019
    └── ODSC_West_2019_XGBoost_Interpretability.ipynb
├── README.md
├── README.rst
├── deploy_steps.md
├── docs-mkdocs
    ├── docs
    │   └── index.md
    └── mkdocs.yml
├── docs
    ├── Makefile
    ├── calib_overview.rst
    ├── conf.py
    ├── index.rst
    ├── install.rst
    ├── interp_overview.rst
    ├── make.bat
    ├── rtfd-requirements.txt
    ├── splinecalib_class.rst
    └── splinecalib_examples.rst
├── examples
    ├── Ames_Housing_Analysis.ipynb
    ├── Calibration_Example_ICU_MIMIC.ipynb
    ├── Calibration_Example_ICU_MIMIC_Short.ipynb
    ├── ICU_Mortality_MIMIC.ipynb
    ├── SplineCalib_Details.ipynb
    ├── SplineCalib_Multiclass_MNIST.ipynb
    ├── SplineCalib_Tutorial.ipynb
    └── data
    │   ├── Ames_Housing_Data.tsv
    │   ├── cal_housing_data.csv
    │   └── lab_vital_icu_table.csv
├── extra
    └── code
    │   ├── discrete_dt.c
    │   ├── discrete_dt.pyx
    │   ├── discrete_gb.c
    │   ├── discrete_gb.pyx
    │   ├── graphs.c
    │   ├── graphs.pyx
    │   ├── hypergraphs.c
    │   ├── hypergraphs.pyx
    │   ├── setup.py
    │   ├── structure_dt.c
    │   ├── structure_dt.pyx
    │   ├── structure_gb.c
    │   └── structure_gb.pyx
├── ml_insights
    ├── CVModel.py
    ├── __init__.py
    ├── calibration.py
    ├── cross_validation.py
    ├── data
    │   ├── faux_data.csv
    │   ├── ortho.csv
    │   └── para.csv
    ├── insights.py
    ├── modeling_utils.py
    ├── shap_insights.py
    ├── tests
    │   ├── __init__.py
    │   ├── test_example.py
    │   └── test_utils.py
    └── utils.py
├── mli_screenshot.png
├── pyproject.toml
├── requirements.txt
└── setup.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
3 | source = ml_insights/*
4 | include = ml_insights/*
5 | omit = */setup.py
6 | [report]
7 | include = ml_insights/*
8 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | * ML Inspector version:
 2 | * Python version:
 3 | * Operating System:
 4 | 
 5 | ### Description
 6 | 
 7 | Describe what you were trying to get done.
 8 | Tell us what happened, what went wrong, and what you expected to happen.
 9 | 
10 | ### What I Did
11 | 
12 | ```
13 | Paste the command(s) you ran and the output.
14 | If there was a crash, please include the traceback here.
15 | ```
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | .DS_Store
 7 | 
 8 | #Ipython Notebook
 9 | .ipynb_checkpoints
10 | 
11 | #Ipython Notebook Temp files
12 | *copy*.ipynb
13 | untitled*
14 | untitled*.ipynb
15 | *Copy*.ipynb
16 | Untitled*
17 | Untitled*.ipynb
18 | 
19 | # C extensions
20 | *.so
21 | 
22 | # Distribution / packaging
23 | .Python
24 | env/
25 | build/
26 | develop-eggs/
27 | dist/
28 | downloads/
29 | eggs/
30 | .eggs/
31 | lib/
32 | lib64/
33 | parts/
34 | sdist/
35 | var/
36 | *.egg-info/
37 | .installed.cfg
38 | *.egg
39 | 
40 | # PyInstaller
41 | #  Usually these files are written by a python script from a template
42 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
43 | *.manifest
44 | *.spec
45 | 
46 | # Installer logs
47 | pip-log.txt
48 | pip-delete-this-directory.txt
49 | *.log
50 | 
51 | # Unit test / coverage reports
52 | htmlcov/
53 | .tox/
54 | .coverage
55 | .coverage.*
56 | .cache
57 | nosetests.xml
58 | coverage.xml
59 | *,cover
60 | .hypothesis/
61 | 
62 | # Translations
63 | *.mo
64 | *.pot
65 | 
66 | # mkdocs
67 | docs-mkdocs/site/
68 | 
69 | # Sphinx documentation
70 | docs/_build/
71 | 
72 | # PyBuilder
73 | target/
74 | 
75 | # local
76 | local/


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | 
 3 | env:
 4 |   global:
 5 |     - CONDA_DEPS="pip flake8 pytest numpy scipy matplotlib pandas" PIP_DEPS="coveralls pytest-cov"
 6 | 
 7 | matrix:
 8 |   include:
 9 |     - os: osx
10 |       env:
11 |          - PYTHON_VERSION=2.7
12 |          - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh"
13 |     - os: osx
14 |       env:
15 |          - PYTHON_VERSION=3.5
16 |          - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh"
17 |     - os: linux
18 |       env:
19 |          - PYTHON_VERSION=2.7
20 |          - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh"
21 |     - os: linux
22 |       env:
23 |          - PYTHON_VERSION=3.5
24 |          - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh"
25 | 
26 | 
27 | before_install:
28 | - export MINICONDA=$HOME/miniconda
29 | - export PATH="$MINICONDA/bin:$PATH"
30 | - hash -r
31 | - echo $MINICONDA_URL
32 | - wget $MINICONDA_URL -O miniconda.sh;
33 | - bash miniconda.sh -b -f -p $MINICONDA;
34 | - conda config --set always_yes yes
35 | - conda update conda
36 | - conda info -a
37 | - conda config --add channels conda-forge
38 | - conda install python=$PYTHON_VERSION $CONDA_DEPS
39 | - travis_retry pip install $PIP_DEPS
40 | 
41 | install:
42 | - python setup.py install --record installed_files.txt
43 | 
44 | script:
45 | - flake8 --ignore N802,N806 `find . -name \*.py | grep -v setup.py | grep -v /doc/`
46 | 
47 | - mkdir for_test
48 | - cd for_test
49 | - py.test --pyargs ml_insights --cov-report term-missing --cov=ml_insights
50 | 
51 | after_success:
52 | - coveralls
53 | 
54 | before_cache:
55 | # clean unused packages & installed files from conda cache
56 | # this makes the cache rebuilt less frequently
57 | - conda clean --tarballs --packages --index-cache
58 | - rm -rf $HOME/miniconda/pkgs/cache
59 | - xargs rm <installed_files.txt
60 | 
61 | cache:
62 |   directories:
63 |     - $HOME/miniconda
64 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | .. highlight:: shell
  2 | 
  3 | ============
  4 | Contributing
  5 | ============
  6 | 
  7 | Contributions are welcome, and they are greatly appreciated! Every
  8 | little bit helps, and credit will always be given.
  9 | 
 10 | You can contribute in many ways:
 11 | 
 12 | Types of Contributions
 13 | ----------------------
 14 | 
 15 | Report Bugs
 16 | ~~~~~~~~~~~
 17 | 
 18 | Report bugs at https://github.com/numeristical/introspective/issues.
 19 | 
 20 | If you are reporting a bug, please include:
 21 | 
 22 | * Your operating system name and version.
 23 | * Any details about your local setup that might be helpful in troubleshooting.
 24 | * Detailed steps to reproduce the bug.
 25 | 
 26 | Fix Bugs
 27 | ~~~~~~~~
 28 | 
 29 | Look through the GitHub issues for bugs. Anything tagged with "bug"
 30 | and "help wanted" is open to whoever wants to implement it.
 31 | 
 32 | Implement Features
 33 | ~~~~~~~~~~~~~~~~~~
 34 | 
 35 | Look through the GitHub issues for features. Anything tagged with "enhancement"
 36 | and "help wanted" is open to whoever wants to implement it.
 37 | 
 38 | Write Documentation
 39 | ~~~~~~~~~~~~~~~~~~~
 40 | 
 41 | ML Insights could always use more documentation, whether as part of the
 42 | official ML Insights docs, in docstrings, or even on the web in blog posts,
 43 | articles, and such.
 44 | 
 45 | Submit Feedback
 46 | ~~~~~~~~~~~~~~~
 47 | 
 48 | The best way to send feedback is to file an issue at https://github.com/numeristical/introspective/issues.
 49 | 
 50 | If you are proposing a feature:
 51 | 
 52 | * Explain in detail how it would work.
 53 | * Keep the scope as narrow as possible, to make it easier to implement.
 54 | * Remember that this is a volunteer-driven project, and that contributions
 55 |   are welcome :)
 56 | 
 57 | Get Started!
 58 | ------------
 59 | 
 60 | Ready to contribute? Here's how to set up `ml_insights` for local development.
 61 | 
 62 | 1. Fork the `introspective` repo on GitHub.
 63 | 2. Clone your fork locally::
 64 | 
 65 |     $ git clone git@github.com:your_name_here/introspective.git
 66 | 
 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
 68 | 
 69 |     $ mkvirtualenv introspective
 70 |     $ cd introspective/
 71 |     $ python setup.py develop
 72 | 
 73 | 4. Create a branch for local development::
 74 | 
 75 |     $ git checkout -b name-of-your-bugfix-or-feature
 76 | 
 77 |    Now you can make your changes locally.
 78 | 
 79 | 5. When you're done making changes, check that your changes pass flake8 and the tests, including testing other Python versions with tox::
 80 | 
 81 |     $ flake8 introspective tests
 82 |     $ python setup.py test or py.test
 83 |     $ tox
 84 | 
 85 |    To get flake8 and tox, just pip install them into your virtualenv.
 86 | 
 87 | 6. Commit your changes and push your branch to GitHub::
 88 | 
 89 |     $ git add .
 90 |     $ git commit -m "Your detailed description of your changes."
 91 |     $ git push origin name-of-your-bugfix-or-feature
 92 | 
 93 | 7. Submit a pull request through the GitHub website.
 94 | 
 95 | Pull Request Guidelines
 96 | -----------------------
 97 | 
 98 | Before you submit a pull request, check that it meets these guidelines:
 99 | 
100 | 1. The pull request should include tests.
101 | 2. If the pull request adds functionality, the docs should be updated. Put
102 |    your new functionality into a function with a docstring, and add the
103 |    feature to the list in README.rst.
104 | 3. The pull request should work for Python 2.6, 2.7, 3.3, 3.4 and 3.5, and for PyPy. 
105 | 
106 | Tips
107 | ----
108 | 
109 | To run a subset of tests::
110 | 
111 | 
112 |     $ python -m unittest tests.test_ml_insights
113 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Brian Lucena
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## ML Insights
 2 | 
 3 | 
 4 | Welcome to ML-Insights!
 5 | 
 6 | This package contains two main sets of tools:
 7 | 
 8 | - SplineCalib: Spline-based Probability Calibration
 9 | - ModelXRay: Model Interpretability
10 | 
11 | ## Probability Calibration
12 | 
13 | For probability calibration, use the SplineCalib class.  Detailed documentation is available here:
14 | [https://ml-insights.readthedocs.io](https://ml-insights.readthedocs.io)
15 | 
16 | Find more detailed examples here:
17 | [https://github.com/numeristical/introspective/tree/master/examples](https://github.com/numeristical/introspective/tree/master/examples)
18 | 
19 | ## Model Interpretation
20 | 
21 | For understanding black-box models, the main entry point is the `ModelXRay` class.  Instantiate it with the model and data.  The data can be what the model was trained with, but intended to be used for out of bag or test data to see how the model performs when one feature is changed, holding everything else constant.
22 | 
23 |     >>> import ml_insights as mli
24 |     >>> xray = mli.ModelXRay(model, data.sample(500))
25 |     >>> xray.feature_dependence_plots()
26 | 
27 | ![mli_screenshot](mli_screenshot.png)
28 | 
29 | Find more detailed examples here:
30 | [https://github.com/numeristical/introspective/tree/master/examples](https://github.com/numeristical/introspective/tree/master/examples)
31 | 
32 | 
33 | ## Other Documentation
34 | 
35 | [https://ml-insights.readthedocs.io](https://ml-insights.readthedocs.io)
36 | 
37 | Disclaimer
38 | ==========
39 | 
40 | We have tested this tool to the best of our ability, but understand that it may have bugs.  It was developed on Python 3.  Use at your own risk, but feel free to report any bugs to our github. <https://github.com/numeristical/introspective>
41 | 
42 | Installation
43 | =============
44 | 
45 |     $ pip install ml_insights
46 | 
47 | 
48 | Source
49 | ======
50 | 
51 | Find the latest version on github: https://github.com/numeristical/introspective
52 | 
53 | Feel free to fork and contribute!
54 | 
55 | License
56 | =======
57 | 
58 | Free software: `MIT license <LICENSE>`_
59 | 
60 | Developed By
61 | ============
62 | 
63 | - Brian Lucena
64 | - Ramesh Sampath
65 | 
66 | References
67 | ==========
68 | 
69 | Lucena, B. 2018. Spline-Based Probability Calibration. https://arxiv.org/abs/1809.07751
70 | 
71 | Alex Goldstein, Adam Kapelner, Justin Bleich, and Emil Pitkin. 2014. Peeking Inside the Black Box: Visualizing Statistical Learning With Plots of Individual Conditional Expectation. Journal of Computational and Graphical Statistics (March 2014)
72 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ML Insights
  2 | ===========
  3 | 
  4 | Welcome to ML-Insights!
  5 | 
  6 | This package contains two core sets of functions:
  7 | 
  8 | 1) Calibration
  9 | 2) Interpreting Models
 10 | 
 11 | For probability calibration, the main class is `SplineCalib`.  Given a set of model outputs and the "true" classes, you can `fit` a SplineCalib object.  That object can then be used to `calibrate` future model predictions post-hoc.
 12 | 
 13 | .. code-block:: python
 14 | 
 15 |     >>> model.fit(X_train, y_train)
 16 |     >>> sc = mli.SplineCalib()
 17 |     >>> sc.fit(X_valid, y_valid)
 18 |     >>> uncalib_preds = model.predict_proba(X_test)
 19 |     >>> calib_preds = sc.calibrate(uncalib_preds)
 20 | 
 21 | 
 22 | .. code-block:: python
 23 | 
 24 |     >>> cv_preds = mli.cv_predictions(model, X_train, y_train)
 25 |     >>> model.fit(X_train, y_train)
 26 |     >>> sc = mli.SplineCalib()
 27 |     >>> sc.fit(cv_preds, y_train)
 28 |     >>> uncalib_preds = model.predict_proba(X_test)
 29 |     >>> calib_preds = sc.calibrate(uncalib_preds)
 30 | 
 31 | 
 32 | 
 33 | For model interpretability, we provide the `ice_plot` and `histogram_pair` functions as well as other tools.
 34 | 
 35 | 
 36 | .. code-block:: python
 37 | 
 38 |     >>> rd = mli.get_range_dict(X_train)
 39 |     >>> mli.ice_plot(model, X_test.sample(3), X_train.columns, rd)
 40 | 
 41 | .. code-block:: python
 42 | 
 43 |     >>> mli.histogram_pair(df.outcome, df.feature, bins=np.linspace(0,100,11))
 44 | 
 45 | Please see the documentation and examples at the links below.
 46 | 
 47 | 
 48 | - `Documentation <https://ml-insights.readthedocs.io>`_
 49 | - `Notebook Examples and Usage <https://github.com/numeristical/introspective/tree/master/examples>`_
 50 | 
 51 | 
 52 | Python
 53 | ------
 54 | Python 3.4+
 55 | 
 56 | 
 57 | Disclaimer
 58 | ==========
 59 | 
 60 | We have tested this tool to the best of our ability, but understand that it may have bugs.  It was most recently developed on Python 3.7.3.  Use at your own risk, but feel free to report any bugs to our github. <https://github.com/numeristical/introspective>
 61 | 
 62 | 
 63 | Installation
 64 | =============
 65 | 
 66 | .. code-block:: bash
 67 | 
 68 |     $ pip install ml_insights
 69 | 
 70 | 
 71 | Usage
 72 | ======
 73 | 
 74 | .. code-block:: python
 75 | 
 76 |     >>> import ml_insights as mli
 77 |     >>> xray = mli.ModelXRay(model, data)
 78 | 
 79 | .. code-block:: python
 80 | 
 81 | 	>>> rfm = RandomForestClassifier(n_estimators = 500, class_weight='balanced_subsample')
 82 | 	>>> rfm_cv = mli.SplineCalibratedClassifierCV(rfm)
 83 | 	>>> rfm_cv.fit(X_train,y_train)
 84 | 	>>> test_res_calib_cv = rfm_cv.predict_proba(X_test)[:,1]
 85 | 	>>> log_loss(y_test,test_res_calib_cv)
 86 | 
 87 | Source
 88 | ======
 89 | 
 90 | Find the latest version on github: https://github.com/numeristical/introspective
 91 | 
 92 | Feel free to fork and contribute!
 93 | 
 94 | License
 95 | =======
 96 | 
 97 | Free software: `MIT license <LICENSE>`_
 98 | 
 99 | Developed By
100 | ============
101 | 
102 | - Brian Lucena
103 | - Ramesh Sampath
104 | 
105 | References
106 | ==========
107 | 
108 | Alex Goldstein, Adam Kapelner, Justin Bleich, and Emil Pitkin. 2014. Peeking Inside the Black Box: Visualizing Statistical Learning With Plots of Individual Conditional Expectation. Journal of Computational and Graphical Statistics (March 2014)


--------------------------------------------------------------------------------
/deploy_steps.md:
--------------------------------------------------------------------------------
 1 | To Develop Locally:
 2 | 
 3 | 1. Run `pip install -e .`
 4 | 
 5 | 
 6 | To Deploy to Test Server:
 7 | 
 8 | 1. Run `python setup.py sdist bdist_wheel`
 9 | 
10 | 2. Upload using twine - `twine upload -r test dist/ml_insights-0.0.<version>*`
11 | 3. Install from Test PyPi - `pip install -i https://testpypi.python.org/pypi ml_insights --upgrade`
12 | 
13 | To Deploy to PyPi Server:
14 | 
15 | 1. Run `python setup.py sdist bdist_wheel`
16 | 
17 | 2. Upload using twine - `twine upload dist/ml_insights-0.0.<version>*`
18 | 3. Install from Test PyPi - `pip install ml_insights --upgrade`
19 | 
20 | 
21 | **Note: When uploading to TestPyPi or PyPi, we cannot update the versions.  Version numbers need to be updated in setup.py and ml_insights/__init__.py
22 | 
23 | 
24 | To Upgrade Documentation:
25 | 
26 | 


--------------------------------------------------------------------------------
/docs-mkdocs/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Welcome to MkDocs
 2 | 
 3 | For full documentation visit [mkdocs.org](http://mkdocs.org).
 4 | 
 5 | ## Commands
 6 | 
 7 | * `mkdocs new [dir-name]` - Create a new project.
 8 | * `mkdocs serve` - Start the live-reloading docs server.
 9 | * `mkdocs build` - Build the documentation site.
10 | * `mkdocs help` - Print this help message.
11 | 
12 | ## Project layout
13 | 
14 |     mkdocs.yml    # The configuration file.
15 |     docs/
16 |         index.md  # The documentation homepage.
17 |         ...       # Other markdown pages, images and other files.
18 | 


--------------------------------------------------------------------------------
/docs-mkdocs/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: ML Insights
2 | theme: readthedocs
3 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help
 18 | help:
 19 | 	@echo "Please use \`make <target>' where <target> is one of"
 20 | 	@echo "  html       to make standalone HTML files"
 21 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 22 | 	@echo "  singlehtml to make a single large HTML file"
 23 | 	@echo "  pickle     to make pickle files"
 24 | 	@echo "  json       to make JSON files"
 25 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 26 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 27 | 	@echo "  applehelp  to make an Apple Help Book"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  epub3      to make an epub3"
 31 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 32 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 33 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 34 | 	@echo "  text       to make text files"
 35 | 	@echo "  man        to make manual pages"
 36 | 	@echo "  texinfo    to make Texinfo files"
 37 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 38 | 	@echo "  gettext    to make PO message catalogs"
 39 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 40 | 	@echo "  xml        to make Docutils-native XML files"
 41 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 42 | 	@echo "  linkcheck  to check all external links for integrity"
 43 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 44 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 45 | 	@echo "  dummy      to check syntax errors of document sources"
 46 | 
 47 | .PHONY: clean
 48 | clean:
 49 | 	rm -rf $(BUILDDIR)/*
 50 | 
 51 | .PHONY: html
 52 | html:
 53 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 54 | 	@echo
 55 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 56 | 
 57 | .PHONY: dirhtml
 58 | dirhtml:
 59 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 60 | 	@echo
 61 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 62 | 
 63 | .PHONY: singlehtml
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | .PHONY: pickle
 70 | pickle:
 71 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 72 | 	@echo
 73 | 	@echo "Build finished; now you can process the pickle files."
 74 | 
 75 | .PHONY: json
 76 | json:
 77 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 78 | 	@echo
 79 | 	@echo "Build finished; now you can process the JSON files."
 80 | 
 81 | .PHONY: htmlhelp
 82 | htmlhelp:
 83 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 84 | 	@echo
 85 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 86 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 87 | 
 88 | .PHONY: qthelp
 89 | qthelp:
 90 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 91 | 	@echo
 92 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 93 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 94 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/MLInsights.qhcp"
 95 | 	@echo "To view the help file:"
 96 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/MLInsights.qhc"
 97 | 
 98 | .PHONY: applehelp
 99 | applehelp:
100 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
101 | 	@echo
102 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
103 | 	@echo "N.B. You won't be able to view it unless you put it in" \
104 | 	      "~/Library/Documentation/Help or install it in your application" \
105 | 	      "bundle."
106 | 
107 | .PHONY: devhelp
108 | devhelp:
109 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
110 | 	@echo
111 | 	@echo "Build finished."
112 | 	@echo "To view the help file:"
113 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/MLInsights"
114 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/MLInsights"
115 | 	@echo "# devhelp"
116 | 
117 | .PHONY: epub
118 | epub:
119 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
120 | 	@echo
121 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
122 | 
123 | .PHONY: epub3
124 | epub3:
125 | 	$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
126 | 	@echo
127 | 	@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
128 | 
129 | .PHONY: latex
130 | latex:
131 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
132 | 	@echo
133 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
134 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
135 | 	      "(use \`make latexpdf' here to do that automatically)."
136 | 
137 | .PHONY: latexpdf
138 | latexpdf:
139 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
140 | 	@echo "Running LaTeX files through pdflatex..."
141 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
142 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
143 | 
144 | .PHONY: latexpdfja
145 | latexpdfja:
146 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
147 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
148 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
149 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
150 | 
151 | .PHONY: text
152 | text:
153 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
154 | 	@echo
155 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
156 | 
157 | .PHONY: man
158 | man:
159 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
160 | 	@echo
161 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
162 | 
163 | .PHONY: texinfo
164 | texinfo:
165 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
166 | 	@echo
167 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
168 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
169 | 	      "(use \`make info' here to do that automatically)."
170 | 
171 | .PHONY: info
172 | info:
173 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
174 | 	@echo "Running Texinfo files through makeinfo..."
175 | 	make -C $(BUILDDIR)/texinfo info
176 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
177 | 
178 | .PHONY: gettext
179 | gettext:
180 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
181 | 	@echo
182 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
183 | 
184 | .PHONY: changes
185 | changes:
186 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
187 | 	@echo
188 | 	@echo "The overview file is in $(BUILDDIR)/changes."
189 | 
190 | .PHONY: linkcheck
191 | linkcheck:
192 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
193 | 	@echo
194 | 	@echo "Link check complete; look for any errors in the above output " \
195 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
196 | 
197 | .PHONY: doctest
198 | doctest:
199 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
200 | 	@echo "Testing of doctests in the sources finished, look at the " \
201 | 	      "results in $(BUILDDIR)/doctest/output.txt."
202 | 
203 | .PHONY: coverage
204 | coverage:
205 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
206 | 	@echo "Testing of coverage in the sources finished, look at the " \
207 | 	      "results in $(BUILDDIR)/coverage/python.txt."
208 | 
209 | .PHONY: xml
210 | xml:
211 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
212 | 	@echo
213 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
214 | 
215 | .PHONY: pseudoxml
216 | pseudoxml:
217 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
218 | 	@echo
219 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
220 | 
221 | .PHONY: dummy
222 | dummy:
223 | 	$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
224 | 	@echo
225 | 	@echo "Build finished. Dummy builder generates no files."
226 | 


--------------------------------------------------------------------------------
/docs/calib_overview.rst:
--------------------------------------------------------------------------------
 1 | Probability Calibration with SplineCalib
 2 | ========================================
 3 | 
 4 | SplineCalib is a tool for probability calibration contained in the ML-Insights package.  Often, classification models may have good *discriminative* performance, but have poor *calibration*.  SplineCalib post-processes the model scores so that they are better calibrated.
 5 | 
 6 | .. toctree::
 7 |    :maxdepth: 3
 8 | 
 9 |    splinecalib_examples
10 |    splinecalib_class
11 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # ML Insights documentation build configuration file, created by
  5 | # sphinx-quickstart on Wed Nov  9 13:32:07 2016.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | sys.path.insert(0, os.path.abspath('../ml_insights'))
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | #
 28 | # needs_sphinx = '1.0'
 29 | import sphinx_rtd_theme
 30 | 
 31 | 
 32 | # Add any Sphinx extension module names here, as strings. They can be
 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 34 | # ones.
 35 | extensions = [
 36 |     'sphinx.ext.autodoc',
 37 |     'sphinx.ext.mathjax',
 38 |     'sphinx.ext.viewcode',
 39 |     'sphinx.ext.coverage',
 40 |     'sphinx.ext.napoleon',
 41 | ]
 42 | 
 43 | # Add any paths that contain templates here, relative to this directory.
 44 | templates_path = ['_templates']
 45 | 
 46 | # The suffix(es) of source filenames.
 47 | # You can specify multiple suffix as a list of string:
 48 | #
 49 | # source_suffix = ['.rst', '.md']
 50 | source_suffix = '.rst'
 51 | 
 52 | # The encoding of source files.
 53 | #
 54 | # source_encoding = 'utf-8-sig'
 55 | 
 56 | # The master toctree document.
 57 | master_doc = 'index'
 58 | 
 59 | # General information about the project.
 60 | project = 'ML Insights'
 61 | copyright = '2020, Brian Lucena and Ramesh Sampath'
 62 | author = 'Brian Lucena and Ramesh Sampath'
 63 | 
 64 | # The version info for the project you're documenting, acts as replacement for
 65 | # |version| and |release|, also used in various other places throughout the
 66 | # built documents.
 67 | #
 68 | # The short X.Y version.
 69 | version = '0.1.0'
 70 | # The full version, including alpha/beta/rc tags.
 71 | release = '0.1.0'
 72 | 
 73 | # The language for content autogenerated by Sphinx. Refer to documentation
 74 | # for a list of supported languages.
 75 | #
 76 | # This is also used if you do content translation via gettext catalogs.
 77 | # Usually you set "language" from the command line for these cases.
 78 | language = None
 79 | 
 80 | # There are two options for replacing |today|: either, you set today to some
 81 | # non-false value, then it is used:
 82 | #
 83 | # today = ''
 84 | #
 85 | # Else, today_fmt is used as the format for a strftime call.
 86 | #
 87 | # today_fmt = '%B %d, %Y'
 88 | 
 89 | # List of patterns, relative to source directory, that match files and
 90 | # directories to ignore when looking for source files.
 91 | # This patterns also effect to html_static_path and html_extra_path
 92 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 93 | 
 94 | # The reST default role (used for this markup: `text`) to use for all
 95 | # documents.
 96 | #
 97 | # default_role = None
 98 | 
 99 | # If true, '()' will be appended to :func: etc. cross-reference text.
100 | #
101 | # add_function_parentheses = True
102 | 
103 | # If true, the current module name will be prepended to all description
104 | # unit titles (such as .. function::).
105 | #
106 | # add_module_names = True
107 | 
108 | # If true, sectionauthor and moduleauthor directives will be shown in the
109 | # output. They are ignored by default.
110 | #
111 | # show_authors = False
112 | 
113 | # The name of the Pygments (syntax highlighting) style to use.
114 | pygments_style = 'sphinx'
115 | 
116 | # A list of ignored prefixes for module index sorting.
117 | # modindex_common_prefix = []
118 | 
119 | # If true, keep warnings as "system message" paragraphs in the built documents.
120 | # keep_warnings = False
121 | 
122 | # If true, `todo` and `todoList` produce output, else they produce nothing.
123 | todo_include_todos = False
124 | 
125 | 
126 | # -- Options for HTML output ----------------------------------------------
127 | 
128 | # The theme to use for HTML and HTML Help pages.  See the documentation for
129 | # a list of builtin themes.
130 | #
131 | # html_theme = 'alabaster'
132 | html_theme = "sphinx_rtd_theme"
133 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
134 | 
135 | # Theme options are theme-specific and customize the look and feel of a theme
136 | # further.  For a list of options available for each theme, see the
137 | # documentation.
138 | #
139 | # html_theme_options = {}
140 | 
141 | # Add any paths that contain custom themes here, relative to this directory.
142 | # html_theme_path = []
143 | 
144 | # The name for this set of Sphinx documents.
145 | # "<project> v<release> documentation" by default.
146 | #
147 | # html_title = 'ML Insights v0.0.2'
148 | 
149 | # A shorter title for the navigation bar.  Default is the same as html_title.
150 | #
151 | # html_short_title = None
152 | 
153 | # The name of an image file (relative to this directory) to place at the top
154 | # of the sidebar.
155 | #
156 | # html_logo = None
157 | 
158 | # The name of an image file (relative to this directory) to use as a favicon of
159 | # the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
160 | # pixels large.
161 | #
162 | # html_favicon = None
163 | 
164 | # Add any paths that contain custom static files (such as style sheets) here,
165 | # relative to this directory. They are copied after the builtin static files,
166 | # so a file named "default.css" will overwrite the builtin "default.css".
167 | # html_static_path = ['_static']
168 | 
169 | # Add any extra paths that contain custom files (such as robots.txt or
170 | # .htaccess) here, relative to this directory. These files are copied
171 | # directly to the root of the documentation.
172 | #
173 | # html_extra_path = []
174 | 
175 | # If not None, a 'Last updated on:' timestamp is inserted at every page
176 | # bottom, using the given strftime format.
177 | # The empty string is equivalent to '%b %d, %Y'.
178 | #
179 | # html_last_updated_fmt = None
180 | 
181 | # If true, SmartyPants will be used to convert quotes and dashes to
182 | # typographically correct entities.
183 | #
184 | # html_use_smartypants = True
185 | 
186 | # Custom sidebar templates, maps document names to template names.
187 | #
188 | # html_sidebars = {}
189 | 
190 | # Additional templates that should be rendered to pages, maps page names to
191 | # template names.
192 | #
193 | # html_additional_pages = {}
194 | 
195 | # If false, no module index is generated.
196 | #
197 | # html_domain_indices = True
198 | 
199 | # If false, no index is generated.
200 | #
201 | # html_use_index = True
202 | 
203 | # If true, the index is split into individual pages for each letter.
204 | #
205 | # html_split_index = False
206 | 
207 | # If true, links to the reST sources are added to the pages.
208 | #
209 | # html_show_sourcelink = True
210 | 
211 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
212 | #
213 | # html_show_sphinx = True
214 | 
215 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
216 | #
217 | # html_show_copyright = True
218 | 
219 | # If true, an OpenSearch description file will be output, and all pages will
220 | # contain a <link> tag referring to it.  The value of this option must be the
221 | # base URL from which the finished HTML is served.
222 | #
223 | # html_use_opensearch = ''
224 | 
225 | # This is the file name suffix for HTML files (e.g. ".xhtml").
226 | # html_file_suffix = None
227 | 
228 | # Language to be used for generating the HTML full-text search index.
229 | # Sphinx supports the following languages:
230 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
231 | #   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh'
232 | #
233 | # html_search_language = 'en'
234 | 
235 | # A dictionary with options for the search language support, empty by default.
236 | # 'ja' uses this config value.
237 | # 'zh' user can custom change `jieba` dictionary path.
238 | #
239 | # html_search_options = {'type': 'default'}
240 | 
241 | # The name of a javascript file (relative to the configuration directory) that
242 | # implements a search results scorer. If empty, the default will be used.
243 | #
244 | # html_search_scorer = 'scorer.js'
245 | 
246 | # Output file base name for HTML help builder.
247 | htmlhelp_basename = 'MLInsightsdoc'
248 | 
249 | # -- Options for LaTeX output ---------------------------------------------
250 | 
251 | latex_elements = {
252 |      # The paper size ('letterpaper' or 'a4paper').
253 |      #
254 |      # 'papersize': 'letterpaper',
255 | 
256 |      # The font size ('10pt', '11pt' or '12pt').
257 |      #
258 |      # 'pointsize': '10pt',
259 | 
260 |      # Additional stuff for the LaTeX preamble.
261 |      #
262 |      # 'preamble': '',
263 | 
264 |      # Latex figure (float) alignment
265 |      #
266 |      # 'figure_align': 'htbp',
267 | }
268 | 
269 | # Grouping the document tree into LaTeX files. List of tuples
270 | # (source start file, target name, title,
271 | #  author, documentclass [howto, manual, or own class]).
272 | latex_documents = [
273 |     (master_doc, 'MLInsights.tex', 'ML Insights Documentation',
274 |      'Brian Lucena and Ramesh Sampath', 'manual'),
275 | ]
276 | 
277 | # The name of an image file (relative to this directory) to place at the top of
278 | # the title page.
279 | #
280 | # latex_logo = None
281 | 
282 | # For "manual" documents, if this is true, then toplevel headings are parts,
283 | # not chapters.
284 | #
285 | # latex_use_parts = False
286 | 
287 | # If true, show page references after internal links.
288 | #
289 | # latex_show_pagerefs = False
290 | 
291 | # If true, show URL addresses after external links.
292 | #
293 | # latex_show_urls = False
294 | 
295 | # Documents to append as an appendix to all manuals.
296 | #
297 | # latex_appendices = []
298 | 
299 | # It false, will not define \strong, \code, 	itleref, \crossref ... but only
300 | # \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
301 | # packages.
302 | #
303 | # latex_keep_old_macro_names = True
304 | 
305 | # If false, no module index is generated.
306 | #
307 | # latex_domain_indices = True
308 | 
309 | 
310 | # -- Options for manual page output ---------------------------------------
311 | 
312 | # One entry per manual page. List of tuples
313 | # (source start file, name, description, authors, manual section).
314 | man_pages = [
315 |     (master_doc, 'mlinsights', 'ML Insights Documentation',
316 |      [author], 1)
317 | ]
318 | 
319 | # If true, show URL addresses after external links.
320 | #
321 | # man_show_urls = False
322 | 
323 | # mock imports
324 | autodoc_mock_imports = ["sklearn"]
325 | 
326 | 
327 | # -- Options for Texinfo output -------------------------------------------
328 | 
329 | # Grouping the document tree into Texinfo files. List of tuples
330 | # (source start file, target name, title, author,
331 | #  dir menu entry, description, category)
332 | texinfo_documents = [
333 |     (master_doc, 'MLInsights', 'ML Insights Documentation',
334 |      author, 'MLInsights', 'One line description of project.',
335 |      'Miscellaneous'),
336 | ]
337 | 
338 | # Documents to append as an appendix to all manuals.
339 | #
340 | # texinfo_appendices = []
341 | 
342 | # If false, no module index is generated.
343 | #
344 | # texinfo_domain_indices = True
345 | 
346 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
347 | #
348 | # texinfo_show_urls = 'footnote'
349 | 
350 | # If true, do not generate a @detailmenu in the "Top" node's menu.
351 | #
352 | # texinfo_no_detailmenu = False
353 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ML_Insights documentation!
 2 | ==========================
 3 | 
 4 | Welcome to ML_Insights, home to SplineCalib and the ModelXRay.
 5 | 
 6 | This package contains two main capabilities:
 7 | 
 8 | * SplineCalib: Spline-based probability calibration 
 9 | * ModelXRay: Tool for model interpretability
10 | 
11 | .. toctree::
12 |    :maxdepth: 3
13 | 
14 |    install
15 |    calib_overview
16 |    interp_overview
17 | 
18 | 
19 | Indices
20 | =======
21 | * :ref:`genindex`
22 | 


--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | ------------
3 | 
4 | .. code-block:: bash
5 | 
6 |     $ pip install ml_insights
7 | 


--------------------------------------------------------------------------------
/docs/interp_overview.rst:
--------------------------------------------------------------------------------
1 | Model Interpretation with ModelXRay
2 | ===================================
3 | 
4 | ModelXRay is a tool for model interpretability contained in the ML-Insights package.  It provides the capability to easily do Individual Conditional Expectation plots (ICE-plots).
5 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  epub3      to make an epub3
 31 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 32 | 	echo.  text       to make text files
 33 | 	echo.  man        to make manual pages
 34 | 	echo.  texinfo    to make Texinfo files
 35 | 	echo.  gettext    to make PO message catalogs
 36 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 37 | 	echo.  xml        to make Docutils-native XML files
 38 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 39 | 	echo.  linkcheck  to check all external links for integrity
 40 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 41 | 	echo.  coverage   to run coverage check of the documentation if enabled
 42 | 	echo.  dummy      to check syntax errors of document sources
 43 | 	goto end
 44 | )
 45 | 
 46 | if "%1" == "clean" (
 47 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 48 | 	del /q /s %BUILDDIR%\*
 49 | 	goto end
 50 | )
 51 | 
 52 | 
 53 | REM Check if sphinx-build is available and fallback to Python version if any
 54 | %SPHINXBUILD% 1>NUL 2>NUL
 55 | if errorlevel 9009 goto sphinx_python
 56 | goto sphinx_ok
 57 | 
 58 | :sphinx_python
 59 | 
 60 | set SPHINXBUILD=python -m sphinx.__init__
 61 | %SPHINXBUILD% 2> nul
 62 | if errorlevel 9009 (
 63 | 	echo.
 64 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 65 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 66 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 67 | 	echo.may add the Sphinx directory to PATH.
 68 | 	echo.
 69 | 	echo.If you don't have Sphinx installed, grab it from
 70 | 	echo.http://sphinx-doc.org/
 71 | 	exit /b 1
 72 | )
 73 | 
 74 | :sphinx_ok
 75 | 
 76 | 
 77 | if "%1" == "html" (
 78 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 79 | 	if errorlevel 1 exit /b 1
 80 | 	echo.
 81 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 82 | 	goto end
 83 | )
 84 | 
 85 | if "%1" == "dirhtml" (
 86 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 87 | 	if errorlevel 1 exit /b 1
 88 | 	echo.
 89 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 90 | 	goto end
 91 | )
 92 | 
 93 | if "%1" == "singlehtml" (
 94 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 95 | 	if errorlevel 1 exit /b 1
 96 | 	echo.
 97 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 98 | 	goto end
 99 | )
100 | 
101 | if "%1" == "pickle" (
102 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
103 | 	if errorlevel 1 exit /b 1
104 | 	echo.
105 | 	echo.Build finished; now you can process the pickle files.
106 | 	goto end
107 | )
108 | 
109 | if "%1" == "json" (
110 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
111 | 	if errorlevel 1 exit /b 1
112 | 	echo.
113 | 	echo.Build finished; now you can process the JSON files.
114 | 	goto end
115 | )
116 | 
117 | if "%1" == "htmlhelp" (
118 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
119 | 	if errorlevel 1 exit /b 1
120 | 	echo.
121 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
122 | .hhp project file in %BUILDDIR%/htmlhelp.
123 | 	goto end
124 | )
125 | 
126 | if "%1" == "qthelp" (
127 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
128 | 	if errorlevel 1 exit /b 1
129 | 	echo.
130 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
131 | .qhcp project file in %BUILDDIR%/qthelp, like this:
132 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\MLInsights.qhcp
133 | 	echo.To view the help file:
134 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\MLInsights.ghc
135 | 	goto end
136 | )
137 | 
138 | if "%1" == "devhelp" (
139 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
140 | 	if errorlevel 1 exit /b 1
141 | 	echo.
142 | 	echo.Build finished.
143 | 	goto end
144 | )
145 | 
146 | if "%1" == "epub" (
147 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
148 | 	if errorlevel 1 exit /b 1
149 | 	echo.
150 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
151 | 	goto end
152 | )
153 | 
154 | if "%1" == "epub3" (
155 | 	%SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3
156 | 	if errorlevel 1 exit /b 1
157 | 	echo.
158 | 	echo.Build finished. The epub3 file is in %BUILDDIR%/epub3.
159 | 	goto end
160 | )
161 | 
162 | if "%1" == "latex" (
163 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
164 | 	if errorlevel 1 exit /b 1
165 | 	echo.
166 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
167 | 	goto end
168 | )
169 | 
170 | if "%1" == "latexpdf" (
171 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | 	cd %BUILDDIR%/latex
173 | 	make all-pdf
174 | 	cd %~dp0
175 | 	echo.
176 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | 	goto end
178 | )
179 | 
180 | if "%1" == "latexpdfja" (
181 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
182 | 	cd %BUILDDIR%/latex
183 | 	make all-pdf-ja
184 | 	cd %~dp0
185 | 	echo.
186 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
187 | 	goto end
188 | )
189 | 
190 | if "%1" == "text" (
191 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
192 | 	if errorlevel 1 exit /b 1
193 | 	echo.
194 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
195 | 	goto end
196 | )
197 | 
198 | if "%1" == "man" (
199 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
200 | 	if errorlevel 1 exit /b 1
201 | 	echo.
202 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
203 | 	goto end
204 | )
205 | 
206 | if "%1" == "texinfo" (
207 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
208 | 	if errorlevel 1 exit /b 1
209 | 	echo.
210 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
211 | 	goto end
212 | )
213 | 
214 | if "%1" == "gettext" (
215 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
216 | 	if errorlevel 1 exit /b 1
217 | 	echo.
218 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
219 | 	goto end
220 | )
221 | 
222 | if "%1" == "changes" (
223 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
224 | 	if errorlevel 1 exit /b 1
225 | 	echo.
226 | 	echo.The overview file is in %BUILDDIR%/changes.
227 | 	goto end
228 | )
229 | 
230 | if "%1" == "linkcheck" (
231 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
232 | 	if errorlevel 1 exit /b 1
233 | 	echo.
234 | 	echo.Link check complete; look for any errors in the above output ^
235 | or in %BUILDDIR%/linkcheck/output.txt.
236 | 	goto end
237 | )
238 | 
239 | if "%1" == "doctest" (
240 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
241 | 	if errorlevel 1 exit /b 1
242 | 	echo.
243 | 	echo.Testing of doctests in the sources finished, look at the ^
244 | results in %BUILDDIR%/doctest/output.txt.
245 | 	goto end
246 | )
247 | 
248 | if "%1" == "coverage" (
249 | 	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
250 | 	if errorlevel 1 exit /b 1
251 | 	echo.
252 | 	echo.Testing of coverage in the sources finished, look at the ^
253 | results in %BUILDDIR%/coverage/python.txt.
254 | 	goto end
255 | )
256 | 
257 | if "%1" == "xml" (
258 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
259 | 	if errorlevel 1 exit /b 1
260 | 	echo.
261 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
262 | 	goto end
263 | )
264 | 
265 | if "%1" == "pseudoxml" (
266 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
267 | 	if errorlevel 1 exit /b 1
268 | 	echo.
269 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
270 | 	goto end
271 | )
272 | 
273 | if "%1" == "dummy" (
274 | 	%SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy
275 | 	if errorlevel 1 exit /b 1
276 | 	echo.
277 | 	echo.Build finished. Dummy builder generates no files.
278 | 	goto end
279 | )
280 | 
281 | :end
282 | 


--------------------------------------------------------------------------------
/docs/rtfd-requirements.txt:
--------------------------------------------------------------------------------
1 | ml_insights


--------------------------------------------------------------------------------
/docs/splinecalib_class.rst:
--------------------------------------------------------------------------------
1 | SplineCalib Class
2 | -----------------
3 | 
4 | .. autoclass:: ml_insights.SplineCalib
5 |    :members: fit, calibrate
6 | 


--------------------------------------------------------------------------------
/docs/splinecalib_examples.rst:
--------------------------------------------------------------------------------
 1 | Examples
 2 | ========
 3 | 
 4 | The best way to learn about SplineCalib is to work through some examples. We have provided a few below.
 5 | 
 6 | #. `SplineCalib_Tutorial`_: The most basic introduction to calibration and the SplineCalib class.
 7 | #. `SplineCalib_Details`_: A deeper dive into the various settings and parameters of the SplineCalib class.
 8 | #. `SplineCalib_Multiclass_MNIST`_: A multiclass calibration example using the MNIST digit data.
 9 | 
10 | 
11 | .. _SplineCalib_Tutorial: https://github.com/numeristical/introspective/tree/master/examples/SplineCalib_Tutorial.ipynb
12 | .. _SplineCalib_Details: https://github.com/numeristical/introspective/tree/master/examples/SplineCalib_Details.ipynb
13 | .. _SplineCalib_Multiclass_MNIST: https://github.com/numeristical/introspective/tree/master/examples/SplineCalib_Multiclass_MNIST.ipynb
14 | 


--------------------------------------------------------------------------------
/extra/code/discrete_dt.pyx:
--------------------------------------------------------------------------------
  1 | # cython: profile=True
  2 | 
  3 | """Decision Tree based on Discrete Graph structure"""
  4 | import numpy as np
  5 | import pandas as pd
  6 | import random
  7 | cimport numpy as cnp
  8 | from libc.math cimport log as clog
  9 | from graphs import *
 10 | 
 11 | 
 12 | class DiscreteGraphDecisionTree(object):
 13 |     """This class represents a tree built on categorical features, each of which contains
 14 |     a graph to represent the associated terrain.  Splits will be tried according to the 
 15 |     *maximally coarse partitions* returned from the graph class.
 16 | 
 17 |     feature_graphs: a dictionary which maps the column names to a graph_undirected object.
 18 |                     The graph_undirected must contain vertices for every possible value of that column
 19 |                     If the graph contains no edges, it will be treated as one-hot encoded.
 20 | 
 21 |     loss_fn: Currently there are three options: 
 22 |             'entropy':  will use the information gain to choose the best split (target nust be [0,1])
 23 |             'mse': will use (minimum) mean squared error to choose the best split (target must be numeric)
 24 |             'gh': This uses the XGBoost method where the first derivative (g) and second derivative (h) of the
 25 |                 custom loss function must be provided.  In this case, the 'g' values should be passed as y_train
 26 |                 and the 'h' values passed as y_train_2
 27 | 
 28 |     min_size_split: The size, below which, the tree will not consider splitting further.  Default is 2.
 29 | 
 30 |     min_leaf_size: The minimum permitted size of a split.  Splits will not be considered if they result
 31 |                     in a leaf smaller than min_leaf_size
 32 | 
 33 |     max_depth: The maximum depth permitted for the tree.  Setting to 1 means creating 'stumps' (a single split).
 34 | 
 35 |     gamma: The minimum improvement required to execute a split (for regularization purposes).  
 36 |             If the improvement of a split does not exceed gamma, then the node will not be split.
 37 | 
 38 |     reg_lambda: The L1 shrinkage applied to the coefficients, as in XGBoost.
 39 | 
 40 |     node_summary_fn: Given a collection of points at the node, what should be the value of the node.  Default is
 41 |             to take the mean.
 42 | 
 43 |     max_splits_to_search: For a feature, what is the maximum number of splits we should search.  Categorical features
 44 |             may have prohibitively many possible splits.  If the number exceeds max_splits_to_search, we randomly choose
 45 |             only max_splits_to_search of them to evaluate.  Default is infinity (search all splits)
 46 |     """
 47 | 
 48 |     def __init__(self, feature_graphs, loss_fn = 'entropy', min_size_split=2, min_leaf_size = 2, max_depth=3, gamma=0,
 49 |                 reg_lambda=1, node_summary_fn = np.mean, max_splits_to_search = np.Inf, msac=13):
 50 |         self.dec_tree={}
 51 |         self.dec_tree['feature_graphs'] = feature_graphs
 52 |         self.num_leafs = 0
 53 |         self.min_size_split=min_size_split
 54 |         self.min_leaf_size=min_leaf_size
 55 |         self.max_depth=max_depth
 56 |         self.gamma=gamma
 57 |         self.node_summary_fn=node_summary_fn
 58 |         self.reg_lambda=reg_lambda
 59 |         self.max_splits_to_search = max_splits_to_search
 60 |         self.msac = msac
 61 |         if loss_fn == 'gh':
 62 |             self.loss_fn='gh'
 63 |             self.node_summary_fn=_node_summary_gh
 64 |             self.split_scorer = _score_data_split_gh
 65 |         if loss_fn == 'entropy':
 66 |             self.loss_fn='entropy'
 67 |             self.split_scorer = _score_data_split_entropy
 68 |         if loss_fn == 'mse':
 69 |             self.loss_fn='mse'
 70 |             self.split_scorer = _score_data_split_mse
 71 | 
 72 |     def fit(self, X_train, y_train, y_train_2=None):        
 73 |         # Tree fitting works through a queue of nodes to process (node_to_proc_list) 
 74 |         # The initial node is just the root of the tree
 75 |         self.node_to_proc_list = [self.dec_tree]
 76 |         
 77 |         # Initialize values to what they are at the root of the tree
 78 |         self.dec_tree['depth']=0
 79 |         self.dec_tree['mask'] = np.ones(len(y_train))
 80 |         self.X_train = X_train
 81 |         self.y_train = y_train
 82 | 
 83 |         # Special handling for 'gh' loss function
 84 |         if self.loss_fn ==  'gh':
 85 |             self.y_train_2 = y_train_2
 86 |             self.node_summary_fn = _node_summary_gh
 87 | 
 88 |         # Process nodes until none are left to process
 89 |         while self.node_to_proc_list:
 90 |             node_to_process = self.node_to_proc_list.pop()
 91 |             self._process_tree_node(node_to_process)
 92 |     
 93 |     def predict(self, X_test):
 94 |         cdef int i, n=X_test.shape[0]
 95 |         cdef dict data_row_dict, pointer, col_to_int_dict
 96 |         cdef frozenset left_set  
 97 |         
 98 |         col_list = list(X_test.columns)
 99 |         data_np = X_test.values
100 |         col_to_int_dict = {col_list[i]:i for i in range(len(col_list))}
101 | 
102 |         # Initialize the output vector to all zeros
103 |         out_vec = np.zeros(X_test.shape[0])
104 |         
105 |         # This iterates through each data point in test set and follows the tree until it
106 |         # reaches a leaf node
107 |         for i in range(n):
108 |             # Put the relevant values for current test point into a dict for quick lookup
109 |             data_row_dict = {colname:data_np[i,col_to_int_dict[colname]] for colname in col_list}
110 |             pointer = self.dec_tree
111 |             while pointer['node_type']=='interior':
112 |                 curr_element = data_row_dict[pointer['split_feature']]
113 |                 left_set = pointer['left_split']
114 |                 if curr_element in left_set:
115 |                     pointer = pointer['left_child']
116 |                 else:
117 |                     pointer = pointer['right_child']
118 |             out_vec[i] = pointer['node_summary_val']
119 |         return(out_vec)
120 |    
121 |     def _process_tree_node(self, curr_node):
122 |         # Restrict to relevant data for the node in question
123 |         X_train_node = self.X_train[curr_node['mask']>0]
124 |         
125 |         # Get the associated y-values (or g,h values)
126 |         # and save information about the current node
127 |         if self.loss_fn != 'gh':
128 |             y_train_node = self.y_train[curr_node['mask']>0]
129 |             curr_node['node_summary_val'] = self.node_summary_fn(y_train_node)
130 |             curr_node['num_data_points'] = len(y_train_node)
131 |         else:
132 |             y_train_g = self.y_train[curr_node['mask']>0]
133 |             y_train_h = self.y_train_2[curr_node['mask']>0]
134 |             curr_node['node_summary_val'] = _node_summary_gh(y_train_g, y_train_h, self.gamma)
135 |             curr_node['num_data_points'] = len(y_train_g)
136 |             g_sum = np.sum(y_train_g)
137 |             h_sum = np.sum(y_train_h)
138 | 
139 |         # If we are guaranteed not to split this node further, then mark it as such and move on
140 |         if (curr_node['num_data_points']<self.min_size_split) or (curr_node['depth']>=self.max_depth):
141 |             if self.loss_fn != 'gh':
142 |                 self._wrap_up_node(curr_node, y_train_node)
143 |             else:
144 |                 self._wrap_up_node(curr_node, y_train_g, y_train_h)
145 |             return None
146 |         
147 |         # Determine which features are still "eligible" to be considered
148 |         features_to_search = _get_features_to_search(X_train_node, curr_node['feature_graphs'])
149 |         
150 |         # If no features are eligible (e.g. all x-values are identical in all features)
151 |         # Then we similarly summarize the node and move on
152 |         if features_to_search==[]:
153 |             if self.loss_fn != 'gh':
154 |                 self._wrap_up_node(curr_node, y_train_node)
155 |             else:
156 |                 self._wrap_up_node(curr_node, y_train_g, y_train_h)
157 |             return None
158 |         
159 |         # best_split_dict holds all the necessary info about a potential split
160 |         best_split_dict = _initialize_best_split_dict()
161 | 
162 |         # Main loop over features to find best split
163 |         for feature in features_to_search:
164 |             feature_graph = curr_node['feature_graphs'][feature]
165 |             if len(feature_graph.edges)==0:     # This means to treat the feature as one-hot encoded
166 |                 possible_splits = []
167 |                 vert_list = list(feature_graph.vertices)
168 |                 # Make a list of splits that are one feature vs the rest (as in one-hot-encoding)
169 |                 for i in range(len(vert_list)):
170 |                     tfset = frozenset(vert_list[i:i+1])
171 |                     possible_splits.append(frozenset([tfset,frozenset(vert_list)-tfset]))
172 |                 #print(possible_splits)
173 |                 index_range = range(len(possible_splits))
174 |             else:
175 |                 # Query the graph structure to get the possible splits
176 |                 if (len(feature_graph.mc_partitions)>0):
177 |                     possible_splits = feature_graph.return_mc_partitions()
178 |                 else:
179 |                     possible_splits = feature_graph.return_contracted_partitions(max_size_after_contraction=self.msac)
180 |                     #print('# possible splits = {}'.format(len(possible_splits)))
181 |                 #possible_splits = feature_graph.return_mc_partitions()
182 |                 if (len(possible_splits)>self.max_splits_to_search):
183 |                     # Randomly choose (with replacement) a subset of possible splits
184 |                     index_range = np.random.randint(0,len(possible_splits),self.max_splits_to_search)
185 |                     #print('index_range_len={} msts={}'.format(len(index_range),self.max_splits_to_search))
186 |                 else:
187 |                     index_range = range(len(possible_splits))
188 | 
189 |             curr_feature_vec = X_train_node[feature].values
190 |             
191 |             # Loop within values of each feature
192 |             for index in index_range:
193 |                 curr_partition = list(possible_splits[index])
194 |                 left_split = curr_partition[0]
195 |                 if self.loss_fn != 'gh':
196 |                     curr_split_dict = _eval_curr_split_dict(curr_feature_vec, y_train_node, curr_node['feature_graphs'], 
197 |                                                             feature, left_split, self.split_scorer, self.min_leaf_size, self.gamma)
198 |                 else:
199 |                     curr_split_dict = _eval_curr_split_dict(curr_feature_vec, y_train_g, curr_node['feature_graphs'], 
200 |                                                             feature, left_split, self.split_scorer, self.min_leaf_size, self.gamma, 
201 |                                                             y_train_2_node = y_train_h, is_gh=True, g_sum=g_sum, h_sum=h_sum)
202 | 
203 |                 best_split_dict = _compare_curr_to_best(curr_split_dict, best_split_dict)
204 | 
205 | 
206 |         if best_split_dict['best_loss_score'] < np.inf:
207 |             # Execute the split
208 |             left_mask = self.X_train[best_split_dict['best_split_feature']].isin(best_split_dict['best_left_split']).values
209 |             right_mask = self.X_train[best_split_dict['best_split_feature']].isin(best_split_dict['best_right_split']).values
210 |             self.perform_split_on_node(curr_node, best_split_dict, curr_node['feature_graphs'], left_mask, right_mask)
211 |         else:
212 |             if self.loss_fn != 'gh':
213 |                 self._wrap_up_node(curr_node, y_train_node)
214 |             else:
215 |                 self._wrap_up_node(curr_node, y_train_g, y_train_h)
216 | 
217 |         return None
218 | 
219 |     def _wrap_up_node(self, curr_node, y_train_node, y_train_2_node=None):
220 |         # Compute summary stats of node and mark it as a leaf
221 |         if self.loss_fn!='gh':
222 |             curr_node['node_summary_val'] = self.node_summary_fn(y_train_node)
223 |         else:
224 |             curr_node['node_summary_val'] = _node_summary_gh(y_train_node, y_train_2_node, self.reg_lambda)
225 |         curr_node['num_data_points'] = len(y_train_node)
226 |         curr_node['node_type'] = 'leaf'
227 |         self.num_leafs+=1
228 |         curr_node.pop('mask')
229 | 
230 |     def perform_split_on_node(self, curr_node, best_split_dict, feature_graphs_node, left_mask, right_mask):
231 |         # record info about current node
232 |         curr_node['left_split'] = best_split_dict['best_left_split']
233 |         curr_node['right_split'] = best_split_dict['best_right_split']
234 |         curr_node['loss_score'] = best_split_dict['best_loss_score']
235 |         curr_node['split_feature'] = best_split_dict['best_split_feature']       
236 |         curr_node['node_type'] = 'interior'
237 |         curr_mask = curr_node.pop('mask')      
238 | 
239 |         # Create feature graphs for children
240 |         feature_graphs_left = feature_graphs_node.copy()
241 |         feature_graphs_left[curr_node['split_feature']] = get_induced_subgraph(feature_graphs_left[curr_node['split_feature']], 
242 |                                                                                 curr_node['left_split'])
243 |         feature_graphs_right = feature_graphs_node.copy()
244 |         feature_graphs_right[curr_node['split_feature']] = get_induced_subgraph(feature_graphs_right[curr_node['split_feature']], 
245 |                                                                                 curr_node['right_split'])
246 |         # Create left and right children
247 |         curr_node['left_child'] = {}
248 |         curr_node['left_child']['depth'] = curr_node['depth'] + 1
249 |         curr_node['left_child']['mask'] = curr_mask * left_mask
250 |         curr_node['left_child']['feature_graphs'] = feature_graphs_left
251 | 
252 |         curr_node['right_child'] = {}
253 |         curr_node['right_child']['depth'] = curr_node['depth'] + 1
254 |         curr_node['right_child']['mask'] = curr_mask * right_mask
255 |         curr_node['right_child']['feature_graphs'] = feature_graphs_right
256 | 
257 |         # Add left and right children to queue
258 |         self.node_to_proc_list.append(curr_node['left_child'])
259 |         self.node_to_proc_list.append(curr_node['right_child'])
260 | 
261 |        
262 | def _get_features_to_search(X_train_node, feature_graphs_node):
263 |     num_distinct_values = {}
264 |     for feature,graph in feature_graphs_node.items():
265 |         num_distinct_values[feature] = len(np.unique(X_train_node[feature]))
266 | 
267 |     ## Remove features from consideration if they only have <=1 distinct values in the current data
268 |     features_to_search = [feature for feature in X_train_node.columns if num_distinct_values[feature]>1]
269 |     return(features_to_search)
270 | 
271 | 
272 | def _initialize_best_split_dict():
273 |     out_dict = {}
274 |     out_dict['best_loss_score'] = np.inf
275 |     out_dict['best_left_split'] = None    
276 |     out_dict['best_right_split'] = None    
277 |     out_dict['best_split_feature'] = None
278 |     return(out_dict)
279 | 
280 | def _eval_curr_split_dict(curr_feature_vec, y_train_node, feature_graphs_node, feature, frozenset left_split, split_scorer, min_leaf_size, gamma, 
281 |                             y_train_2_node=None, is_gh=False, g_sum=0, h_sum=0):
282 |     cdef dict out_dict
283 |     cdef frozenset temp_set
284 |     cdef list temp_list
285 | 
286 |     out_dict = {}
287 |     out_dict['left_split'] = left_split
288 |     out_dict['feature'] = feature
289 |     out_dict['right_split'] = frozenset(feature_graphs_node[feature].vertices - left_split)
290 |     temp_set = out_dict['left_split']
291 |     temp_list = [x in temp_set for x in curr_feature_vec]
292 |     out_dict['mask_left'] = np.array(temp_list)    
293 |     out_dict['mask_right'] = np.logical_not(out_dict['mask_left'])
294 |     if is_gh==False:
295 |         out_dict['loss_score'] = split_scorer(out_dict['mask_left'], out_dict['mask_right'], y_train_node.values, min_leaf_size, gamma)
296 |     else:
297 |         out_dict['loss_score'] = split_scorer(out_dict['mask_left'], out_dict['mask_right'], y_train_node.values, y_train_2_node.values,
298 |                                                                      min_leaf_size, gamma, g_sum, h_sum)
299 | 
300 |     return(out_dict)
301 | 
302 | def _compare_curr_to_best(curr_split_dict, best_split_dict):
303 |     if (curr_split_dict['loss_score'] < best_split_dict['best_loss_score']):
304 |         best_split_dict['best_loss_score'] = curr_split_dict['loss_score']
305 |         best_split_dict['best_split_feature'] = curr_split_dict['feature']
306 |         best_split_dict['best_left_split'] = curr_split_dict['left_split']
307 |         best_split_dict['best_right_split'] = curr_split_dict['right_split']
308 |     return(best_split_dict)
309 | 
310 | def root_mean_squared_error(vec1, vec2):
311 |     return np.sqrt(np.mean((vec1-vec2)**2))
312 | 
313 | 
314 | def _score_data_split_mse(mask_left, mask_right, outcome_vec, min_leaf_size, gamma, eps=.0001):
315 | 
316 |     cdef double mean_left, mean_right,mean_overall,loss_score, n1, n2
317 | 
318 |     n1 = np.sum(mask_left)
319 |     n2 = np.sum(mask_right)
320 |     if np.minimum(n1, n2)<min_leaf_size:
321 |         return np.inf
322 |     mean_left = np.mean(outcome_vec[mask_left])
323 |     mean_right = np.mean(outcome_vec[mask_right])
324 |     mean_overall = np.mean(outcome_vec)
325 |     t_vec = np.zeros(len(outcome_vec))
326 |     t_vec[mask_left] = mean_left
327 |     t_vec[mask_right] = mean_right
328 |     agg_vec = np.mean(outcome_vec) * np.ones(len(outcome_vec))
329 |     loss_score = root_mean_squared_error(outcome_vec, t_vec) - root_mean_squared_error(outcome_vec, agg_vec)
330 |     loss_score = loss_score-gamma
331 |     if loss_score>=0:
332 |         loss_score = np.inf
333 |     return loss_score
334 | 
335 | 
336 | 
337 | def _score_data_split_entropy(mask_left, mask_right, outcome_vec, min_leaf_size, gamma, eps=.0001):
338 | 
339 |     cdef double m1,n1,m2,n2,num1,num1a,num2,num2a,lik_rat,loss_score
340 | 
341 |     m1 = np.sum(outcome_vec[mask_left])+eps
342 |     n1 = np.sum(mask_left)+eps
343 |     m2 = np.sum(outcome_vec[mask_right])+eps
344 |     n2 = np.sum(mask_right)+eps
345 |     if np.minimum(n1, n2)<min_leaf_size:
346 |         return np.inf
347 |     loss_score = -1 * get_lik_rat(m1,n1,m2,n2,eps)
348 |     loss_score - loss_score-gamma
349 |     if loss_score>=0:
350 |         loss_score = np.inf
351 |     return loss_score
352 | 
353 | cdef double get_lik_rat(double m1, double n1, double m2, double n2, eps):
354 |     cdef double num1, num2
355 |     num1 = m1*clog(((m1/n1)/((m1+m2)/(n1+n2)))+eps) + (n1-m1+eps) * clog((((n1-m1)/n1)/((((n1+n2)-(m1+m2)))/(n1+n2))))
356 |     num2 = m2*clog(((m2/n2)/((m1+m2)/(n1+n2)))) + (n2-m2+eps) * clog((((n2-m2)/n2)/((((n1+n2)-(m1+m2)))/(n1+n2))))
357 |     return num1+num2
358 | 
359 | def _score_data_split_gh(mask_left, mask_right, outcome_vec_g, outcome_vec_h, min_leaf_size, gamma, g_sum, h_sum):
360 |     cdef double loss_score, g_left, g_right, h_left, h_right, n_left, n_right, vec_len
361 | 
362 |     vec_len = len(outcome_vec_g)
363 |     g_left = np.sum(outcome_vec_g[mask_left])
364 |     g_right = g_sum - g_left
365 |     #g_right = np.sum(outcome_vec_g[mask_right])
366 |     h_left = np.sum(outcome_vec_h[mask_left])
367 |     h_right = h_sum - h_left
368 |     #h_right = np.sum(outcome_vec_h[mask_right])
369 |     n_left = np.sum(mask_left)
370 |     n_right = vec_len - n_left
371 |     #n_right = np.sum(mask_right)
372 |     if np.minimum(n_left, n_right)<min_leaf_size:
373 |         return np.inf
374 |     loss_score = -1.0 * _get_gh_score(g_left, g_right, h_left, h_right, gamma)
375 |     if loss_score>=0:
376 |         loss_score = np.inf
377 |     return loss_score
378 | 
379 | cdef double _get_gh_score(double g_left, double g_right, double h_left, double h_right, double gamma):
380 |     return(.5*( ((g_left**2)/(h_left+gamma)) + ((g_right**2)/(h_right+gamma)) - (((g_left + g_right)**2)/(h_left + h_right+gamma)))-gamma)
381 | 
382 | def _node_summary_gh(y_vec_g, y_vec_h, reg_lambda):
383 |     out_val = -np.sum(y_vec_g)/(np.sum(y_vec_h)+reg_lambda)
384 |     return(out_val)
385 | 


--------------------------------------------------------------------------------
/extra/code/discrete_gb.pyx:
--------------------------------------------------------------------------------
  1 | # cython: profile=True
  2 | 
  3 | """Decision Tree Gradient Boosting based on Discrete Graph structure"""
  4 | import numpy as np
  5 | import pandas as pd
  6 | cimport numpy as cnp
  7 | from libc.math cimport log as clog
  8 | from discrete_dt import *
  9 | from graphs import *
 10 | from sklearn.metrics import log_loss, mean_squared_error
 11 | 
 12 | 
 13 | class DiscreteGraphGB(object):
 14 | 
 15 |     def __init__(self, num_trees, feature_graphs,  mode='classification', loss_fn = 'entropy', min_size_split=2, min_leaf_size = 1, max_depth=3, gamma=0,
 16 |                      reg_lambda=1, node_summary_fn = np.mean, learning_rate=.1, max_splits_to_search=np.Inf, msac=100):
 17 |         self.num_trees = num_trees
 18 |         self.num_trees_for_prediction = num_trees
 19 |         self.dec_tree_list = []
 20 |         self.feature_graphs = feature_graphs
 21 |         self.min_size_split=min_size_split
 22 |         self.min_leaf_size=min_leaf_size
 23 |         self.max_depth=max_depth
 24 |         self.gamma=gamma
 25 |         self.node_summary_fn=node_summary_fn
 26 |         self.learning_rate = learning_rate
 27 |         self.loss_fn = loss_fn
 28 |         self.max_splits_to_search = max_splits_to_search
 29 |         self.msac = msac
 30 |         self.mode = mode
 31 |         if loss_fn == 'entropy':
 32 |             self.loss_fn_der_1 = _entropy_link_der_1
 33 |             self.loss_fn_der_2 = _entropy_link_der_2
 34 |         if loss_fn == 'mse':
 35 |             self.loss_fn_der_1 = _mse_der_1
 36 |             self.loss_fn_der_2 = _mse_der_2
 37 |         # if features=='auto':
 38 |         #     self.features=list(self.dec_tree['feature_graphs'].keys())
 39 | 
 40 |     def fit(self, X_train, y_train, eval_set = None, eval_freq=10, 
 41 |                 early_stop_past_steps=0, choose_best_eval=True):
 42 |         # cdef int i, n =self.num_trees
 43 |         self.eval_freq=eval_freq
 44 |         eval_len = np.floor(self.num_trees/self.eval_freq).astype(int)
 45 |         self.eval_results = np.zeros(eval_len)
 46 |         n =self.num_trees
 47 |         self.initial_pred = np.mean(y_train)
 48 |         stop_now=False
 49 |         if eval_set is not None:
 50 |             X_valid = eval_set[0]
 51 |             y_valid = eval_set[1]
 52 |         for i in range(n):
 53 |             
 54 |             # Get predictions of current model
 55 |             if i==0:
 56 |                 curr_answer = self.initial_pred * np.ones(len(y_train))
 57 |                 if eval_set is not None:
 58 |                     curr_test_answer = self.initial_pred * np.ones(len(y_valid))
 59 |                     if self.mode == 'classification':
 60 |                         curr_loss= log_loss(y_valid, 1/(1+np.exp(-curr_test_answer)))
 61 |                         print("i=0, test_set_log_loss = {}".format(curr_loss))
 62 |                     else:
 63 |                         curr_loss= mean_squared_error(y_valid, curr_test_answer)
 64 |                         print("i=0. test_set_mse = {}".format(curr_loss))
 65 |                     
 66 |             else:
 67 |                 curr_answer = curr_answer + self.learning_rate * self.dec_tree_list[i-1].predict(X_train) 
 68 |                 if eval_set is not None:
 69 |                     curr_test_answer = curr_test_answer + self.learning_rate * self.dec_tree_list[i-1].predict(X_valid)
 70 |                     if ((i+1)%self.eval_freq==1):
 71 |                         if self.mode == 'classification':
 72 |                             curr_loss= log_loss(y_valid, 1/(1+np.exp(-curr_test_answer)))
 73 |                             print("i={}, test_set_log_loss = {}".format(i,curr_loss))
 74 |                         else:
 75 |                             curr_loss= mean_squared_error(y_valid, curr_test_answer)
 76 |                             print("i={}, test_set_mse = {}".format(i,curr_loss))
 77 |                         
 78 |                         curr_step=np.floor((i+1)/self.eval_freq).astype(int) -1
 79 |                         self.eval_results[curr_step]=curr_loss
 80 |                         if curr_step>early_stop_past_steps:        
 81 |                             compare_loss = np.min(self.eval_results[:curr_step-early_stop_past_steps+1])
 82 |                             if (curr_loss>compare_loss):
 83 |                                 stop_now=True
 84 |                                 print("Stopping early: curr_loss of {} exceeds compare_loss of {}".format(curr_loss, compare_loss))
 85 |             if stop_now:        
 86 |                 if choose_best_eval:
 87 |                     self.num_trees_for_prediction = (np.argmin(self.eval_results[:curr_step+1])+1)*eval_freq
 88 |                 break
 89 | 
 90 |             # Get first and second derivatives
 91 |             y_g_vec = self.loss_fn_der_1(y_train, curr_answer)
 92 |             y_h_vec = self.loss_fn_der_2(y_train, curr_answer)
 93 | 
 94 | 
 95 |            # Sample the data to use for this tree
 96 |             
 97 |             num_rows = X_train.shape[0]
 98 |             rows_to_use = np.random.choice(range(num_rows), num_rows, replace=True)
 99 |             if type(X_train)==pd.DataFrame:
100 |                 X_train_to_use = X_train.iloc[rows_to_use]
101 |             elif type(X_train)==np.ndarray:
102 |                 X_train_to_use = X_train[rows_to_use]
103 |             else:
104 |                 print('unknown format for X_train')
105 |             #y_original_train_to_use = y_train.sample(X_train.shape[0], random_state=rs, replace=True)
106 |             if type(y_g_vec)==pd.Series:
107 |                 y_g_to_use = y_g_vec.iloc[rows_to_use]
108 |             elif type(y_g_vec)==np.ndarray:
109 |                 y_g_to_use = y_g_vec[rows_to_use]
110 |             else:
111 |                 print('unknown format for y_g_vec')
112 | 
113 |             if type(y_h_vec)==pd.Series:
114 |                 y_h_to_use = y_h_vec.iloc[rows_to_use]
115 |             elif type(y_h_vec)==np.ndarray:
116 |                 y_h_to_use = y_h_vec[rows_to_use]
117 |             else:
118 |                 print('unknown format for y_h_vec')
119 | 
120 |             self.dec_tree_list.append(DiscreteGraphDecisionTree(feature_graphs=self.feature_graphs,loss_fn = 'gh',
121 |                                                  min_size_split = self.min_size_split, min_leaf_size=self.min_leaf_size, 
122 |                                                  gamma=self.gamma, max_depth=self.max_depth, 
123 |                                                  node_summary_fn = self.node_summary_fn, 
124 |                                                  max_splits_to_search = self.max_splits_to_search, msac=self.msac))
125 |             self.dec_tree_list[i].fit(X_train_to_use, y_g_to_use, y_h_to_use)
126 | 
127 | 
128 |     def predict(self, X_test, num_trees_to_use=0):
129 |             cdef int i
130 |             if num_trees_to_use==0:
131 |                 num_trees_to_use=self.num_trees_for_prediction
132 |             out_vec = self.initial_pred*np.ones(X_test.shape[0])
133 |             for i in range(num_trees_to_use):
134 |                 out_vec = out_vec + self.learning_rate * self.dec_tree_list[i].predict(X_test)
135 |             if self.mode=='classification':
136 |                 return(1/(1+np.exp(-out_vec)))
137 |             else:
138 |                 return(out_vec)
139 | 
140 | def _entropy_der_1(y_true, y_pred, eps=1e-15):
141 |     y_pred = np.maximum(y_pred, eps)
142 |     y_pred = np.minimum(y_pred, 1-eps)
143 |     return((-(y_true/y_pred) + (1-y_true)/(1-y_pred)))
144 | 
145 | def _entropy_der_2(y_true, y_pred, eps=1e-15):
146 |     y_pred = np.maximum(y_pred, eps)
147 |     y_pred = np.minimum(y_pred, 1-eps)
148 |     out_vec = (y_true)/(y_pred**2) + ((1-y_true)/((1-y_pred)**2))
149 |     return(out_vec)
150 | 
151 | def _mse_der_1(y_true, y_pred, eps=1e-15):
152 |     return(2*(y_pred-y_true))
153 | 
154 | def _mse_der_2(y_true, y_pred, eps=1e-15):
155 |     return(pd.Series(2*np.ones(len(y_pred))))
156 | 
157 | def _entropy_link_der_1(y_true, z_pred, eps=1e-15):
158 |     return(-y_true*(1/(1+np.exp(z_pred))) + (1-y_true) * (1/(1+np.exp(-z_pred))) )
159 | 
160 | def _entropy_link_der_2(y_true, z_pred, eps=1e-15):
161 |     return(y_true*(np.exp(z_pred)/((1+np.exp(z_pred))**2)) + (1-y_true) * (np.exp(-z_pred)/((1+np.exp(-z_pred))**2)) )
162 | 
163 | 


--------------------------------------------------------------------------------
/extra/code/graphs.pyx:
--------------------------------------------------------------------------------
  1 | # cython: profile=True
  2 | 
  3 | 
  4 | import numpy as np
  5 | import json
  6 | 
  7 | class graph_undirected(object):
  8 |     """This is a class to handle undirected graphs.  Still very much a work in progress.
  9 |     Defines a graph by a set of vertices and a set of "frozensets" representing the edges.
 10 |     """
 11 | 
 12 |     def __init__(self, edges, vertices=set()):
 13 |         
 14 |         if vertices == set():
 15 |             self.vertices = set([x for sublist in list(edges) for x in list(sublist) ])
 16 |         else:
 17 |             self.vertices = set(vertices)
 18 |         new_edges = set()
 19 |         for edge in edges:
 20 |             new_edge = frozenset(edge)
 21 |             if len(new_edge)>1:
 22 |                 new_edges.add(new_edge)
 23 |         self.edges = set(new_edges)
 24 |         self.mc_partitions = []
 25 |         self.mc_partitions_max_size = 0
 26 |         self.all_connected_sets = []
 27 |         self.vertex_to_neighbors_dict = {}
 28 | 
 29 |     def adjacent_edges(self, target_vertex):
 30 |         return set([x for x in self.edges if target_vertex in x])
 31 | 
 32 |     def adjacent_vertices(self, target_vertex):
 33 |         if target_vertex in self.vertex_to_neighbors_dict.keys():
 34 |             return self.vertex_to_neighbors_dict[target_vertex]
 35 |         else:
 36 |             neighbors_and_self = set([x for sublist in self.adjacent_edges(target_vertex) for x in sublist])
 37 |             out_set = set(neighbors_and_self)-set([target_vertex])
 38 |             self.vertex_to_neighbors_dict[target_vertex] = out_set
 39 |         return out_set
 40 | 
 41 |     def adjacent_vertices_to_set(self, target_vertex_set):
 42 |         templist = [list(self.adjacent_vertices(x)) for x in target_vertex_set]
 43 |         neighbors_and_self = [x for sublist in templist for x in sublist]
 44 |         return set(neighbors_and_self)-target_vertex_set
 45 |     
 46 |     def vertex_degree(self, target_vertex):
 47 |         return len(self.adjacent_vertices(target_vertex))
 48 | 
 49 |     def contract_edge(self, edge, sep_str='_'):
 50 |         return contract_edge(self, edge, sep_str)
 51 | 
 52 |     def delete_vertex(self, vertex):
 53 |         return delete_vertex(self, vertex)
 54 | 
 55 |     def delete_vertices(self, vertex_set):
 56 |         return delete_vertices(self, vertex_set)
 57 | 
 58 |     def get_induced_subgraph(self, vertex_set):
 59 |         return get_induced_subgraph(self, vertex_set)
 60 | 
 61 |     def return_mc_partitions(self):
 62 |         if self.mc_partitions==[]:
 63 |             self.enumerate_mc_partitions()
 64 |         return(self.mc_partitions)
 65 | 
 66 | 
 67 |     def enumerate_mc_partitions(self, max_size=0, verbose=False):
 68 |         """This method will examine every connected set S of size up to max_size and
 69 |         determine whether or not the complement of the set is also connected.  If the
 70 |         complement is also connected, then the partition {S, S^C} is added to the list
 71 |         self.mc_partitions"""
 72 |         
 73 |         # Default behavior is to find all maximally coarse partitions which
 74 |         # requires searching components up to size floor(n_vertices/2)
 75 |         if max_size==0:
 76 |             max_size=int(np.floor(len(self.vertices)/2))
 77 |         
 78 |         # Initialize some variables
 79 |         # The two lists below are sets of sets by size.
 80 |         # i.e. conn_sets_with_conn_complements_by_size[5] will be a set that contains
 81 |         # the connected sets of size 5 whose complements are also connected
 82 |         conn_sets_with_conn_complements_by_size = []
 83 |         conn_sets_with_disconn_complements_by_size = []
 84 |         
 85 |         # These two contain the sizes of each entry in the above lists
 86 |         num_conn_sets_with_conn_complements_list = []
 87 |         num_conn_sets_with_disconn_complements_list = []
 88 |         
 89 |         # Initialize the list with an empty set
 90 |         conn_sets_with_conn_complements_by_size.append(set())
 91 |         conn_sets_with_disconn_complements_by_size.append(set())
 92 | 
 93 | 
 94 |         # Corner case handling
 95 |         if(len(self.vertices)<=1):
 96 |             self.mc_partitions = []
 97 |             return []
 98 |         if(len(self.vertices)==2):
 99 |             vert_list = list(self.vertices)
100 |             set1 = set()
101 |             set2 = set()
102 |             set1.add(vert_list[0])
103 |             set2.add(vert_list[1])
104 |             self.mc_partitions = [frozenset([frozenset(set1),frozenset(set2)])]
105 |             self.max_size = 1
106 |             return None
107 |         
108 |         # The connected components of size 1 are exactly the vertices
109 |         if verbose:
110 |             print('Evaluating connected sets of size 1')
111 |         for vert in self.vertices:
112 |             if is_connected(delete_vertex(self, vert)):
113 |                 conn_sets_with_conn_complements_by_size[0].add(frozenset({vert}))
114 |             else:
115 |                 conn_sets_with_disconn_complements_by_size[0].add(frozenset({vert}))
116 |         num_conn_sets_with_conn_complements_list.append(len(conn_sets_with_conn_complements_by_size[0]))
117 |         num_conn_sets_with_disconn_complements_list.append(len(conn_sets_with_disconn_complements_by_size[0]))
118 |         if verbose:
119 |             print('num conn sets of comp_size 1 with connected complements = {}'.format(num_conn_sets_with_conn_complements_list[0]))
120 |             print('num conn sets of comp_size 1 with disconnected complements = {}'.format(num_conn_sets_with_disconn_complements_list[0]))
121 |             print('Evaluating connected sets of size 2')
122 |         conn_sets_with_conn_complements_by_size.append(set())
123 |         conn_sets_with_disconn_complements_by_size.append(set())
124 |         
125 |         # The connected components of size 2 are exactly the edges
126 |         for edge in self.edges:
127 |             if is_connected(delete_vertices(self, edge)):
128 |                 conn_sets_with_conn_complements_by_size[1].add(edge)
129 |             else:
130 |                 conn_sets_with_disconn_complements_by_size[1].add(edge)
131 |         num_conn_sets_with_conn_complements_list.append(len(conn_sets_with_conn_complements_by_size[1]))
132 |         num_conn_sets_with_disconn_complements_list.append(len(conn_sets_with_disconn_complements_by_size[1]))
133 |         if verbose:
134 |             print('num conn sets of comp_size 2 with connected complements = {}'.format(num_conn_sets_with_conn_complements_list[1]))
135 |             print('num conn sets of comp_size 2 with disconnected complements = {}'.format(num_conn_sets_with_disconn_complements_list[1]))
136 |             print('num conn sets of comp_size <=2 with connected complements = {}'.format(np.sum(num_conn_sets_with_conn_complements_list)))
137 |             print('num conn sets of comp_size <=2 with disconnected complements = {}'.format(np.sum(num_conn_sets_with_disconn_complements_list)))
138 | 
139 |         
140 |         for comp_size in range(3, max_size+1):
141 |             conn_sets_with_conn_complements_by_size.append(set())
142 |             conn_sets_with_disconn_complements_by_size.append(set())
143 | 
144 |             if verbose:
145 |                 print('Evaluating connected sets of size {}'.format(comp_size))
146 |             base_components = conn_sets_with_conn_complements_by_size[comp_size-2].union(conn_sets_with_disconn_complements_by_size[comp_size-2])
147 |             for base_comp in base_components:
148 |                 neighbors_to_add = self.adjacent_vertices_to_set(base_comp)
149 |                 for neighbor in neighbors_to_add:
150 |                     new_comp = set(base_comp)
151 |                     new_comp.add(neighbor)
152 |                     new_comp = frozenset(new_comp)
153 |                     if ((not new_comp in conn_sets_with_conn_complements_by_size[comp_size-1]) and (not new_comp in conn_sets_with_disconn_complements_by_size[comp_size-1])):
154 |                         if is_connected(delete_vertices(self,new_comp)):
155 |                             conn_sets_with_conn_complements_by_size[comp_size-1].add(new_comp)
156 |                         else:
157 |                             conn_sets_with_disconn_complements_by_size[comp_size-1].add(new_comp)
158 |             num_conn_sets_with_conn_complements_list.append(len(conn_sets_with_conn_complements_by_size[comp_size-1]))
159 |             num_conn_sets_with_disconn_complements_list.append(len(conn_sets_with_disconn_complements_by_size[comp_size-1]))
160 |                 
161 |             if verbose:
162 |                 print('num conn set of comp_size {} with connected complements= {}'.format(comp_size,num_conn_sets_with_conn_complements_list[comp_size-1]))
163 |                 print('num conn set of comp_size {} with discconnected complements= {}'.format(comp_size,num_conn_sets_with_disconn_complements_list[comp_size-1]))
164 |                 print('num conn set of comp_size <= {} with connected complements= {}'.format(comp_size, np.sum(num_conn_sets_with_conn_complements_list)))
165 |                 print('num conn set of comp_size <= {} with disconnected complements= {}'.format(comp_size, np.sum(num_conn_sets_with_disconn_complements_list)))
166 |              
167 |         self.mc_partitions = list(set([frozenset([conn_set, frozenset(self.vertices - conn_set)]) for templist in conn_sets_with_conn_complements_by_size for conn_set in templist]))
168 |         #self.mc_partitions = [[conn_set, self.vertices - conn_set] for conn_set in conn_sets_with_conn_complements]
169 |         self.mc_partitions_max_size = max_size
170 |         
171 | 
172 |     def save_partitions_to_file(self, file_name):
173 |         list_of_lists = [list(x) for x in self.all_partitions]
174 |         with open(file_name, "w") as write_file:
175 |             json.dump(list_of_lists, write_file)
176 | 
177 |     def load_partitions_from_file(self, file_name):
178 |         with open(file_name, "r") as read_file:
179 |             list_of_lists = json.load(read_file)
180 |         self.all_partitions = [frozenset(x) for x in list_of_lists]
181 | 
182 |     
183 |     def return_contracted_partitions(self, max_size_after_contraction = 13):
184 |         # if len(self.contracted_partitions)>0:
185 |         #     return self.contracted_partitions
186 |         # else:
187 |         new_graph = graph_undirected(self.edges, self.vertices)
188 |         while (len(new_graph.vertices)>max_size_after_contraction):
189 |             vertex_list = list(new_graph.vertices)
190 |             rand_vertex = vertex_list[np.random.randint(len(vertex_list))]
191 |             rand_vertex_neighbor_list = list(new_graph.adjacent_vertices(rand_vertex))
192 |             rand_neighbor = rand_vertex_neighbor_list[np.random.randint(len(rand_vertex_neighbor_list))]
193 |             new_graph = new_graph.contract_edge([rand_vertex, rand_neighbor],sep_str='_|_')
194 |         
195 |         new_graph.enumerate_mc_partitions()
196 |         self.contracted_partitions = transform_partition_list(new_graph.mc_partitions, sep='_|_')
197 |         return self.contracted_partitions
198 | 
199 | 
200 | 
201 |     def enumerate_connected_sets(self, max_size=-1, verbose=False):
202 |         if self.all_connected_sets:
203 |             return self.all_connected_sets
204 |         if(len(self.vertices)==0):
205 |             return []
206 |         if((len(self.vertices)>=1) and (len(self.vertices)<=2)):
207 |             return [frozenset([list(self.vertices)[0]])]
208 |         if max_size==(-1):
209 |             max_size=len(self.vertices)
210 |         connected_sets = []
211 |         connected_sets.append(set())
212 |         num_connected_sets_list = []
213 |         if verbose:
214 |             print('Evaluating components of size 1')
215 |         for vert in self.vertices:
216 |             connected_sets[0].add(frozenset({vert}))
217 |         num_connected_sets_list.append(len(connected_sets[0]))
218 |         if verbose:
219 |             print('num connected sets of size 1 = {}'.format(num_connected_sets_list[0]))
220 |             print('Evaluating components of size 2')
221 |         connected_sets.append(set())
222 |         for edge in self.edges:
223 |             connected_sets[1].add(edge)
224 |         num_connected_sets_list.append(len(connected_sets[1]))
225 |         if verbose:
226 |             print('num_connected_sets of size 2 = {}'.format(num_connected_sets_list[1]))
227 |             print('num_connected_sets of size<=2 is {}'.format(np.sum(num_connected_sets_list)))
228 | 
229 |         
230 |         for comp_size in range(3, max_size+1):
231 |             connected_sets.append(set())
232 | 
233 |             if verbose:
234 |                 print('Evaluating components of size {}'.format(comp_size))
235 |             base_components = connected_sets[comp_size-2]
236 |             for base_comp in base_components:
237 |                 neighbors_to_add = self.adjacent_vertices_to_set(base_comp)
238 |                 for neighbor in neighbors_to_add:
239 |                     new_comp = set(base_comp)
240 |                     new_comp.add(neighbor)
241 |                     new_comp = frozenset(new_comp)
242 |                     connected_sets[comp_size-1].add(new_comp)
243 |             num_connected_sets_list.append(len(connected_sets[comp_size-1]))
244 |             # if memory_save:
245 |             #     good_partitions[comp_size-2]=set()
246 |             #     failed_partitions[comp_size-2]=set()
247 |                 
248 |             if verbose:
249 |                 print('num_connected_sets of size {} = {}'.format(comp_size,num_connected_sets_list[comp_size-1]))
250 |                 print('num_total_connected_sets of size<={} is {}'.format(comp_size, np.sum(num_connected_sets_list)))
251 |              
252 |         connected_sets = [k for templist in connected_sets for k in templist]
253 |         self.all_connected_sets = connected_sets
254 |         return connected_sets
255 |     
256 |     def save_connected_sets_to_file(self, file_name):
257 |         list_of_lists = [list(x) for x in self.all_connected_sets]
258 |         with open(file_name, "w") as write_file:
259 |             json.dump(list_of_lists, write_file)
260 | 
261 |     def load_connected_sets_from_file(self, file_name):
262 |         with open(file_name, "r") as read_file:
263 |             list_of_lists = json.load(read_file)
264 |         self.all_connected_sets = [frozenset(x) for x in list_of_lists]
265 | 
266 |     def get_partitions_from_connected_sets(self, verbose=False, verbose_freq=1000):
267 |         part_list = []
268 |         conn_set_list = self.all_connected_sets.copy()
269 |         conn_set_set = set(self.all_connected_sets)
270 |         if verbose:
271 |             print('checking {} connected sets'.format(len(conn_set_list)))
272 |         for i,conn_set in enumerate(conn_set_list):
273 |             if len(conn_set) > (len(self.vertices)/2):
274 |                 break
275 |             complement_set = frozenset(self.vertices - conn_set)
276 |             if complement_set in conn_set_set:
277 |                 part_list.append(conn_set)
278 |                 conn_set_list.remove(complement_set)
279 |             if ((((i+1) % verbose_freq)) ==0):
280 |                 if verbose:
281 |                     print('Checked {} sets'.format(i+1))
282 |                     print('Found {} partitions'.format(len(part_list)))
283 |         self.all_partitions = part_list
284 | 
285 | 
286 | def contract_edge(graph, edge, sep_str='_|_'):
287 |     edge_alph = list(edge)
288 |     edge_alph.sort()
289 |     contracted_vertex = sep_str.join((edge_alph))
290 |     #new_vertices = (set(graph.vertices) - set(edge)).union({contracted_vertex})
291 |     new_edges = [[contracted_vertex if y==edge_alph[0] or y==edge_alph[1] else y for y in this_edge] 
292 |                  if edge_alph[0] in this_edge  or edge_alph[1] in this_edge else this_edge for this_edge in graph.edges]
293 |     return graph_undirected(new_edges)
294 | 
295 | def delete_vertex(graph, vertex):
296 |     new_edges = set([edge for edge in graph.edges if vertex not in edge])
297 |     new_vertices = graph.vertices - {vertex}
298 |     return graph_undirected(new_edges, new_vertices)
299 | 
300 | def delete_vertices(graph, vertex_set):
301 |     new_edges = set([edge for edge in graph.edges if not vertex_set.intersection(edge)])
302 |     new_vertices = graph.vertices - vertex_set
303 |     return graph_undirected(new_edges, new_vertices)
304 | 
305 | def get_induced_subgraph(graph, vertex_set):
306 |     vertex_set = set(vertex_set)
307 |     new_edges = set([edge for edge in graph.edges if edge <= vertex_set])
308 |     new_vertices = vertex_set
309 |     new_graph = graph_undirected(new_edges, new_vertices)
310 |     new_graph.all_connected_sets = [x for x in graph.all_connected_sets if new_vertices.issuperset(x)]
311 |     return new_graph
312 | 
313 | 
314 | def is_connected(graph):
315 |     initial_vertex = next(iter(graph.vertices))
316 |     visited_vertices = [initial_vertex]
317 |     unexplored_vertices = list(graph.adjacent_vertices(initial_vertex))
318 |     while unexplored_vertices:
319 |         curr_vertex = unexplored_vertices.pop()
320 |         visited_vertices.append(curr_vertex)
321 |         new_vertices = graph.adjacent_vertices(curr_vertex)
322 |         unexplored_vertices = list(set(unexplored_vertices).union(new_vertices) - set(visited_vertices))
323 |     return len(set(visited_vertices)) == len(set(graph.vertices))
324 | 
325 | def num_connected_comp(graph):
326 |     initial_vertex = list(graph.vertices)[0]
327 |     visited_vertices = [initial_vertex]
328 |     unexplored_vertices = list(graph.adjacent_vertices(initial_vertex))
329 |     while unexplored_vertices:
330 |         curr_vertex = unexplored_vertices.pop(0)
331 |         visited_vertices.append(curr_vertex)
332 |         new_vertices = graph.adjacent_vertices(curr_vertex)
333 |         unexplored_vertices = list(set(unexplored_vertices).union(new_vertices) - set(visited_vertices))
334 |     if len(set(visited_vertices)) == len(set(graph.vertices)):
335 |         return 1
336 |     else:
337 |         remainder_vertices = list(set(graph.vertices)-set(visited_vertices))
338 |         remainder_edges = [edge for edge in graph.edges if edge.issubset(set(remainder_vertices))]
339 |         return 1 + num_connected_comp(graph_undirected(remainder_edges, remainder_vertices))
340 |     
341 | def connected_comp_list(graph):
342 |     initial_vertex = list(graph.vertices)[0]
343 |     visited_vertices = [initial_vertex]
344 |     unexplored_vertices = list(graph.adjacent_vertices(initial_vertex))
345 |     while unexplored_vertices:
346 |         curr_vertex = unexplored_vertices.pop(0)
347 |         visited_vertices.append(curr_vertex)
348 |         new_vertices = graph.adjacent_vertices(curr_vertex)
349 |         unexplored_vertices = list(set(unexplored_vertices).union(new_vertices) - set(visited_vertices))
350 |     if len(set(visited_vertices)) == len(set(graph.vertices)):
351 |         return [graph]
352 |     else:
353 |         cc_vertices = set(visited_vertices)
354 |         cc_edges = [edge for edge in graph.edges if edge.issubset(set(visited_vertices))]
355 |         cc_graph = graph_undirected(cc_edges, cc_vertices)
356 |         remainder_vertices = list(set(graph.vertices)-set(visited_vertices))
357 |         remainder_edges = [edge for edge in graph.edges if edge.issubset(set(remainder_vertices))]
358 |         return [cc_graph] + connected_comp_list(graph_undirected(remainder_edges, remainder_vertices))
359 |     
360 | def get_all_distances_from_vertex(graph, start_vertex):
361 |     vertex_path_dist_dict=set()
362 |     vertex_path_dist_dict[start_vertex] = 0
363 |     unexplored_vertices = list(graph.adjacent_vertices(start_vertex))
364 |     for vert in unexplored_vertices:
365 |         vertex_path_dist_dict[vert]=1
366 |     visited_vertices = [start_vertex]
367 |     
368 |     while unexplored_vertices and (len(vertex_path_dist_dict.keys())<len(graph.vertices)):
369 |         curr_vertex = unexplored_vertices.pop()
370 |         curr_dist = vertex_path_dist_dict[curr_vertex]
371 |         visited_vertices.append(curr_vertex)
372 |         curr_neighbors = graph.adjacent_vertices(curr_vertex)
373 |         new_vertices = curr_neighbors - set(vertex_path_dist_dict.keys())
374 |         for vert in new_vertices:
375 |             vertex_path_dist_dict[vert]=curr_dist+1
376 |         unexplored_vertices = list(new_vertices) + unexplored_vertices
377 |     return vertex_path_dist_dict
378 | 
379 | def transform_partition_list(part_list, sep='_|_'):
380 |     new_part_list = []
381 |     for partition in part_list:
382 |         new_fs_list = [[],[]]
383 |         for i,fs in enumerate(partition):
384 |             for item_str in fs:
385 |                 new_fs_list[i] = new_fs_list[i] + item_str.split(sep)
386 |         new_fs = frozenset([frozenset(new_fs_list[0]), frozenset(new_fs_list[1])])
387 |         new_part_list.append(new_fs)
388 |     return(new_part_list)
389 | 
390 | def separate_by_two_vertices(graph, vert_1, vert_2):
391 |     dict_1 = get_all_distances_from_vertex(graph, vert_1)
392 |     dict_2 = get_all_distances_from_vertex(graph, vert_2)
393 |     comp_2 = set([vert for vert in dict_2.keys() if dict_2[vert]<dict_1[vert]])
394 |     comp_1 = graph.vertices - comp_2
395 |     return comp_1, comp_2
396 | 


--------------------------------------------------------------------------------
/extra/code/hypergraphs.pyx:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import json
  3 | 
  4 | class hypergraph(object):
  5 |     """This is a class to handle hypergraphs. 
  6 |     Defines a hypergraph by a set of vertices and a set of "frozensets" representing the edges.
  7 |     """
  8 | 
  9 |     def __init__(self, edges, vertices=set(), add_singletons=False):
 10 |         
 11 |         if vertices == set():
 12 |             self.vertices = set([x for sublist in list(edges) for x in list(sublist) ])
 13 |         else:
 14 |             self.vertices = vertices
 15 |         new_edges = set()
 16 |         for edge in edges:
 17 |             new_edge = frozenset(edge)
 18 |             if len(new_edge)>=1:
 19 |                 new_edges.add(new_edge)
 20 |         if add_singletons:
 21 |             for vertex in self.vertices:
 22 |                 new_edges.add(frozenset([vertex]))
 23 |         self.edges = set(new_edges)
 24 |         self.mc_partitions = []
 25 |         self.mc_partitions_max_size = 0
 26 |         self.all_connected_sets = []
 27 |         self.partition_dict = {}
 28 | 
 29 | 
 30 |     def adjacent_edges(self, target_vertex):
 31 |         return set([x for x in self.edges if target_vertex in x])
 32 | 
 33 |     def sets_adjacent(self, frozenset vert_name_set_1, frozenset vert_name_set_2):
 34 |         return(bool(vert_name_set_1.intersection(vert_name_set_2)))
 35 | 
 36 | 
 37 |     def adjacent_vertices(self, target_vertex):
 38 |         neighbors_and_self = set([x for sublist in self.adjacent_edges(target_vertex) for x in sublist])
 39 |         return set(neighbors_and_self)-set([target_vertex])
 40 | 
 41 |     def adjacent_vertices_to_set(self, target_vertex_set):
 42 |         templist = [list(self.adjacent_vertices(x)) for x in target_vertex_set]
 43 |         neighbors_and_self = [x for sublist in templist for x in sublist]
 44 |         return set(neighbors_and_self)-target_vertex_set
 45 | 
 46 |     # def adjacent_vertices_to_set_2(self, target_vertex_set):
 47 |     #     out_set = {x for subset in self.edges if self.sets_adjacent(subset, target_vertex_set) for x in subset}
 48 |     #     return(out_set - set(target_vertex_set))
 49 |     
 50 |     def vertex_degree(self, target_vertex):
 51 |         return len(self.adjacent_vertices(target_vertex))
 52 | 
 53 |     # def contract_edge(self, edge, sep_str='_'):
 54 |     #     return contract_edge(self, edge, sep_str)
 55 | 
 56 |     def delete_vertex(self, vertex):
 57 |         return delete_vertex(self, vertex)
 58 | 
 59 |     def delete_vertices(self, vertex_set):
 60 |         return delete_vertices(self, vertex_set)
 61 | 
 62 |     def get_induced_subgraph(self, vertex_set):
 63 |         return get_induced_subgraph(self, vertex_set)
 64 | 
 65 |     def generate_small_size_partitions(self, max_size=2):
 66 |         edge_list = list(self.edges)
 67 |         self.partition_dict[(1,1)] 
 68 |         if (self.num_vertices, 2) not in self.partition_dict.keys():
 69 |             self.partition_dict[(self.num_vertices, 2)] = {frozenset([i,j]) for i in edge_list for j in edge_list  if (not i.intersection(j) and i.union(j)==self.vertices)}
 70 |         for curr_size in range(3, max_size):
 71 |             if (0,curr_size-1) not in self.partition_dict.keys():
 72 |                 self.partition_dict[(0, curr_size-1)] = {frozenset([i,j]) for i in edge_list for j in edge_list  if (not i.intersection(j))}
 73 | 
 74 |     # def return_mc_partitions(self):
 75 |     #     if self.mc_partitions==[]:
 76 |     #         self.enumerate_mc_partitions()
 77 |     #     return(self.mc_partitions)
 78 | 
 79 | 
 80 | #     def enumerate_mc_partitions(self, max_size=0, verbose=False):
 81 | #         """This method will examine every connected set S of size up to max_size and
 82 | #         determine whether or not the complement of the set is also connected.  If the
 83 | #         complement is also connected, then the partition {S, S^C} is added to the list
 84 | #         self.mc_partitions"""
 85 |         
 86 | #         # Default behavior is to find all maximally coarse partitions which
 87 | #         # requires searching components up to size floor(n_vertices/2)
 88 | #         if max_size==0:
 89 | #             max_size=int(np.floor(len(self.vertices)/2))
 90 |         
 91 | #         # Initialize some variables
 92 | #         # The two lists below are sets of sets by size.
 93 | #         # i.e. conn_sets_with_conn_complements_by_size[5] will be a set that contains
 94 | #         # the connected sets of size 5 whose complements are also connected
 95 | #         conn_sets_with_conn_complements_by_size = []
 96 | #         conn_sets_with_disconn_complements_by_size = []
 97 |         
 98 | #         # These two contain the sizes of each entry in the above lists
 99 | #         num_conn_sets_with_conn_complements_list = []
100 | #         num_conn_sets_with_disconn_complements_list = []
101 |         
102 | #         # Initialize the list with an empty set
103 | #         conn_sets_with_conn_complements_by_size.append(set())
104 | #         conn_sets_with_disconn_complements_by_size.append(set())
105 | 
106 | 
107 | #         # Corner case handling
108 | #         if(len(self.vertices)==0):
109 | #             return []
110 | #         if((len(self.vertices)>=1) and (len(self.vertices)<=2)):
111 | #             return [frozenset([list(self.vertices)[0]])]
112 |         
113 | #         # The connected components of size 1 are exactly the vertices
114 | #         if verbose:
115 | #             print('Evaluating connected sets of size 1')
116 | #         for vert in self.vertices:
117 | #             if is_connected(delete_vertex(self, vert)):
118 | #                 conn_sets_with_conn_complements_by_size[0].add(frozenset({vert}))
119 | #             else:
120 | #                 conn_sets_with_disconn_complements_by_size[0].add(frozenset({vert}))
121 | #         num_conn_sets_with_conn_complements_list.append(len(conn_sets_with_conn_complements_by_size[0]))
122 | #         num_conn_sets_with_disconn_complements_list.append(len(conn_sets_with_disconn_complements_by_size[0]))
123 | #         if verbose:
124 | #             print('num conn sets of comp_size 1 with connected complements = {}'.format(num_conn_sets_with_conn_complements_list[0]))
125 | #             print('num conn sets of comp_size 1 with disconnected complements = {}'.format(num_conn_sets_with_disconn_complements_list[0]))
126 | #             print('Evaluating connected sets of size 2')
127 | #         conn_sets_with_conn_complements_by_size.append(set())
128 | #         conn_sets_with_disconn_complements_by_size.append(set())
129 |         
130 | #         # The connected components of size 2 are exactly the edges
131 | #         for edge in self.edges:
132 | #             if is_connected(delete_vertices(self, edge)):
133 | #                 conn_sets_with_conn_complements_by_size[1].add(edge)
134 | #             else:
135 | #                 conn_sets_with_disconn_complements_by_size[1].add(edge)
136 | #         num_conn_sets_with_conn_complements_list.append(len(conn_sets_with_conn_complements_by_size[1]))
137 | #         num_conn_sets_with_disconn_complements_list.append(len(conn_sets_with_disconn_complements_by_size[1]))
138 | #         if verbose:
139 | #             print('num conn sets of comp_size 2 with connected complements = {}'.format(num_conn_sets_with_conn_complements_list[1]))
140 | #             print('num conn sets of comp_size 2 with disconnected complements = {}'.format(num_conn_sets_with_disconn_complements_list[1]))
141 | #             print('num conn sets of comp_size <=2 with connected complements = {}'.format(np.sum(num_conn_sets_with_conn_complements_list)))
142 | #             print('num conn sets of comp_size <=2 with disconnected complements = {}'.format(np.sum(num_conn_sets_with_disconn_complements_list)))
143 | 
144 |         
145 | #         for comp_size in range(3, max_size+1):
146 | #             conn_sets_with_conn_complements_by_size.append(set())
147 | #             conn_sets_with_disconn_complements_by_size.append(set())
148 | 
149 | #             if verbose:
150 | #                 print('Evaluating connected sets of size {}'.format(comp_size))
151 | #             base_components = conn_sets_with_conn_complements_by_size[comp_size-2].union(conn_sets_with_disconn_complements_by_size[comp_size-2])
152 | #             for base_comp in base_components:
153 | #                 neighbors_to_add = self.adjacent_vertices_to_set(base_comp)
154 | #                 for neighbor in neighbors_to_add:
155 | #                     new_comp = set(base_comp)
156 | #                     new_comp.add(neighbor)
157 | #                     new_comp = frozenset(new_comp)
158 | #                     if ((not new_comp in conn_sets_with_conn_complements_by_size[comp_size-1]) and (not new_comp in conn_sets_with_disconn_complements_by_size[comp_size-1])):
159 | #                         if is_connected(delete_vertices(self,new_comp)):
160 | #                             conn_sets_with_conn_complements_by_size[comp_size-1].add(new_comp)
161 | #                         else:
162 | #                             conn_sets_with_disconn_complements_by_size[comp_size-1].add(new_comp)
163 | #             num_conn_sets_with_conn_complements_list.append(len(conn_sets_with_conn_complements_by_size[comp_size-1]))
164 | #             num_conn_sets_with_disconn_complements_list.append(len(conn_sets_with_disconn_complements_by_size[comp_size-1]))
165 |                 
166 | #             if verbose:
167 | #                 print('num conn set of comp_size {} with connected complements= {}'.format(comp_size,num_conn_sets_with_conn_complements_list[comp_size-1]))
168 | #                 print('num conn set of comp_size {} with discconnected complements= {}'.format(comp_size,num_conn_sets_with_disconn_complements_list[comp_size-1]))
169 | #                 print('num conn set of comp_size <= {} with connected complements= {}'.format(comp_size, np.sum(num_conn_sets_with_conn_complements_list)))
170 | #                 print('num conn set of comp_size <= {} with disconnected complements= {}'.format(comp_size, np.sum(num_conn_sets_with_disconn_complements_list)))
171 |              
172 | #         self.mc_partitions = list(set([frozenset([conn_set, frozenset(self.vertices - conn_set)]) for templist in conn_sets_with_conn_complements_by_size for conn_set in templist]))
173 | #         #self.mc_partitions = [[conn_set, self.vertices - conn_set] for conn_set in conn_sets_with_conn_complements]
174 | #         self.mc_partitions_max_size = max_size
175 |         
176 | 
177 | #     def save_partitions_to_file(self, file_name):
178 | #         list_of_lists = [list(x) for x in self.all_partitions]
179 | #         with open(file_name, "w") as write_file:
180 | #             json.dump(list_of_lists, write_file)
181 | 
182 | #     def load_partitions_from_file(self, file_name):
183 | #         with open(file_name, "r") as read_file:
184 | #             list_of_lists = json.load(read_file)
185 | #         self.all_partitions = [frozenset(x) for x in list_of_lists]
186 | 
187 | #     def enumerate_connected_sets(self, max_size=-1, verbose=False):
188 | #         if self.all_connected_sets:
189 | #             return self.all_connected_sets
190 | #         if(len(self.vertices)==0):
191 | #             return []
192 | #         if((len(self.vertices)>=1) and (len(self.vertices)<=2)):
193 | #             return [frozenset([list(self.vertices)[0]])]
194 | #         if max_size==(-1):
195 | #             max_size=len(self.vertices)
196 | #         connected_sets = []
197 | #         connected_sets.append(set())
198 | #         num_connected_sets_list = []
199 | #         if verbose:
200 | #             print('Evaluating components of size 1')
201 | #         for vert in self.vertices:
202 | #             connected_sets[0].add(frozenset({vert}))
203 | #         num_connected_sets_list.append(len(connected_sets[0]))
204 | #         if verbose:
205 | #             print('num connected sets of size 1 = {}'.format(num_connected_sets_list[0]))
206 | #             print('Evaluating components of size 2')
207 | #         connected_sets.append(set())
208 | #         for edge in self.edges:
209 | #             connected_sets[1].add(edge)
210 | #         num_connected_sets_list.append(len(connected_sets[1]))
211 | #         if verbose:
212 | #             print('num_connected_sets of size 2 = {}'.format(num_connected_sets_list[1]))
213 | #             print('num_connected_sets of size<=2 is {}'.format(np.sum(num_connected_sets_list)))
214 | 
215 |         
216 | #         for comp_size in range(3, max_size+1):
217 | #             connected_sets.append(set())
218 | 
219 | #             if verbose:
220 | #                 print('Evaluating components of size {}'.format(comp_size))
221 | #             base_components = connected_sets[comp_size-2]
222 | #             for base_comp in base_components:
223 | #                 neighbors_to_add = self.adjacent_vertices_to_set(base_comp)
224 | #                 for neighbor in neighbors_to_add:
225 | #                     new_comp = set(base_comp)
226 | #                     new_comp.add(neighbor)
227 | #                     new_comp = frozenset(new_comp)
228 | #                     connected_sets[comp_size-1].add(new_comp)
229 | #             num_connected_sets_list.append(len(connected_sets[comp_size-1]))
230 | #             # if memory_save:
231 | #             #     good_partitions[comp_size-2]=set()
232 | #             #     failed_partitions[comp_size-2]=set()
233 |                 
234 | #             if verbose:
235 | #                 print('num_connected_sets of size {} = {}'.format(comp_size,num_connected_sets_list[comp_size-1]))
236 | #                 print('num_total_connected_sets of size<={} is {}'.format(comp_size, np.sum(num_connected_sets_list)))
237 |              
238 | #         connected_sets = [k for templist in connected_sets for k in templist]
239 | #         self.all_connected_sets = connected_sets
240 | #         return connected_sets
241 |     
242 | #     def save_connected_sets_to_file(self, file_name):
243 | #         list_of_lists = [list(x) for x in self.all_connected_sets]
244 | #         with open(file_name, "w") as write_file:
245 | #             json.dump(list_of_lists, write_file)
246 | 
247 | #     def load_connected_sets_from_file(self, file_name):
248 | #         with open(file_name, "r") as read_file:
249 | #             list_of_lists = json.load(read_file)
250 | #         self.all_connected_sets = [frozenset(x) for x in list_of_lists]
251 | 
252 | #     def get_partitions_from_connected_sets(self, verbose=False, verbose_freq=1000):
253 | #         part_list = []
254 | #         conn_set_list = self.all_connected_sets.copy()
255 | #         conn_set_set = set(self.all_connected_sets)
256 | #         if verbose:
257 | #             print('checking {} connected sets'.format(len(conn_set_list)))
258 | #         for i,conn_set in enumerate(conn_set_list):
259 | #             if len(conn_set) > (len(self.vertices)/2):
260 | #                 break
261 | #             complement_set = frozenset(self.vertices - conn_set)
262 | #             if complement_set in conn_set_set:
263 | #                 part_list.append(conn_set)
264 | #                 conn_set_list.remove(complement_set)
265 | #             if ((((i+1) % verbose_freq)) ==0):
266 | #                 if verbose:
267 | #                     print('Checked {} sets'.format(i+1))
268 | #                     print('Found {} partitions'.format(len(part_list)))
269 | #         self.all_partitions = part_list
270 | 
271 | 
272 | # def contract_edge(graph, edge, sep_str='_'):
273 | #     edge_alph = list(edge)
274 | #     edge_alph.sort()
275 | #     contracted_vertex = sep_str.join((edge_alph))
276 | #     #new_vertices = (set(graph.vertices) - set(edge)).union({contracted_vertex})
277 | #     new_edges = [[contracted_vertex if y==edge_alph[0] or y==edge_alph[1] else y for y in this_edge] 
278 | #                  if edge_alph[0] in this_edge  or edge_alph[1] in this_edge else this_edge for this_edge in graph.edges]
279 | #     return graph_undirected(new_edges)
280 | 
281 | def delete_vertex(graph, vertex):
282 |     new_edges = set([frozenset(edge - set(vertex)) for edge in graph.edges])
283 |     new_vertices = graph.vertices - {vertex}
284 |     return hypergraph(new_edges, new_vertices)
285 | 
286 | def delete_vertices(graph, vertex_set):
287 | 
288 |     new_edges = set([frozenset(edge-set(vertex_set)) for edge in graph.edges])
289 |     new_vertices = graph.vertices - vertex_set
290 |     return hypergraph(new_edges, new_vertices)
291 | 
292 | def get_induced_subgraph(graph, vertex_set):
293 |     vertex_set = set(vertex_set)
294 |     new_edges = set([edge for edge in graph.edges if edge <= vertex_set])
295 |     new_vertices = vertex_set
296 |     new_graph = hypergraph(new_edges, new_vertices)
297 |     new_graph.all_connected_sets = [x for x in graph.all_connected_sets if new_vertices.issuperset(x)]
298 |     return new_graph
299 | 
300 | 
301 | # def is_connected(graph):
302 | #     initial_vertex = next(iter(graph.vertices))
303 | #     visited_vertices = [initial_vertex]
304 | #     unexplored_vertices = list(graph.adjacent_vertices(initial_vertex))
305 | #     while unexplored_vertices:
306 | #         curr_vertex = unexplored_vertices.pop()
307 | #         visited_vertices.append(curr_vertex)
308 | #         new_vertices = graph.adjacent_vertices(curr_vertex)
309 | #         unexplored_vertices = list(set(unexplored_vertices).union(new_vertices) - set(visited_vertices))
310 | #     return len(set(visited_vertices)) == len(set(graph.vertices))
311 | 
312 | # def num_connected_comp(graph):
313 | #     initial_vertex = list(graph.vertices)[0]
314 | #     visited_vertices = [initial_vertex]
315 | #     unexplored_vertices = list(graph.adjacent_vertices(initial_vertex))
316 | #     while unexplored_vertices:
317 | #         curr_vertex = unexplored_vertices.pop(0)
318 | #         visited_vertices.append(curr_vertex)
319 | #         new_vertices = graph.adjacent_vertices(curr_vertex)
320 | #         unexplored_vertices = list(set(unexplored_vertices).union(new_vertices) - set(visited_vertices))
321 | #     if len(set(visited_vertices)) == len(set(graph.vertices)):
322 | #         return 1
323 | #     else:
324 | #         remainder_vertices = list(set(graph.vertices)-set(visited_vertices))
325 | #         remainder_edges = [edge for edge in graph.edges if edge.issubset(set(remainder_vertices))]
326 | #         return 1 + num_connected_comp(graph_undirected(remainder_edges, remainder_vertices))
327 |     
328 | # def connected_comp_list(graph):
329 | #     initial_vertex = list(graph.vertices)[0]
330 | #     visited_vertices = [initial_vertex]
331 | #     unexplored_vertices = list(graph.adjacent_vertices(initial_vertex))
332 | #     while unexplored_vertices:
333 | #         curr_vertex = unexplored_vertices.pop(0)
334 | #         visited_vertices.append(curr_vertex)
335 | #         new_vertices = graph.adjacent_vertices(curr_vertex)
336 | #         unexplored_vertices = list(set(unexplored_vertices).union(new_vertices) - set(visited_vertices))
337 | #     if len(set(visited_vertices)) == len(set(graph.vertices)):
338 | #         return [graph]
339 | #     else:
340 | #         cc_vertices = set(visited_vertices)
341 | #         cc_edges = [edge for edge in graph.edges if edge.issubset(set(visited_vertices))]
342 | #         cc_graph = graph_undirected(cc_edges, cc_vertices)
343 | #         remainder_vertices = list(set(graph.vertices)-set(visited_vertices))
344 | #         remainder_edges = [edge for edge in graph.edges if edge.issubset(set(remainder_vertices))]
345 | #         return [cc_graph] + connected_comp_list(graph_undirected(remainder_edges, remainder_vertices))
346 |     
347 | # def get_all_distances_from_vertex(graph, start_vertex):
348 | #     vertex_path_dist_dict=set()
349 | #     vertex_path_dist_dict[start_vertex] = 0
350 | #     unexplored_vertices = list(graph.adjacent_vertices(start_vertex))
351 | #     for vert in unexplored_vertices:
352 | #         vertex_path_dist_dict[vert]=1
353 | #     visited_vertices = [start_vertex]
354 |     
355 | #     while unexplored_vertices and (len(vertex_path_dist_dict.keys())<len(graph.vertices)):
356 | #         curr_vertex = unexplored_vertices.pop()
357 | #         curr_dist = vertex_path_dist_dict[curr_vertex]
358 | #         visited_vertices.append(curr_vertex)
359 | #         curr_neighbors = graph.adjacent_vertices(curr_vertex)
360 | #         new_vertices = curr_neighbors - set(vertex_path_dist_dict.keys())
361 | #         for vert in new_vertices:
362 | #             vertex_path_dist_dict[vert]=curr_dist+1
363 | #         unexplored_vertices = list(new_vertices) + unexplored_vertices
364 | #     return vertex_path_dist_dict
365 | 
366 | # def separate_by_two_vertices(graph, vert_1, vert_2):
367 | #     dict_1 = get_all_distances_from_vertex(graph, vert_1)
368 | #     dict_2 = get_all_distances_from_vertex(graph, vert_2)
369 | #     comp_2 = set([vert for vert in dict_2.keys() if dict_2[vert]<dict_1[vert]])
370 | #     comp_1 = graph.vertices - comp_2
371 | #     return comp_1, comp_2
372 | 


--------------------------------------------------------------------------------
/extra/code/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from Cython.Build import cythonize
 3 | import numpy
 4 | 
 5 | 
 6 | setup(
 7 |   name = 'MyProject',
 8 |   ext_modules = cythonize(["*.pyx"]),
 9 |   include_dirs=[numpy.get_include()]
10 | )
11 | 


--------------------------------------------------------------------------------
/extra/code/structure_dt.pyx:
--------------------------------------------------------------------------------
  1 | # cython: profile=True
  2 | 
  3 | """Decision Tree based on Discrete Graph structure"""
  4 | import numpy as np
  5 | import pandas as pd
  6 | import random
  7 | cimport numpy as cnp
  8 | from libc.math cimport log as clog
  9 | from graphs import *
 10 | import copy
 11 | 
 12 | class StructureDecisionTree(object):
 13 |     """This class represents a tree built on categorical features, each of which contains
 14 |     a graph to represent the associated terrain.  Splits will be tried according to the 
 15 |     *maximally coarse partitions* returned from the graph class.
 16 | 
 17 |     feature_graphs: a dictionary which maps the column names to a graph_undirected object.
 18 |                     The graph_undirected must contain vertices for every possible value of that column
 19 |                     If the graph contains no edges, it will be treated as one-hot encoded.
 20 | 
 21 |     loss_fn: Currently there are three options: 
 22 |             'entropy':  will use the information gain to choose the best split (target nust be [0,1])
 23 |             'mse': will use (minimum) mean squared error to choose the best split (target must be numeric)
 24 |             'gh': This uses the XGBoost method where the first derivative (g) and second derivative (h) of the
 25 |                 custom loss function must be provided.  In this case, the 'g' values should be passed as y_train
 26 |                 and the 'h' values passed as y_train_2
 27 | 
 28 |     min_size_split: The size, below which, the tree will not consider splitting further.  Default is 2.
 29 | 
 30 |     min_leaf_size: The minimum permitted size of a split.  Splits will not be considered if they result
 31 |                     in a leaf smaller than min_leaf_size
 32 | 
 33 |     max_depth: The maximum depth permitted for the tree.  Setting to 1 means creating 'stumps' (a single split).
 34 | 
 35 |     gamma: The minimum improvement required to execute a split (for regularization purposes).  
 36 |             If the improvement of a split does not exceed gamma, then the node will not be split.
 37 | 
 38 |     reg_lambda: The L1 shrinkage applied to the coefficients, as in XGBoost.
 39 | 
 40 |     node_summary_fn: Given a collection of points at the node, what should be the value of the node.  Default is
 41 |             to take the mean.
 42 | 
 43 |     max_splits_to_search: For a feature, what is the maximum number of splits we should search.  Categorical features
 44 |             may have prohibitively many possible splits.  If the number exceeds max_splits_to_search, we randomly choose
 45 |             only max_splits_to_search of them to evaluate.  Default is infinity (search all splits)
 46 |     """
 47 | 
 48 |     def __init__(self, feature_configs, feature_graphs, min_size_split=2, 
 49 |                 min_leaf_size = 2, max_depth=3, gamma=0,
 50 |                 reg_lambda=1):
 51 |         self.dec_tree={}
 52 |         self.feature_configs = feature_configs
 53 |         self.dec_tree['feature_graphs'] = feature_graphs
 54 |         self.num_leafs = 0
 55 |         self.min_size_split=min_size_split
 56 |         self.min_leaf_size=min_leaf_size
 57 |         self.max_depth=max_depth
 58 |         self.gamma=gamma
 59 |         self.reg_lambda=reg_lambda
 60 |         self.node_summary_fn=_node_summary_gh
 61 | 
 62 |     def fit(self, X_train, g_train, h_train):        
 63 |         # Tree fitting works through a queue of nodes to process (node_to_proc_list) 
 64 |         # The initial node is just the root of the tree
 65 |         self.node_to_proc_list = [self.dec_tree]
 66 |         
 67 |         # Initialize values to what they are at the root of the tree
 68 |         self.dec_tree['depth']=0
 69 |         self.dec_tree['mask'] = np.ones(len(g_train)).astype(bool)
 70 |         self.X_train = X_train
 71 |         self.g_train = g_train
 72 |         self.h_train = h_train
 73 | 
 74 |         # Process nodes until none are left to process
 75 |         while self.node_to_proc_list:
 76 |             node_to_process = self.node_to_proc_list.pop()
 77 |             self._process_tree_node(node_to_process)
 78 |     
 79 |     def predict(self, X_test):
 80 |         cdef int i, n=X_test.shape[0]
 81 |         cdef dict data_row_dict, pointer, col_to_int_dict
 82 |         cdef frozenset left_set  
 83 |         
 84 |         col_list = list(X_test.columns)
 85 |         data_np = X_test.values
 86 |         col_to_int_dict = {col_list[i]:i for i in range(len(col_list))}
 87 | 
 88 |         # Initialize the output vector to all zeros
 89 |         out_vec = np.zeros(X_test.shape[0])
 90 |         
 91 |         # This iterates through each data point in test set and follows the tree until it
 92 |         # reaches a leaf node
 93 |         for i in range(n):
 94 |             # Put the relevant values for current test point into a dict for quicker lookup
 95 |             data_row_dict = {colname:data_np[i,col_to_int_dict[colname]] for colname in col_list}
 96 |             pointer = self.dec_tree
 97 |             while pointer['node_type']=='interior':
 98 |                 curr_element = data_row_dict[pointer['split_feature']]
 99 |                 if pointer['feature_type']=='categ_graphical':
100 |                     left_set = pointer['left_split']
101 |                     if curr_element in left_set:
102 |                         pointer = pointer['left_child']
103 |                     else:
104 |                         pointer = pointer['right_child']
105 |                     continue
106 |                 if pointer['feature_type']=='numerical':
107 |                     if curr_element <= pointer['left_split']:
108 |                         pointer = pointer['left_child']
109 |                     else:
110 |                         pointer = pointer['right_child']
111 |                     continue
112 |             out_vec[i] = pointer['node_summary_val']
113 |         return(out_vec)
114 |    
115 |     def _process_tree_node(self, curr_node):
116 |         # Restrict to relevant data for the node in question
117 |         X_train_node = self.X_train[curr_node['mask']]
118 |         
119 |         # Get the associated y-values (or g,h values)
120 |         # and save information about the current node
121 |         g_train_node = self.g_train[curr_node['mask']].values
122 |         h_train_node = self.h_train[curr_node['mask']].values
123 |         curr_node['node_summary_val'] = _node_summary_gh(g_train_node, h_train_node, self.gamma)
124 |         curr_node['num_data_points'] = len(g_train_node)
125 |         g_sum_node = np.sum(g_train_node)
126 |         h_sum_node = np.sum(h_train_node)
127 | 
128 |         # If we are guaranteed not to split this node further, then mark it as such and move on
129 |         if (curr_node['num_data_points']<self.min_size_split) or (curr_node['depth']>=self.max_depth):
130 |             self._wrap_up_node(curr_node, g_train_node, h_train_node)
131 |             return None
132 |         
133 |         # Determine which features are still "eligible" to be considered
134 |         features_to_search = self.feature_configs.keys()
135 |         
136 |         # print('features_to_search')
137 |         # print(features_to_search)
138 |         # If no features are eligible (e.g. all x-values are identical in all features)
139 |         # Then we similarly summarize the node and move on
140 |         # if features_to_search==[]:
141 |         #     self._wrap_up_node(curr_node, g_train_node, h_train_node)
142 |         #     return None
143 |         
144 |         # best_split_dict holds all the necessary info about a potential split
145 |         best_split_dict = _initialize_best_split_dict()
146 | 
147 |         # Main loop over features to find best split
148 |         for feature in features_to_search:
149 |             # print('evaluating feature {}'.format(feature))
150 |             best_split_for_feature = evaluate_feature(self.feature_configs[feature], 
151 |                                                         curr_node['feature_graphs'],
152 |                                                         feature,
153 |                                                         X_train_node[feature].values, 
154 |                                                         g_train_node, h_train_node, 
155 |                                                         self.gamma, self.reg_lambda)
156 |             if best_split_for_feature:
157 |                 best_split_for_feature['split_feature'] = feature
158 |                 if best_split_for_feature['loss_score']<best_split_dict['loss_score']:
159 |                     best_split_dict = best_split_for_feature
160 | 
161 |         # Execute the split (if a good-enough split is found) otherwise stop
162 |         if best_split_dict['loss_score'] < np.inf:
163 |             self._execute_split(curr_node, best_split_dict, curr_node['feature_graphs'])
164 |         else:
165 |             self._wrap_up_node(curr_node, g_train_node, h_train_node)
166 | 
167 |     def _wrap_up_node(self, curr_node, g_train_node, h_train_node):
168 |         # Compute summary stats of node and mark it as a leaf
169 |         curr_node['node_summary_val'] = _node_summary_gh(g_train_node, h_train_node, self.reg_lambda)
170 |         curr_node['num_data_points'] = len(g_train_node)
171 |         curr_node['node_type'] = 'leaf'
172 |         self.num_leafs+=1
173 |         curr_node.pop('mask')
174 | 
175 |     def _execute_split(self, curr_node, best_split_dict, feature_graphs_node):
176 |         if best_split_dict['feature_type']=='numerical':
177 |             self._execute_split_numerical(curr_node, best_split_dict, feature_graphs_node)
178 |         if best_split_dict['feature_type']=='categ_graphical':
179 |             self._execute_split_graphical(curr_node, best_split_dict, feature_graphs_node)
180 | 
181 |     def _execute_split_numerical(self, curr_node, best_split_dict, feature_graphs_node):
182 |         left_mask = (self.X_train[best_split_dict['split_feature']]<=best_split_dict['left_split']).values
183 |         curr_node['left_split'] = best_split_dict['left_split']
184 |         curr_node['loss_score'] = best_split_dict['loss_score']
185 |         curr_node['split_feature'] = best_split_dict['split_feature']       
186 |         curr_node['node_type'] = 'interior'
187 |         curr_node['feature_type'] = best_split_dict['feature_type']
188 |         curr_mask = curr_node.pop('mask')      
189 | 
190 |         # Create feature graphs for children
191 |         feature_graphs_left = feature_graphs_node.copy()
192 |         feature_graphs_right = feature_graphs_node.copy()
193 |         # feature_configs_left = copy.deepcopy(feature_configs_node)
194 |         # feature_configs_right = copy.deepcopy(feature_configs_node)
195 | 
196 |         self._create_children_nodes(curr_node, feature_graphs_left, feature_graphs_right, curr_mask, left_mask)
197 |        
198 |     def _create_children_nodes(self, curr_node, feature_graphs_left, feature_graphs_right, curr_mask, left_mask):
199 |         # Create left and right children
200 |         curr_node['left_child'] = {}
201 |         curr_node['left_child']['depth'] = curr_node['depth'] + 1
202 |         curr_node['left_child']['mask'] = curr_mask & left_mask
203 |         curr_node['left_child']['feature_graphs'] = feature_graphs_left
204 | 
205 |         curr_node['right_child'] = {}
206 |         curr_node['right_child']['depth'] = curr_node['depth'] + 1
207 |         curr_node['right_child']['mask'] = curr_mask & np.logical_not(left_mask)
208 |         curr_node['right_child']['feature_graphs'] = feature_graphs_right
209 | 
210 |         # Add left and right children to queue
211 |         self.node_to_proc_list.append(curr_node['left_child'])
212 |         self.node_to_proc_list.append(curr_node['right_child'])
213 | 
214 | 
215 |     def _execute_split_graphical(self, curr_node, best_split_dict, feature_graphs_node):
216 | 
217 |         left_mask = self.X_train[best_split_dict['split_feature']].isin(best_split_dict['left_split']).values
218 | 
219 |         # record info about current node
220 |         curr_node['left_split'] = best_split_dict['left_split']
221 |         curr_node['right_split'] = feature_graphs_node[best_split_dict['split_feature']].vertices - best_split_dict['left_split']
222 |         curr_node['loss_score'] = best_split_dict['loss_score']
223 |         curr_node['split_feature'] = best_split_dict['split_feature']       
224 |         curr_node['node_type'] = 'interior'
225 |         curr_node['feature_type'] = best_split_dict['feature_type']
226 |         curr_mask = curr_node.pop('mask')      
227 | 
228 |         # Create feature graphs for children
229 |         feature_graphs_left = feature_graphs_node.copy()
230 |         feature_graphs_left[curr_node['split_feature']] = get_induced_subgraph(feature_graphs_left[curr_node['split_feature']], 
231 |                                                                                 curr_node['left_split'])
232 |         feature_graphs_right = feature_graphs_node.copy()
233 |         feature_graphs_right[curr_node['split_feature']] = get_induced_subgraph(feature_graphs_right[curr_node['split_feature']], 
234 |                                                                                 curr_node['right_split'])
235 | 
236 |         # feature_graphs_left = feature_graphs_node.copy()
237 |         # curr_graph = feature_configs_node[curr_node['split_feature']]['graph']
238 |         # feature_configs_left[curr_node['split_feature']]['graph'] = get_induced_subgraph(curr_graph, 
239 |         #                                                                         curr_node['left_split'])
240 |         # feature_configs_right = copy.deepcopy(feature_configs_node)
241 |         # feature_configs_right[curr_node['split_feature']]['graph'] = get_induced_subgraph(curr_graph, 
242 |         #                                                                         curr_graph.vertices -curr_node['left_split'])
243 | 
244 |         # print('-------------------------------------------------------')
245 |         # print('Executing Graphical Split')
246 |         # print('Left Split')
247 |         # print(curr_node['left_split'])
248 |         # print('Right Split')
249 |         # print(curr_graph.vertices -curr_node['left_split'])
250 |         # print('-------------------------------------------------------')
251 | 
252 |         self._create_children_nodes(curr_node, feature_graphs_left, feature_graphs_right, curr_mask, left_mask)
253 |     
254 | 
255 | def _initialize_best_split_dict():
256 |     out_dict = {}
257 |     out_dict['loss_score'] = np.inf
258 |     out_dict['left_split'] = None    
259 |     out_dict['split_feature'] = None
260 |     return(out_dict)
261 | 
262 | 
263 | def root_mean_squared_error(vec1, vec2):
264 |     return np.sqrt(np.mean((vec1-vec2)**2))
265 | 
266 | 
267 | def _get_gh_score_num(double g_left,  double g_right, 
268 |                     double h_left, double h_right, double gamma, double reg_lambda):
269 |     return(.5*( ((g_left*g_left)/(h_left+reg_lambda)) + ((g_right*g_right)/(h_right+reg_lambda)) - (((g_left + g_right)*(g_left + g_right))/(h_left + h_right+reg_lambda)))-gamma)
270 | 
271 | 
272 | def _get_gh_score_array(cnp.ndarray[double] g_left, cnp.ndarray[double]g_right, 
273 |                     cnp.ndarray[double]h_left, cnp.ndarray[double]h_right, double gamma, double reg_lambda):
274 |     return(.5*( ((g_left*g_left)/(h_left+reg_lambda)) + ((g_right*g_right)/(h_right+reg_lambda)) - (((g_left + g_right)*(g_left + g_right))/(h_left + h_right+reg_lambda)))-gamma)
275 | 
276 | def _node_summary_gh(y_vec_g, y_vec_h, reg_lambda):
277 |     out_val = -np.sum(y_vec_g)/(np.sum(y_vec_h)+reg_lambda)
278 |     return(out_val)
279 | 
280 | 
281 | def evaluate_feature(feature_config, feature_graphs, feature_name,
282 |                 feature_vec_node, g_train_node, h_train_node, gamma, reg_lambda):
283 | 
284 |     feature_type = feature_config['feature_type']
285 |     if feature_type=='numerical':
286 |         return _evaluate_feature_numerical(feature_config, feature_vec_node, 
287 |                                             g_train_node, h_train_node, gamma, reg_lambda)
288 |     if feature_type=='categ_graphical':
289 |         return _evaluate_feature_graphical(feature_config, feature_graphs[feature_name], feature_vec_node,
290 |                                             g_train_node, h_train_node, gamma, reg_lambda)
291 | 
292 |     # if feature_type=='categ_one_hot':
293 |     #     return _evaluate_feature_one_hot(feature_config, feature_vec_node,
294 |     #                                         g_train_node, h_train_node, gamma, reg_lambda)
295 | 
296 | def _evaluate_feature_numerical(feature_config, feature_vec, g_vec, h_vec, gamma, reg_lambda):
297 |     splits_to_eval = _get_numerical_splits(feature_vec)
298 |     if len(splits_to_eval)>0:
299 |         split_res = feature_config['split_res'] if 'split_res' in feature_config.keys() else np.Inf
300 |         split_count = len(splits_to_eval)
301 |         if split_res<split_count:
302 |             splits_to_eval = splits_to_eval[np.unique(np.random.randint(split_count, size=split_res))]
303 |         best_loss, best_split_val = _evaluate_numerical_splits(feature_vec, g_vec, h_vec, splits_to_eval, gamma, reg_lambda)
304 |         best_split_of_feat={}
305 |         best_split_of_feat['loss_score'] = best_loss
306 |         best_split_of_feat['left_split'] = best_split_val    
307 |         best_split_of_feat['feature_type'] = 'numerical'
308 |         return(best_split_of_feat)
309 |     else:
310 |         return({})
311 | 
312 | 
313 | def _get_numerical_splits(feature_vec, prec_digits=16):
314 |     unique_vals = np.sort(pd.unique(feature_vec))
315 |     if len(unique_vals)>1:
316 |         unique_splits = (unique_vals[1:]+unique_vals[:-1])/2
317 |         return unique_splits
318 |     else:
319 |         return []
320 | 
321 | def _evaluate_numerical_splits(feature_vec, g_vec, h_vec, split_vec, gamma, reg_lambda):
322 |     ## NOTE : need to incorporate min_leaf_size restriction
323 | 
324 |     bin_result_vec = np.searchsorted(split_vec, feature_vec, side='left')
325 |     g_sum_bins, h_sum_bins = get_bin_sums_c(g_vec, h_vec, bin_result_vec, len(split_vec)+1)
326 |     g_sum_total, g_sum_left, g_sum_right = get_left_right_sums(g_sum_bins)
327 |     h_sum_total, h_sum_left, h_sum_right = get_left_right_sums(h_sum_bins)
328 |     score_vec = (-1)*_get_gh_score_array(g_sum_left, g_sum_right, h_sum_left, h_sum_right, gamma, reg_lambda)
329 |     # if (len(score_vec)!=len(split_vec)):
330 |     #     print('score_vec has length {}'.format(len(score_vec)))
331 |     #     print('split_vec has length {}'.format(len(split_vec)))
332 | 
333 |     best_loss, best_split_val = get_best_vals(score_vec, split_vec)
334 |     return best_loss, best_split_val
335 | 
336 | def get_best_vals(score_vec, split_vec):
337 |     best_loss = np.min(score_vec)
338 |     best_split_index = np.argmin(score_vec)
339 |     best_split_val =  split_vec[np.argmin(score_vec)]
340 |     return best_loss, best_split_val
341 | 
342 | def get_bin_sums(g_vec, h_vec, bin_result_vec, out_vec_size):
343 |     g_sum_bins = np.zeros(out_vec_size)
344 |     h_sum_bins = np.zeros(out_vec_size)
345 |     for i,bin_ind in enumerate(bin_result_vec):
346 |         g_sum_bins[bin_ind]+=g_vec[i]
347 |         h_sum_bins[bin_ind]+=h_vec[i]
348 |     return g_sum_bins, h_sum_bins
349 | 
350 | def get_bin_sums_c(cnp.ndarray[double] g_vec, cnp.ndarray[double] h_vec, 
351 |                     cnp.ndarray[long] bin_result_vec, long out_vec_size):
352 |     cdef int i
353 |     cdef int m = bin_result_vec.shape[0]
354 | 
355 |     cdef cnp.ndarray[double] g_sum_bins = np.zeros(out_vec_size)
356 |     cdef cnp.ndarray[double] h_sum_bins = np.zeros(out_vec_size)
357 |     
358 |     for i in range(m):
359 |         g_sum_bins[bin_result_vec[i]]+=g_vec[i]
360 |         h_sum_bins[bin_result_vec[i]]+=h_vec[i]
361 |     return g_sum_bins, h_sum_bins
362 | 
363 | 
364 | def get_left_right_sums(bin_sums):
365 |     sum_total = np.sum(bin_sums)
366 |     sum_left = (np.cumsum(bin_sums))[:-1]
367 |     sum_right = sum_total - sum_left
368 |     return sum_total, sum_left, sum_right
369 | 
370 | def _evaluate_feature_graphical(feature_config, feature_graph, feature_vec_node, 
371 |                                 g_train_node, h_train_node, gamma, reg_lambda):
372 |     # NOTE: need to incorporate min_leaf_size restriction
373 | 
374 |     msac = feature_config['msac']
375 |     msts = feature_config['split_res']
376 |     # Query the graph structure to get the possible splits
377 |     # print('len(feature_graph.mc_partitions)={}'.format(len(feature_graph.mc_partitions)))
378 |     if (len(feature_graph.mc_partitions)>0):
379 |         possible_splits = feature_graph.return_mc_partitions()
380 |     else:
381 |         # print('vertices = {}'.format(feature_graph.vertices))
382 |         # print('edges = {}'.format(feature_graph.edges))
383 |         possible_splits = feature_graph.return_contracted_partitions(max_size_after_contraction=msac)
384 |     nps = len(possible_splits)
385 |     # print('nps={}'.format(nps))
386 |     if (nps>msts):
387 |         # Randomly choose (with replacement) a subset of possible splits
388 |         index_range = np.random.randint(0,nps,msts)
389 |     else:
390 |         index_range = range(nps)
391 | 
392 |     
393 |     best_split_of_feat = {}
394 |     best_split_of_feat['loss_score'] = np.Inf
395 |     g_sum = np.sum(g_train_node)
396 |     h_sum = np.sum(h_train_node)
397 |     # Loop within values of each feature
398 |     for index in index_range:
399 |         curr_partition = list(possible_splits[index])
400 |         left_split = curr_partition[0]
401 |         right_split = curr_partition[1]
402 |         mask_left = np.array([x in left_split for x in feature_vec_node])
403 |         curr_loss = _score_split(mask_left, g_train_node, h_train_node, g_sum, h_sum,
404 |                                  gamma, reg_lambda)
405 |         # print('Evaluating split')
406 |         # print(left_split)
407 |         # print('vs')
408 |         # print(right_split)
409 |         # print('loss_score = {}'.format(curr_loss))
410 |         # print('----')
411 | 
412 |         if curr_loss < best_split_of_feat['loss_score']:
413 |             best_split_of_feat['loss_score'] = curr_loss
414 |             best_split_of_feat['left_split'] = left_split
415 |             best_split_of_feat['feature_type'] = 'categ_graphical'
416 |     return(best_split_of_feat)
417 | 
418 | def _score_split(mask_left, g_train_node, h_train_node, g_sum, h_sum, gamma, reg_lambda):
419 |     # cdef double loss_score, g_left, g_right, h_left, h_right, vec_len
420 | 
421 |     vec_len = len(g_train_node)
422 |     g_left = np.sum(g_train_node[mask_left])
423 |     g_right = g_sum - g_left
424 |     h_left = np.sum(h_train_node[mask_left])
425 |     h_right = h_sum - h_left
426 |     loss_score = -1.0 * _get_gh_score_num(g_left, g_right, h_left, h_right, gamma, reg_lambda)
427 |     if loss_score>=0:
428 |         loss_score = np.inf
429 |     return loss_score
430 | 
431 | 
432 | 
433 | 
434 | 


--------------------------------------------------------------------------------
/extra/code/structure_gb.pyx:
--------------------------------------------------------------------------------
  1 | # cython: profile=True
  2 | 
  3 | """Decision Tree Gradient Boosting based on Discrete Graph structure"""
  4 | import numpy as np
  5 | import pandas as pd
  6 | cimport numpy as cnp
  7 | from libc.math cimport log as clog
  8 | from structure_dt import *
  9 | from graphs import *
 10 | from sklearn.metrics import log_loss, mean_squared_error
 11 | 
 12 | 
 13 | class StructureBoost(object):
 14 | 
 15 |     def __init__(self, num_trees, feature_configs, feature_graphs, mode='classification', loss_fn = 'entropy', min_size_split=2, min_leaf_size = 1, max_depth=3, gamma=0,
 16 |                      reg_lambda=1, node_summary_fn = np.mean, learning_rate=.1, max_splits_to_search=np.Inf, msac=100):
 17 |         self.num_trees = num_trees
 18 |         self.num_trees_for_prediction = num_trees
 19 |         self.dec_tree_list = []
 20 |         self.feature_configs = feature_configs
 21 |         self.feature_graphs = feature_graphs
 22 |         self.min_size_split=min_size_split
 23 |         self.min_leaf_size=min_leaf_size
 24 |         self.max_depth=max_depth
 25 |         self.gamma=gamma
 26 |         self.reg_lambda=reg_lambda
 27 |         self.node_summary_fn=node_summary_fn
 28 |         self.learning_rate = learning_rate
 29 |         self.loss_fn = loss_fn
 30 |         self.max_splits_to_search = max_splits_to_search
 31 |         self.mode = mode
 32 |         if loss_fn == 'entropy':
 33 |             self.loss_fn_der_1 = _entropy_link_der_1
 34 |             self.loss_fn_der_2 = _entropy_link_der_2
 35 |         if loss_fn == 'mse':
 36 |             self.loss_fn_der_1 = _mse_der_1
 37 |             self.loss_fn_der_2 = _mse_der_2
 38 |         # if features=='auto':
 39 |         #     self.features=list(self.dec_tree['feature_graphs'].keys())
 40 | 
 41 |     def fit(self, X_train, y_train, eval_set = None, eval_freq=10, 
 42 |                 early_stop_past_steps=0, choose_best_eval=True):
 43 |         # cdef int i, n =self.num_trees
 44 |         self.eval_freq=eval_freq
 45 |         eval_len = np.floor(self.num_trees/self.eval_freq).astype(int)
 46 |         self.eval_results = np.zeros(eval_len)
 47 |         n =self.num_trees
 48 |         self.initial_pred = np.mean(y_train)
 49 |         stop_now=False
 50 |         if eval_set is not None:
 51 |             X_valid = eval_set[0]
 52 |             y_valid = eval_set[1]
 53 |         for i in range(n):
 54 |             # print('iteration number {}'.format(i))
 55 |             # Get predictions of current model
 56 |             if i==0:
 57 |                 curr_answer = self.initial_pred * np.ones(len(y_train))
 58 |                 if eval_set is not None:
 59 |                     curr_test_answer = self.initial_pred * np.ones(len(y_valid))
 60 |                     if self.mode == 'classification':
 61 |                         curr_loss= log_loss(y_valid, 1/(1+np.exp(-curr_test_answer)))
 62 |                         print("i=0, test_set_log_loss = {}".format(curr_loss))
 63 |                     else:
 64 |                         curr_loss= mean_squared_error(y_valid, curr_test_answer)
 65 |                         print("i=0. test_set_mse = {}".format(curr_loss))
 66 |                     
 67 |             else:
 68 |                 curr_answer = curr_answer + self.learning_rate * self.dec_tree_list[i-1].predict(X_train) 
 69 |                 if eval_set is not None:
 70 |                     curr_test_answer = curr_test_answer + self.learning_rate * self.dec_tree_list[i-1].predict(X_valid)
 71 |                     if ((i+1)%self.eval_freq==1):
 72 |                         if self.mode == 'classification':
 73 |                             curr_loss= log_loss(y_valid, 1/(1+np.exp(-curr_test_answer)))
 74 |                             print("i={}, test_set_log_loss = {}".format(i,curr_loss))
 75 |                         else:
 76 |                             curr_loss= mean_squared_error(y_valid, curr_test_answer)
 77 |                             print("i={}, test_set_mse = {}".format(i,curr_loss))
 78 |                         
 79 |                         curr_step=np.floor((i+1)/self.eval_freq).astype(int) -1
 80 |                         self.eval_results[curr_step]=curr_loss
 81 |                         if curr_step>early_stop_past_steps:        
 82 |                             compare_loss = np.min(self.eval_results[:curr_step-early_stop_past_steps+1])
 83 |                             if (curr_loss>compare_loss):
 84 |                                 stop_now=True
 85 |                                 print("Stopping early: curr_loss of {} exceeds compare_loss of {}".format(curr_loss, compare_loss))
 86 |             if stop_now:        
 87 |                 if choose_best_eval:
 88 |                     self.num_trees_for_prediction = (np.argmin(self.eval_results[:curr_step+1])+1)*eval_freq
 89 |                 break
 90 | 
 91 |             # Get first and second derivatives
 92 |             y_g_vec = self.loss_fn_der_1(y_train, curr_answer)
 93 |             y_h_vec = self.loss_fn_der_2(y_train, curr_answer)
 94 | 
 95 | 
 96 |            # Sample the data to use for this tree
 97 |             
 98 |             num_rows = X_train.shape[0]
 99 |             rows_to_use = np.random.choice(range(num_rows), num_rows, replace=True)
100 |             if type(X_train)==pd.DataFrame:
101 |                 X_train_to_use = X_train.iloc[rows_to_use]
102 |             elif type(X_train)==np.ndarray:
103 |                 X_train_to_use = X_train[rows_to_use]
104 |             else:
105 |                 print('unknown format for X_train')
106 |             #y_original_train_to_use = y_train.sample(X_train.shape[0], random_state=rs, replace=True)
107 |             if type(y_g_vec)==pd.Series:
108 |                 y_g_to_use = y_g_vec.iloc[rows_to_use]
109 |             elif type(y_g_vec)==np.ndarray:
110 |                 y_g_to_use = y_g_vec[rows_to_use]
111 |             else:
112 |                 print('unknown format for y_g_vec')
113 | 
114 |             if type(y_h_vec)==pd.Series:
115 |                 y_h_to_use = y_h_vec.iloc[rows_to_use]
116 |             elif type(y_h_vec)==np.ndarray:
117 |                 y_h_to_use = y_h_vec[rows_to_use]
118 |             else:
119 |                 print('unknown format for y_h_vec')
120 | 
121 |             #local_feature_configs = self.feature_configs.copy()
122 |             self.dec_tree_list.append(StructureDecisionTree(feature_configs=self.feature_configs,
123 |                                                 feature_graphs=self.feature_graphs,
124 |                                                  min_size_split = self.min_size_split, min_leaf_size=self.min_leaf_size, 
125 |                                                  gamma=self.gamma, max_depth=self.max_depth, reg_lambda=self.reg_lambda))
126 |             self.dec_tree_list[i].fit(X_train_to_use, y_g_to_use, y_h_to_use)
127 | 
128 | 
129 |     def predict(self, X_test, num_trees_to_use=0):
130 |             cdef int i
131 |             if num_trees_to_use==0:
132 |                 num_trees_to_use=self.num_trees_for_prediction
133 |             out_vec = self.initial_pred*np.ones(X_test.shape[0])
134 |             for i in range(num_trees_to_use):
135 |                 out_vec = out_vec + self.learning_rate * self.dec_tree_list[i].predict(X_test)
136 |             if self.mode=='classification':
137 |                 return(1/(1+np.exp(-out_vec)))
138 |             else:
139 |                 return(out_vec)
140 | 
141 | def _entropy_der_1(y_true, y_pred, eps=1e-15):
142 |     y_pred = np.maximum(y_pred, eps)
143 |     y_pred = np.minimum(y_pred, 1-eps)
144 |     return((-(y_true/y_pred) + (1-y_true)/(1-y_pred)))
145 | 
146 | def _entropy_der_2(y_true, y_pred, eps=1e-15):
147 |     y_pred = np.maximum(y_pred, eps)
148 |     y_pred = np.minimum(y_pred, 1-eps)
149 |     out_vec = (y_true)/(y_pred**2) + ((1-y_true)/((1-y_pred)**2))
150 |     return(out_vec)
151 | 
152 | def _mse_der_1(y_true, y_pred, eps=1e-15):
153 |     return(2*(y_pred-y_true))
154 | 
155 | def _mse_der_2(y_true, y_pred, eps=1e-15):
156 |     return(pd.Series(2*np.ones(len(y_pred))))
157 | 
158 | def _entropy_link_der_1(y_true, z_pred, eps=1e-15):
159 |     return(-y_true*(1/(1+np.exp(z_pred))) + (1-y_true) * (1/(1+np.exp(-z_pred))) )
160 | 
161 | def _entropy_link_der_2(y_true, z_pred, eps=1e-15):
162 |     return(y_true*(np.exp(z_pred)/((1+np.exp(z_pred))**2)) + (1-y_true) * (np.exp(-z_pred)/((1+np.exp(-z_pred))**2)) )
163 | 
164 | 


--------------------------------------------------------------------------------
/ml_insights/CVModel.py:
--------------------------------------------------------------------------------
  1 | """Cross-validated training and prediction."""
  2 | import numpy as np
  3 | import pandas as pd
  4 | from sklearn.base import BaseEstimator, ClassifierMixin, clone
  5 | 
  6 | class CVModel(BaseEstimator, ClassifierMixin):
  7 | 
  8 |     def __init__(self, base_estimator=None):
  9 |         self.base_estimator = base_estimator
 10 | 
 11 |     def fit(self, X_train, y_train, fold_num, train_overall=True, **kwargs):
 12 |         """Fits a cross-validated model - a model for each left-out fold plus one overall model.
 13 | 
 14 |         X_train: the training predictors
 15 |         y_train: the training outcome
 16 |         fold_num: the indicator of which fold each row belongs to"""
 17 |         self.model_dict = {}
 18 |         self.fold_set = np.unique(fold_num)
 19 |         self.num_unique_y_values = len(np.unique(y_train))
 20 |         self.num_features = X_train.shape[1]
 21 | 
 22 |         ## If a DataFrame is given (rather than just an array) then make a note of the column names.  
 23 |         ## This way we can match up column names when we use predict_proba.
 24 |         if type(X_train) == pd.DataFrame:
 25 |             self.fit_columns = np.array(X_train.columns)
 26 |         else:
 27 |             self.fit_columns = None
 28 | 
 29 |         ## Make copies of the estimator for each of the fold models
 30 |         for fold in self.fold_set:
 31 |             self.model_dict[fold] = clone(self.base_estimator)
 32 | 
 33 |         ## Train the separate models, each one leaving out a particular fold in training
 34 |         for fold in self.fold_set:
 35 |             print("Leave out fold {} and train on the rest".format(fold))
 36 |             X_tr = X_train[fold_num != fold]
 37 |             y_tr = y_train[fold_num != fold]
 38 |             self.model_dict[fold].fit(X_tr, y_tr, **kwargs)
 39 | 
 40 |         ## Train the overall model on all of the data
 41 |         if train_overall:
 42 |             print("Train the overall model".format(fold))
 43 |             self.model_dict['overall_model'] = clone(self.base_estimator)
 44 |             self.model_dict['overall_model'].fit(X_train, y_train, **kwargs)
 45 |         return self
 46 | 
 47 |     def predict_proba(self, X_test, fold_num=None, **kwargs):
 48 |         """Predict probabilities in cross-validated fashion.
 49 | 
 50 |         X_test: the data to predict on
 51 |         fold_num: the indicator of which fold a row belongs to / which model variant to use.
 52 |         If fold_num is not specified, it will default to use the overall_model
 53 |         """
 54 |         ## If we have column names and X_test is a DataFrame, then subset X_test to those columns
 55 |         ## in the correct order, and error if those columns are not present
 56 |         if self.fit_columns is not None:
 57 |             if type(X_test) == pd.DataFrame:
 58 |                 X_test = X_test.loc[:, self.fit_columns]
 59 |         
 60 |         if fold_num is None:
 61 |             #print("no folds specified, using overall_model")
 62 |             if 'overall_model' not in self.model_dict.keys():
 63 |                 #print("Error: overall_model not trained and fold_num not specified")
 64 |                 return None
 65 |             else:
 66 |                 results = self.model_dict['overall_model'].predict_proba(X_test, **kwargs)
 67 |                 return results
 68 |         else:
 69 |             results = np.zeros((X_test.shape[0], self.num_unique_y_values))
 70 |             fold_set = np.unique(fold_num)
 71 |             for fold in fold_set:
 72 |                 X_te = X_test[fold_num == fold]
 73 |                 fold_results = self.model_dict[fold].predict_proba(X_te, **kwargs)
 74 |                 results[fold_num==fold] = fold_results
 75 |             return results
 76 | 
 77 |     def predict(self, X_test, fold_num=None, **kwargs):
 78 |         """Predict final values in cross-validated fashion.
 79 | 
 80 |         X_test: the data to predict on
 81 |         fold_num: the indicator of which fold a row belongs to / which model variant to use.
 82 |         If fold_num is not specified, it will default to use the overall_model
 83 |         
 84 |         """
 85 | 
 86 |         ## If we have column names and X_test is a DataFrame, then subset X_test to those columns
 87 |         ## in the correct order, and error if those columns are not present
 88 |         if self.fit_columns is not None:
 89 |             if type(X_test) == pd.DataFrame:
 90 |                 X_test = X_test.loc[:, self.fit_columns]
 91 |         
 92 |         if fold_num is None:
 93 |             #print("no folds specified, using overall_model")
 94 |             if 'overall_model' not in self.model_dict.keys():
 95 |                 print("Error: overall_model not trained and fold_num not specified")
 96 |                 return None
 97 |             else:
 98 |                 results = self.model_dict['overall_model']
 99 |                 return results
100 |         else:
101 |             results = np.zeros(X_test.shape[0])
102 |             fold_set = np.unique(fold_num)
103 |             for fold in fold_set:
104 |                 X_te = X_test[fold_num == fold]
105 |                 fold_results = self.model_dict[fold].predict(X_te, **kwargs)
106 |                 results[fold_num==fold] = fold_results
107 |             return results
108 | 
109 |     def grid_search(self, X, y, fold_ind, param_grid, score_fn, verbose=True):
110 |         param_arg_list = _get_param_settings_from_grid(param_grid)
111 |         num_settings = len(param_arg_list)
112 |         print("Size of grid to search = {} different settings".format(num_settings))
113 |         param_list_scores = np.zeros(num_settings)
114 |         old_self = clone(self.base_estimator)
115 |         for i in range(num_settings):
116 |             print("Fitting setting {} of {}".format(i+1,num_settings))
117 |             curr_param_dict = param_arg_list[i]
118 |             if verbose:
119 |                 print(curr_param_dict)
120 |             self.base_estimator.set_params(**curr_param_dict)
121 |             self.fit(X, y, fold_ind, train_overall=False)
122 |             curr_preds = self.predict_proba(X, fold_ind)
123 |             if type(score_fn) == list:
124 |                 for j, fn in enumerate(score_fn):
125 |                     curr_score= fn(y, curr_preds)
126 |                     param_arg_list[i]['score_'+str(j)] = curr_score
127 |                     if verbose:
128 |                         print(curr_param_dict,'score function '+str(j)+':',curr_score)
129 |             else:
130 |                 curr_score= score_fn(y, curr_preds)
131 |                 param_arg_list[i]['score'] = curr_score
132 |                 if verbose:
133 |                     print(curr_param_dict,'score function '+':',curr_score)
134 |             param_list_scores[i]=curr_score
135 |         self.base_estimator = old_self
136 |         return param_arg_list
137 | 
138 | 
139 | def _get_param_settings_from_grid(param_grid):
140 |     num_settings = np.prod([len(i) for i in param_grid.values()])
141 |     pg_tuple = tuple(param_grid.items())
142 |     param_names = [k[0] for k in pg_tuple]
143 |     param_lists = [k[1] for k in pg_tuple]
144 |     param_list_lengths = [len(k) for k in param_lists]
145 |     param_dict_list = []
146 |     for i in range(num_settings):
147 |         indices = _int_to_indices(i,param_list_lengths)
148 |         curr_param_dict = {}
149 |         for k in range(len(param_names)):
150 |             curr_param_dict[param_names[k]]=param_lists[k][indices[k]]
151 |         param_dict_list.append(curr_param_dict)
152 |     return param_dict_list    
153 |     
154 | def _int_to_indices(j,lengths):
155 |     out_list = []
156 |     for i in range(len(lengths)):
157 |         curr_ind = j % lengths[i]
158 |         out_list.append(curr_ind)
159 |         j = j//lengths[i]
160 |     return(out_list)
161 | 
162 | 
163 | 
164 | 


--------------------------------------------------------------------------------
/ml_insights/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Package Docuemntation
 3 | """
 4 | # -*- coding: utf-8 -*-
 5 | 
 6 | from .insights import ModelXRay, explain_prediction_difference, explain_prediction_difference_xgboost
 7 | from splinecalib import SplineCalib
 8 | from .modeling_utils import get_stratified_foldnums,cv_predictions
 9 | from .modeling_utils import plot_pr_curve,plot_pr_curves,histogram_pair
10 | from .modeling_utils import ice_plot,get_range_dict, plot_reliability_diagram
11 | from .shap_insights import consolidate_reason_scores, get_reason_codes, cv_column_shap, predict_reason_strings, predict_reasons_cv, get_reason_score_matrix
12 | from .CVModel import CVModel
13 | 
14 | __version__ = '1.1.0'
15 | 


--------------------------------------------------------------------------------
/ml_insights/calibration.py:
--------------------------------------------------------------------------------
  1 | """Calibration of predicted probabilities."""
  2 | import numpy as np
  3 | import sklearn
  4 | import warnings
  5 | from sklearn.base import BaseEstimator, ClassifierMixin, clone
  6 | 
  7 | try:
  8 |     from sklearn.model_selection import StratifiedKFold
  9 | except:
 10 |     from sklearn.cross_validation import StratifiedKFold
 11 | 
 12 | from .calibration_utils import prob_calibration_function, compact_logit
 13 | 
 14 | 
 15 | class SplineCalibratedClassifierCV(BaseEstimator, ClassifierMixin):
 16 |     """Probability calibration using cubic splines.
 17 | 
 18 |     With this class, the base_estimator is fit on each of the cross-validation
 19 |     training set folds in order to generate scores on the (cross-validated)
 20 |     test set folds.  The test set scores are accumulated into a final vector
 21 |     (the size of the full set) which is used to calibrate the answers.
 22 |     The model is then fit on the full data set.  The predict, and predict_proba
 23 |     methods are then updated to use the combination of the predictions from the 
 24 |     full model and the calibration function computed as above.
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     base_estimator : instance BaseEstimator
 29 |         The classifier whose output decision function needs to be calibrated
 30 |         to offer more accurate predict_proba outputs. If cv='prefit', the
 31 |         classifier must have been fit already on data.
 32 | 
 33 |     method : 'logistic' or 'ridge'
 34 |         The default is 'logistic', which is best if you plan to use log-loss as your
 35 |         performance metric.  This method is relatively robust and will typically do
 36 |         well on brier score as well.  The 'ridge' method calibrates using an L2 loss,
 37 |         and therefore should do better for brier score, but may do considerably worse
 38 |         on log-loss.
 39 | 
 40 |     cv : integer, cross-validation generator, iterable or "prefit", optional
 41 |         Determines the cross-validation splitting strategy.
 42 |         Possible inputs for cv are:
 43 | 
 44 |         - None, to use the default 5-fold cross-validation,
 45 |         - integer, to specify the number of folds.
 46 |         - 'prefit', if you wish to use the data only for calibration
 47 | 
 48 |         For integer/None inputs, if ``y`` is binary or multiclass,
 49 |         :class:`sklearn.model_selection.StratifiedKFold` is used. If ``y`` is
 50 |         neither binary nor multiclass, :class:`sklearn.model_selection.KFold`
 51 |         is used.
 52 | 
 53 |         Refer :ref:`User Guide <cross_validation>` for the various
 54 |         cross-validation strategies that can be used here.
 55 | 
 56 |         If "prefit" is passed, it is assumed that base_estimator has been
 57 |         fitted already and all data is used for calibration.
 58 | 
 59 |     Attributes
 60 |     ----------
 61 |     uncalibrated_classifier: this gives the uncalibrated version of the classifier, fit on the entire data set
 62 | 
 63 |     calib_func: this is the calibration function that has been learned from the cross-validation.  Applying this function
 64 |      to the results of the uncalibrated classifier (via model.predict_proba(X_test)[:,1]) gives the fully calibrated classifier
 65 | 
 66 |     References
 67 |     ----------
 68 |    """
 69 |     def __init__(self, base_estimator=None, method='logistic', cv=5, transform_type='none', cl_eps = .000001, **calib_kwargs):
 70 |         warn_msg = ('\nThis class is deprecated and will eventually be removed.' + 
 71 |                     '\nPlease use the SplineCalib class for calibration.')
 72 |         warnings.warn(warn_msg, FutureWarning)
 73 | 
 74 |         self.base_estimator = base_estimator
 75 |         self.uncalibrated_classifier = None
 76 |         self.calib_func = None
 77 |         self.method = method
 78 |         self.cv = cv
 79 |         self.cl_eps = cl_eps
 80 |         self.calib_kwargs = calib_kwargs
 81 |         self.fit_on_multiclass = False
 82 |         self.transform_type = transform_type
 83 |         self.pre_transform = lambda x: x
 84 |         if type(self.transform_type) == str:
 85 |             if self.transform_type == 'cl':
 86 |                 self.pre_transform = lambda x: compact_logit(x, eps = self.cl_eps)
 87 |         if callable(self.transform_type):
 88 |             self.pre_transform = self.transform_type
 89 | 
 90 |     def fit(self, X, y, verbose=False):
 91 |         """Fit the calibrated model
 92 | 
 93 |         Parameters
 94 |         ----------
 95 |         X : array-like, shape (n_samples, n_features)
 96 |             Training data.
 97 | 
 98 |         y : array-like, shape (n_samples,)
 99 |             Target values.
100 | 
101 |         Returns
102 |         -------
103 |         self : object
104 |             Returns an instance of self.
105 |         """
106 | 
107 |         
108 |         if len(np.unique(y)) > 2:
109 |             self.fit_on_multiclass = True
110 |             return self._fit_multiclass(X, y, verbose=verbose)
111 | 
112 |         self.fit_on_multiclass=False
113 |         if ((type(self.cv)==str) and (self.cv=='prefit')):
114 |             self.uncalibrated_classifier = self.base_estimator
115 |             y_pred = self.uncalibrated_classifier.predict_proba(X)[:,1]
116 | 
117 |         else:
118 |             y_pred = np.zeros(len(y))
119 |             
120 |             if sklearn.__version__ < '0.18':
121 |                 if type(self.cv)==int:
122 |                     skf = StratifiedKFold(y, n_folds=self.cv,shuffle=True)
123 |                 else:
124 |                     skf = self.cv
125 |             else:
126 |                 if type(self.cv)==int:
127 |                     skf = StratifiedKFold(n_splits=self.cv, shuffle=True).split(X, y)
128 |                 else:
129 |                     skf = self.cv.split(X,y)
130 |             for idx, (train_idx, test_idx) in enumerate(skf):
131 |                 if verbose:
132 |                     print("training fold {} of {}".format(idx+1, self.cv))
133 |                 X_train = np.array(X)[train_idx,:]
134 |                 X_test = np.array(X)[test_idx,:]
135 |                 y_train = np.array(y)[train_idx]
136 |                 # We could also copy the model first and then fit it
137 |                 this_estimator = clone(self.base_estimator)
138 |                 this_estimator.fit(X_train,y_train)
139 |                 y_pred[test_idx] = this_estimator.predict_proba(X_test)[:,1]
140 |             
141 |             if verbose:
142 |                 print("Training Full Model")
143 |             self.uncalibrated_classifier = clone(self.base_estimator)
144 |             self.uncalibrated_classifier.fit(X, y)
145 | 
146 |         # calibrating function
147 |         if verbose:
148 |             print("Determining Calibration Function")
149 |         if self.method=='logistic':
150 |             self.calib_func = prob_calibration_function(y, self.pre_transform(y_pred), verbose=verbose, **self.calib_kwargs)
151 |         if self.method=='ridge':
152 |             self.calib_func = prob_calibration_function(y, self.pre_transform(y_pred), method='ridge', verbose=verbose, **self.calib_kwargs)
153 |         # training full model
154 | 
155 |         return self
156 | 
157 |     def _fit_multiclass(self, X, y, verbose=False):
158 |         """Fit the calibrated model in multiclass setting
159 | 
160 |         Parameters
161 |         ----------
162 |         X : array-like, shape (n_samples, n_features)
163 |             Training data.
164 | 
165 |         y : array-like, shape (n_samples,)
166 |             Target values.
167 | 
168 |         Returns
169 |         -------
170 |         self : object
171 |             Returns an instance of self.
172 |         """
173 |         class_list = np.unique(y)
174 |         num_classes = len(class_list)
175 |         y_mod = np.zeros(len(y))
176 |         for i in range(num_classes):
177 |            y_mod[y==class_list[i]]=i
178 | 
179 |         y_mod = y_mod.astype(int)
180 |         if ((type(self.cv)==str) and (self.cv=='prefit')):
181 |             self.uncalibrated_classifier = self.base_estimator
182 |             y_pred = self.uncalibrated_classifier.predict_proba(X)
183 | 
184 |         else:
185 |             y_pred = np.zeros((len(y_mod),num_classes))
186 |             if sklearn.__version__ < '0.18':
187 |                 skf = StratifiedKFold(y_mod, n_folds=self.cv,shuffle=True)
188 |             else:
189 |                 skf = StratifiedKFold(n_splits=self.cv, shuffle=True).split(X, y)
190 |             for idx, (train_idx, test_idx) in enumerate(skf):
191 |                 if verbose:
192 |                     print("training fold {} of {}".format(idx+1, self.cv))
193 |                 X_train = np.array(X)[train_idx,:]
194 |                 X_test = np.array(X)[test_idx,:]
195 |                 y_train = np.array(y_mod)[train_idx]
196 |                 # We could also copy the model first and then fit it
197 |                 this_estimator = clone(self.base_estimator)
198 |                 this_estimator.fit(X_train,y_train)
199 |                 y_pred[test_idx,:] = this_estimator.predict_proba(X_test)
200 |             
201 |             if verbose:
202 |                 print("Training Full Model")
203 |             self.uncalibrated_classifier = clone(self.base_estimator)
204 |             self.uncalibrated_classifier.fit(X, y_mod)
205 | 
206 |         # calibrating function
207 |         if verbose:
208 |             print("Determining Calibration Function")
209 |         if self.method=='logistic':
210 |             self.calib_func, self.cf_list = prob_calibration_function_multiclass(y_mod, self.pre_transform(y_pred), verbose=verbose, **self.calib_kwargs)
211 |         if self.method=='ridge':
212 |             self.calib_func, self.cf_list = prob_calibration_function_multiclass(y_mod, self.pre_transform(y_pred), verbose=verbose, method='ridge', **self.calib_kwargs)
213 |         # training full model
214 | 
215 |         return self
216 | 
217 | 
218 |     def predict_proba(self, X):
219 |         """Posterior probabilities of classification
220 | 
221 |         This function returns posterior probabilities of classification
222 |         according to each class on an array of test vectors X.
223 | 
224 |         Parameters
225 |         ----------
226 |         X : array-like, shape (n_samples, n_features)
227 |             The samples.
228 | 
229 |         Returns
230 |         -------
231 |         C : array, shape (n_samples, n_classes)
232 |             The predicted probas.
233 |         """
234 |         # check_is_fitted(self, ["classes_", "calibrated_classifier"])
235 |         if self.fit_on_multiclass:
236 |             return self.calib_func(self.pre_transform(self.uncalibrated_classifier.predict_proba(X)))
237 |         
238 |         col_1 = self.calib_func(self.pre_transform(self.uncalibrated_classifier.predict_proba(X)[:,1]))
239 |         col_0 = 1-col_1
240 |         return np.vstack((col_0,col_1)).T
241 |         
242 |             
243 | 
244 |     def predict(self, X):
245 |         """Predict the target of new samples. Can be different from the
246 |         prediction of the uncalibrated classifier.
247 | 
248 |         Parameters
249 |         ----------
250 |         X : array-like, shape (n_samples, n_features)
251 |             The samples.
252 | 
253 |         Returns
254 |         -------
255 |         C : array, shape (n_samples,)
256 |             The predicted class.
257 |         """
258 |         # check_is_fitted(self, ["classes_", "calibrated_classifier"])
259 |         return self.uncalibrated_classifier.classes_[np.argmax(self.predict_proba(X), axis=1)]
260 | 
261 |     def classes_(self):
262 |         return self.uncalibrated_classifier.classes_
263 | 
264 | 
265 | 
266 | """Calibration of predicted probabilities."""
267 | import numpy as np
268 | import sklearn
269 | from sklearn.base import BaseEstimator, ClassifierMixin, clone
270 | 
271 | try:
272 |     from sklearn.model_selection import StratifiedKFold
273 | except:
274 |     from sklearn.cross_validation import StratifiedKFold
275 | 
276 | from .calibration_utils import prob_calibration_function_multiclass
277 | 
278 | 
279 | class SplineCalibratedClassifierMulticlassCV(BaseEstimator, ClassifierMixin):
280 |     """Probability calibration using cubic splines.
281 | 
282 |     With this class, the base_estimator is fit on each of the cross-validation
283 |     training set folds in order to generate scores on the (cross-validated)
284 |     test set folds.  The test set scores are accumulated into a final vector
285 |     (the size of the full set) which is used to calibrate the answers.
286 |     The model is then fit on the full data set.  The predict, and predict_proba
287 |     methods are then updated to use the combination of the predictions from the 
288 |     full model and the calibration function computed as above.
289 | 
290 |     Parameters
291 |     ----------
292 |     base_estimator : instance BaseEstimator
293 |         The classifier whose output decision function needs to be calibrated
294 |         to offer more accurate predict_proba outputs. If cv='prefit', the
295 |         classifier must have been fit already on data.
296 | 
297 |     method : 'logistic' or 'ridge'
298 |         The default is 'logistic', which is best if you plan to use log-loss as your
299 |         performance metric.  This method is relatively robust and will typically do
300 |         well on brier score as well.  The 'ridge' method calibrates using an L2 loss,
301 |         and therefore should do better for brier score, but may do considerably worse
302 |         on log-loss.
303 | 
304 |     cv : integer, cross-validation generator, iterable or "prefit", optional
305 |         Determines the cross-validation splitting strategy.
306 |         Possible inputs for cv are:
307 | 
308 |         - None, to use the default 5-fold cross-validation,
309 |         - integer, to specify the number of folds.
310 |         - 'prefit', if you wish to use the data only for calibration
311 | 
312 |         For integer/None inputs, if ``y`` is binary or multiclass,
313 |         :class:`sklearn.model_selection.StratifiedKFold` is used. If ``y`` is
314 |         neither binary nor multiclass, :class:`sklearn.model_selection.KFold`
315 |         is used.
316 | 
317 |         Refer :ref:`User Guide <cross_validation>` for the various
318 |         cross-validation strategies that can be used here.
319 | 
320 |         If "prefit" is passed, it is assumed that base_estimator has been
321 |         fitted already and all data is used for calibration.
322 | 
323 |     Attributes
324 |     ----------
325 |     uncalibrated_classifier: this gives the uncalibrated version of the classifier, fit on the entire data set
326 | 
327 |     calib_func: this is the calibration function that has been learned from the cross-validation.  Applying this function
328 |      to the results of the uncalibrated classifier (via model.predict_proba(X_test)[:,1]) gives the fully calibrated classifier
329 | 
330 |     References
331 |     ----------
332 |    """
333 |     def __init__(self, base_estimator=None, method='logistic', cv=5, **calib_kwargs):
334 |         warn_msg = ('\nThis class is deprecated and will eventually be removed.' + 
335 |                     '\nPlease use the SplineCalib class for calibration.')
336 |         warnings.warn(warn_msg, FutureWarning)
337 | 
338 |         self.base_estimator = base_estimator
339 |         self.uncalibrated_classifier = None
340 |         self.calib_func = None
341 |         self.method = method
342 |         self.cv = cv
343 |         self.calib_kwargs = calib_kwargs
344 | 
345 |     def fit(self, X, y, verbose=False):
346 |         """Fit the calibrated model
347 | 
348 |         Parameters
349 |         ----------
350 |         X : array-like, shape (n_samples, n_features)
351 |             Training data.
352 | 
353 |         y : array-like, shape (n_samples,)
354 |             Target values.
355 | 
356 |         Returns
357 |         -------
358 |         self : object
359 |             Returns an instance of self.
360 |         """
361 |         class_list = np.unique(y)
362 |         num_classes = len(class_list)
363 |         y_mod = np.zeros(len(y))
364 | 
365 |         for i in range(num_classes):
366 |             y_mod[np.where(y==class_list[i])]=i
367 | 
368 |         y_mod = y_mod.astype(int)
369 |         if ((type(self.cv)==str) and (self.cv=='prefit')):
370 |             self.uncalibrated_classifier = self.base_estimator
371 |             y_pred = self.uncalibrated_classifier.predict_proba(X)[:,1]
372 | 
373 |         else:
374 |             y_pred = np.zeros((len(y_mod),num_classes))
375 |             if sklearn.__version__ < '0.18':
376 |                 skf = StratifiedKFold(y_mod, n_folds=self.cv,shuffle=True)
377 |             else:
378 |                 skf = StratifiedKFold(n_splits=self.cv, shuffle=True).split(X, y)
379 |             for idx, (train_idx, test_idx) in enumerate(skf):
380 |                 if verbose:
381 |                     print("training fold {} of {}".format(idx+1, self.cv))
382 |                 X_train = np.array(X)[train_idx,:]
383 |                 X_test = np.array(X)[test_idx,:]
384 |                 y_train = np.array(y_mod)[train_idx]
385 |                 # We could also copy the model first and then fit it
386 |                 this_estimator = clone(self.base_estimator)
387 |                 this_estimator.fit(X_train,y_train)
388 |                 y_pred[test_idx,:] = this_estimator.predict_proba(X_test)
389 |             
390 |             if verbose:
391 |                 print("Training Full Model")
392 |             self.uncalibrated_classifier = clone(self.base_estimator)
393 |             self.uncalibrated_classifier.fit(X, y_mod)
394 | 
395 |         # calibrating function
396 |         if verbose:
397 |             print("Determining Calibration Function")
398 |         if self.method=='logistic':
399 |             self.calib_func = prob_calibration_function_multiclass(y_mod, y_pred, verbose=verbose, **self.calib_kwargs)
400 |         if self.method=='ridge':
401 |             self.calib_func = prob_calibration_function_multiclass(y_mod, y_pred, verbose=verbose, method='ridge', **self.calib_kwargs)
402 |         # training full model
403 | 
404 |         return self
405 | 
406 |     def predict_proba(self, X):
407 |         """Posterior probabilities of classification
408 | 
409 |         This function returns posterior probabilities of classification
410 |         according to each class on an array of test vectors X.
411 | 
412 |         Parameters
413 |         ----------
414 |         X : array-like, shape (n_samples, n_features)
415 |             The samples.
416 | 
417 |         Returns
418 |         -------
419 |         C : array, shape (n_samples, n_classes)
420 |             The predicted probas.
421 |         """
422 |         # check_is_fitted(self, ["classes_", "calibrated_classifier"])
423 |         return self.calib_func(self.uncalibrated_classifier.predict_proba(X))
424 | 
425 | 
426 |     def predict(self, X):
427 |         """Predict the target of new samples. Can be different from the
428 |         prediction of the uncalibrated classifier.
429 | 
430 |         Parameters
431 |         ----------
432 |         X : array-like, shape (n_samples, n_features)
433 |             The samples.
434 | 
435 |         Returns
436 |         -------
437 |         C : array, shape (n_samples,)
438 |             The predicted class.
439 |         """
440 |         # check_is_fitted(self, ["classes_", "calibrated_classifier"])
441 |         return self.uncalibrated_classifier.classes_[np.argmax(self.predict_proba(X), axis=1)]
442 | 
443 |     def classes_(self):
444 |         return self.uncalibrated_classifier.classes_
445 | 


--------------------------------------------------------------------------------
/ml_insights/cross_validation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from sklearn.model_selection import StratifiedKFold
 4 | 
 5 | def cv_predict_proba(X, y,estimator, cv):
 6 |     
 7 |     ## Convert from Pandas to Numpy if necessary
 8 |     if (type(X)==pd.DataFrame)  or (type(X)==pd.Series):
 9 |         X = X.values
10 |     if (type(y)==pd.DataFrame)  or (type(y)==pd.Series):
11 |         y = y.values
12 | 
13 |     num_classes = len(np.unique(y))
14 |     out_vec=np.zeros((len(y),num_classes))
15 | 
16 |     #Main loop to do cross-validated predict proba and construct output matrix
17 |     for tr, te in cv.split(X,y):
18 |         estimator.fit(X[tr],y[tr])
19 |         out_vals = estimator.predict_proba(X[te])
20 |         out_vec[te,:] = out_vals
21 |     return out_vec
22 | 
23 | 
24 | def cv_score(X, y, estimator, cv, score_fn):
25 |     return(score_fn(y,cv_predict_proba(X,y,estimator,cv)))
26 | 
27 | def _get_param_settings_from_grid(param_grid):
28 |     num_settings = np.prod([len(i) for i in param_grid.values()])
29 |     pg_tuple = tuple(param_grid.items())
30 |     param_names = [k[0] for k in pg_tuple]
31 |     param_lists = [k[1] for k in pg_tuple]
32 |     param_list_lengths = [len(k) for k in param_lists]
33 |     param_dict_list = []
34 |     for i in range(num_settings):
35 |         indices = _int_to_indices(i,param_list_lengths)
36 |         curr_param_dict = {}
37 |         for k in range(len(param_names)):
38 |             curr_param_dict[param_names[k]]=param_lists[k][indices[k]]
39 |         param_dict_list.append(curr_param_dict)
40 |     return param_dict_list    
41 |     
42 | def _int_to_indices(j,lengths):
43 |     out_list = []
44 |     for i in range(len(lengths)):
45 |         curr_ind = j % lengths[i]
46 |         out_list.append(curr_ind)
47 |         j = j//lengths[i]
48 |     return(out_list)
49 | 
50 | def grid_search(X,y, model, param_grid, score_fn, verbose=True):
51 |     param_arg_list = _get_param_settings_from_grid(param_grid)
52 |     num_settings = len(param_arg_list)
53 |     param_list_scores = np.zeros(num_settings)
54 |     skf = StratifiedKFold(5, shuffle=True, random_state=42)
55 |     for i in range(num_settings):
56 |         curr_param_dict = param_arg_list[i]
57 |         if verbose:
58 |             print(curr_param_dict)
59 |         model.set_params(**curr_param_dict)
60 |         curr_score=cv_score(X,y,model,skf,score_fn)
61 |         param_list_scores[i]=curr_score
62 |         if verbose:
63 |             print(curr_param_dict,curr_score)
64 | 
65 |     return(list(zip(param_arg_list,param_list_scores)))
66 | 
67 | 


--------------------------------------------------------------------------------
/ml_insights/data/ortho.csv:
--------------------------------------------------------------------------------
  1 | contrast1,contrast2,answer
  2 | 0.59999999999999998,0.29999999999999999,1
  3 | 0.65000000000000002,0.29999999999999999,1
  4 | 0.69999999999999996,0.29999999999999999,1
  5 | 0.5,0.29999999999999999,1
  6 | 0.10000000000000001,0.29999999999999999,2
  7 | 0.01,0.29999999999999999,2
  8 | 0.20000000000000001,0.29999999999999999,2
  9 | 0.5,0.29999999999999999,1
 10 | 0.10000000000000001,0.29999999999999999,2
 11 | 0.90000000000000002,0.29999999999999999,1
 12 | 0.90000000000000002,0.29999999999999999,1
 13 | 0.55000000000000004,0.29999999999999999,1
 14 | 0.40000000000000002,0.29999999999999999,1
 15 | 0.20000000000000001,0.29999999999999999,2
 16 | 0.20000000000000001,0.29999999999999999,2
 17 | 0.29999999999999999,0.29999999999999999,2
 18 | 0.65000000000000002,0.29999999999999999,1
 19 | 0.20000000000000001,0.29999999999999999,2
 20 | 0.5,0.29999999999999999,1
 21 | 0.5,0.29999999999999999,1
 22 | 0.01,0.29999999999999999,1
 23 | 0.90000000000000002,0.29999999999999999,1
 24 | 0.65000000000000002,0.29999999999999999,1
 25 | 0.90000000000000002,0.29999999999999999,1
 26 | 0.5,0.29999999999999999,1
 27 | 0.01,0.29999999999999999,2
 28 | 0.59999999999999998,0.29999999999999999,1
 29 | 0.65000000000000002,0.29999999999999999,1
 30 | 0.5,0.29999999999999999,2
 31 | 0.01,0.29999999999999999,2
 32 | 0.69999999999999996,0.29999999999999999,1
 33 | 0.69999999999999996,0.29999999999999999,1
 34 | 0.29999999999999999,0.29999999999999999,2
 35 | 0.20000000000000001,0.29999999999999999,2
 36 | 0.90000000000000002,0.29999999999999999,1
 37 | 0.5,0.29999999999999999,2
 38 | 0.10000000000000001,0.29999999999999999,1
 39 | 0.01,0.29999999999999999,2
 40 | 0.29999999999999999,0.29999999999999999,2
 41 | 0.5,0.29999999999999999,2
 42 | 0.69999999999999996,0.29999999999999999,1
 43 | 0.69999999999999996,0.29999999999999999,1
 44 | 0.90000000000000002,0.29999999999999999,1
 45 | 0.90000000000000002,0.29999999999999999,1
 46 | 0.65000000000000002,0.29999999999999999,1
 47 | 0.40000000000000002,0.29999999999999999,1
 48 | 0.65000000000000002,0.29999999999999999,1
 49 | 0.01,0.29999999999999999,2
 50 | 0.29999999999999999,0.29999999999999999,2
 51 | 0.59999999999999998,0.29999999999999999,2
 52 | 0.01,0.29999999999999999,2
 53 | 0.20000000000000001,0.29999999999999999,2
 54 | 0.10000000000000001,0.29999999999999999,2
 55 | 0.5,0.29999999999999999,1
 56 | 0.69999999999999996,0.29999999999999999,1
 57 | 0.01,0.29999999999999999,2
 58 | 0.10000000000000001,0.29999999999999999,2
 59 | 0.55000000000000004,0.29999999999999999,1
 60 | 0.01,0.29999999999999999,2
 61 | 0.29999999999999999,0.29999999999999999,2
 62 | 0.40000000000000002,0.29999999999999999,2
 63 | 0.20000000000000001,0.29999999999999999,1
 64 | 0.29999999999999999,0.29999999999999999,2
 65 | 0.55000000000000004,0.29999999999999999,2
 66 | 0.10000000000000001,0.29999999999999999,2
 67 | 0.40000000000000002,0.29999999999999999,1
 68 | 0.10000000000000001,0.29999999999999999,2
 69 | 0.10000000000000001,0.29999999999999999,2
 70 | 0.69999999999999996,0.29999999999999999,1
 71 | 0.59999999999999998,0.29999999999999999,1
 72 | 0.59999999999999998,0.29999999999999999,1
 73 | 0.5,0.29999999999999999,1
 74 | 0.40000000000000002,0.29999999999999999,2
 75 | 0.55000000000000004,0.29999999999999999,1
 76 | 0.29999999999999999,0.29999999999999999,2
 77 | 0.55000000000000004,0.29999999999999999,2
 78 | 0.10000000000000001,0.29999999999999999,2
 79 | 0.65000000000000002,0.29999999999999999,1
 80 | 0.5,0.29999999999999999,1
 81 | 0.5,0.29999999999999999,2
 82 | 0.65000000000000002,0.29999999999999999,1
 83 | 0.55000000000000004,0.29999999999999999,2
 84 | 0.29999999999999999,0.29999999999999999,2
 85 | 0.55000000000000004,0.29999999999999999,1
 86 | 0.55000000000000004,0.29999999999999999,1
 87 | 0.65000000000000002,0.29999999999999999,1
 88 | 0.5,0.29999999999999999,2
 89 | 0.10000000000000001,0.29999999999999999,2
 90 | 0.10000000000000001,0.29999999999999999,2
 91 | 0.90000000000000002,0.29999999999999999,1
 92 | 0.55000000000000004,0.29999999999999999,2
 93 | 0.20000000000000001,0.29999999999999999,2
 94 | 0.20000000000000001,0.29999999999999999,2
 95 | 0.20000000000000001,0.29999999999999999,1
 96 | 0.40000000000000002,0.29999999999999999,2
 97 | 0.65000000000000002,0.29999999999999999,1
 98 | 0.65000000000000002,0.29999999999999999,1
 99 | 0.5,0.29999999999999999,1
100 | 0.59999999999999998,0.29999999999999999,2
101 | 0.65000000000000002,0.29999999999999999,1
102 | 0.5,0.29999999999999999,2
103 | 0.59999999999999998,0.29999999999999999,2
104 | 0.59999999999999998,0.29999999999999999,1
105 | 0.59999999999999998,0.29999999999999999,1
106 | 0.40000000000000002,0.29999999999999999,2
107 | 0.90000000000000002,0.29999999999999999,1
108 | 0.90000000000000002,0.29999999999999999,1
109 | 0.29999999999999999,0.29999999999999999,2
110 | 0.40000000000000002,0.29999999999999999,2
111 | 0.40000000000000002,0.29999999999999999,2
112 | 0.59999999999999998,0.29999999999999999,1
113 | 0.69999999999999996,0.29999999999999999,1
114 | 0.10000000000000001,0.29999999999999999,2
115 | 0.59999999999999998,0.29999999999999999,1
116 | 0.5,0.29999999999999999,1
117 | 0.55000000000000004,0.29999999999999999,1
118 | 0.55000000000000004,0.29999999999999999,1
119 | 0.5,0.29999999999999999,1
120 | 0.55000000000000004,0.29999999999999999,1
121 | 0.40000000000000002,0.29999999999999999,1
122 | 0.59999999999999998,0.29999999999999999,1
123 | 0.10000000000000001,0.29999999999999999,2
124 | 0.59999999999999998,0.29999999999999999,1
125 | 0.59999999999999998,0.29999999999999999,1
126 | 0.20000000000000001,0.29999999999999999,2
127 | 0.01,0.29999999999999999,2
128 | 0.5,0.29999999999999999,1
129 | 0.90000000000000002,0.29999999999999999,1
130 | 0.65000000000000002,0.29999999999999999,2
131 | 0.65000000000000002,0.29999999999999999,1
132 | 0.29999999999999999,0.29999999999999999,2
133 | 0.55000000000000004,0.29999999999999999,1
134 | 0.5,0.29999999999999999,2
135 | 0.90000000000000002,0.29999999999999999,1
136 | 0.90000000000000002,0.29999999999999999,1
137 | 0.65000000000000002,0.29999999999999999,1
138 | 0.20000000000000001,0.29999999999999999,2
139 | 0.40000000000000002,0.29999999999999999,2
140 | 0.65000000000000002,0.29999999999999999,1
141 | 0.69999999999999996,0.29999999999999999,1
142 | 0.90000000000000002,0.29999999999999999,1
143 | 0.69999999999999996,0.29999999999999999,1
144 | 0.40000000000000002,0.29999999999999999,1
145 | 0.55000000000000004,0.29999999999999999,1
146 | 0.40000000000000002,0.29999999999999999,2
147 | 0.29999999999999999,0.29999999999999999,2
148 | 0.69999999999999996,0.29999999999999999,1
149 | 0.29999999999999999,0.29999999999999999,2
150 | 0.59999999999999998,0.29999999999999999,1
151 | 0.40000000000000002,0.29999999999999999,2
152 | 0.55000000000000004,0.29999999999999999,2
153 | 0.20000000000000001,0.29999999999999999,2
154 | 0.01,0.29999999999999999,2
155 | 0.10000000000000001,0.29999999999999999,2
156 | 0.90000000000000002,0.29999999999999999,1
157 | 0.29999999999999999,0.29999999999999999,2
158 | 0.10000000000000001,0.29999999999999999,2
159 | 0.59999999999999998,0.29999999999999999,1
160 | 0.01,0.29999999999999999,2
161 | 0.20000000000000001,0.29999999999999999,2
162 | 0.65000000000000002,0.29999999999999999,1
163 | 0.59999999999999998,0.29999999999999999,1
164 | 0.69999999999999996,0.29999999999999999,1
165 | 0.20000000000000001,0.29999999999999999,2
166 | 0.65000000000000002,0.29999999999999999,1
167 | 0.65000000000000002,0.29999999999999999,1
168 | 0.10000000000000001,0.29999999999999999,1
169 | 0.59999999999999998,0.29999999999999999,1
170 | 0.90000000000000002,0.29999999999999999,1
171 | 0.29999999999999999,0.29999999999999999,2
172 | 0.69999999999999996,0.29999999999999999,1
173 | 0.40000000000000002,0.29999999999999999,1
174 | 0.20000000000000001,0.29999999999999999,2
175 | 0.69999999999999996,0.29999999999999999,1
176 | 0.01,0.29999999999999999,2
177 | 0.20000000000000001,0.29999999999999999,2
178 | 0.5,0.29999999999999999,2
179 | 


--------------------------------------------------------------------------------
/ml_insights/data/para.csv:
--------------------------------------------------------------------------------
  1 | contrast1,contrast2,answer
  2 | 0.10000000000000001,0.29999999999999999,2
  3 | 0.90000000000000002,0.29999999999999999,1
  4 | 0.90000000000000002,0.29999999999999999,1
  5 | 0.40000000000000002,0.29999999999999999,2
  6 | 0.01,0.29999999999999999,2
  7 | 0.40000000000000002,0.29999999999999999,2
  8 | 0.20000000000000001,0.29999999999999999,2
  9 | 0.40000000000000002,0.29999999999999999,2
 10 | 0.20000000000000001,0.29999999999999999,2
 11 | 0.55000000000000004,0.29999999999999999,2
 12 | 0.65000000000000002,0.29999999999999999,2
 13 | 0.5,0.29999999999999999,2
 14 | 0.29999999999999999,0.29999999999999999,2
 15 | 0.20000000000000001,0.29999999999999999,2
 16 | 0.90000000000000002,0.29999999999999999,1
 17 | 0.5,0.29999999999999999,1
 18 | 0.69999999999999996,0.29999999999999999,2
 19 | 0.59999999999999998,0.29999999999999999,2
 20 | 0.90000000000000002,0.29999999999999999,1
 21 | 0.55000000000000004,0.29999999999999999,2
 22 | 0.55000000000000004,0.29999999999999999,2
 23 | 0.5,0.29999999999999999,2
 24 | 0.20000000000000001,0.29999999999999999,2
 25 | 0.01,0.29999999999999999,2
 26 | 0.29999999999999999,0.29999999999999999,2
 27 | 0.29999999999999999,0.29999999999999999,2
 28 | 0.20000000000000001,0.29999999999999999,2
 29 | 0.69999999999999996,0.29999999999999999,1
 30 | 0.69999999999999996,0.29999999999999999,1
 31 | 0.5,0.29999999999999999,2
 32 | 0.10000000000000001,0.29999999999999999,2
 33 | 0.59999999999999998,0.29999999999999999,2
 34 | 0.20000000000000001,0.29999999999999999,2
 35 | 0.59999999999999998,0.29999999999999999,2
 36 | 0.65000000000000002,0.29999999999999999,1
 37 | 0.55000000000000004,0.29999999999999999,1
 38 | 0.01,0.29999999999999999,2
 39 | 0.55000000000000004,0.29999999999999999,2
 40 | 0.29999999999999999,0.29999999999999999,2
 41 | 0.01,0.29999999999999999,2
 42 | 0.5,0.29999999999999999,2
 43 | 0.20000000000000001,0.29999999999999999,2
 44 | 0.69999999999999996,0.29999999999999999,1
 45 | 0.5,0.29999999999999999,1
 46 | 0.90000000000000002,0.29999999999999999,1
 47 | 0.55000000000000004,0.29999999999999999,1
 48 | 0.59999999999999998,0.29999999999999999,1
 49 | 0.59999999999999998,0.29999999999999999,2
 50 | 0.01,0.29999999999999999,2
 51 | 0.65000000000000002,0.29999999999999999,1
 52 | 0.90000000000000002,0.29999999999999999,1
 53 | 0.55000000000000004,0.29999999999999999,1
 54 | 0.59999999999999998,0.29999999999999999,2
 55 | 0.5,0.29999999999999999,2
 56 | 0.55000000000000004,0.29999999999999999,2
 57 | 0.5,0.29999999999999999,2
 58 | 0.29999999999999999,0.29999999999999999,2
 59 | 0.01,0.29999999999999999,2
 60 | 0.55000000000000004,0.29999999999999999,2
 61 | 0.59999999999999998,0.29999999999999999,1
 62 | 0.01,0.29999999999999999,2
 63 | 0.10000000000000001,0.29999999999999999,2
 64 | 0.90000000000000002,0.29999999999999999,1
 65 | 0.10000000000000001,0.29999999999999999,2
 66 | 0.29999999999999999,0.29999999999999999,2
 67 | 0.20000000000000001,0.29999999999999999,2
 68 | 0.40000000000000002,0.29999999999999999,2
 69 | 0.5,0.29999999999999999,2
 70 | 0.65000000000000002,0.29999999999999999,1
 71 | 0.69999999999999996,0.29999999999999999,1
 72 | 0.5,0.29999999999999999,2
 73 | 0.90000000000000002,0.29999999999999999,1
 74 | 0.65000000000000002,0.29999999999999999,1
 75 | 0.10000000000000001,0.29999999999999999,2
 76 | 0.90000000000000002,0.29999999999999999,1
 77 | 0.59999999999999998,0.29999999999999999,1
 78 | 0.69999999999999996,0.29999999999999999,1
 79 | 0.5,0.29999999999999999,1
 80 | 0.20000000000000001,0.29999999999999999,2
 81 | 0.10000000000000001,0.29999999999999999,2
 82 | 0.55000000000000004,0.29999999999999999,1
 83 | 0.10000000000000001,0.29999999999999999,2
 84 | 0.59999999999999998,0.29999999999999999,1
 85 | 0.90000000000000002,0.29999999999999999,1
 86 | 0.5,0.29999999999999999,2
 87 | 0.5,0.29999999999999999,2
 88 | 0.5,0.29999999999999999,2
 89 | 0.40000000000000002,0.29999999999999999,2
 90 | 0.69999999999999996,0.29999999999999999,1
 91 | 0.55000000000000004,0.29999999999999999,2
 92 | 0.90000000000000002,0.29999999999999999,1
 93 | 0.5,0.29999999999999999,1
 94 | 0.01,0.29999999999999999,2
 95 | 0.65000000000000002,0.29999999999999999,1
 96 | 0.20000000000000001,0.29999999999999999,2
 97 | 0.55000000000000004,0.29999999999999999,1
 98 | 0.59999999999999998,0.29999999999999999,1
 99 | 0.40000000000000002,0.29999999999999999,2
100 | 0.10000000000000001,0.29999999999999999,2
101 | 0.90000000000000002,0.29999999999999999,1
102 | 0.40000000000000002,0.29999999999999999,2
103 | 0.65000000000000002,0.29999999999999999,1
104 | 0.90000000000000002,0.29999999999999999,1
105 | 0.29999999999999999,0.29999999999999999,2
106 | 0.59999999999999998,0.29999999999999999,1
107 | 0.01,0.29999999999999999,2
108 | 0.29999999999999999,0.29999999999999999,2
109 | 0.65000000000000002,0.29999999999999999,1
110 | 0.10000000000000001,0.29999999999999999,2
111 | 0.29999999999999999,0.29999999999999999,2
112 | 0.59999999999999998,0.29999999999999999,1
113 | 0.01,0.29999999999999999,2
114 | 0.01,0.29999999999999999,2
115 | 0.40000000000000002,0.29999999999999999,2
116 | 0.90000000000000002,0.29999999999999999,1
117 | 0.29999999999999999,0.29999999999999999,2
118 | 0.20000000000000001,0.29999999999999999,2
119 | 0.40000000000000002,0.29999999999999999,2
120 | 0.20000000000000001,0.29999999999999999,1
121 | 0.5,0.29999999999999999,2
122 | 0.90000000000000002,0.29999999999999999,1
123 | 0.20000000000000001,0.29999999999999999,2
124 | 0.59999999999999998,0.29999999999999999,2
125 | 0.10000000000000001,0.29999999999999999,2
126 | 0.40000000000000002,0.29999999999999999,2
127 | 0.29999999999999999,0.29999999999999999,2
128 | 0.20000000000000001,0.29999999999999999,2
129 | 0.90000000000000002,0.29999999999999999,1
130 | 0.01,0.29999999999999999,2
131 | 0.65000000000000002,0.29999999999999999,2
132 | 0.01,0.29999999999999999,2
133 | 0.29999999999999999,0.29999999999999999,2
134 | 0.5,0.29999999999999999,1
135 | 0.69999999999999996,0.29999999999999999,2
136 | 0.10000000000000001,0.29999999999999999,2
137 | 0.01,0.29999999999999999,2
138 | 0.59999999999999998,0.29999999999999999,2
139 | 0.65000000000000002,0.29999999999999999,1
140 | 0.10000000000000001,0.29999999999999999,1
141 | 0.5,0.29999999999999999,1
142 | 0.55000000000000004,0.29999999999999999,2
143 | 0.90000000000000002,0.29999999999999999,1
144 | 0.55000000000000004,0.29999999999999999,1
145 | 0.20000000000000001,0.29999999999999999,2
146 | 0.69999999999999996,0.29999999999999999,1
147 | 0.29999999999999999,0.29999999999999999,2
148 | 0.40000000000000002,0.29999999999999999,2
149 | 0.01,0.29999999999999999,2
150 | 0.5,0.29999999999999999,2
151 | 0.59999999999999998,0.29999999999999999,1
152 | 0.01,0.29999999999999999,2
153 | 0.40000000000000002,0.29999999999999999,2
154 | 0.10000000000000001,0.29999999999999999,2
155 | 0.5,0.29999999999999999,1
156 | 0.69999999999999996,0.29999999999999999,2
157 | 0.90000000000000002,0.29999999999999999,1
158 | 0.10000000000000001,0.29999999999999999,2
159 | 0.40000000000000002,0.29999999999999999,2
160 | 0.59999999999999998,0.29999999999999999,1
161 | 0.29999999999999999,0.29999999999999999,2
162 | 0.20000000000000001,0.29999999999999999,2
163 | 0.40000000000000002,0.29999999999999999,2
164 | 0.40000000000000002,0.29999999999999999,2
165 | 0.55000000000000004,0.29999999999999999,2
166 | 0.40000000000000002,0.29999999999999999,2
167 | 0.55000000000000004,0.29999999999999999,1
168 | 


--------------------------------------------------------------------------------
/ml_insights/insights.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import warnings
  3 | import numpy as np
  4 | import pandas as pd
  5 | from .utils import _gca, is_classifier, is_regressor
  6 | 
  7 | 
  8 | class ModelXRay(object):
  9 |     """This class executes a model over a broad range of modified data points to analyze aspects of its performance.
 10 | 
 11 |     For each point in the data set, and for every feature involved of the prediction of the model, a new set of data
 12 |     points is created where the chosen feature is varied across its (empirical) range.  These modified data points are
 13 |     fed into the model to get a set of model predictions for each feature-data point combination.
 14 | 
 15 |     It is desirable that the "data" object passed in be relatively large in size, since the algorithm will make
 16 |     some heuristic choices based on the ranges of values it sees.  We suggest using at least 100 data points and preferably
 17 |     more than 500.
 18 | 
 19 |     It returns a results object, which can then be passed to functions such as feature_effect_summary and
 20 |     feature_dependence_plots to gain insight on the how the various features affect the target.  The results
 21 |     object can also be used directly by a user who wants to operate at a low-level.
 22 | 
 23 |        Parameters
 24 |         ----------
 25 | 
 26 | 
 27 |         model : A model object from sklearn or similar styled objects.  The `predict` method will be used if it is
 28 |             a regression model, while `predict_proba` will be used if it is a (binary) classification model.  Multi-class
 29 |             classifiers are not supported at this time.
 30 | 
 31 |         data : A DataFrame possessing the sameucture that the model would take as an argument.  These methods are designed
 32 |             to be used on "test" data (i.e. data that was not used in the training of the model).  However, there is nothing
 33 |             structural to prevent it from being used on training data, and there may be some insight gained by doing so.
 34 | 
 35 |         columns : a specific subset of columns to be used.  Default is None, which means to use all available columns in *data*
 36 | 
 37 |         resolution : how many different "grid points" to use for each feature.  The algorithm will use only the unique values
 38 |         it sees in *data* if there are fewer than *resolution* unique values.  Otherwise it will use *resolution* linearly spaced
 39 |         values ranging from the min observed value to the max observed value.
 40 |     """
 41 | 
 42 |     def __init__(self, model, data, columns=None, resolution=100, normalize_loc=None, pred_col_name = None, pred_col_index=1):
 43 | 
 44 |         self.model = model
 45 |         self.data = data
 46 |         self.pred_col_index = pred_col_index
 47 |         if type(data) == pd.DataFrame:
 48 |             if (pred_col_name != None) and (is_classifier(self.model)):
 49 |                 self.pred_col_index = np.where(self.model.classes_ == pred_col_name)[0][0]
 50 |             self.pred_col_name = data.columns[self.pred_col_index]
 51 |             self.data_values = data.values
 52 | 
 53 | 
 54 |         else:
 55 |             self.data_values = data
 56 | 
 57 |         self.columns = columns
 58 |         self.results = self._model_xray(columns, resolution, normalize_loc)
 59 | 
 60 | 
 61 |     def _get_data_rows(self, row_nums):
 62 |         if type(self.data) == pd.DataFrame:
 63 |             return self.data.iloc[row_nums]
 64 |         else:
 65 |             return self.data[row_nums, :]
 66 | 
 67 | 
 68 |     def _get_predictions(self, rows):
 69 |         # Catch deprecated warnings from Predict call
 70 |         with warnings.catch_warnings():
 71 |             warnings.simplefilter("ignore")
 72 | 
 73 |             if is_classifier(self.model):
 74 |                 y_pred = self.model.predict_proba(rows)[:,self.pred_col_index]
 75 |             else:
 76 |                 #print('off')
 77 |                 y_pred = self.model.predict(rows)
 78 |         return y_pred
 79 | 
 80 | 
 81 |     def gen_model_pred(self, row, col_idx, values):
 82 |         rows = []
 83 |         for val in values:
 84 |             sim_row = row.copy()
 85 |             sim_row[col_idx] = val
 86 |             rows.append(sim_row)
 87 |         # If the row is a Series, make it into a DF
 88 |         if type(rows[0]) == pd.Series:
 89 |             rows = pd.DataFrame(rows)
 90 |         y_pred = self._get_predictions(rows)
 91 |         return y_pred
 92 | 
 93 |     def _model_xray(self, columns, resolution, normalize_loc):
 94 |         '''This function executes a model over a broad range of conditions to analyze aspects of its performance.
 95 | 
 96 |         For each point in the data set, and for every feature involved of the prediction of the model, a new set of data
 97 |         points is created where the chosen feature is varied across its (empirical) range.  These modified data points are
 98 |         fed into the model to get a set of model predictions for each feature-data point combination.
 99 | 
100 |         It is desirable that the "data" object passed in be relatively large in size, since the algorithm will make
101 |         some heuristic choices based on the ranges of values it sees.  We suggest using at least 100 data points and preferably
102 |         more than 500.
103 | 
104 |         It returns a results object, which can then be passed to functions such as feature_effect_summary and
105 |         feature_dependence_plots to gain insight on the how the various features affect the target.  The results
106 |         object can also be used directly by a user who wants to operate at a low-level.
107 | 
108 |             Parameters
109 |             ----------
110 | 
111 |             model : A model object from sklearn or similar styled objects.  The `predict` method will be used if it is
112 |                 a regression model, while `predict_proba` will be used if it is a (binary) classification model.  Multi-class
113 |                 classifiers are not supported at this time.
114 | 
115 |             data : A DataFrame possessing the same structure that the model would take as an argument.  These methods are designed
116 |                 to be used on "test" data (i.e. data that was not used in the training of the model).  However, there is nothing
117 |                 structural to prevent it from being used on training data, and there may be some insight gained by doing so.
118 | 
119 |             columns : a specific subset of columns to be used.  Default is None, which means to use all available columns in *data*
120 | 
121 |             resolution : how many different "grid points" to use for each feature.  The algorithm will use only the unique values
122 |             it sees in *data* if there are fewer than *resolution* unique values.  Otherwise it will use *resolution* linearly spaced
123 |             values ranging from the min observed value to the max observed value.
124 | 
125 |             Returns
126 |             -------
127 | 
128 |             results : The "results" object is a dictionary where the keys are the feature names and the values are a 2-tuple.  This
129 |                 object is intended primarily to be passed to other functions to interact with and display the data.  However, advanced
130 |                 users may wish to understand and/or use the object directly.
131 | 
132 |                 The first element in the tuple is the set of different feature values that were substituted in for each data point.  The
133 |                 second element in the tuple is matrix where the number of rows is the number of data points and the number of columns
134 |                 is the number of different feature values.  The (i,j)th element of the matrix is the result of the model prediction when
135 |                 data point i has the feature in question set to jth value.
136 |             '''
137 |         ## Convert Pandas DataFrame to nparray explicitly to make life easier
138 |         #print('hello!!!')
139 | 
140 | 
141 |         ## Determine the range of values to plot for the chosen column
142 |         if columns is None:
143 |             if type(self.data) == pd.DataFrame:
144 |                 columns = self.data.columns
145 |             if type(self.data)==np.ndarray:
146 |                 columns = range(len(self.data[0]))  # Assuming a 2-D Dataset
147 |         else:
148 |             # Verify that columns is an iterable
149 |             try:
150 |                 iterator = iter(columns)
151 |             except TypeError:
152 |                 # not iterable
153 |                 columns = [columns]
154 |             else:
155 |                 # iterable
156 |                 pass
157 | 
158 |         # Build Column Index
159 |         column_nums = []
160 |         if type(self.data) == pd.DataFrame:
161 |             for column in columns:
162 |                 try:
163 |                     column_nums.append(self.data.columns.get_loc(column))
164 |                 except KeyError:
165 |                     ## TODO
166 |                     pass
167 |         else:
168 |             # Column Index and Column Names are the same
169 |             if type(columns[0]) == int:
170 |                 column_nums = columns
171 |             else:
172 |                 column_nums = range(len(columns))
173 | 
174 |         # Use the Numpy array of data values to ease indexing by col. numbers
175 |         results = {}
176 |         num_pts = len(self.data_values)
177 |         for column_num, column_name in zip(column_nums, columns):
178 |             if (len(np.unique(self.data_values[:,column_num])) > resolution):
179 |                 col_values = np.linspace(np.nanmin(self.data_values[:,column_num]),
180 |                     np.nanmax(self.data_values[:,column_num]),resolution)
181 |             else:
182 |                 col_values = np.sort(np.unique(self.data_values[:,column_num]))
183 |             ## Define the empty data structure to output
184 |             out_matrix = np.zeros([num_pts,len(col_values)])
185 | 
186 |             ## Generate predictions
187 |             if type(self.data) == pd.DataFrame:
188 |                 rows = self.data.iterrows()
189 |             else:
190 |                 rows = enumerate(self.data)
191 |             for loop_idx, (row_idx, row) in enumerate(rows):
192 |                 y_pred = self.gen_model_pred(row, column_num, col_values)
193 |                 if normalize_loc=='start':
194 |                     y_pred = y_pred - y_pred[0]
195 |                 if normalize_loc=='end':
196 |                     y_pred = y_pred - y_pred[-1]
197 |                 if (type(normalize_loc)==int and normalize_loc>=0 and normalize_loc<resolution):
198 |                     y_pred = y_pred - y_pred[normalize_loc]
199 |                 out_matrix[loop_idx,:] = y_pred
200 |             results[column_name] = (col_values, out_matrix)
201 |         return results
202 | 
203 | 
204 |     def feature_effect_summary(self, kind="boxh", num_features=20, y_scaling = 'none', ax=None):
205 |         '''This function plots a comparison of the effects of different features in a complex predictive model.
206 | 
207 |         In more complicated predictive models, the effect of an individual feature can be highly dependent on the values
208 |         of the other features.  It could be that a feature has a large effect in one context but a negligible effect in another.
209 |         This visualization attempts to shed light on the range of possibilities of the effect of a feature, by giving a boxplot
210 |         showing the range of possibilities of the effect of a feature.
211 | 
212 |         The features are ranked by their "median" effect across a range of data points, where the "effect"
213 |         is measured by the "peak to trough" distance that occurs as that feature varies across its possible range.
214 | 
215 |             Parameters
216 |             ----------
217 | 
218 |             results : This is a results object from a call to model_xray.  Ideally, the model_xray was given a reasonably large amount of data
219 |                 so that we can empirically see a broad range of possibilities.
220 | 
221 |             kind : Currently only 'boxh' (horizontal boxplot) is supported
222 | 
223 |             ax : If desired, a particular axis on which to generate the plot can be passed to the function
224 | 
225 |             num_features : This specifies the maximum number of features to include in the boxplot.  The function chooses the most significant
226 |                 features as measured by the median peak-to-trough effect size.
227 | 
228 |             Returns
229 |             -------
230 |         '''
231 |         ## Convert Pandas DataFrame to nparray explicitly to make life easier
232 | 
233 |         columns = list(self.results.keys())
234 |         logit_func = lambda x: np.log(x/(1-x))
235 |         logit10_func = lambda x: np.log10(x/(1-x))
236 |         logit2_func = lambda x: np.log2(x/(1-x))
237 |         if y_scaling=='logit':
238 |             result_data = [importance_distribution_of_variable(logit_func(self.results[col_name][1])) for col_name in columns]
239 |         elif y_scaling=='logit10':
240 |             result_data = [importance_distribution_of_variable(logit10_func(self.results[col_name][1])) for col_name in columns]
241 |         elif y_scaling=='logit2':
242 |             result_data = [importance_distribution_of_variable(logit2_func(self.results[col_name][1])) for col_name in columns]
243 |         else:
244 |             result_data = [importance_distribution_of_variable(self.results[col_name][1]) for col_name in columns]
245 |         sortind = np.argsort([np.median(d) for d in result_data])
246 |         if num_features and num_features > 0:
247 |             num_features = min(num_features, len(columns))
248 |         else:
249 |             num_features = len(columns)
250 |         plot_data = [result_data[idx] for idx in sortind][-num_features:]
251 | 
252 |         if ax is None:
253 |             ax = _gca()
254 |             fig = ax.get_figure()
255 |             fig.set_figwidth(10)
256 |             fig.set_figheight(max(6, int(math.ceil(num_features*0.5))))
257 |         ax.boxplot(plot_data, notch=0, sym='+', vert=0, whis=1.5)
258 |         ax.set_yticklabels([columns[idx] for idx in sortind][-num_features:]);
259 | 
260 | 
261 |     def feature_dependence_plots(self, y_scaling='none', show_base_points=True, pts_selected='sample',
262 |         columns = None, num_pts=5, figsize=None):
263 |         '''This function visualizes the effect of a single variable in models with complicated dependencies.
264 |         Given a dataset, it will select points in that dataset, and then change the select column across
265 |         different values to view the effect of the model prediction given that variable. These have been called
266 |         Individual Conditional Expectation plots (or ICE-plots), see Goldstein, Kapelner, Bleich,
267 |         Pitkin. Peeking Inside the Black Box: Visualizing Statistical Learning With Plots of Individual
268 |         Conditional Expectation. Journal of Computational and Graphical Statistics (March 2014)
269 |         '''
270 | 
271 |         import matplotlib.pyplot as plt
272 | 
273 |         if columns == None:
274 |             columns = sorted(list(self.results.keys()))
275 |         num_rows = len(self.results[columns[0]][1])  # Get number of sample rows
276 |         if (type(pts_selected)==str and pts_selected=='sample'):
277 |             row_indexes = np.random.choice(np.arange(num_rows), num_pts)
278 |         else:
279 |             row_indexes = pts_selected
280 | 
281 |         if show_base_points:
282 |             base_rows = self._get_data_rows(row_indexes)
283 |             y_base_points = self._get_predictions(base_rows)
284 |             if y_scaling=='logit':
285 |                 y_base_points = np.log(y_base_points/(1-y_base_points))
286 |             if y_scaling=='logit10':
287 |                 y_base_points = np.log10(y_base_points/(1-y_base_points))
288 |             if y_scaling=='logit2':
289 |                 y_base_points = np.log2(y_base_points/(1-y_base_points))
290 |         else:
291 |             y_base_points = None
292 | 
293 |         n_cols = min(3, len(columns))
294 |         n_rows = int(math.ceil(len(columns) / n_cols))
295 |         figsize = (n_cols * 4, n_rows * 4)
296 |         fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize)
297 |         for col_name, ax in zip(columns, axes.flatten()):
298 |             x = self.results[col_name][0]
299 |             y_values = self.results[col_name][1][row_indexes]
300 |             y_plot = y_values
301 |             if y_scaling=='logit':
302 |                 y_plot = np.log(y_values/(1-y_values))
303 |             if y_scaling=='logit10':
304 |                 y_plot = np.log10(y_values/(1-y_values))
305 |             if y_scaling=='logit2':
306 |                 y_plot = np.log2(y_values/(1-y_values))
307 |             for y in y_plot:
308 |                 ax.plot(x, y)
309 |             # Plot Base Points
310 |             if y_base_points is not None:
311 |                 ax.scatter(base_rows[col_name], y_base_points)
312 |             ax.set_title(col_name[:30])
313 |         plt.tight_layout()
314 |         return row_indexes
315 | 
316 | 
317 |     def explain_prediction_difference(self, index_1, index_2, tol=.03, verbose=True, decimals=4):
318 |         '''Given the indices of two points in the "xray"-ed data set, this function gives an explanation
319 |         of the factors contributing to the difference in the predictions.
320 | 
321 |         Starting with the first point given, the considers changing each feature from its current value to that
322 |         possessed by the second point.  The function evaluates the target in both scenarios and determines the
323 |         feature value change that creates the biggest (absolute) change in the target.  This change is selected
324 |         and the current point becomes the new point with the new feature value.  This is repeated until the new
325 |         target value is within a factor of 1+tol of the second point.
326 |         '''
327 |         data_row_1 = self._get_data_rows(index_1)
328 |         data_row_2 = self._get_data_rows(index_2)
329 |         return explain_prediction_difference(self.model, data_row_1, data_row_2, tol, verbose, decimals, self.pred_col_index)
330 | 
331 | 
332 | def importance_distribution_of_variable(model_result_array):
333 |     max_result_vec = np.array(list(map(np.max,model_result_array)))
334 |     min_result_vec = np.array(list(map(np.min,model_result_array)))
335 |     return max_result_vec - min_result_vec
336 | 
337 | 
338 | def explain_prediction_difference(model, data_row_1, data_row_2, tol=.03, verbose=True, decimals = 4, pred_col_index=1):
339 |     '''Given a model and two single row data frames, this function gives an explanation
340 |         of the factors contributing to the difference in the predictions.
341 | 
342 |         Starting with the first point given, the considers changing each feature from its current value to that
343 |         possessed by the second point.  The function evaluates the target in both scenarios and determines the
344 |         feature value change that creates the biggest (absolute) change in the target.  This change is selected
345 |         and the current point becomes the new point with the new feature value.  This is repeated until the new
346 |         target value is within a factor of 1+tol of the second point.
347 |         '''
348 |     column_names = data_row_1.index
349 |     num_columns = len(column_names)
350 | 
351 |     dr_1 = data_row_1.values.reshape(1,-1)
352 |     dr_2 = data_row_2.values.reshape(1,-1)
353 |     column_list = list(range(num_columns))
354 |     curr_pt = np.copy(dr_1)
355 |     if is_classifier(model):
356 |         val1 = model.predict_proba(dr_1)[0,pred_col_index]
357 |         val2 = model.predict_proba(dr_2)[0,pred_col_index]
358 |     else:
359 |         val1 = model.predict(dr_1)[0]
360 |         val2 = model.predict(dr_2)[0]
361 |     if verbose:
362 |         print(val1, val2)
363 |         print('Your initial point has a target value of {}'.format(np.round(val1,decimals=decimals)))
364 |         print('Your final point has a target value of {}'.format(np.round(val2,decimals=decimals)))
365 |     pt_list = [dr_1]
366 |     val_list = [val1]
367 |     curr_val = val1
368 |     final_val = val2
369 |     feat_list =[]
370 |     move_list = []
371 |     feat_val_change_list = []
372 |     #for num_steps in range(4):
373 |     while (((curr_val/final_val) >(1+tol)) or ((curr_val/final_val) <(1-tol))):
374 |         biggest_move = 0
375 |         best_column = -1
376 |         best_val = curr_val
377 |         for i in column_list:
378 |             test_pt = np.copy(curr_pt)
379 |             prev_feat_val = test_pt[0,i]
380 |             subst_val = dr_2[0,i]
381 |             test_pt[0,i] = subst_val
382 |             if is_classifier(model):
383 |                 test_val = model.predict_proba(test_pt)[0,pred_col_index]
384 |             else:
385 |                 test_val = model.predict(test_pt)[0]
386 |             move_size = (test_val - curr_val)
387 |             if(np.abs(move_size)>=np.abs(biggest_move)):
388 |                 biggest_move = move_size
389 |                 best_column = i
390 |                 best_val = test_val
391 |                 old_feat_val = prev_feat_val
392 |                 new_feat_val = subst_val
393 |         subst_val = dr_2[0,best_column]
394 |         curr_pt[0,best_column] = subst_val
395 |         val_list.append(best_val)
396 |         curr_val = best_val
397 |         if verbose:
398 |             print('Changing {} from {} to {}'.format(column_names[best_column],np.round(old_feat_val,decimals=decimals),np.round(new_feat_val,decimals=decimals)))
399 |             print('\t\tchanges your target by {} to {}'.format(np.round(biggest_move,decimals=decimals), np.round(best_val,decimals=decimals)))
400 |             print('----------')
401 |             if not (((curr_val/final_val) >(1+tol)) or ((curr_val/final_val) <(1-tol))):
402 |                 print('Tolerance of {} reached'.format(tol))
403 |                 print('Current value of {} is within {}% of {}'.format(np.round(curr_val,decimals=decimals),(100*tol),np.round(final_val,decimals=decimals)))
404 |         feat_list.append(column_names[best_column])
405 |         column_list.remove(best_column)
406 |         move_list.append(biggest_move)
407 |         feat_val_change_list.append((old_feat_val, new_feat_val))
408 |     return feat_list, feat_val_change_list, move_list, val_list
409 | 
410 | 
411 | def explain_prediction_difference_xgboost(model, data_row_1, data_row_2, tol=.03, verbose=True, decimals = 4, pred_col_index=1):
412 |     '''Given a model and two single row data frames, this function gives an explanation
413 |         of the factors contributing to the difference in the predictions.
414 | 
415 |         Starting with the first point given, the considers changing each feature from its current value to that
416 |         possessed by the second point.  The function evaluates the target in both scenarios and determines the
417 |         feature value change that creates the biggest (absolute) change in the target.  This change is selected
418 |         and the current point becomes the new point with the new feature value.  This is repeated until the new
419 |         target value is within a factor of 1+tol of the second point.
420 |         '''
421 |     column_names = data_row_1.columns
422 |     num_columns = len(column_names)
423 | 
424 |     #dr_1 = data_row_1.values.reshape(1,-1)
425 |     #dr_2 = data_row_2.values.reshape(1,-1)
426 |     dr_1 = data_row_1
427 |     dr_2 = data_row_2
428 |     column_list = list(range(num_columns))
429 |     curr_pt = (dr_1).copy()
430 |     if is_classifier(model):
431 |         val1 = model.predict_proba(dr_1)[0,pred_col_index]
432 |         val2 = model.predict_proba(dr_2)[0,pred_col_index]
433 |     else:
434 |         val1 = model.predict(dr_1)[0]
435 |         val2 = model.predict(dr_2)[0]
436 |     if verbose:
437 |         print(val1, val2)
438 |         print('Your initial point has a target value of {}'.format(np.round(val1,decimals=decimals)))
439 |         print('Your final point has a target value of {}'.format(np.round(val2,decimals=decimals)))
440 |     pt_list = [dr_1]
441 |     val_list = [val1]
442 |     curr_val = val1
443 |     final_val = val2
444 |     feat_list =[]
445 |     move_list = []
446 |     feat_val_change_list = []
447 |     #for num_steps in range(4):
448 |     while (((curr_val/final_val) >(1+tol)) or ((curr_val/final_val) <(1-tol))):
449 |         biggest_move = 0
450 |         best_column = -1
451 |         best_val = curr_val
452 |         for i in column_list:
453 |             test_pt = (curr_pt).copy()
454 |             prev_feat_val = test_pt.iloc[0,i]
455 |             subst_val = dr_2.iloc[0,i]
456 |             test_pt.iloc[0,i] = subst_val
457 |             if is_classifier(model):
458 |                 test_val = model.predict_proba(test_pt)[0,pred_col_index]
459 |             else:
460 |                 test_val = model.predict(test_pt)[0]
461 |             move_size = (test_val - curr_val)
462 |             if(np.abs(move_size)>=np.abs(biggest_move)):
463 |                 biggest_move = move_size
464 |                 best_column = i
465 |                 best_val = test_val
466 |                 old_feat_val = prev_feat_val
467 |                 new_feat_val = subst_val
468 |         subst_val = dr_2.iloc[0,best_column]
469 |         curr_pt.iloc[0,best_column] = subst_val
470 |         val_list.append(best_val)
471 |         curr_val = best_val
472 |         if verbose:
473 |             print('Changing {} from {} to {}'.format(column_names[best_column],np.round(old_feat_val,decimals=decimals),np.round(new_feat_val,decimals=decimals)))
474 |             print('\t\tchanges your target by {} to {}'.format(np.round(biggest_move,decimals=decimals), np.round(best_val,decimals=decimals)))
475 |             print('----------')
476 |             if not (((curr_val/final_val) >(1+tol)) or ((curr_val/final_val) <(1-tol))):
477 |                 print('Tolerance of {} reached'.format(tol))
478 |                 print('Current value of {} is within {}% of {}'.format(np.round(curr_val,decimals=decimals),(100*tol),np.round(final_val,decimals=decimals)))
479 |         feat_list.append(column_names[best_column])
480 |         column_list.remove(best_column)
481 |         move_list.append(biggest_move)
482 |         feat_val_change_list.append((old_feat_val, new_feat_val))
483 |     return feat_list, feat_val_change_list, move_list, val_list
484 | 
485 | 
486 | 
487 | 
488 | 


--------------------------------------------------------------------------------
/ml_insights/shap_insights.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | try:
 5 |     import xgboost as xgb
 6 | 
 7 | except ImportError:
 8 |     xgb_installed = False
 9 | 
10 | def consolidate_reason_scores(df_ind_expl, dict_map):
11 |     reason_list = dict_map.keys()
12 |     df_rsn = pd.DataFrame(columns = reason_list)
13 |     for reason in reason_list:
14 |         df_rsn[reason] = np.sum(df_ind_expl.loc[:,dict_map[reason]], axis=1)
15 |     return df_rsn 
16 | 
17 | def get_reason_codes(df_rsn, thresh, direction='greater', delimiter=';'):
18 |     nr, nc = df_rsn.shape
19 |     argsort_mat = np.argsort(-df_rsn.values)
20 |     if (direction=='lesser'):
21 |         num_exceeding_thresh_vec = np.sum(df_rsn.values<=thresh, axis=1)
22 |     else:
23 |         num_exceeding_thresh_vec = np.sum(df_rsn.values>=thresh, axis=1)
24 |     reason_mat = np.array([df_rsn.columns[i] for row in argsort_mat for i in row ]).reshape(nr,nc)
25 |     reason_vec = np.array([delimiter.join(list(reason_mat[j][:num_exceeding_thresh_vec[j]])) for j in range(nr)])
26 |     return reason_vec
27 | 
28 | def cv_column_shap(xgbcv, X_pr, fn):
29 |     results = np.zeros((X_pr.shape[0], xgbcv.num_features+1))
30 |     fold_set = np.unique(fn)
31 |     for fold in fold_set:
32 |         X_te = xgb.DMatrix(X_pr[fn == fold].values)
33 |         fold_results = xgbcv.model_dict[fold].get_booster().predict(X_te, pred_contribs=True, validate_features=False)
34 |         results[fn==fold] = fold_results
35 |     return results
36 |     
37 | def predict_reasons_cv(xgbcv, X_pr, fn, reason_map, thresh, delimiter=';'):
38 |     shap_val_mat = cv_column_shap(xgbcv, X_pr, fn)
39 |     df_shap_val = pd.DataFrame(shap_val_mat[:,:-1], columns = X_pr.columns)
40 |     df_reason_scores = consolidate_reason_scores(df_shap_val,reason_map)
41 |     reason_list_vec = get_reason_codes(df_reason_scores, thresh, delimiter=delimiter)
42 |     return(reason_list_vec)
43 | 
44 | def predict_reason_strings(xgbmodel, X_pr, reason_map, thresh, delimiter=';', direction='greater'):
45 |     X_pr_dmat = xgb.DMatrix(X_pr)
46 |     shap_val_mat = xgbmodel.get_booster().predict(X_pr_dmat, pred_contribs=True, validate_features=False)
47 |     df_shap_val = pd.DataFrame(shap_val_mat[:,:-1], columns = X_pr.columns)
48 |     df_reason_scores = consolidate_reason_scores(df_shap_val,reason_map)
49 |     reason_list_vec = get_reason_codes(df_reason_scores, thresh, direction=direction, delimiter=delimiter)
50 |     return(reason_list_vec)
51 | 
52 | def get_reason_score_matrix(xgbmodel, X_pr, validate=False):
53 |     if (type(X_pr)==pd.DataFrame):
54 |         X_test_dmat = xgb.DMatrix(X_pr)
55 |         reason_list = list(X_pr.columns) + ['Intercept']
56 |         reas_mat = xgbmodel.get_booster().predict(X_test_dmat, pred_contribs=True, validate_features=validate)
57 |     else:
58 |         reason_list = ['f'+str(i) for i in range(X_pr.shape[1])] + ['Intercept']
59 |         X_test_dmat = xgb.DMatrix(X_pr, feature_names = reason_list[:-1])
60 |         reas_mat = xgbmodel.get_booster().predict(X_test_dmat, pred_contribs=True, validate_features=validate)
61 |     return(pd.DataFrame(reas_mat, columns=reason_list))
62 | 
63 | # def augment_tree(tree_dict):
64 | #     if 'leaf' in tree_dict.keys():
65 | #         value = tree_dict['leaf']
66 | #         tree_dict['value_at_node'] = value
67 | #         return value
68 | #     else:
69 | #         a0 = tree_dict['children'][0]['cover']
70 | #         a1 = tree_dict['children'][1]['cover']
71 | #         value = (a0 * augment_tree(tree_dict['children'][0]) + a1 * augment_tree(tree_dict['children'][1]))/(a0 + a1)
72 |         
73 | #         tree_dict['value_at_node'] = value
74 | #         return value
75 | 
76 | 


--------------------------------------------------------------------------------
/ml_insights/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numeristical/introspective/dbe96f7fc4dfd24d7ed6a6982d661426d74ee172/ml_insights/tests/__init__.py


--------------------------------------------------------------------------------
/ml_insights/tests/test_example.py:
--------------------------------------------------------------------------------
 1 | import os.path as op
 2 | import numpy as np
 3 | import pandas as pd
 4 | import numpy.testing as npt
 5 | import ml_insights as mli
 6 | from sklearn.metrics import roc_auc_score, log_loss
 7 | 
 8 | data_path = op.join(mli.__path__[0], 'data')
 9 | 
10 | 
11 | def test_1():
12 |     assert(True)
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/ml_insights/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import os.path as op
 2 | import numpy as np
 3 | import pandas as pd
 4 | import numpy.testing as npt
 5 | import ml_insights as mli
 6 | from sklearn.metrics import roc_auc_score, log_loss
 7 | 
 8 | data_path = op.join(mli.__path__[0], 'data')
 9 | 
10 | 
11 | def test_get_range_dict():
12 |     """
13 |     This tests the get_range_dict function on a sample dataframe
14 | 
15 |     """
16 |     df = pd.read_csv(op.join(data_path,'faux_data.csv'))
17 |     rd = mli.get_range_dict(df, max_pts=172)
18 |     t1 = len(rd['int1']), len(rd['int2']) == (100, 172)
19 |     t2 = len(rd['float1']) == 172
20 |     t3 = len(rd['str1']), len(rd['str2']) == (172, 50)
21 |     assert(t1 and t2 and t3)
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/ml_insights/utils.py:
--------------------------------------------------------------------------------
 1 | def _gca():
 2 |     import matplotlib.pyplot as plt
 3 |     return plt.gca()
 4 | 
 5 | 
 6 | def is_classifier(estimator):
 7 |     """Returns True if the given estimator is (probably) a classifier."""
 8 |     return getattr(estimator, "_estimator_type", None) == "classifier"
 9 | 
10 | 
11 | def is_regressor(estimator):
12 |     """Returns True if the given estimator is (probably) a regressor."""
13 |     return getattr(estimator, "_estimator_type", None) == "regressor"
14 | 


--------------------------------------------------------------------------------
/mli_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/numeristical/introspective/dbe96f7fc4dfd24d7ed6a6982d661426d74ee172/mli_screenshot.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools >= 77.0.3"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "ml_insights"
 7 | version = "1.1.0"
 8 | dependencies = [
 9 |     "pandas>=0.23",
10 |     "numpy>=1.23.5",
11 |     "matplotlib>=2.0.0",
12 |     "scikit-learn>=0.24.2",
13 |     "scipy>=1.6.0",
14 |     "splinecalib>=0.0.13"
15 | ]
16 | authors = [
17 |   { name="Brian Lucena / Ramesh Sampath", email="brian@numeristical.com" },
18 | ]
19 | description = "Package to calibrate and understand ML Models"
20 | readme = "README.md"
21 | requires-python = ">=3.8"
22 | classifiers = [
23 |     "Programming Language :: Python :: 3",
24 |     "Operating System :: OS Independent",
25 | ]
26 | license = "MIT"
27 | license-files = ["LICEN[CS]E*"]
28 | 
29 | [project.urls]
30 | Homepage = "https://github.com/numeristical/introspective"
31 | Issues = "https://github.com/numeristical/introspective/issues"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx_rtd_theme
2 | mkdocs


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """Setup for the ml_insights package."""
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | setup(
 6 |      packages=[
 7 |         'ml_insights',
 8 |     ],
 9 | 
10 | )
11 | 


--------------------------------------------------------------------------------