├── .coveragerc ├── .github └── ISSUE_TEMPLATE.md ├── .gitignore ├── .travis.yml ├── CONTRIBUTING.rst ├── LICENSE ├── ODSC_East_2019 └── ODSC_XGBoost_Interpretability.ipynb ├── ODSC_East_2020 └── ODSC_East_2020_XGBoost_Interpretability.ipynb ├── ODSC_West_2019 └── ODSC_West_2019_XGBoost_Interpretability.ipynb ├── README.md ├── README.rst ├── deploy_steps.md ├── docs-mkdocs ├── docs │ └── index.md └── mkdocs.yml ├── docs ├── Makefile ├── calib_overview.rst ├── conf.py ├── index.rst ├── install.rst ├── interp_overview.rst ├── make.bat ├── rtfd-requirements.txt ├── splinecalib_class.rst └── splinecalib_examples.rst ├── examples ├── Ames_Housing_Analysis.ipynb ├── Calibration_Example_ICU_MIMIC.ipynb ├── Calibration_Example_ICU_MIMIC_Short.ipynb ├── ICU_Mortality_MIMIC.ipynb ├── SplineCalib_Details.ipynb ├── SplineCalib_Multiclass_MNIST.ipynb ├── SplineCalib_Tutorial.ipynb └── data │ ├── Ames_Housing_Data.tsv │ ├── cal_housing_data.csv │ └── lab_vital_icu_table.csv ├── extra └── code │ ├── discrete_dt.c │ ├── discrete_dt.pyx │ ├── discrete_gb.c │ ├── discrete_gb.pyx │ ├── graphs.c │ ├── graphs.pyx │ ├── hypergraphs.c │ ├── hypergraphs.pyx │ ├── setup.py │ ├── structure_dt.c │ ├── structure_dt.pyx │ ├── structure_gb.c │ └── structure_gb.pyx ├── ml_insights ├── CVModel.py ├── __init__.py ├── calibration.py ├── cross_validation.py ├── data │ ├── faux_data.csv │ ├── ortho.csv │ └── para.csv ├── insights.py ├── modeling_utils.py ├── shap_insights.py ├── tests │ ├── __init__.py │ ├── test_example.py │ └── test_utils.py └── utils.py ├── mli_screenshot.png ├── pyproject.toml ├── requirements.txt └── setup.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = ml_insights/* 4 | include = ml_insights/* 5 | omit = */setup.py 6 | [report] 7 | include = ml_insights/* 8 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | * ML Inspector version: 2 | * Python version: 3 | * Operating System: 4 | 5 | ### Description 6 | 7 | Describe what you were trying to get done. 8 | Tell us what happened, what went wrong, and what you expected to happen. 9 | 10 | ### What I Did 11 | 12 | ``` 13 | Paste the command(s) you ran and the output. 14 | If there was a crash, please include the traceback here. 15 | ``` 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | .DS_Store 7 | 8 | #Ipython Notebook 9 | .ipynb_checkpoints 10 | 11 | #Ipython Notebook Temp files 12 | *copy*.ipynb 13 | untitled* 14 | untitled*.ipynb 15 | *Copy*.ipynb 16 | Untitled* 17 | Untitled*.ipynb 18 | 19 | # C extensions 20 | *.so 21 | 22 | # Distribution / packaging 23 | .Python 24 | env/ 25 | build/ 26 | develop-eggs/ 27 | dist/ 28 | downloads/ 29 | eggs/ 30 | .eggs/ 31 | lib/ 32 | lib64/ 33 | parts/ 34 | sdist/ 35 | var/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | *.log 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .coverage 55 | .coverage.* 56 | .cache 57 | nosetests.xml 58 | coverage.xml 59 | *,cover 60 | .hypothesis/ 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # mkdocs 67 | docs-mkdocs/site/ 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # local 76 | local/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | 3 | env: 4 | global: 5 | - CONDA_DEPS="pip flake8 pytest numpy scipy matplotlib pandas" PIP_DEPS="coveralls pytest-cov" 6 | 7 | matrix: 8 | include: 9 | - os: osx 10 | env: 11 | - PYTHON_VERSION=2.7 12 | - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh" 13 | - os: osx 14 | env: 15 | - PYTHON_VERSION=3.5 16 | - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh" 17 | - os: linux 18 | env: 19 | - PYTHON_VERSION=2.7 20 | - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh" 21 | - os: linux 22 | env: 23 | - PYTHON_VERSION=3.5 24 | - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh" 25 | 26 | 27 | before_install: 28 | - export MINICONDA=$HOME/miniconda 29 | - export PATH="$MINICONDA/bin:$PATH" 30 | - hash -r 31 | - echo $MINICONDA_URL 32 | - wget $MINICONDA_URL -O miniconda.sh; 33 | - bash miniconda.sh -b -f -p $MINICONDA; 34 | - conda config --set always_yes yes 35 | - conda update conda 36 | - conda info -a 37 | - conda config --add channels conda-forge 38 | - conda install python=$PYTHON_VERSION $CONDA_DEPS 39 | - travis_retry pip install $PIP_DEPS 40 | 41 | install: 42 | - python setup.py install --record installed_files.txt 43 | 44 | script: 45 | - flake8 --ignore N802,N806 `find . -name \*.py | grep -v setup.py | grep -v /doc/` 46 | 47 | - mkdir for_test 48 | - cd for_test 49 | - py.test --pyargs ml_insights --cov-report term-missing --cov=ml_insights 50 | 51 | after_success: 52 | - coveralls 53 | 54 | before_cache: 55 | # clean unused packages & installed files from conda cache 56 | # this makes the cache rebuilt less frequently 57 | - conda clean --tarballs --packages --index-cache 58 | - rm -rf $HOME/miniconda/pkgs/cache 59 | - xargs rm >> import ml_insights as mli 24 | >>> xray = mli.ModelXRay(model, data.sample(500)) 25 | >>> xray.feature_dependence_plots() 26 | 27 | ![mli_screenshot](mli_screenshot.png) 28 | 29 | Find more detailed examples here: 30 | [https://github.com/numeristical/introspective/tree/master/examples](https://github.com/numeristical/introspective/tree/master/examples) 31 | 32 | 33 | ## Other Documentation 34 | 35 | [https://ml-insights.readthedocs.io](https://ml-insights.readthedocs.io) 36 | 37 | Disclaimer 38 | ========== 39 | 40 | We have tested this tool to the best of our ability, but understand that it may have bugs. It was developed on Python 3. Use at your own risk, but feel free to report any bugs to our github. 41 | 42 | Installation 43 | ============= 44 | 45 | $ pip install ml_insights 46 | 47 | 48 | Source 49 | ====== 50 | 51 | Find the latest version on github: https://github.com/numeristical/introspective 52 | 53 | Feel free to fork and contribute! 54 | 55 | License 56 | ======= 57 | 58 | Free software: `MIT license `_ 59 | 60 | Developed By 61 | ============ 62 | 63 | - Brian Lucena 64 | - Ramesh Sampath 65 | 66 | References 67 | ========== 68 | 69 | Lucena, B. 2018. Spline-Based Probability Calibration. https://arxiv.org/abs/1809.07751 70 | 71 | Alex Goldstein, Adam Kapelner, Justin Bleich, and Emil Pitkin. 2014. Peeking Inside the Black Box: Visualizing Statistical Learning With Plots of Individual Conditional Expectation. Journal of Computational and Graphical Statistics (March 2014) 72 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ML Insights 2 | =========== 3 | 4 | Welcome to ML-Insights! 5 | 6 | This package contains two core sets of functions: 7 | 8 | 1) Calibration 9 | 2) Interpreting Models 10 | 11 | For probability calibration, the main class is `SplineCalib`. Given a set of model outputs and the "true" classes, you can `fit` a SplineCalib object. That object can then be used to `calibrate` future model predictions post-hoc. 12 | 13 | .. code-block:: python 14 | 15 | >>> model.fit(X_train, y_train) 16 | >>> sc = mli.SplineCalib() 17 | >>> sc.fit(X_valid, y_valid) 18 | >>> uncalib_preds = model.predict_proba(X_test) 19 | >>> calib_preds = sc.calibrate(uncalib_preds) 20 | 21 | 22 | .. code-block:: python 23 | 24 | >>> cv_preds = mli.cv_predictions(model, X_train, y_train) 25 | >>> model.fit(X_train, y_train) 26 | >>> sc = mli.SplineCalib() 27 | >>> sc.fit(cv_preds, y_train) 28 | >>> uncalib_preds = model.predict_proba(X_test) 29 | >>> calib_preds = sc.calibrate(uncalib_preds) 30 | 31 | 32 | 33 | For model interpretability, we provide the `ice_plot` and `histogram_pair` functions as well as other tools. 34 | 35 | 36 | .. code-block:: python 37 | 38 | >>> rd = mli.get_range_dict(X_train) 39 | >>> mli.ice_plot(model, X_test.sample(3), X_train.columns, rd) 40 | 41 | .. code-block:: python 42 | 43 | >>> mli.histogram_pair(df.outcome, df.feature, bins=np.linspace(0,100,11)) 44 | 45 | Please see the documentation and examples at the links below. 46 | 47 | 48 | - `Documentation `_ 49 | - `Notebook Examples and Usage `_ 50 | 51 | 52 | Python 53 | ------ 54 | Python 3.4+ 55 | 56 | 57 | Disclaimer 58 | ========== 59 | 60 | We have tested this tool to the best of our ability, but understand that it may have bugs. It was most recently developed on Python 3.7.3. Use at your own risk, but feel free to report any bugs to our github. 61 | 62 | 63 | Installation 64 | ============= 65 | 66 | .. code-block:: bash 67 | 68 | $ pip install ml_insights 69 | 70 | 71 | Usage 72 | ====== 73 | 74 | .. code-block:: python 75 | 76 | >>> import ml_insights as mli 77 | >>> xray = mli.ModelXRay(model, data) 78 | 79 | .. code-block:: python 80 | 81 | >>> rfm = RandomForestClassifier(n_estimators = 500, class_weight='balanced_subsample') 82 | >>> rfm_cv = mli.SplineCalibratedClassifierCV(rfm) 83 | >>> rfm_cv.fit(X_train,y_train) 84 | >>> test_res_calib_cv = rfm_cv.predict_proba(X_test)[:,1] 85 | >>> log_loss(y_test,test_res_calib_cv) 86 | 87 | Source 88 | ====== 89 | 90 | Find the latest version on github: https://github.com/numeristical/introspective 91 | 92 | Feel free to fork and contribute! 93 | 94 | License 95 | ======= 96 | 97 | Free software: `MIT license `_ 98 | 99 | Developed By 100 | ============ 101 | 102 | - Brian Lucena 103 | - Ramesh Sampath 104 | 105 | References 106 | ========== 107 | 108 | Alex Goldstein, Adam Kapelner, Justin Bleich, and Emil Pitkin. 2014. Peeking Inside the Black Box: Visualizing Statistical Learning With Plots of Individual Conditional Expectation. Journal of Computational and Graphical Statistics (March 2014) -------------------------------------------------------------------------------- /deploy_steps.md: -------------------------------------------------------------------------------- 1 | To Develop Locally: 2 | 3 | 1. Run `pip install -e .` 4 | 5 | 6 | To Deploy to Test Server: 7 | 8 | 1. Run `python setup.py sdist bdist_wheel` 9 | 10 | 2. Upload using twine - `twine upload -r test dist/ml_insights-0.0.*` 11 | 3. Install from Test PyPi - `pip install -i https://testpypi.python.org/pypi ml_insights --upgrade` 12 | 13 | To Deploy to PyPi Server: 14 | 15 | 1. Run `python setup.py sdist bdist_wheel` 16 | 17 | 2. Upload using twine - `twine upload dist/ml_insights-0.0.*` 18 | 3. Install from Test PyPi - `pip install ml_insights --upgrade` 19 | 20 | 21 | **Note: When uploading to TestPyPi or PyPi, we cannot update the versions. Version numbers need to be updated in setup.py and ml_insights/__init__.py 22 | 23 | 24 | To Upgrade Documentation: 25 | 26 | -------------------------------------------------------------------------------- /docs-mkdocs/docs/index.md: -------------------------------------------------------------------------------- 1 | # Welcome to MkDocs 2 | 3 | For full documentation visit [mkdocs.org](http://mkdocs.org). 4 | 5 | ## Commands 6 | 7 | * `mkdocs new [dir-name]` - Create a new project. 8 | * `mkdocs serve` - Start the live-reloading docs server. 9 | * `mkdocs build` - Build the documentation site. 10 | * `mkdocs help` - Print this help message. 11 | 12 | ## Project layout 13 | 14 | mkdocs.yml # The configuration file. 15 | docs/ 16 | index.md # The documentation homepage. 17 | ... # Other markdown pages, images and other files. 18 | -------------------------------------------------------------------------------- /docs-mkdocs/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: ML Insights 2 | theme: readthedocs 3 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help 18 | help: 19 | @echo "Please use \`make ' where is one of" 20 | @echo " html to make standalone HTML files" 21 | @echo " dirhtml to make HTML files named index.html in directories" 22 | @echo " singlehtml to make a single large HTML file" 23 | @echo " pickle to make pickle files" 24 | @echo " json to make JSON files" 25 | @echo " htmlhelp to make HTML files and a HTML help project" 26 | @echo " qthelp to make HTML files and a qthelp project" 27 | @echo " applehelp to make an Apple Help Book" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " epub3 to make an epub3" 31 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 32 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 33 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 34 | @echo " text to make text files" 35 | @echo " man to make manual pages" 36 | @echo " texinfo to make Texinfo files" 37 | @echo " info to make Texinfo files and run them through makeinfo" 38 | @echo " gettext to make PO message catalogs" 39 | @echo " changes to make an overview of all changed/added/deprecated items" 40 | @echo " xml to make Docutils-native XML files" 41 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 42 | @echo " linkcheck to check all external links for integrity" 43 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 44 | @echo " coverage to run coverage check of the documentation (if enabled)" 45 | @echo " dummy to check syntax errors of document sources" 46 | 47 | .PHONY: clean 48 | clean: 49 | rm -rf $(BUILDDIR)/* 50 | 51 | .PHONY: html 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | .PHONY: dirhtml 58 | dirhtml: 59 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 60 | @echo 61 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 62 | 63 | .PHONY: singlehtml 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | .PHONY: pickle 70 | pickle: 71 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 72 | @echo 73 | @echo "Build finished; now you can process the pickle files." 74 | 75 | .PHONY: json 76 | json: 77 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 78 | @echo 79 | @echo "Build finished; now you can process the JSON files." 80 | 81 | .PHONY: htmlhelp 82 | htmlhelp: 83 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 84 | @echo 85 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 86 | ".hhp project file in $(BUILDDIR)/htmlhelp." 87 | 88 | .PHONY: qthelp 89 | qthelp: 90 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 91 | @echo 92 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 93 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 94 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/MLInsights.qhcp" 95 | @echo "To view the help file:" 96 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/MLInsights.qhc" 97 | 98 | .PHONY: applehelp 99 | applehelp: 100 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 101 | @echo 102 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 103 | @echo "N.B. You won't be able to view it unless you put it in" \ 104 | "~/Library/Documentation/Help or install it in your application" \ 105 | "bundle." 106 | 107 | .PHONY: devhelp 108 | devhelp: 109 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 110 | @echo 111 | @echo "Build finished." 112 | @echo "To view the help file:" 113 | @echo "# mkdir -p $$HOME/.local/share/devhelp/MLInsights" 114 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/MLInsights" 115 | @echo "# devhelp" 116 | 117 | .PHONY: epub 118 | epub: 119 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 120 | @echo 121 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 122 | 123 | .PHONY: epub3 124 | epub3: 125 | $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 126 | @echo 127 | @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." 128 | 129 | .PHONY: latex 130 | latex: 131 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 132 | @echo 133 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 134 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 135 | "(use \`make latexpdf' here to do that automatically)." 136 | 137 | .PHONY: latexpdf 138 | latexpdf: 139 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 140 | @echo "Running LaTeX files through pdflatex..." 141 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 142 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 143 | 144 | .PHONY: latexpdfja 145 | latexpdfja: 146 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 147 | @echo "Running LaTeX files through platex and dvipdfmx..." 148 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 149 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 150 | 151 | .PHONY: text 152 | text: 153 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 154 | @echo 155 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 156 | 157 | .PHONY: man 158 | man: 159 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 160 | @echo 161 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 162 | 163 | .PHONY: texinfo 164 | texinfo: 165 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 166 | @echo 167 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 168 | @echo "Run \`make' in that directory to run these through makeinfo" \ 169 | "(use \`make info' here to do that automatically)." 170 | 171 | .PHONY: info 172 | info: 173 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 174 | @echo "Running Texinfo files through makeinfo..." 175 | make -C $(BUILDDIR)/texinfo info 176 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 177 | 178 | .PHONY: gettext 179 | gettext: 180 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 181 | @echo 182 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 183 | 184 | .PHONY: changes 185 | changes: 186 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 187 | @echo 188 | @echo "The overview file is in $(BUILDDIR)/changes." 189 | 190 | .PHONY: linkcheck 191 | linkcheck: 192 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 193 | @echo 194 | @echo "Link check complete; look for any errors in the above output " \ 195 | "or in $(BUILDDIR)/linkcheck/output.txt." 196 | 197 | .PHONY: doctest 198 | doctest: 199 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 200 | @echo "Testing of doctests in the sources finished, look at the " \ 201 | "results in $(BUILDDIR)/doctest/output.txt." 202 | 203 | .PHONY: coverage 204 | coverage: 205 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 206 | @echo "Testing of coverage in the sources finished, look at the " \ 207 | "results in $(BUILDDIR)/coverage/python.txt." 208 | 209 | .PHONY: xml 210 | xml: 211 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 212 | @echo 213 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 214 | 215 | .PHONY: pseudoxml 216 | pseudoxml: 217 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 218 | @echo 219 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 220 | 221 | .PHONY: dummy 222 | dummy: 223 | $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy 224 | @echo 225 | @echo "Build finished. Dummy builder generates no files." 226 | -------------------------------------------------------------------------------- /docs/calib_overview.rst: -------------------------------------------------------------------------------- 1 | Probability Calibration with SplineCalib 2 | ======================================== 3 | 4 | SplineCalib is a tool for probability calibration contained in the ML-Insights package. Often, classification models may have good *discriminative* performance, but have poor *calibration*. SplineCalib post-processes the model scores so that they are better calibrated. 5 | 6 | .. toctree:: 7 | :maxdepth: 3 8 | 9 | splinecalib_examples 10 | splinecalib_class 11 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # ML Insights documentation build configuration file, created by 5 | # sphinx-quickstart on Wed Nov 9 13:32:07 2016. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | sys.path.insert(0, os.path.abspath('../ml_insights')) 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | # 28 | # needs_sphinx = '1.0' 29 | import sphinx_rtd_theme 30 | 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = [ 36 | 'sphinx.ext.autodoc', 37 | 'sphinx.ext.mathjax', 38 | 'sphinx.ext.viewcode', 39 | 'sphinx.ext.coverage', 40 | 'sphinx.ext.napoleon', 41 | ] 42 | 43 | # Add any paths that contain templates here, relative to this directory. 44 | templates_path = ['_templates'] 45 | 46 | # The suffix(es) of source filenames. 47 | # You can specify multiple suffix as a list of string: 48 | # 49 | # source_suffix = ['.rst', '.md'] 50 | source_suffix = '.rst' 51 | 52 | # The encoding of source files. 53 | # 54 | # source_encoding = 'utf-8-sig' 55 | 56 | # The master toctree document. 57 | master_doc = 'index' 58 | 59 | # General information about the project. 60 | project = 'ML Insights' 61 | copyright = '2020, Brian Lucena and Ramesh Sampath' 62 | author = 'Brian Lucena and Ramesh Sampath' 63 | 64 | # The version info for the project you're documenting, acts as replacement for 65 | # |version| and |release|, also used in various other places throughout the 66 | # built documents. 67 | # 68 | # The short X.Y version. 69 | version = '0.1.0' 70 | # The full version, including alpha/beta/rc tags. 71 | release = '0.1.0' 72 | 73 | # The language for content autogenerated by Sphinx. Refer to documentation 74 | # for a list of supported languages. 75 | # 76 | # This is also used if you do content translation via gettext catalogs. 77 | # Usually you set "language" from the command line for these cases. 78 | language = None 79 | 80 | # There are two options for replacing |today|: either, you set today to some 81 | # non-false value, then it is used: 82 | # 83 | # today = '' 84 | # 85 | # Else, today_fmt is used as the format for a strftime call. 86 | # 87 | # today_fmt = '%B %d, %Y' 88 | 89 | # List of patterns, relative to source directory, that match files and 90 | # directories to ignore when looking for source files. 91 | # This patterns also effect to html_static_path and html_extra_path 92 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 93 | 94 | # The reST default role (used for this markup: `text`) to use for all 95 | # documents. 96 | # 97 | # default_role = None 98 | 99 | # If true, '()' will be appended to :func: etc. cross-reference text. 100 | # 101 | # add_function_parentheses = True 102 | 103 | # If true, the current module name will be prepended to all description 104 | # unit titles (such as .. function::). 105 | # 106 | # add_module_names = True 107 | 108 | # If true, sectionauthor and moduleauthor directives will be shown in the 109 | # output. They are ignored by default. 110 | # 111 | # show_authors = False 112 | 113 | # The name of the Pygments (syntax highlighting) style to use. 114 | pygments_style = 'sphinx' 115 | 116 | # A list of ignored prefixes for module index sorting. 117 | # modindex_common_prefix = [] 118 | 119 | # If true, keep warnings as "system message" paragraphs in the built documents. 120 | # keep_warnings = False 121 | 122 | # If true, `todo` and `todoList` produce output, else they produce nothing. 123 | todo_include_todos = False 124 | 125 | 126 | # -- Options for HTML output ---------------------------------------------- 127 | 128 | # The theme to use for HTML and HTML Help pages. See the documentation for 129 | # a list of builtin themes. 130 | # 131 | # html_theme = 'alabaster' 132 | html_theme = "sphinx_rtd_theme" 133 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 134 | 135 | # Theme options are theme-specific and customize the look and feel of a theme 136 | # further. For a list of options available for each theme, see the 137 | # documentation. 138 | # 139 | # html_theme_options = {} 140 | 141 | # Add any paths that contain custom themes here, relative to this directory. 142 | # html_theme_path = [] 143 | 144 | # The name for this set of Sphinx documents. 145 | # " v documentation" by default. 146 | # 147 | # html_title = 'ML Insights v0.0.2' 148 | 149 | # A shorter title for the navigation bar. Default is the same as html_title. 150 | # 151 | # html_short_title = None 152 | 153 | # The name of an image file (relative to this directory) to place at the top 154 | # of the sidebar. 155 | # 156 | # html_logo = None 157 | 158 | # The name of an image file (relative to this directory) to use as a favicon of 159 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 160 | # pixels large. 161 | # 162 | # html_favicon = None 163 | 164 | # Add any paths that contain custom static files (such as style sheets) here, 165 | # relative to this directory. They are copied after the builtin static files, 166 | # so a file named "default.css" will overwrite the builtin "default.css". 167 | # html_static_path = ['_static'] 168 | 169 | # Add any extra paths that contain custom files (such as robots.txt or 170 | # .htaccess) here, relative to this directory. These files are copied 171 | # directly to the root of the documentation. 172 | # 173 | # html_extra_path = [] 174 | 175 | # If not None, a 'Last updated on:' timestamp is inserted at every page 176 | # bottom, using the given strftime format. 177 | # The empty string is equivalent to '%b %d, %Y'. 178 | # 179 | # html_last_updated_fmt = None 180 | 181 | # If true, SmartyPants will be used to convert quotes and dashes to 182 | # typographically correct entities. 183 | # 184 | # html_use_smartypants = True 185 | 186 | # Custom sidebar templates, maps document names to template names. 187 | # 188 | # html_sidebars = {} 189 | 190 | # Additional templates that should be rendered to pages, maps page names to 191 | # template names. 192 | # 193 | # html_additional_pages = {} 194 | 195 | # If false, no module index is generated. 196 | # 197 | # html_domain_indices = True 198 | 199 | # If false, no index is generated. 200 | # 201 | # html_use_index = True 202 | 203 | # If true, the index is split into individual pages for each letter. 204 | # 205 | # html_split_index = False 206 | 207 | # If true, links to the reST sources are added to the pages. 208 | # 209 | # html_show_sourcelink = True 210 | 211 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 212 | # 213 | # html_show_sphinx = True 214 | 215 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 216 | # 217 | # html_show_copyright = True 218 | 219 | # If true, an OpenSearch description file will be output, and all pages will 220 | # contain a tag referring to it. The value of this option must be the 221 | # base URL from which the finished HTML is served. 222 | # 223 | # html_use_opensearch = '' 224 | 225 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 226 | # html_file_suffix = None 227 | 228 | # Language to be used for generating the HTML full-text search index. 229 | # Sphinx supports the following languages: 230 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' 231 | # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh' 232 | # 233 | # html_search_language = 'en' 234 | 235 | # A dictionary with options for the search language support, empty by default. 236 | # 'ja' uses this config value. 237 | # 'zh' user can custom change `jieba` dictionary path. 238 | # 239 | # html_search_options = {'type': 'default'} 240 | 241 | # The name of a javascript file (relative to the configuration directory) that 242 | # implements a search results scorer. If empty, the default will be used. 243 | # 244 | # html_search_scorer = 'scorer.js' 245 | 246 | # Output file base name for HTML help builder. 247 | htmlhelp_basename = 'MLInsightsdoc' 248 | 249 | # -- Options for LaTeX output --------------------------------------------- 250 | 251 | latex_elements = { 252 | # The paper size ('letterpaper' or 'a4paper'). 253 | # 254 | # 'papersize': 'letterpaper', 255 | 256 | # The font size ('10pt', '11pt' or '12pt'). 257 | # 258 | # 'pointsize': '10pt', 259 | 260 | # Additional stuff for the LaTeX preamble. 261 | # 262 | # 'preamble': '', 263 | 264 | # Latex figure (float) alignment 265 | # 266 | # 'figure_align': 'htbp', 267 | } 268 | 269 | # Grouping the document tree into LaTeX files. List of tuples 270 | # (source start file, target name, title, 271 | # author, documentclass [howto, manual, or own class]). 272 | latex_documents = [ 273 | (master_doc, 'MLInsights.tex', 'ML Insights Documentation', 274 | 'Brian Lucena and Ramesh Sampath', 'manual'), 275 | ] 276 | 277 | # The name of an image file (relative to this directory) to place at the top of 278 | # the title page. 279 | # 280 | # latex_logo = None 281 | 282 | # For "manual" documents, if this is true, then toplevel headings are parts, 283 | # not chapters. 284 | # 285 | # latex_use_parts = False 286 | 287 | # If true, show page references after internal links. 288 | # 289 | # latex_show_pagerefs = False 290 | 291 | # If true, show URL addresses after external links. 292 | # 293 | # latex_show_urls = False 294 | 295 | # Documents to append as an appendix to all manuals. 296 | # 297 | # latex_appendices = [] 298 | 299 | # It false, will not define \strong, \code, itleref, \crossref ... but only 300 | # \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added 301 | # packages. 302 | # 303 | # latex_keep_old_macro_names = True 304 | 305 | # If false, no module index is generated. 306 | # 307 | # latex_domain_indices = True 308 | 309 | 310 | # -- Options for manual page output --------------------------------------- 311 | 312 | # One entry per manual page. List of tuples 313 | # (source start file, name, description, authors, manual section). 314 | man_pages = [ 315 | (master_doc, 'mlinsights', 'ML Insights Documentation', 316 | [author], 1) 317 | ] 318 | 319 | # If true, show URL addresses after external links. 320 | # 321 | # man_show_urls = False 322 | 323 | # mock imports 324 | autodoc_mock_imports = ["sklearn"] 325 | 326 | 327 | # -- Options for Texinfo output ------------------------------------------- 328 | 329 | # Grouping the document tree into Texinfo files. List of tuples 330 | # (source start file, target name, title, author, 331 | # dir menu entry, description, category) 332 | texinfo_documents = [ 333 | (master_doc, 'MLInsights', 'ML Insights Documentation', 334 | author, 'MLInsights', 'One line description of project.', 335 | 'Miscellaneous'), 336 | ] 337 | 338 | # Documents to append as an appendix to all manuals. 339 | # 340 | # texinfo_appendices = [] 341 | 342 | # If false, no module index is generated. 343 | # 344 | # texinfo_domain_indices = True 345 | 346 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 347 | # 348 | # texinfo_show_urls = 'footnote' 349 | 350 | # If true, do not generate a @detailmenu in the "Top" node's menu. 351 | # 352 | # texinfo_no_detailmenu = False 353 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ML_Insights documentation! 2 | ========================== 3 | 4 | Welcome to ML_Insights, home to SplineCalib and the ModelXRay. 5 | 6 | This package contains two main capabilities: 7 | 8 | * SplineCalib: Spline-based probability calibration 9 | * ModelXRay: Tool for model interpretability 10 | 11 | .. toctree:: 12 | :maxdepth: 3 13 | 14 | install 15 | calib_overview 16 | interp_overview 17 | 18 | 19 | Indices 20 | ======= 21 | * :ref:`genindex` 22 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ------------ 3 | 4 | .. code-block:: bash 5 | 6 | $ pip install ml_insights 7 | -------------------------------------------------------------------------------- /docs/interp_overview.rst: -------------------------------------------------------------------------------- 1 | Model Interpretation with ModelXRay 2 | =================================== 3 | 4 | ModelXRay is a tool for model interpretability contained in the ML-Insights package. It provides the capability to easily do Individual Conditional Expectation plots (ICE-plots). 5 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. epub3 to make an epub3 31 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 32 | echo. text to make text files 33 | echo. man to make manual pages 34 | echo. texinfo to make Texinfo files 35 | echo. gettext to make PO message catalogs 36 | echo. changes to make an overview over all changed/added/deprecated items 37 | echo. xml to make Docutils-native XML files 38 | echo. pseudoxml to make pseudoxml-XML files for display purposes 39 | echo. linkcheck to check all external links for integrity 40 | echo. doctest to run all doctests embedded in the documentation if enabled 41 | echo. coverage to run coverage check of the documentation if enabled 42 | echo. dummy to check syntax errors of document sources 43 | goto end 44 | ) 45 | 46 | if "%1" == "clean" ( 47 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 48 | del /q /s %BUILDDIR%\* 49 | goto end 50 | ) 51 | 52 | 53 | REM Check if sphinx-build is available and fallback to Python version if any 54 | %SPHINXBUILD% 1>NUL 2>NUL 55 | if errorlevel 9009 goto sphinx_python 56 | goto sphinx_ok 57 | 58 | :sphinx_python 59 | 60 | set SPHINXBUILD=python -m sphinx.__init__ 61 | %SPHINXBUILD% 2> nul 62 | if errorlevel 9009 ( 63 | echo. 64 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 65 | echo.installed, then set the SPHINXBUILD environment variable to point 66 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 67 | echo.may add the Sphinx directory to PATH. 68 | echo. 69 | echo.If you don't have Sphinx installed, grab it from 70 | echo.http://sphinx-doc.org/ 71 | exit /b 1 72 | ) 73 | 74 | :sphinx_ok 75 | 76 | 77 | if "%1" == "html" ( 78 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 79 | if errorlevel 1 exit /b 1 80 | echo. 81 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 82 | goto end 83 | ) 84 | 85 | if "%1" == "dirhtml" ( 86 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 87 | if errorlevel 1 exit /b 1 88 | echo. 89 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 90 | goto end 91 | ) 92 | 93 | if "%1" == "singlehtml" ( 94 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 95 | if errorlevel 1 exit /b 1 96 | echo. 97 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 98 | goto end 99 | ) 100 | 101 | if "%1" == "pickle" ( 102 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 103 | if errorlevel 1 exit /b 1 104 | echo. 105 | echo.Build finished; now you can process the pickle files. 106 | goto end 107 | ) 108 | 109 | if "%1" == "json" ( 110 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 111 | if errorlevel 1 exit /b 1 112 | echo. 113 | echo.Build finished; now you can process the JSON files. 114 | goto end 115 | ) 116 | 117 | if "%1" == "htmlhelp" ( 118 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 119 | if errorlevel 1 exit /b 1 120 | echo. 121 | echo.Build finished; now you can run HTML Help Workshop with the ^ 122 | .hhp project file in %BUILDDIR%/htmlhelp. 123 | goto end 124 | ) 125 | 126 | if "%1" == "qthelp" ( 127 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 128 | if errorlevel 1 exit /b 1 129 | echo. 130 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 131 | .qhcp project file in %BUILDDIR%/qthelp, like this: 132 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\MLInsights.qhcp 133 | echo.To view the help file: 134 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\MLInsights.ghc 135 | goto end 136 | ) 137 | 138 | if "%1" == "devhelp" ( 139 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 140 | if errorlevel 1 exit /b 1 141 | echo. 142 | echo.Build finished. 143 | goto end 144 | ) 145 | 146 | if "%1" == "epub" ( 147 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 148 | if errorlevel 1 exit /b 1 149 | echo. 150 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 151 | goto end 152 | ) 153 | 154 | if "%1" == "epub3" ( 155 | %SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3 156 | if errorlevel 1 exit /b 1 157 | echo. 158 | echo.Build finished. The epub3 file is in %BUILDDIR%/epub3. 159 | goto end 160 | ) 161 | 162 | if "%1" == "latex" ( 163 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 164 | if errorlevel 1 exit /b 1 165 | echo. 166 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdf" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "latexpdfja" ( 181 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 182 | cd %BUILDDIR%/latex 183 | make all-pdf-ja 184 | cd %~dp0 185 | echo. 186 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 187 | goto end 188 | ) 189 | 190 | if "%1" == "text" ( 191 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 192 | if errorlevel 1 exit /b 1 193 | echo. 194 | echo.Build finished. The text files are in %BUILDDIR%/text. 195 | goto end 196 | ) 197 | 198 | if "%1" == "man" ( 199 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 200 | if errorlevel 1 exit /b 1 201 | echo. 202 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 203 | goto end 204 | ) 205 | 206 | if "%1" == "texinfo" ( 207 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 208 | if errorlevel 1 exit /b 1 209 | echo. 210 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 211 | goto end 212 | ) 213 | 214 | if "%1" == "gettext" ( 215 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 216 | if errorlevel 1 exit /b 1 217 | echo. 218 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 219 | goto end 220 | ) 221 | 222 | if "%1" == "changes" ( 223 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 224 | if errorlevel 1 exit /b 1 225 | echo. 226 | echo.The overview file is in %BUILDDIR%/changes. 227 | goto end 228 | ) 229 | 230 | if "%1" == "linkcheck" ( 231 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 232 | if errorlevel 1 exit /b 1 233 | echo. 234 | echo.Link check complete; look for any errors in the above output ^ 235 | or in %BUILDDIR%/linkcheck/output.txt. 236 | goto end 237 | ) 238 | 239 | if "%1" == "doctest" ( 240 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 241 | if errorlevel 1 exit /b 1 242 | echo. 243 | echo.Testing of doctests in the sources finished, look at the ^ 244 | results in %BUILDDIR%/doctest/output.txt. 245 | goto end 246 | ) 247 | 248 | if "%1" == "coverage" ( 249 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 250 | if errorlevel 1 exit /b 1 251 | echo. 252 | echo.Testing of coverage in the sources finished, look at the ^ 253 | results in %BUILDDIR%/coverage/python.txt. 254 | goto end 255 | ) 256 | 257 | if "%1" == "xml" ( 258 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 259 | if errorlevel 1 exit /b 1 260 | echo. 261 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 262 | goto end 263 | ) 264 | 265 | if "%1" == "pseudoxml" ( 266 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 267 | if errorlevel 1 exit /b 1 268 | echo. 269 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 270 | goto end 271 | ) 272 | 273 | if "%1" == "dummy" ( 274 | %SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy 275 | if errorlevel 1 exit /b 1 276 | echo. 277 | echo.Build finished. Dummy builder generates no files. 278 | goto end 279 | ) 280 | 281 | :end 282 | -------------------------------------------------------------------------------- /docs/rtfd-requirements.txt: -------------------------------------------------------------------------------- 1 | ml_insights -------------------------------------------------------------------------------- /docs/splinecalib_class.rst: -------------------------------------------------------------------------------- 1 | SplineCalib Class 2 | ----------------- 3 | 4 | .. autoclass:: ml_insights.SplineCalib 5 | :members: fit, calibrate 6 | -------------------------------------------------------------------------------- /docs/splinecalib_examples.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | ======== 3 | 4 | The best way to learn about SplineCalib is to work through some examples. We have provided a few below. 5 | 6 | #. `SplineCalib_Tutorial`_: The most basic introduction to calibration and the SplineCalib class. 7 | #. `SplineCalib_Details`_: A deeper dive into the various settings and parameters of the SplineCalib class. 8 | #. `SplineCalib_Multiclass_MNIST`_: A multiclass calibration example using the MNIST digit data. 9 | 10 | 11 | .. _SplineCalib_Tutorial: https://github.com/numeristical/introspective/tree/master/examples/SplineCalib_Tutorial.ipynb 12 | .. _SplineCalib_Details: https://github.com/numeristical/introspective/tree/master/examples/SplineCalib_Details.ipynb 13 | .. _SplineCalib_Multiclass_MNIST: https://github.com/numeristical/introspective/tree/master/examples/SplineCalib_Multiclass_MNIST.ipynb 14 | -------------------------------------------------------------------------------- /extra/code/discrete_dt.pyx: -------------------------------------------------------------------------------- 1 | # cython: profile=True 2 | 3 | """Decision Tree based on Discrete Graph structure""" 4 | import numpy as np 5 | import pandas as pd 6 | import random 7 | cimport numpy as cnp 8 | from libc.math cimport log as clog 9 | from graphs import * 10 | 11 | 12 | class DiscreteGraphDecisionTree(object): 13 | """This class represents a tree built on categorical features, each of which contains 14 | a graph to represent the associated terrain. Splits will be tried according to the 15 | *maximally coarse partitions* returned from the graph class. 16 | 17 | feature_graphs: a dictionary which maps the column names to a graph_undirected object. 18 | The graph_undirected must contain vertices for every possible value of that column 19 | If the graph contains no edges, it will be treated as one-hot encoded. 20 | 21 | loss_fn: Currently there are three options: 22 | 'entropy': will use the information gain to choose the best split (target nust be [0,1]) 23 | 'mse': will use (minimum) mean squared error to choose the best split (target must be numeric) 24 | 'gh': This uses the XGBoost method where the first derivative (g) and second derivative (h) of the 25 | custom loss function must be provided. In this case, the 'g' values should be passed as y_train 26 | and the 'h' values passed as y_train_2 27 | 28 | min_size_split: The size, below which, the tree will not consider splitting further. Default is 2. 29 | 30 | min_leaf_size: The minimum permitted size of a split. Splits will not be considered if they result 31 | in a leaf smaller than min_leaf_size 32 | 33 | max_depth: The maximum depth permitted for the tree. Setting to 1 means creating 'stumps' (a single split). 34 | 35 | gamma: The minimum improvement required to execute a split (for regularization purposes). 36 | If the improvement of a split does not exceed gamma, then the node will not be split. 37 | 38 | reg_lambda: The L1 shrinkage applied to the coefficients, as in XGBoost. 39 | 40 | node_summary_fn: Given a collection of points at the node, what should be the value of the node. Default is 41 | to take the mean. 42 | 43 | max_splits_to_search: For a feature, what is the maximum number of splits we should search. Categorical features 44 | may have prohibitively many possible splits. If the number exceeds max_splits_to_search, we randomly choose 45 | only max_splits_to_search of them to evaluate. Default is infinity (search all splits) 46 | """ 47 | 48 | def __init__(self, feature_graphs, loss_fn = 'entropy', min_size_split=2, min_leaf_size = 2, max_depth=3, gamma=0, 49 | reg_lambda=1, node_summary_fn = np.mean, max_splits_to_search = np.Inf, msac=13): 50 | self.dec_tree={} 51 | self.dec_tree['feature_graphs'] = feature_graphs 52 | self.num_leafs = 0 53 | self.min_size_split=min_size_split 54 | self.min_leaf_size=min_leaf_size 55 | self.max_depth=max_depth 56 | self.gamma=gamma 57 | self.node_summary_fn=node_summary_fn 58 | self.reg_lambda=reg_lambda 59 | self.max_splits_to_search = max_splits_to_search 60 | self.msac = msac 61 | if loss_fn == 'gh': 62 | self.loss_fn='gh' 63 | self.node_summary_fn=_node_summary_gh 64 | self.split_scorer = _score_data_split_gh 65 | if loss_fn == 'entropy': 66 | self.loss_fn='entropy' 67 | self.split_scorer = _score_data_split_entropy 68 | if loss_fn == 'mse': 69 | self.loss_fn='mse' 70 | self.split_scorer = _score_data_split_mse 71 | 72 | def fit(self, X_train, y_train, y_train_2=None): 73 | # Tree fitting works through a queue of nodes to process (node_to_proc_list) 74 | # The initial node is just the root of the tree 75 | self.node_to_proc_list = [self.dec_tree] 76 | 77 | # Initialize values to what they are at the root of the tree 78 | self.dec_tree['depth']=0 79 | self.dec_tree['mask'] = np.ones(len(y_train)) 80 | self.X_train = X_train 81 | self.y_train = y_train 82 | 83 | # Special handling for 'gh' loss function 84 | if self.loss_fn == 'gh': 85 | self.y_train_2 = y_train_2 86 | self.node_summary_fn = _node_summary_gh 87 | 88 | # Process nodes until none are left to process 89 | while self.node_to_proc_list: 90 | node_to_process = self.node_to_proc_list.pop() 91 | self._process_tree_node(node_to_process) 92 | 93 | def predict(self, X_test): 94 | cdef int i, n=X_test.shape[0] 95 | cdef dict data_row_dict, pointer, col_to_int_dict 96 | cdef frozenset left_set 97 | 98 | col_list = list(X_test.columns) 99 | data_np = X_test.values 100 | col_to_int_dict = {col_list[i]:i for i in range(len(col_list))} 101 | 102 | # Initialize the output vector to all zeros 103 | out_vec = np.zeros(X_test.shape[0]) 104 | 105 | # This iterates through each data point in test set and follows the tree until it 106 | # reaches a leaf node 107 | for i in range(n): 108 | # Put the relevant values for current test point into a dict for quick lookup 109 | data_row_dict = {colname:data_np[i,col_to_int_dict[colname]] for colname in col_list} 110 | pointer = self.dec_tree 111 | while pointer['node_type']=='interior': 112 | curr_element = data_row_dict[pointer['split_feature']] 113 | left_set = pointer['left_split'] 114 | if curr_element in left_set: 115 | pointer = pointer['left_child'] 116 | else: 117 | pointer = pointer['right_child'] 118 | out_vec[i] = pointer['node_summary_val'] 119 | return(out_vec) 120 | 121 | def _process_tree_node(self, curr_node): 122 | # Restrict to relevant data for the node in question 123 | X_train_node = self.X_train[curr_node['mask']>0] 124 | 125 | # Get the associated y-values (or g,h values) 126 | # and save information about the current node 127 | if self.loss_fn != 'gh': 128 | y_train_node = self.y_train[curr_node['mask']>0] 129 | curr_node['node_summary_val'] = self.node_summary_fn(y_train_node) 130 | curr_node['num_data_points'] = len(y_train_node) 131 | else: 132 | y_train_g = self.y_train[curr_node['mask']>0] 133 | y_train_h = self.y_train_2[curr_node['mask']>0] 134 | curr_node['node_summary_val'] = _node_summary_gh(y_train_g, y_train_h, self.gamma) 135 | curr_node['num_data_points'] = len(y_train_g) 136 | g_sum = np.sum(y_train_g) 137 | h_sum = np.sum(y_train_h) 138 | 139 | # If we are guaranteed not to split this node further, then mark it as such and move on 140 | if (curr_node['num_data_points']=self.max_depth): 141 | if self.loss_fn != 'gh': 142 | self._wrap_up_node(curr_node, y_train_node) 143 | else: 144 | self._wrap_up_node(curr_node, y_train_g, y_train_h) 145 | return None 146 | 147 | # Determine which features are still "eligible" to be considered 148 | features_to_search = _get_features_to_search(X_train_node, curr_node['feature_graphs']) 149 | 150 | # If no features are eligible (e.g. all x-values are identical in all features) 151 | # Then we similarly summarize the node and move on 152 | if features_to_search==[]: 153 | if self.loss_fn != 'gh': 154 | self._wrap_up_node(curr_node, y_train_node) 155 | else: 156 | self._wrap_up_node(curr_node, y_train_g, y_train_h) 157 | return None 158 | 159 | # best_split_dict holds all the necessary info about a potential split 160 | best_split_dict = _initialize_best_split_dict() 161 | 162 | # Main loop over features to find best split 163 | for feature in features_to_search: 164 | feature_graph = curr_node['feature_graphs'][feature] 165 | if len(feature_graph.edges)==0: # This means to treat the feature as one-hot encoded 166 | possible_splits = [] 167 | vert_list = list(feature_graph.vertices) 168 | # Make a list of splits that are one feature vs the rest (as in one-hot-encoding) 169 | for i in range(len(vert_list)): 170 | tfset = frozenset(vert_list[i:i+1]) 171 | possible_splits.append(frozenset([tfset,frozenset(vert_list)-tfset])) 172 | #print(possible_splits) 173 | index_range = range(len(possible_splits)) 174 | else: 175 | # Query the graph structure to get the possible splits 176 | if (len(feature_graph.mc_partitions)>0): 177 | possible_splits = feature_graph.return_mc_partitions() 178 | else: 179 | possible_splits = feature_graph.return_contracted_partitions(max_size_after_contraction=self.msac) 180 | #print('# possible splits = {}'.format(len(possible_splits))) 181 | #possible_splits = feature_graph.return_mc_partitions() 182 | if (len(possible_splits)>self.max_splits_to_search): 183 | # Randomly choose (with replacement) a subset of possible splits 184 | index_range = np.random.randint(0,len(possible_splits),self.max_splits_to_search) 185 | #print('index_range_len={} msts={}'.format(len(index_range),self.max_splits_to_search)) 186 | else: 187 | index_range = range(len(possible_splits)) 188 | 189 | curr_feature_vec = X_train_node[feature].values 190 | 191 | # Loop within values of each feature 192 | for index in index_range: 193 | curr_partition = list(possible_splits[index]) 194 | left_split = curr_partition[0] 195 | if self.loss_fn != 'gh': 196 | curr_split_dict = _eval_curr_split_dict(curr_feature_vec, y_train_node, curr_node['feature_graphs'], 197 | feature, left_split, self.split_scorer, self.min_leaf_size, self.gamma) 198 | else: 199 | curr_split_dict = _eval_curr_split_dict(curr_feature_vec, y_train_g, curr_node['feature_graphs'], 200 | feature, left_split, self.split_scorer, self.min_leaf_size, self.gamma, 201 | y_train_2_node = y_train_h, is_gh=True, g_sum=g_sum, h_sum=h_sum) 202 | 203 | best_split_dict = _compare_curr_to_best(curr_split_dict, best_split_dict) 204 | 205 | 206 | if best_split_dict['best_loss_score'] < np.inf: 207 | # Execute the split 208 | left_mask = self.X_train[best_split_dict['best_split_feature']].isin(best_split_dict['best_left_split']).values 209 | right_mask = self.X_train[best_split_dict['best_split_feature']].isin(best_split_dict['best_right_split']).values 210 | self.perform_split_on_node(curr_node, best_split_dict, curr_node['feature_graphs'], left_mask, right_mask) 211 | else: 212 | if self.loss_fn != 'gh': 213 | self._wrap_up_node(curr_node, y_train_node) 214 | else: 215 | self._wrap_up_node(curr_node, y_train_g, y_train_h) 216 | 217 | return None 218 | 219 | def _wrap_up_node(self, curr_node, y_train_node, y_train_2_node=None): 220 | # Compute summary stats of node and mark it as a leaf 221 | if self.loss_fn!='gh': 222 | curr_node['node_summary_val'] = self.node_summary_fn(y_train_node) 223 | else: 224 | curr_node['node_summary_val'] = _node_summary_gh(y_train_node, y_train_2_node, self.reg_lambda) 225 | curr_node['num_data_points'] = len(y_train_node) 226 | curr_node['node_type'] = 'leaf' 227 | self.num_leafs+=1 228 | curr_node.pop('mask') 229 | 230 | def perform_split_on_node(self, curr_node, best_split_dict, feature_graphs_node, left_mask, right_mask): 231 | # record info about current node 232 | curr_node['left_split'] = best_split_dict['best_left_split'] 233 | curr_node['right_split'] = best_split_dict['best_right_split'] 234 | curr_node['loss_score'] = best_split_dict['best_loss_score'] 235 | curr_node['split_feature'] = best_split_dict['best_split_feature'] 236 | curr_node['node_type'] = 'interior' 237 | curr_mask = curr_node.pop('mask') 238 | 239 | # Create feature graphs for children 240 | feature_graphs_left = feature_graphs_node.copy() 241 | feature_graphs_left[curr_node['split_feature']] = get_induced_subgraph(feature_graphs_left[curr_node['split_feature']], 242 | curr_node['left_split']) 243 | feature_graphs_right = feature_graphs_node.copy() 244 | feature_graphs_right[curr_node['split_feature']] = get_induced_subgraph(feature_graphs_right[curr_node['split_feature']], 245 | curr_node['right_split']) 246 | # Create left and right children 247 | curr_node['left_child'] = {} 248 | curr_node['left_child']['depth'] = curr_node['depth'] + 1 249 | curr_node['left_child']['mask'] = curr_mask * left_mask 250 | curr_node['left_child']['feature_graphs'] = feature_graphs_left 251 | 252 | curr_node['right_child'] = {} 253 | curr_node['right_child']['depth'] = curr_node['depth'] + 1 254 | curr_node['right_child']['mask'] = curr_mask * right_mask 255 | curr_node['right_child']['feature_graphs'] = feature_graphs_right 256 | 257 | # Add left and right children to queue 258 | self.node_to_proc_list.append(curr_node['left_child']) 259 | self.node_to_proc_list.append(curr_node['right_child']) 260 | 261 | 262 | def _get_features_to_search(X_train_node, feature_graphs_node): 263 | num_distinct_values = {} 264 | for feature,graph in feature_graphs_node.items(): 265 | num_distinct_values[feature] = len(np.unique(X_train_node[feature])) 266 | 267 | ## Remove features from consideration if they only have <=1 distinct values in the current data 268 | features_to_search = [feature for feature in X_train_node.columns if num_distinct_values[feature]>1] 269 | return(features_to_search) 270 | 271 | 272 | def _initialize_best_split_dict(): 273 | out_dict = {} 274 | out_dict['best_loss_score'] = np.inf 275 | out_dict['best_left_split'] = None 276 | out_dict['best_right_split'] = None 277 | out_dict['best_split_feature'] = None 278 | return(out_dict) 279 | 280 | def _eval_curr_split_dict(curr_feature_vec, y_train_node, feature_graphs_node, feature, frozenset left_split, split_scorer, min_leaf_size, gamma, 281 | y_train_2_node=None, is_gh=False, g_sum=0, h_sum=0): 282 | cdef dict out_dict 283 | cdef frozenset temp_set 284 | cdef list temp_list 285 | 286 | out_dict = {} 287 | out_dict['left_split'] = left_split 288 | out_dict['feature'] = feature 289 | out_dict['right_split'] = frozenset(feature_graphs_node[feature].vertices - left_split) 290 | temp_set = out_dict['left_split'] 291 | temp_list = [x in temp_set for x in curr_feature_vec] 292 | out_dict['mask_left'] = np.array(temp_list) 293 | out_dict['mask_right'] = np.logical_not(out_dict['mask_left']) 294 | if is_gh==False: 295 | out_dict['loss_score'] = split_scorer(out_dict['mask_left'], out_dict['mask_right'], y_train_node.values, min_leaf_size, gamma) 296 | else: 297 | out_dict['loss_score'] = split_scorer(out_dict['mask_left'], out_dict['mask_right'], y_train_node.values, y_train_2_node.values, 298 | min_leaf_size, gamma, g_sum, h_sum) 299 | 300 | return(out_dict) 301 | 302 | def _compare_curr_to_best(curr_split_dict, best_split_dict): 303 | if (curr_split_dict['loss_score'] < best_split_dict['best_loss_score']): 304 | best_split_dict['best_loss_score'] = curr_split_dict['loss_score'] 305 | best_split_dict['best_split_feature'] = curr_split_dict['feature'] 306 | best_split_dict['best_left_split'] = curr_split_dict['left_split'] 307 | best_split_dict['best_right_split'] = curr_split_dict['right_split'] 308 | return(best_split_dict) 309 | 310 | def root_mean_squared_error(vec1, vec2): 311 | return np.sqrt(np.mean((vec1-vec2)**2)) 312 | 313 | 314 | def _score_data_split_mse(mask_left, mask_right, outcome_vec, min_leaf_size, gamma, eps=.0001): 315 | 316 | cdef double mean_left, mean_right,mean_overall,loss_score, n1, n2 317 | 318 | n1 = np.sum(mask_left) 319 | n2 = np.sum(mask_right) 320 | if np.minimum(n1, n2)=0: 332 | loss_score = np.inf 333 | return loss_score 334 | 335 | 336 | 337 | def _score_data_split_entropy(mask_left, mask_right, outcome_vec, min_leaf_size, gamma, eps=.0001): 338 | 339 | cdef double m1,n1,m2,n2,num1,num1a,num2,num2a,lik_rat,loss_score 340 | 341 | m1 = np.sum(outcome_vec[mask_left])+eps 342 | n1 = np.sum(mask_left)+eps 343 | m2 = np.sum(outcome_vec[mask_right])+eps 344 | n2 = np.sum(mask_right)+eps 345 | if np.minimum(n1, n2)=0: 350 | loss_score = np.inf 351 | return loss_score 352 | 353 | cdef double get_lik_rat(double m1, double n1, double m2, double n2, eps): 354 | cdef double num1, num2 355 | num1 = m1*clog(((m1/n1)/((m1+m2)/(n1+n2)))+eps) + (n1-m1+eps) * clog((((n1-m1)/n1)/((((n1+n2)-(m1+m2)))/(n1+n2)))) 356 | num2 = m2*clog(((m2/n2)/((m1+m2)/(n1+n2)))) + (n2-m2+eps) * clog((((n2-m2)/n2)/((((n1+n2)-(m1+m2)))/(n1+n2)))) 357 | return num1+num2 358 | 359 | def _score_data_split_gh(mask_left, mask_right, outcome_vec_g, outcome_vec_h, min_leaf_size, gamma, g_sum, h_sum): 360 | cdef double loss_score, g_left, g_right, h_left, h_right, n_left, n_right, vec_len 361 | 362 | vec_len = len(outcome_vec_g) 363 | g_left = np.sum(outcome_vec_g[mask_left]) 364 | g_right = g_sum - g_left 365 | #g_right = np.sum(outcome_vec_g[mask_right]) 366 | h_left = np.sum(outcome_vec_h[mask_left]) 367 | h_right = h_sum - h_left 368 | #h_right = np.sum(outcome_vec_h[mask_right]) 369 | n_left = np.sum(mask_left) 370 | n_right = vec_len - n_left 371 | #n_right = np.sum(mask_right) 372 | if np.minimum(n_left, n_right)=0: 376 | loss_score = np.inf 377 | return loss_score 378 | 379 | cdef double _get_gh_score(double g_left, double g_right, double h_left, double h_right, double gamma): 380 | return(.5*( ((g_left**2)/(h_left+gamma)) + ((g_right**2)/(h_right+gamma)) - (((g_left + g_right)**2)/(h_left + h_right+gamma)))-gamma) 381 | 382 | def _node_summary_gh(y_vec_g, y_vec_h, reg_lambda): 383 | out_val = -np.sum(y_vec_g)/(np.sum(y_vec_h)+reg_lambda) 384 | return(out_val) 385 | -------------------------------------------------------------------------------- /extra/code/discrete_gb.pyx: -------------------------------------------------------------------------------- 1 | # cython: profile=True 2 | 3 | """Decision Tree Gradient Boosting based on Discrete Graph structure""" 4 | import numpy as np 5 | import pandas as pd 6 | cimport numpy as cnp 7 | from libc.math cimport log as clog 8 | from discrete_dt import * 9 | from graphs import * 10 | from sklearn.metrics import log_loss, mean_squared_error 11 | 12 | 13 | class DiscreteGraphGB(object): 14 | 15 | def __init__(self, num_trees, feature_graphs, mode='classification', loss_fn = 'entropy', min_size_split=2, min_leaf_size = 1, max_depth=3, gamma=0, 16 | reg_lambda=1, node_summary_fn = np.mean, learning_rate=.1, max_splits_to_search=np.Inf, msac=100): 17 | self.num_trees = num_trees 18 | self.num_trees_for_prediction = num_trees 19 | self.dec_tree_list = [] 20 | self.feature_graphs = feature_graphs 21 | self.min_size_split=min_size_split 22 | self.min_leaf_size=min_leaf_size 23 | self.max_depth=max_depth 24 | self.gamma=gamma 25 | self.node_summary_fn=node_summary_fn 26 | self.learning_rate = learning_rate 27 | self.loss_fn = loss_fn 28 | self.max_splits_to_search = max_splits_to_search 29 | self.msac = msac 30 | self.mode = mode 31 | if loss_fn == 'entropy': 32 | self.loss_fn_der_1 = _entropy_link_der_1 33 | self.loss_fn_der_2 = _entropy_link_der_2 34 | if loss_fn == 'mse': 35 | self.loss_fn_der_1 = _mse_der_1 36 | self.loss_fn_der_2 = _mse_der_2 37 | # if features=='auto': 38 | # self.features=list(self.dec_tree['feature_graphs'].keys()) 39 | 40 | def fit(self, X_train, y_train, eval_set = None, eval_freq=10, 41 | early_stop_past_steps=0, choose_best_eval=True): 42 | # cdef int i, n =self.num_trees 43 | self.eval_freq=eval_freq 44 | eval_len = np.floor(self.num_trees/self.eval_freq).astype(int) 45 | self.eval_results = np.zeros(eval_len) 46 | n =self.num_trees 47 | self.initial_pred = np.mean(y_train) 48 | stop_now=False 49 | if eval_set is not None: 50 | X_valid = eval_set[0] 51 | y_valid = eval_set[1] 52 | for i in range(n): 53 | 54 | # Get predictions of current model 55 | if i==0: 56 | curr_answer = self.initial_pred * np.ones(len(y_train)) 57 | if eval_set is not None: 58 | curr_test_answer = self.initial_pred * np.ones(len(y_valid)) 59 | if self.mode == 'classification': 60 | curr_loss= log_loss(y_valid, 1/(1+np.exp(-curr_test_answer))) 61 | print("i=0, test_set_log_loss = {}".format(curr_loss)) 62 | else: 63 | curr_loss= mean_squared_error(y_valid, curr_test_answer) 64 | print("i=0. test_set_mse = {}".format(curr_loss)) 65 | 66 | else: 67 | curr_answer = curr_answer + self.learning_rate * self.dec_tree_list[i-1].predict(X_train) 68 | if eval_set is not None: 69 | curr_test_answer = curr_test_answer + self.learning_rate * self.dec_tree_list[i-1].predict(X_valid) 70 | if ((i+1)%self.eval_freq==1): 71 | if self.mode == 'classification': 72 | curr_loss= log_loss(y_valid, 1/(1+np.exp(-curr_test_answer))) 73 | print("i={}, test_set_log_loss = {}".format(i,curr_loss)) 74 | else: 75 | curr_loss= mean_squared_error(y_valid, curr_test_answer) 76 | print("i={}, test_set_mse = {}".format(i,curr_loss)) 77 | 78 | curr_step=np.floor((i+1)/self.eval_freq).astype(int) -1 79 | self.eval_results[curr_step]=curr_loss 80 | if curr_step>early_stop_past_steps: 81 | compare_loss = np.min(self.eval_results[:curr_step-early_stop_past_steps+1]) 82 | if (curr_loss>compare_loss): 83 | stop_now=True 84 | print("Stopping early: curr_loss of {} exceeds compare_loss of {}".format(curr_loss, compare_loss)) 85 | if stop_now: 86 | if choose_best_eval: 87 | self.num_trees_for_prediction = (np.argmin(self.eval_results[:curr_step+1])+1)*eval_freq 88 | break 89 | 90 | # Get first and second derivatives 91 | y_g_vec = self.loss_fn_der_1(y_train, curr_answer) 92 | y_h_vec = self.loss_fn_der_2(y_train, curr_answer) 93 | 94 | 95 | # Sample the data to use for this tree 96 | 97 | num_rows = X_train.shape[0] 98 | rows_to_use = np.random.choice(range(num_rows), num_rows, replace=True) 99 | if type(X_train)==pd.DataFrame: 100 | X_train_to_use = X_train.iloc[rows_to_use] 101 | elif type(X_train)==np.ndarray: 102 | X_train_to_use = X_train[rows_to_use] 103 | else: 104 | print('unknown format for X_train') 105 | #y_original_train_to_use = y_train.sample(X_train.shape[0], random_state=rs, replace=True) 106 | if type(y_g_vec)==pd.Series: 107 | y_g_to_use = y_g_vec.iloc[rows_to_use] 108 | elif type(y_g_vec)==np.ndarray: 109 | y_g_to_use = y_g_vec[rows_to_use] 110 | else: 111 | print('unknown format for y_g_vec') 112 | 113 | if type(y_h_vec)==pd.Series: 114 | y_h_to_use = y_h_vec.iloc[rows_to_use] 115 | elif type(y_h_vec)==np.ndarray: 116 | y_h_to_use = y_h_vec[rows_to_use] 117 | else: 118 | print('unknown format for y_h_vec') 119 | 120 | self.dec_tree_list.append(DiscreteGraphDecisionTree(feature_graphs=self.feature_graphs,loss_fn = 'gh', 121 | min_size_split = self.min_size_split, min_leaf_size=self.min_leaf_size, 122 | gamma=self.gamma, max_depth=self.max_depth, 123 | node_summary_fn = self.node_summary_fn, 124 | max_splits_to_search = self.max_splits_to_search, msac=self.msac)) 125 | self.dec_tree_list[i].fit(X_train_to_use, y_g_to_use, y_h_to_use) 126 | 127 | 128 | def predict(self, X_test, num_trees_to_use=0): 129 | cdef int i 130 | if num_trees_to_use==0: 131 | num_trees_to_use=self.num_trees_for_prediction 132 | out_vec = self.initial_pred*np.ones(X_test.shape[0]) 133 | for i in range(num_trees_to_use): 134 | out_vec = out_vec + self.learning_rate * self.dec_tree_list[i].predict(X_test) 135 | if self.mode=='classification': 136 | return(1/(1+np.exp(-out_vec))) 137 | else: 138 | return(out_vec) 139 | 140 | def _entropy_der_1(y_true, y_pred, eps=1e-15): 141 | y_pred = np.maximum(y_pred, eps) 142 | y_pred = np.minimum(y_pred, 1-eps) 143 | return((-(y_true/y_pred) + (1-y_true)/(1-y_pred))) 144 | 145 | def _entropy_der_2(y_true, y_pred, eps=1e-15): 146 | y_pred = np.maximum(y_pred, eps) 147 | y_pred = np.minimum(y_pred, 1-eps) 148 | out_vec = (y_true)/(y_pred**2) + ((1-y_true)/((1-y_pred)**2)) 149 | return(out_vec) 150 | 151 | def _mse_der_1(y_true, y_pred, eps=1e-15): 152 | return(2*(y_pred-y_true)) 153 | 154 | def _mse_der_2(y_true, y_pred, eps=1e-15): 155 | return(pd.Series(2*np.ones(len(y_pred)))) 156 | 157 | def _entropy_link_der_1(y_true, z_pred, eps=1e-15): 158 | return(-y_true*(1/(1+np.exp(z_pred))) + (1-y_true) * (1/(1+np.exp(-z_pred))) ) 159 | 160 | def _entropy_link_der_2(y_true, z_pred, eps=1e-15): 161 | return(y_true*(np.exp(z_pred)/((1+np.exp(z_pred))**2)) + (1-y_true) * (np.exp(-z_pred)/((1+np.exp(-z_pred))**2)) ) 162 | 163 | -------------------------------------------------------------------------------- /extra/code/graphs.pyx: -------------------------------------------------------------------------------- 1 | # cython: profile=True 2 | 3 | 4 | import numpy as np 5 | import json 6 | 7 | class graph_undirected(object): 8 | """This is a class to handle undirected graphs. Still very much a work in progress. 9 | Defines a graph by a set of vertices and a set of "frozensets" representing the edges. 10 | """ 11 | 12 | def __init__(self, edges, vertices=set()): 13 | 14 | if vertices == set(): 15 | self.vertices = set([x for sublist in list(edges) for x in list(sublist) ]) 16 | else: 17 | self.vertices = set(vertices) 18 | new_edges = set() 19 | for edge in edges: 20 | new_edge = frozenset(edge) 21 | if len(new_edge)>1: 22 | new_edges.add(new_edge) 23 | self.edges = set(new_edges) 24 | self.mc_partitions = [] 25 | self.mc_partitions_max_size = 0 26 | self.all_connected_sets = [] 27 | self.vertex_to_neighbors_dict = {} 28 | 29 | def adjacent_edges(self, target_vertex): 30 | return set([x for x in self.edges if target_vertex in x]) 31 | 32 | def adjacent_vertices(self, target_vertex): 33 | if target_vertex in self.vertex_to_neighbors_dict.keys(): 34 | return self.vertex_to_neighbors_dict[target_vertex] 35 | else: 36 | neighbors_and_self = set([x for sublist in self.adjacent_edges(target_vertex) for x in sublist]) 37 | out_set = set(neighbors_and_self)-set([target_vertex]) 38 | self.vertex_to_neighbors_dict[target_vertex] = out_set 39 | return out_set 40 | 41 | def adjacent_vertices_to_set(self, target_vertex_set): 42 | templist = [list(self.adjacent_vertices(x)) for x in target_vertex_set] 43 | neighbors_and_self = [x for sublist in templist for x in sublist] 44 | return set(neighbors_and_self)-target_vertex_set 45 | 46 | def vertex_degree(self, target_vertex): 47 | return len(self.adjacent_vertices(target_vertex)) 48 | 49 | def contract_edge(self, edge, sep_str='_'): 50 | return contract_edge(self, edge, sep_str) 51 | 52 | def delete_vertex(self, vertex): 53 | return delete_vertex(self, vertex) 54 | 55 | def delete_vertices(self, vertex_set): 56 | return delete_vertices(self, vertex_set) 57 | 58 | def get_induced_subgraph(self, vertex_set): 59 | return get_induced_subgraph(self, vertex_set) 60 | 61 | def return_mc_partitions(self): 62 | if self.mc_partitions==[]: 63 | self.enumerate_mc_partitions() 64 | return(self.mc_partitions) 65 | 66 | 67 | def enumerate_mc_partitions(self, max_size=0, verbose=False): 68 | """This method will examine every connected set S of size up to max_size and 69 | determine whether or not the complement of the set is also connected. If the 70 | complement is also connected, then the partition {S, S^C} is added to the list 71 | self.mc_partitions""" 72 | 73 | # Default behavior is to find all maximally coarse partitions which 74 | # requires searching components up to size floor(n_vertices/2) 75 | if max_size==0: 76 | max_size=int(np.floor(len(self.vertices)/2)) 77 | 78 | # Initialize some variables 79 | # The two lists below are sets of sets by size. 80 | # i.e. conn_sets_with_conn_complements_by_size[5] will be a set that contains 81 | # the connected sets of size 5 whose complements are also connected 82 | conn_sets_with_conn_complements_by_size = [] 83 | conn_sets_with_disconn_complements_by_size = [] 84 | 85 | # These two contain the sizes of each entry in the above lists 86 | num_conn_sets_with_conn_complements_list = [] 87 | num_conn_sets_with_disconn_complements_list = [] 88 | 89 | # Initialize the list with an empty set 90 | conn_sets_with_conn_complements_by_size.append(set()) 91 | conn_sets_with_disconn_complements_by_size.append(set()) 92 | 93 | 94 | # Corner case handling 95 | if(len(self.vertices)<=1): 96 | self.mc_partitions = [] 97 | return [] 98 | if(len(self.vertices)==2): 99 | vert_list = list(self.vertices) 100 | set1 = set() 101 | set2 = set() 102 | set1.add(vert_list[0]) 103 | set2.add(vert_list[1]) 104 | self.mc_partitions = [frozenset([frozenset(set1),frozenset(set2)])] 105 | self.max_size = 1 106 | return None 107 | 108 | # The connected components of size 1 are exactly the vertices 109 | if verbose: 110 | print('Evaluating connected sets of size 1') 111 | for vert in self.vertices: 112 | if is_connected(delete_vertex(self, vert)): 113 | conn_sets_with_conn_complements_by_size[0].add(frozenset({vert})) 114 | else: 115 | conn_sets_with_disconn_complements_by_size[0].add(frozenset({vert})) 116 | num_conn_sets_with_conn_complements_list.append(len(conn_sets_with_conn_complements_by_size[0])) 117 | num_conn_sets_with_disconn_complements_list.append(len(conn_sets_with_disconn_complements_by_size[0])) 118 | if verbose: 119 | print('num conn sets of comp_size 1 with connected complements = {}'.format(num_conn_sets_with_conn_complements_list[0])) 120 | print('num conn sets of comp_size 1 with disconnected complements = {}'.format(num_conn_sets_with_disconn_complements_list[0])) 121 | print('Evaluating connected sets of size 2') 122 | conn_sets_with_conn_complements_by_size.append(set()) 123 | conn_sets_with_disconn_complements_by_size.append(set()) 124 | 125 | # The connected components of size 2 are exactly the edges 126 | for edge in self.edges: 127 | if is_connected(delete_vertices(self, edge)): 128 | conn_sets_with_conn_complements_by_size[1].add(edge) 129 | else: 130 | conn_sets_with_disconn_complements_by_size[1].add(edge) 131 | num_conn_sets_with_conn_complements_list.append(len(conn_sets_with_conn_complements_by_size[1])) 132 | num_conn_sets_with_disconn_complements_list.append(len(conn_sets_with_disconn_complements_by_size[1])) 133 | if verbose: 134 | print('num conn sets of comp_size 2 with connected complements = {}'.format(num_conn_sets_with_conn_complements_list[1])) 135 | print('num conn sets of comp_size 2 with disconnected complements = {}'.format(num_conn_sets_with_disconn_complements_list[1])) 136 | print('num conn sets of comp_size <=2 with connected complements = {}'.format(np.sum(num_conn_sets_with_conn_complements_list))) 137 | print('num conn sets of comp_size <=2 with disconnected complements = {}'.format(np.sum(num_conn_sets_with_disconn_complements_list))) 138 | 139 | 140 | for comp_size in range(3, max_size+1): 141 | conn_sets_with_conn_complements_by_size.append(set()) 142 | conn_sets_with_disconn_complements_by_size.append(set()) 143 | 144 | if verbose: 145 | print('Evaluating connected sets of size {}'.format(comp_size)) 146 | base_components = conn_sets_with_conn_complements_by_size[comp_size-2].union(conn_sets_with_disconn_complements_by_size[comp_size-2]) 147 | for base_comp in base_components: 148 | neighbors_to_add = self.adjacent_vertices_to_set(base_comp) 149 | for neighbor in neighbors_to_add: 150 | new_comp = set(base_comp) 151 | new_comp.add(neighbor) 152 | new_comp = frozenset(new_comp) 153 | if ((not new_comp in conn_sets_with_conn_complements_by_size[comp_size-1]) and (not new_comp in conn_sets_with_disconn_complements_by_size[comp_size-1])): 154 | if is_connected(delete_vertices(self,new_comp)): 155 | conn_sets_with_conn_complements_by_size[comp_size-1].add(new_comp) 156 | else: 157 | conn_sets_with_disconn_complements_by_size[comp_size-1].add(new_comp) 158 | num_conn_sets_with_conn_complements_list.append(len(conn_sets_with_conn_complements_by_size[comp_size-1])) 159 | num_conn_sets_with_disconn_complements_list.append(len(conn_sets_with_disconn_complements_by_size[comp_size-1])) 160 | 161 | if verbose: 162 | print('num conn set of comp_size {} with connected complements= {}'.format(comp_size,num_conn_sets_with_conn_complements_list[comp_size-1])) 163 | print('num conn set of comp_size {} with discconnected complements= {}'.format(comp_size,num_conn_sets_with_disconn_complements_list[comp_size-1])) 164 | print('num conn set of comp_size <= {} with connected complements= {}'.format(comp_size, np.sum(num_conn_sets_with_conn_complements_list))) 165 | print('num conn set of comp_size <= {} with disconnected complements= {}'.format(comp_size, np.sum(num_conn_sets_with_disconn_complements_list))) 166 | 167 | self.mc_partitions = list(set([frozenset([conn_set, frozenset(self.vertices - conn_set)]) for templist in conn_sets_with_conn_complements_by_size for conn_set in templist])) 168 | #self.mc_partitions = [[conn_set, self.vertices - conn_set] for conn_set in conn_sets_with_conn_complements] 169 | self.mc_partitions_max_size = max_size 170 | 171 | 172 | def save_partitions_to_file(self, file_name): 173 | list_of_lists = [list(x) for x in self.all_partitions] 174 | with open(file_name, "w") as write_file: 175 | json.dump(list_of_lists, write_file) 176 | 177 | def load_partitions_from_file(self, file_name): 178 | with open(file_name, "r") as read_file: 179 | list_of_lists = json.load(read_file) 180 | self.all_partitions = [frozenset(x) for x in list_of_lists] 181 | 182 | 183 | def return_contracted_partitions(self, max_size_after_contraction = 13): 184 | # if len(self.contracted_partitions)>0: 185 | # return self.contracted_partitions 186 | # else: 187 | new_graph = graph_undirected(self.edges, self.vertices) 188 | while (len(new_graph.vertices)>max_size_after_contraction): 189 | vertex_list = list(new_graph.vertices) 190 | rand_vertex = vertex_list[np.random.randint(len(vertex_list))] 191 | rand_vertex_neighbor_list = list(new_graph.adjacent_vertices(rand_vertex)) 192 | rand_neighbor = rand_vertex_neighbor_list[np.random.randint(len(rand_vertex_neighbor_list))] 193 | new_graph = new_graph.contract_edge([rand_vertex, rand_neighbor],sep_str='_|_') 194 | 195 | new_graph.enumerate_mc_partitions() 196 | self.contracted_partitions = transform_partition_list(new_graph.mc_partitions, sep='_|_') 197 | return self.contracted_partitions 198 | 199 | 200 | 201 | def enumerate_connected_sets(self, max_size=-1, verbose=False): 202 | if self.all_connected_sets: 203 | return self.all_connected_sets 204 | if(len(self.vertices)==0): 205 | return [] 206 | if((len(self.vertices)>=1) and (len(self.vertices)<=2)): 207 | return [frozenset([list(self.vertices)[0]])] 208 | if max_size==(-1): 209 | max_size=len(self.vertices) 210 | connected_sets = [] 211 | connected_sets.append(set()) 212 | num_connected_sets_list = [] 213 | if verbose: 214 | print('Evaluating components of size 1') 215 | for vert in self.vertices: 216 | connected_sets[0].add(frozenset({vert})) 217 | num_connected_sets_list.append(len(connected_sets[0])) 218 | if verbose: 219 | print('num connected sets of size 1 = {}'.format(num_connected_sets_list[0])) 220 | print('Evaluating components of size 2') 221 | connected_sets.append(set()) 222 | for edge in self.edges: 223 | connected_sets[1].add(edge) 224 | num_connected_sets_list.append(len(connected_sets[1])) 225 | if verbose: 226 | print('num_connected_sets of size 2 = {}'.format(num_connected_sets_list[1])) 227 | print('num_connected_sets of size<=2 is {}'.format(np.sum(num_connected_sets_list))) 228 | 229 | 230 | for comp_size in range(3, max_size+1): 231 | connected_sets.append(set()) 232 | 233 | if verbose: 234 | print('Evaluating components of size {}'.format(comp_size)) 235 | base_components = connected_sets[comp_size-2] 236 | for base_comp in base_components: 237 | neighbors_to_add = self.adjacent_vertices_to_set(base_comp) 238 | for neighbor in neighbors_to_add: 239 | new_comp = set(base_comp) 240 | new_comp.add(neighbor) 241 | new_comp = frozenset(new_comp) 242 | connected_sets[comp_size-1].add(new_comp) 243 | num_connected_sets_list.append(len(connected_sets[comp_size-1])) 244 | # if memory_save: 245 | # good_partitions[comp_size-2]=set() 246 | # failed_partitions[comp_size-2]=set() 247 | 248 | if verbose: 249 | print('num_connected_sets of size {} = {}'.format(comp_size,num_connected_sets_list[comp_size-1])) 250 | print('num_total_connected_sets of size<={} is {}'.format(comp_size, np.sum(num_connected_sets_list))) 251 | 252 | connected_sets = [k for templist in connected_sets for k in templist] 253 | self.all_connected_sets = connected_sets 254 | return connected_sets 255 | 256 | def save_connected_sets_to_file(self, file_name): 257 | list_of_lists = [list(x) for x in self.all_connected_sets] 258 | with open(file_name, "w") as write_file: 259 | json.dump(list_of_lists, write_file) 260 | 261 | def load_connected_sets_from_file(self, file_name): 262 | with open(file_name, "r") as read_file: 263 | list_of_lists = json.load(read_file) 264 | self.all_connected_sets = [frozenset(x) for x in list_of_lists] 265 | 266 | def get_partitions_from_connected_sets(self, verbose=False, verbose_freq=1000): 267 | part_list = [] 268 | conn_set_list = self.all_connected_sets.copy() 269 | conn_set_set = set(self.all_connected_sets) 270 | if verbose: 271 | print('checking {} connected sets'.format(len(conn_set_list))) 272 | for i,conn_set in enumerate(conn_set_list): 273 | if len(conn_set) > (len(self.vertices)/2): 274 | break 275 | complement_set = frozenset(self.vertices - conn_set) 276 | if complement_set in conn_set_set: 277 | part_list.append(conn_set) 278 | conn_set_list.remove(complement_set) 279 | if ((((i+1) % verbose_freq)) ==0): 280 | if verbose: 281 | print('Checked {} sets'.format(i+1)) 282 | print('Found {} partitions'.format(len(part_list))) 283 | self.all_partitions = part_list 284 | 285 | 286 | def contract_edge(graph, edge, sep_str='_|_'): 287 | edge_alph = list(edge) 288 | edge_alph.sort() 289 | contracted_vertex = sep_str.join((edge_alph)) 290 | #new_vertices = (set(graph.vertices) - set(edge)).union({contracted_vertex}) 291 | new_edges = [[contracted_vertex if y==edge_alph[0] or y==edge_alph[1] else y for y in this_edge] 292 | if edge_alph[0] in this_edge or edge_alph[1] in this_edge else this_edge for this_edge in graph.edges] 293 | return graph_undirected(new_edges) 294 | 295 | def delete_vertex(graph, vertex): 296 | new_edges = set([edge for edge in graph.edges if vertex not in edge]) 297 | new_vertices = graph.vertices - {vertex} 298 | return graph_undirected(new_edges, new_vertices) 299 | 300 | def delete_vertices(graph, vertex_set): 301 | new_edges = set([edge for edge in graph.edges if not vertex_set.intersection(edge)]) 302 | new_vertices = graph.vertices - vertex_set 303 | return graph_undirected(new_edges, new_vertices) 304 | 305 | def get_induced_subgraph(graph, vertex_set): 306 | vertex_set = set(vertex_set) 307 | new_edges = set([edge for edge in graph.edges if edge <= vertex_set]) 308 | new_vertices = vertex_set 309 | new_graph = graph_undirected(new_edges, new_vertices) 310 | new_graph.all_connected_sets = [x for x in graph.all_connected_sets if new_vertices.issuperset(x)] 311 | return new_graph 312 | 313 | 314 | def is_connected(graph): 315 | initial_vertex = next(iter(graph.vertices)) 316 | visited_vertices = [initial_vertex] 317 | unexplored_vertices = list(graph.adjacent_vertices(initial_vertex)) 318 | while unexplored_vertices: 319 | curr_vertex = unexplored_vertices.pop() 320 | visited_vertices.append(curr_vertex) 321 | new_vertices = graph.adjacent_vertices(curr_vertex) 322 | unexplored_vertices = list(set(unexplored_vertices).union(new_vertices) - set(visited_vertices)) 323 | return len(set(visited_vertices)) == len(set(graph.vertices)) 324 | 325 | def num_connected_comp(graph): 326 | initial_vertex = list(graph.vertices)[0] 327 | visited_vertices = [initial_vertex] 328 | unexplored_vertices = list(graph.adjacent_vertices(initial_vertex)) 329 | while unexplored_vertices: 330 | curr_vertex = unexplored_vertices.pop(0) 331 | visited_vertices.append(curr_vertex) 332 | new_vertices = graph.adjacent_vertices(curr_vertex) 333 | unexplored_vertices = list(set(unexplored_vertices).union(new_vertices) - set(visited_vertices)) 334 | if len(set(visited_vertices)) == len(set(graph.vertices)): 335 | return 1 336 | else: 337 | remainder_vertices = list(set(graph.vertices)-set(visited_vertices)) 338 | remainder_edges = [edge for edge in graph.edges if edge.issubset(set(remainder_vertices))] 339 | return 1 + num_connected_comp(graph_undirected(remainder_edges, remainder_vertices)) 340 | 341 | def connected_comp_list(graph): 342 | initial_vertex = list(graph.vertices)[0] 343 | visited_vertices = [initial_vertex] 344 | unexplored_vertices = list(graph.adjacent_vertices(initial_vertex)) 345 | while unexplored_vertices: 346 | curr_vertex = unexplored_vertices.pop(0) 347 | visited_vertices.append(curr_vertex) 348 | new_vertices = graph.adjacent_vertices(curr_vertex) 349 | unexplored_vertices = list(set(unexplored_vertices).union(new_vertices) - set(visited_vertices)) 350 | if len(set(visited_vertices)) == len(set(graph.vertices)): 351 | return [graph] 352 | else: 353 | cc_vertices = set(visited_vertices) 354 | cc_edges = [edge for edge in graph.edges if edge.issubset(set(visited_vertices))] 355 | cc_graph = graph_undirected(cc_edges, cc_vertices) 356 | remainder_vertices = list(set(graph.vertices)-set(visited_vertices)) 357 | remainder_edges = [edge for edge in graph.edges if edge.issubset(set(remainder_vertices))] 358 | return [cc_graph] + connected_comp_list(graph_undirected(remainder_edges, remainder_vertices)) 359 | 360 | def get_all_distances_from_vertex(graph, start_vertex): 361 | vertex_path_dist_dict=set() 362 | vertex_path_dist_dict[start_vertex] = 0 363 | unexplored_vertices = list(graph.adjacent_vertices(start_vertex)) 364 | for vert in unexplored_vertices: 365 | vertex_path_dist_dict[vert]=1 366 | visited_vertices = [start_vertex] 367 | 368 | while unexplored_vertices and (len(vertex_path_dist_dict.keys())=1: 19 | new_edges.add(new_edge) 20 | if add_singletons: 21 | for vertex in self.vertices: 22 | new_edges.add(frozenset([vertex])) 23 | self.edges = set(new_edges) 24 | self.mc_partitions = [] 25 | self.mc_partitions_max_size = 0 26 | self.all_connected_sets = [] 27 | self.partition_dict = {} 28 | 29 | 30 | def adjacent_edges(self, target_vertex): 31 | return set([x for x in self.edges if target_vertex in x]) 32 | 33 | def sets_adjacent(self, frozenset vert_name_set_1, frozenset vert_name_set_2): 34 | return(bool(vert_name_set_1.intersection(vert_name_set_2))) 35 | 36 | 37 | def adjacent_vertices(self, target_vertex): 38 | neighbors_and_self = set([x for sublist in self.adjacent_edges(target_vertex) for x in sublist]) 39 | return set(neighbors_and_self)-set([target_vertex]) 40 | 41 | def adjacent_vertices_to_set(self, target_vertex_set): 42 | templist = [list(self.adjacent_vertices(x)) for x in target_vertex_set] 43 | neighbors_and_self = [x for sublist in templist for x in sublist] 44 | return set(neighbors_and_self)-target_vertex_set 45 | 46 | # def adjacent_vertices_to_set_2(self, target_vertex_set): 47 | # out_set = {x for subset in self.edges if self.sets_adjacent(subset, target_vertex_set) for x in subset} 48 | # return(out_set - set(target_vertex_set)) 49 | 50 | def vertex_degree(self, target_vertex): 51 | return len(self.adjacent_vertices(target_vertex)) 52 | 53 | # def contract_edge(self, edge, sep_str='_'): 54 | # return contract_edge(self, edge, sep_str) 55 | 56 | def delete_vertex(self, vertex): 57 | return delete_vertex(self, vertex) 58 | 59 | def delete_vertices(self, vertex_set): 60 | return delete_vertices(self, vertex_set) 61 | 62 | def get_induced_subgraph(self, vertex_set): 63 | return get_induced_subgraph(self, vertex_set) 64 | 65 | def generate_small_size_partitions(self, max_size=2): 66 | edge_list = list(self.edges) 67 | self.partition_dict[(1,1)] 68 | if (self.num_vertices, 2) not in self.partition_dict.keys(): 69 | self.partition_dict[(self.num_vertices, 2)] = {frozenset([i,j]) for i in edge_list for j in edge_list if (not i.intersection(j) and i.union(j)==self.vertices)} 70 | for curr_size in range(3, max_size): 71 | if (0,curr_size-1) not in self.partition_dict.keys(): 72 | self.partition_dict[(0, curr_size-1)] = {frozenset([i,j]) for i in edge_list for j in edge_list if (not i.intersection(j))} 73 | 74 | # def return_mc_partitions(self): 75 | # if self.mc_partitions==[]: 76 | # self.enumerate_mc_partitions() 77 | # return(self.mc_partitions) 78 | 79 | 80 | # def enumerate_mc_partitions(self, max_size=0, verbose=False): 81 | # """This method will examine every connected set S of size up to max_size and 82 | # determine whether or not the complement of the set is also connected. If the 83 | # complement is also connected, then the partition {S, S^C} is added to the list 84 | # self.mc_partitions""" 85 | 86 | # # Default behavior is to find all maximally coarse partitions which 87 | # # requires searching components up to size floor(n_vertices/2) 88 | # if max_size==0: 89 | # max_size=int(np.floor(len(self.vertices)/2)) 90 | 91 | # # Initialize some variables 92 | # # The two lists below are sets of sets by size. 93 | # # i.e. conn_sets_with_conn_complements_by_size[5] will be a set that contains 94 | # # the connected sets of size 5 whose complements are also connected 95 | # conn_sets_with_conn_complements_by_size = [] 96 | # conn_sets_with_disconn_complements_by_size = [] 97 | 98 | # # These two contain the sizes of each entry in the above lists 99 | # num_conn_sets_with_conn_complements_list = [] 100 | # num_conn_sets_with_disconn_complements_list = [] 101 | 102 | # # Initialize the list with an empty set 103 | # conn_sets_with_conn_complements_by_size.append(set()) 104 | # conn_sets_with_disconn_complements_by_size.append(set()) 105 | 106 | 107 | # # Corner case handling 108 | # if(len(self.vertices)==0): 109 | # return [] 110 | # if((len(self.vertices)>=1) and (len(self.vertices)<=2)): 111 | # return [frozenset([list(self.vertices)[0]])] 112 | 113 | # # The connected components of size 1 are exactly the vertices 114 | # if verbose: 115 | # print('Evaluating connected sets of size 1') 116 | # for vert in self.vertices: 117 | # if is_connected(delete_vertex(self, vert)): 118 | # conn_sets_with_conn_complements_by_size[0].add(frozenset({vert})) 119 | # else: 120 | # conn_sets_with_disconn_complements_by_size[0].add(frozenset({vert})) 121 | # num_conn_sets_with_conn_complements_list.append(len(conn_sets_with_conn_complements_by_size[0])) 122 | # num_conn_sets_with_disconn_complements_list.append(len(conn_sets_with_disconn_complements_by_size[0])) 123 | # if verbose: 124 | # print('num conn sets of comp_size 1 with connected complements = {}'.format(num_conn_sets_with_conn_complements_list[0])) 125 | # print('num conn sets of comp_size 1 with disconnected complements = {}'.format(num_conn_sets_with_disconn_complements_list[0])) 126 | # print('Evaluating connected sets of size 2') 127 | # conn_sets_with_conn_complements_by_size.append(set()) 128 | # conn_sets_with_disconn_complements_by_size.append(set()) 129 | 130 | # # The connected components of size 2 are exactly the edges 131 | # for edge in self.edges: 132 | # if is_connected(delete_vertices(self, edge)): 133 | # conn_sets_with_conn_complements_by_size[1].add(edge) 134 | # else: 135 | # conn_sets_with_disconn_complements_by_size[1].add(edge) 136 | # num_conn_sets_with_conn_complements_list.append(len(conn_sets_with_conn_complements_by_size[1])) 137 | # num_conn_sets_with_disconn_complements_list.append(len(conn_sets_with_disconn_complements_by_size[1])) 138 | # if verbose: 139 | # print('num conn sets of comp_size 2 with connected complements = {}'.format(num_conn_sets_with_conn_complements_list[1])) 140 | # print('num conn sets of comp_size 2 with disconnected complements = {}'.format(num_conn_sets_with_disconn_complements_list[1])) 141 | # print('num conn sets of comp_size <=2 with connected complements = {}'.format(np.sum(num_conn_sets_with_conn_complements_list))) 142 | # print('num conn sets of comp_size <=2 with disconnected complements = {}'.format(np.sum(num_conn_sets_with_disconn_complements_list))) 143 | 144 | 145 | # for comp_size in range(3, max_size+1): 146 | # conn_sets_with_conn_complements_by_size.append(set()) 147 | # conn_sets_with_disconn_complements_by_size.append(set()) 148 | 149 | # if verbose: 150 | # print('Evaluating connected sets of size {}'.format(comp_size)) 151 | # base_components = conn_sets_with_conn_complements_by_size[comp_size-2].union(conn_sets_with_disconn_complements_by_size[comp_size-2]) 152 | # for base_comp in base_components: 153 | # neighbors_to_add = self.adjacent_vertices_to_set(base_comp) 154 | # for neighbor in neighbors_to_add: 155 | # new_comp = set(base_comp) 156 | # new_comp.add(neighbor) 157 | # new_comp = frozenset(new_comp) 158 | # if ((not new_comp in conn_sets_with_conn_complements_by_size[comp_size-1]) and (not new_comp in conn_sets_with_disconn_complements_by_size[comp_size-1])): 159 | # if is_connected(delete_vertices(self,new_comp)): 160 | # conn_sets_with_conn_complements_by_size[comp_size-1].add(new_comp) 161 | # else: 162 | # conn_sets_with_disconn_complements_by_size[comp_size-1].add(new_comp) 163 | # num_conn_sets_with_conn_complements_list.append(len(conn_sets_with_conn_complements_by_size[comp_size-1])) 164 | # num_conn_sets_with_disconn_complements_list.append(len(conn_sets_with_disconn_complements_by_size[comp_size-1])) 165 | 166 | # if verbose: 167 | # print('num conn set of comp_size {} with connected complements= {}'.format(comp_size,num_conn_sets_with_conn_complements_list[comp_size-1])) 168 | # print('num conn set of comp_size {} with discconnected complements= {}'.format(comp_size,num_conn_sets_with_disconn_complements_list[comp_size-1])) 169 | # print('num conn set of comp_size <= {} with connected complements= {}'.format(comp_size, np.sum(num_conn_sets_with_conn_complements_list))) 170 | # print('num conn set of comp_size <= {} with disconnected complements= {}'.format(comp_size, np.sum(num_conn_sets_with_disconn_complements_list))) 171 | 172 | # self.mc_partitions = list(set([frozenset([conn_set, frozenset(self.vertices - conn_set)]) for templist in conn_sets_with_conn_complements_by_size for conn_set in templist])) 173 | # #self.mc_partitions = [[conn_set, self.vertices - conn_set] for conn_set in conn_sets_with_conn_complements] 174 | # self.mc_partitions_max_size = max_size 175 | 176 | 177 | # def save_partitions_to_file(self, file_name): 178 | # list_of_lists = [list(x) for x in self.all_partitions] 179 | # with open(file_name, "w") as write_file: 180 | # json.dump(list_of_lists, write_file) 181 | 182 | # def load_partitions_from_file(self, file_name): 183 | # with open(file_name, "r") as read_file: 184 | # list_of_lists = json.load(read_file) 185 | # self.all_partitions = [frozenset(x) for x in list_of_lists] 186 | 187 | # def enumerate_connected_sets(self, max_size=-1, verbose=False): 188 | # if self.all_connected_sets: 189 | # return self.all_connected_sets 190 | # if(len(self.vertices)==0): 191 | # return [] 192 | # if((len(self.vertices)>=1) and (len(self.vertices)<=2)): 193 | # return [frozenset([list(self.vertices)[0]])] 194 | # if max_size==(-1): 195 | # max_size=len(self.vertices) 196 | # connected_sets = [] 197 | # connected_sets.append(set()) 198 | # num_connected_sets_list = [] 199 | # if verbose: 200 | # print('Evaluating components of size 1') 201 | # for vert in self.vertices: 202 | # connected_sets[0].add(frozenset({vert})) 203 | # num_connected_sets_list.append(len(connected_sets[0])) 204 | # if verbose: 205 | # print('num connected sets of size 1 = {}'.format(num_connected_sets_list[0])) 206 | # print('Evaluating components of size 2') 207 | # connected_sets.append(set()) 208 | # for edge in self.edges: 209 | # connected_sets[1].add(edge) 210 | # num_connected_sets_list.append(len(connected_sets[1])) 211 | # if verbose: 212 | # print('num_connected_sets of size 2 = {}'.format(num_connected_sets_list[1])) 213 | # print('num_connected_sets of size<=2 is {}'.format(np.sum(num_connected_sets_list))) 214 | 215 | 216 | # for comp_size in range(3, max_size+1): 217 | # connected_sets.append(set()) 218 | 219 | # if verbose: 220 | # print('Evaluating components of size {}'.format(comp_size)) 221 | # base_components = connected_sets[comp_size-2] 222 | # for base_comp in base_components: 223 | # neighbors_to_add = self.adjacent_vertices_to_set(base_comp) 224 | # for neighbor in neighbors_to_add: 225 | # new_comp = set(base_comp) 226 | # new_comp.add(neighbor) 227 | # new_comp = frozenset(new_comp) 228 | # connected_sets[comp_size-1].add(new_comp) 229 | # num_connected_sets_list.append(len(connected_sets[comp_size-1])) 230 | # # if memory_save: 231 | # # good_partitions[comp_size-2]=set() 232 | # # failed_partitions[comp_size-2]=set() 233 | 234 | # if verbose: 235 | # print('num_connected_sets of size {} = {}'.format(comp_size,num_connected_sets_list[comp_size-1])) 236 | # print('num_total_connected_sets of size<={} is {}'.format(comp_size, np.sum(num_connected_sets_list))) 237 | 238 | # connected_sets = [k for templist in connected_sets for k in templist] 239 | # self.all_connected_sets = connected_sets 240 | # return connected_sets 241 | 242 | # def save_connected_sets_to_file(self, file_name): 243 | # list_of_lists = [list(x) for x in self.all_connected_sets] 244 | # with open(file_name, "w") as write_file: 245 | # json.dump(list_of_lists, write_file) 246 | 247 | # def load_connected_sets_from_file(self, file_name): 248 | # with open(file_name, "r") as read_file: 249 | # list_of_lists = json.load(read_file) 250 | # self.all_connected_sets = [frozenset(x) for x in list_of_lists] 251 | 252 | # def get_partitions_from_connected_sets(self, verbose=False, verbose_freq=1000): 253 | # part_list = [] 254 | # conn_set_list = self.all_connected_sets.copy() 255 | # conn_set_set = set(self.all_connected_sets) 256 | # if verbose: 257 | # print('checking {} connected sets'.format(len(conn_set_list))) 258 | # for i,conn_set in enumerate(conn_set_list): 259 | # if len(conn_set) > (len(self.vertices)/2): 260 | # break 261 | # complement_set = frozenset(self.vertices - conn_set) 262 | # if complement_set in conn_set_set: 263 | # part_list.append(conn_set) 264 | # conn_set_list.remove(complement_set) 265 | # if ((((i+1) % verbose_freq)) ==0): 266 | # if verbose: 267 | # print('Checked {} sets'.format(i+1)) 268 | # print('Found {} partitions'.format(len(part_list))) 269 | # self.all_partitions = part_list 270 | 271 | 272 | # def contract_edge(graph, edge, sep_str='_'): 273 | # edge_alph = list(edge) 274 | # edge_alph.sort() 275 | # contracted_vertex = sep_str.join((edge_alph)) 276 | # #new_vertices = (set(graph.vertices) - set(edge)).union({contracted_vertex}) 277 | # new_edges = [[contracted_vertex if y==edge_alph[0] or y==edge_alph[1] else y for y in this_edge] 278 | # if edge_alph[0] in this_edge or edge_alph[1] in this_edge else this_edge for this_edge in graph.edges] 279 | # return graph_undirected(new_edges) 280 | 281 | def delete_vertex(graph, vertex): 282 | new_edges = set([frozenset(edge - set(vertex)) for edge in graph.edges]) 283 | new_vertices = graph.vertices - {vertex} 284 | return hypergraph(new_edges, new_vertices) 285 | 286 | def delete_vertices(graph, vertex_set): 287 | 288 | new_edges = set([frozenset(edge-set(vertex_set)) for edge in graph.edges]) 289 | new_vertices = graph.vertices - vertex_set 290 | return hypergraph(new_edges, new_vertices) 291 | 292 | def get_induced_subgraph(graph, vertex_set): 293 | vertex_set = set(vertex_set) 294 | new_edges = set([edge for edge in graph.edges if edge <= vertex_set]) 295 | new_vertices = vertex_set 296 | new_graph = hypergraph(new_edges, new_vertices) 297 | new_graph.all_connected_sets = [x for x in graph.all_connected_sets if new_vertices.issuperset(x)] 298 | return new_graph 299 | 300 | 301 | # def is_connected(graph): 302 | # initial_vertex = next(iter(graph.vertices)) 303 | # visited_vertices = [initial_vertex] 304 | # unexplored_vertices = list(graph.adjacent_vertices(initial_vertex)) 305 | # while unexplored_vertices: 306 | # curr_vertex = unexplored_vertices.pop() 307 | # visited_vertices.append(curr_vertex) 308 | # new_vertices = graph.adjacent_vertices(curr_vertex) 309 | # unexplored_vertices = list(set(unexplored_vertices).union(new_vertices) - set(visited_vertices)) 310 | # return len(set(visited_vertices)) == len(set(graph.vertices)) 311 | 312 | # def num_connected_comp(graph): 313 | # initial_vertex = list(graph.vertices)[0] 314 | # visited_vertices = [initial_vertex] 315 | # unexplored_vertices = list(graph.adjacent_vertices(initial_vertex)) 316 | # while unexplored_vertices: 317 | # curr_vertex = unexplored_vertices.pop(0) 318 | # visited_vertices.append(curr_vertex) 319 | # new_vertices = graph.adjacent_vertices(curr_vertex) 320 | # unexplored_vertices = list(set(unexplored_vertices).union(new_vertices) - set(visited_vertices)) 321 | # if len(set(visited_vertices)) == len(set(graph.vertices)): 322 | # return 1 323 | # else: 324 | # remainder_vertices = list(set(graph.vertices)-set(visited_vertices)) 325 | # remainder_edges = [edge for edge in graph.edges if edge.issubset(set(remainder_vertices))] 326 | # return 1 + num_connected_comp(graph_undirected(remainder_edges, remainder_vertices)) 327 | 328 | # def connected_comp_list(graph): 329 | # initial_vertex = list(graph.vertices)[0] 330 | # visited_vertices = [initial_vertex] 331 | # unexplored_vertices = list(graph.adjacent_vertices(initial_vertex)) 332 | # while unexplored_vertices: 333 | # curr_vertex = unexplored_vertices.pop(0) 334 | # visited_vertices.append(curr_vertex) 335 | # new_vertices = graph.adjacent_vertices(curr_vertex) 336 | # unexplored_vertices = list(set(unexplored_vertices).union(new_vertices) - set(visited_vertices)) 337 | # if len(set(visited_vertices)) == len(set(graph.vertices)): 338 | # return [graph] 339 | # else: 340 | # cc_vertices = set(visited_vertices) 341 | # cc_edges = [edge for edge in graph.edges if edge.issubset(set(visited_vertices))] 342 | # cc_graph = graph_undirected(cc_edges, cc_vertices) 343 | # remainder_vertices = list(set(graph.vertices)-set(visited_vertices)) 344 | # remainder_edges = [edge for edge in graph.edges if edge.issubset(set(remainder_vertices))] 345 | # return [cc_graph] + connected_comp_list(graph_undirected(remainder_edges, remainder_vertices)) 346 | 347 | # def get_all_distances_from_vertex(graph, start_vertex): 348 | # vertex_path_dist_dict=set() 349 | # vertex_path_dist_dict[start_vertex] = 0 350 | # unexplored_vertices = list(graph.adjacent_vertices(start_vertex)) 351 | # for vert in unexplored_vertices: 352 | # vertex_path_dist_dict[vert]=1 353 | # visited_vertices = [start_vertex] 354 | 355 | # while unexplored_vertices and (len(vertex_path_dist_dict.keys())=self.max_depth): 130 | self._wrap_up_node(curr_node, g_train_node, h_train_node) 131 | return None 132 | 133 | # Determine which features are still "eligible" to be considered 134 | features_to_search = self.feature_configs.keys() 135 | 136 | # print('features_to_search') 137 | # print(features_to_search) 138 | # If no features are eligible (e.g. all x-values are identical in all features) 139 | # Then we similarly summarize the node and move on 140 | # if features_to_search==[]: 141 | # self._wrap_up_node(curr_node, g_train_node, h_train_node) 142 | # return None 143 | 144 | # best_split_dict holds all the necessary info about a potential split 145 | best_split_dict = _initialize_best_split_dict() 146 | 147 | # Main loop over features to find best split 148 | for feature in features_to_search: 149 | # print('evaluating feature {}'.format(feature)) 150 | best_split_for_feature = evaluate_feature(self.feature_configs[feature], 151 | curr_node['feature_graphs'], 152 | feature, 153 | X_train_node[feature].values, 154 | g_train_node, h_train_node, 155 | self.gamma, self.reg_lambda) 156 | if best_split_for_feature: 157 | best_split_for_feature['split_feature'] = feature 158 | if best_split_for_feature['loss_score']0: 299 | split_res = feature_config['split_res'] if 'split_res' in feature_config.keys() else np.Inf 300 | split_count = len(splits_to_eval) 301 | if split_res1: 316 | unique_splits = (unique_vals[1:]+unique_vals[:-1])/2 317 | return unique_splits 318 | else: 319 | return [] 320 | 321 | def _evaluate_numerical_splits(feature_vec, g_vec, h_vec, split_vec, gamma, reg_lambda): 322 | ## NOTE : need to incorporate min_leaf_size restriction 323 | 324 | bin_result_vec = np.searchsorted(split_vec, feature_vec, side='left') 325 | g_sum_bins, h_sum_bins = get_bin_sums_c(g_vec, h_vec, bin_result_vec, len(split_vec)+1) 326 | g_sum_total, g_sum_left, g_sum_right = get_left_right_sums(g_sum_bins) 327 | h_sum_total, h_sum_left, h_sum_right = get_left_right_sums(h_sum_bins) 328 | score_vec = (-1)*_get_gh_score_array(g_sum_left, g_sum_right, h_sum_left, h_sum_right, gamma, reg_lambda) 329 | # if (len(score_vec)!=len(split_vec)): 330 | # print('score_vec has length {}'.format(len(score_vec))) 331 | # print('split_vec has length {}'.format(len(split_vec))) 332 | 333 | best_loss, best_split_val = get_best_vals(score_vec, split_vec) 334 | return best_loss, best_split_val 335 | 336 | def get_best_vals(score_vec, split_vec): 337 | best_loss = np.min(score_vec) 338 | best_split_index = np.argmin(score_vec) 339 | best_split_val = split_vec[np.argmin(score_vec)] 340 | return best_loss, best_split_val 341 | 342 | def get_bin_sums(g_vec, h_vec, bin_result_vec, out_vec_size): 343 | g_sum_bins = np.zeros(out_vec_size) 344 | h_sum_bins = np.zeros(out_vec_size) 345 | for i,bin_ind in enumerate(bin_result_vec): 346 | g_sum_bins[bin_ind]+=g_vec[i] 347 | h_sum_bins[bin_ind]+=h_vec[i] 348 | return g_sum_bins, h_sum_bins 349 | 350 | def get_bin_sums_c(cnp.ndarray[double] g_vec, cnp.ndarray[double] h_vec, 351 | cnp.ndarray[long] bin_result_vec, long out_vec_size): 352 | cdef int i 353 | cdef int m = bin_result_vec.shape[0] 354 | 355 | cdef cnp.ndarray[double] g_sum_bins = np.zeros(out_vec_size) 356 | cdef cnp.ndarray[double] h_sum_bins = np.zeros(out_vec_size) 357 | 358 | for i in range(m): 359 | g_sum_bins[bin_result_vec[i]]+=g_vec[i] 360 | h_sum_bins[bin_result_vec[i]]+=h_vec[i] 361 | return g_sum_bins, h_sum_bins 362 | 363 | 364 | def get_left_right_sums(bin_sums): 365 | sum_total = np.sum(bin_sums) 366 | sum_left = (np.cumsum(bin_sums))[:-1] 367 | sum_right = sum_total - sum_left 368 | return sum_total, sum_left, sum_right 369 | 370 | def _evaluate_feature_graphical(feature_config, feature_graph, feature_vec_node, 371 | g_train_node, h_train_node, gamma, reg_lambda): 372 | # NOTE: need to incorporate min_leaf_size restriction 373 | 374 | msac = feature_config['msac'] 375 | msts = feature_config['split_res'] 376 | # Query the graph structure to get the possible splits 377 | # print('len(feature_graph.mc_partitions)={}'.format(len(feature_graph.mc_partitions))) 378 | if (len(feature_graph.mc_partitions)>0): 379 | possible_splits = feature_graph.return_mc_partitions() 380 | else: 381 | # print('vertices = {}'.format(feature_graph.vertices)) 382 | # print('edges = {}'.format(feature_graph.edges)) 383 | possible_splits = feature_graph.return_contracted_partitions(max_size_after_contraction=msac) 384 | nps = len(possible_splits) 385 | # print('nps={}'.format(nps)) 386 | if (nps>msts): 387 | # Randomly choose (with replacement) a subset of possible splits 388 | index_range = np.random.randint(0,nps,msts) 389 | else: 390 | index_range = range(nps) 391 | 392 | 393 | best_split_of_feat = {} 394 | best_split_of_feat['loss_score'] = np.Inf 395 | g_sum = np.sum(g_train_node) 396 | h_sum = np.sum(h_train_node) 397 | # Loop within values of each feature 398 | for index in index_range: 399 | curr_partition = list(possible_splits[index]) 400 | left_split = curr_partition[0] 401 | right_split = curr_partition[1] 402 | mask_left = np.array([x in left_split for x in feature_vec_node]) 403 | curr_loss = _score_split(mask_left, g_train_node, h_train_node, g_sum, h_sum, 404 | gamma, reg_lambda) 405 | # print('Evaluating split') 406 | # print(left_split) 407 | # print('vs') 408 | # print(right_split) 409 | # print('loss_score = {}'.format(curr_loss)) 410 | # print('----') 411 | 412 | if curr_loss < best_split_of_feat['loss_score']: 413 | best_split_of_feat['loss_score'] = curr_loss 414 | best_split_of_feat['left_split'] = left_split 415 | best_split_of_feat['feature_type'] = 'categ_graphical' 416 | return(best_split_of_feat) 417 | 418 | def _score_split(mask_left, g_train_node, h_train_node, g_sum, h_sum, gamma, reg_lambda): 419 | # cdef double loss_score, g_left, g_right, h_left, h_right, vec_len 420 | 421 | vec_len = len(g_train_node) 422 | g_left = np.sum(g_train_node[mask_left]) 423 | g_right = g_sum - g_left 424 | h_left = np.sum(h_train_node[mask_left]) 425 | h_right = h_sum - h_left 426 | loss_score = -1.0 * _get_gh_score_num(g_left, g_right, h_left, h_right, gamma, reg_lambda) 427 | if loss_score>=0: 428 | loss_score = np.inf 429 | return loss_score 430 | 431 | 432 | 433 | 434 | -------------------------------------------------------------------------------- /extra/code/structure_gb.pyx: -------------------------------------------------------------------------------- 1 | # cython: profile=True 2 | 3 | """Decision Tree Gradient Boosting based on Discrete Graph structure""" 4 | import numpy as np 5 | import pandas as pd 6 | cimport numpy as cnp 7 | from libc.math cimport log as clog 8 | from structure_dt import * 9 | from graphs import * 10 | from sklearn.metrics import log_loss, mean_squared_error 11 | 12 | 13 | class StructureBoost(object): 14 | 15 | def __init__(self, num_trees, feature_configs, feature_graphs, mode='classification', loss_fn = 'entropy', min_size_split=2, min_leaf_size = 1, max_depth=3, gamma=0, 16 | reg_lambda=1, node_summary_fn = np.mean, learning_rate=.1, max_splits_to_search=np.Inf, msac=100): 17 | self.num_trees = num_trees 18 | self.num_trees_for_prediction = num_trees 19 | self.dec_tree_list = [] 20 | self.feature_configs = feature_configs 21 | self.feature_graphs = feature_graphs 22 | self.min_size_split=min_size_split 23 | self.min_leaf_size=min_leaf_size 24 | self.max_depth=max_depth 25 | self.gamma=gamma 26 | self.reg_lambda=reg_lambda 27 | self.node_summary_fn=node_summary_fn 28 | self.learning_rate = learning_rate 29 | self.loss_fn = loss_fn 30 | self.max_splits_to_search = max_splits_to_search 31 | self.mode = mode 32 | if loss_fn == 'entropy': 33 | self.loss_fn_der_1 = _entropy_link_der_1 34 | self.loss_fn_der_2 = _entropy_link_der_2 35 | if loss_fn == 'mse': 36 | self.loss_fn_der_1 = _mse_der_1 37 | self.loss_fn_der_2 = _mse_der_2 38 | # if features=='auto': 39 | # self.features=list(self.dec_tree['feature_graphs'].keys()) 40 | 41 | def fit(self, X_train, y_train, eval_set = None, eval_freq=10, 42 | early_stop_past_steps=0, choose_best_eval=True): 43 | # cdef int i, n =self.num_trees 44 | self.eval_freq=eval_freq 45 | eval_len = np.floor(self.num_trees/self.eval_freq).astype(int) 46 | self.eval_results = np.zeros(eval_len) 47 | n =self.num_trees 48 | self.initial_pred = np.mean(y_train) 49 | stop_now=False 50 | if eval_set is not None: 51 | X_valid = eval_set[0] 52 | y_valid = eval_set[1] 53 | for i in range(n): 54 | # print('iteration number {}'.format(i)) 55 | # Get predictions of current model 56 | if i==0: 57 | curr_answer = self.initial_pred * np.ones(len(y_train)) 58 | if eval_set is not None: 59 | curr_test_answer = self.initial_pred * np.ones(len(y_valid)) 60 | if self.mode == 'classification': 61 | curr_loss= log_loss(y_valid, 1/(1+np.exp(-curr_test_answer))) 62 | print("i=0, test_set_log_loss = {}".format(curr_loss)) 63 | else: 64 | curr_loss= mean_squared_error(y_valid, curr_test_answer) 65 | print("i=0. test_set_mse = {}".format(curr_loss)) 66 | 67 | else: 68 | curr_answer = curr_answer + self.learning_rate * self.dec_tree_list[i-1].predict(X_train) 69 | if eval_set is not None: 70 | curr_test_answer = curr_test_answer + self.learning_rate * self.dec_tree_list[i-1].predict(X_valid) 71 | if ((i+1)%self.eval_freq==1): 72 | if self.mode == 'classification': 73 | curr_loss= log_loss(y_valid, 1/(1+np.exp(-curr_test_answer))) 74 | print("i={}, test_set_log_loss = {}".format(i,curr_loss)) 75 | else: 76 | curr_loss= mean_squared_error(y_valid, curr_test_answer) 77 | print("i={}, test_set_mse = {}".format(i,curr_loss)) 78 | 79 | curr_step=np.floor((i+1)/self.eval_freq).astype(int) -1 80 | self.eval_results[curr_step]=curr_loss 81 | if curr_step>early_stop_past_steps: 82 | compare_loss = np.min(self.eval_results[:curr_step-early_stop_past_steps+1]) 83 | if (curr_loss>compare_loss): 84 | stop_now=True 85 | print("Stopping early: curr_loss of {} exceeds compare_loss of {}".format(curr_loss, compare_loss)) 86 | if stop_now: 87 | if choose_best_eval: 88 | self.num_trees_for_prediction = (np.argmin(self.eval_results[:curr_step+1])+1)*eval_freq 89 | break 90 | 91 | # Get first and second derivatives 92 | y_g_vec = self.loss_fn_der_1(y_train, curr_answer) 93 | y_h_vec = self.loss_fn_der_2(y_train, curr_answer) 94 | 95 | 96 | # Sample the data to use for this tree 97 | 98 | num_rows = X_train.shape[0] 99 | rows_to_use = np.random.choice(range(num_rows), num_rows, replace=True) 100 | if type(X_train)==pd.DataFrame: 101 | X_train_to_use = X_train.iloc[rows_to_use] 102 | elif type(X_train)==np.ndarray: 103 | X_train_to_use = X_train[rows_to_use] 104 | else: 105 | print('unknown format for X_train') 106 | #y_original_train_to_use = y_train.sample(X_train.shape[0], random_state=rs, replace=True) 107 | if type(y_g_vec)==pd.Series: 108 | y_g_to_use = y_g_vec.iloc[rows_to_use] 109 | elif type(y_g_vec)==np.ndarray: 110 | y_g_to_use = y_g_vec[rows_to_use] 111 | else: 112 | print('unknown format for y_g_vec') 113 | 114 | if type(y_h_vec)==pd.Series: 115 | y_h_to_use = y_h_vec.iloc[rows_to_use] 116 | elif type(y_h_vec)==np.ndarray: 117 | y_h_to_use = y_h_vec[rows_to_use] 118 | else: 119 | print('unknown format for y_h_vec') 120 | 121 | #local_feature_configs = self.feature_configs.copy() 122 | self.dec_tree_list.append(StructureDecisionTree(feature_configs=self.feature_configs, 123 | feature_graphs=self.feature_graphs, 124 | min_size_split = self.min_size_split, min_leaf_size=self.min_leaf_size, 125 | gamma=self.gamma, max_depth=self.max_depth, reg_lambda=self.reg_lambda)) 126 | self.dec_tree_list[i].fit(X_train_to_use, y_g_to_use, y_h_to_use) 127 | 128 | 129 | def predict(self, X_test, num_trees_to_use=0): 130 | cdef int i 131 | if num_trees_to_use==0: 132 | num_trees_to_use=self.num_trees_for_prediction 133 | out_vec = self.initial_pred*np.ones(X_test.shape[0]) 134 | for i in range(num_trees_to_use): 135 | out_vec = out_vec + self.learning_rate * self.dec_tree_list[i].predict(X_test) 136 | if self.mode=='classification': 137 | return(1/(1+np.exp(-out_vec))) 138 | else: 139 | return(out_vec) 140 | 141 | def _entropy_der_1(y_true, y_pred, eps=1e-15): 142 | y_pred = np.maximum(y_pred, eps) 143 | y_pred = np.minimum(y_pred, 1-eps) 144 | return((-(y_true/y_pred) + (1-y_true)/(1-y_pred))) 145 | 146 | def _entropy_der_2(y_true, y_pred, eps=1e-15): 147 | y_pred = np.maximum(y_pred, eps) 148 | y_pred = np.minimum(y_pred, 1-eps) 149 | out_vec = (y_true)/(y_pred**2) + ((1-y_true)/((1-y_pred)**2)) 150 | return(out_vec) 151 | 152 | def _mse_der_1(y_true, y_pred, eps=1e-15): 153 | return(2*(y_pred-y_true)) 154 | 155 | def _mse_der_2(y_true, y_pred, eps=1e-15): 156 | return(pd.Series(2*np.ones(len(y_pred)))) 157 | 158 | def _entropy_link_der_1(y_true, z_pred, eps=1e-15): 159 | return(-y_true*(1/(1+np.exp(z_pred))) + (1-y_true) * (1/(1+np.exp(-z_pred))) ) 160 | 161 | def _entropy_link_der_2(y_true, z_pred, eps=1e-15): 162 | return(y_true*(np.exp(z_pred)/((1+np.exp(z_pred))**2)) + (1-y_true) * (np.exp(-z_pred)/((1+np.exp(-z_pred))**2)) ) 163 | 164 | -------------------------------------------------------------------------------- /ml_insights/CVModel.py: -------------------------------------------------------------------------------- 1 | """Cross-validated training and prediction.""" 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.base import BaseEstimator, ClassifierMixin, clone 5 | 6 | class CVModel(BaseEstimator, ClassifierMixin): 7 | 8 | def __init__(self, base_estimator=None): 9 | self.base_estimator = base_estimator 10 | 11 | def fit(self, X_train, y_train, fold_num, train_overall=True, **kwargs): 12 | """Fits a cross-validated model - a model for each left-out fold plus one overall model. 13 | 14 | X_train: the training predictors 15 | y_train: the training outcome 16 | fold_num: the indicator of which fold each row belongs to""" 17 | self.model_dict = {} 18 | self.fold_set = np.unique(fold_num) 19 | self.num_unique_y_values = len(np.unique(y_train)) 20 | self.num_features = X_train.shape[1] 21 | 22 | ## If a DataFrame is given (rather than just an array) then make a note of the column names. 23 | ## This way we can match up column names when we use predict_proba. 24 | if type(X_train) == pd.DataFrame: 25 | self.fit_columns = np.array(X_train.columns) 26 | else: 27 | self.fit_columns = None 28 | 29 | ## Make copies of the estimator for each of the fold models 30 | for fold in self.fold_set: 31 | self.model_dict[fold] = clone(self.base_estimator) 32 | 33 | ## Train the separate models, each one leaving out a particular fold in training 34 | for fold in self.fold_set: 35 | print("Leave out fold {} and train on the rest".format(fold)) 36 | X_tr = X_train[fold_num != fold] 37 | y_tr = y_train[fold_num != fold] 38 | self.model_dict[fold].fit(X_tr, y_tr, **kwargs) 39 | 40 | ## Train the overall model on all of the data 41 | if train_overall: 42 | print("Train the overall model".format(fold)) 43 | self.model_dict['overall_model'] = clone(self.base_estimator) 44 | self.model_dict['overall_model'].fit(X_train, y_train, **kwargs) 45 | return self 46 | 47 | def predict_proba(self, X_test, fold_num=None, **kwargs): 48 | """Predict probabilities in cross-validated fashion. 49 | 50 | X_test: the data to predict on 51 | fold_num: the indicator of which fold a row belongs to / which model variant to use. 52 | If fold_num is not specified, it will default to use the overall_model 53 | """ 54 | ## If we have column names and X_test is a DataFrame, then subset X_test to those columns 55 | ## in the correct order, and error if those columns are not present 56 | if self.fit_columns is not None: 57 | if type(X_test) == pd.DataFrame: 58 | X_test = X_test.loc[:, self.fit_columns] 59 | 60 | if fold_num is None: 61 | #print("no folds specified, using overall_model") 62 | if 'overall_model' not in self.model_dict.keys(): 63 | #print("Error: overall_model not trained and fold_num not specified") 64 | return None 65 | else: 66 | results = self.model_dict['overall_model'].predict_proba(X_test, **kwargs) 67 | return results 68 | else: 69 | results = np.zeros((X_test.shape[0], self.num_unique_y_values)) 70 | fold_set = np.unique(fold_num) 71 | for fold in fold_set: 72 | X_te = X_test[fold_num == fold] 73 | fold_results = self.model_dict[fold].predict_proba(X_te, **kwargs) 74 | results[fold_num==fold] = fold_results 75 | return results 76 | 77 | def predict(self, X_test, fold_num=None, **kwargs): 78 | """Predict final values in cross-validated fashion. 79 | 80 | X_test: the data to predict on 81 | fold_num: the indicator of which fold a row belongs to / which model variant to use. 82 | If fold_num is not specified, it will default to use the overall_model 83 | 84 | """ 85 | 86 | ## If we have column names and X_test is a DataFrame, then subset X_test to those columns 87 | ## in the correct order, and error if those columns are not present 88 | if self.fit_columns is not None: 89 | if type(X_test) == pd.DataFrame: 90 | X_test = X_test.loc[:, self.fit_columns] 91 | 92 | if fold_num is None: 93 | #print("no folds specified, using overall_model") 94 | if 'overall_model' not in self.model_dict.keys(): 95 | print("Error: overall_model not trained and fold_num not specified") 96 | return None 97 | else: 98 | results = self.model_dict['overall_model'] 99 | return results 100 | else: 101 | results = np.zeros(X_test.shape[0]) 102 | fold_set = np.unique(fold_num) 103 | for fold in fold_set: 104 | X_te = X_test[fold_num == fold] 105 | fold_results = self.model_dict[fold].predict(X_te, **kwargs) 106 | results[fold_num==fold] = fold_results 107 | return results 108 | 109 | def grid_search(self, X, y, fold_ind, param_grid, score_fn, verbose=True): 110 | param_arg_list = _get_param_settings_from_grid(param_grid) 111 | num_settings = len(param_arg_list) 112 | print("Size of grid to search = {} different settings".format(num_settings)) 113 | param_list_scores = np.zeros(num_settings) 114 | old_self = clone(self.base_estimator) 115 | for i in range(num_settings): 116 | print("Fitting setting {} of {}".format(i+1,num_settings)) 117 | curr_param_dict = param_arg_list[i] 118 | if verbose: 119 | print(curr_param_dict) 120 | self.base_estimator.set_params(**curr_param_dict) 121 | self.fit(X, y, fold_ind, train_overall=False) 122 | curr_preds = self.predict_proba(X, fold_ind) 123 | if type(score_fn) == list: 124 | for j, fn in enumerate(score_fn): 125 | curr_score= fn(y, curr_preds) 126 | param_arg_list[i]['score_'+str(j)] = curr_score 127 | if verbose: 128 | print(curr_param_dict,'score function '+str(j)+':',curr_score) 129 | else: 130 | curr_score= score_fn(y, curr_preds) 131 | param_arg_list[i]['score'] = curr_score 132 | if verbose: 133 | print(curr_param_dict,'score function '+':',curr_score) 134 | param_list_scores[i]=curr_score 135 | self.base_estimator = old_self 136 | return param_arg_list 137 | 138 | 139 | def _get_param_settings_from_grid(param_grid): 140 | num_settings = np.prod([len(i) for i in param_grid.values()]) 141 | pg_tuple = tuple(param_grid.items()) 142 | param_names = [k[0] for k in pg_tuple] 143 | param_lists = [k[1] for k in pg_tuple] 144 | param_list_lengths = [len(k) for k in param_lists] 145 | param_dict_list = [] 146 | for i in range(num_settings): 147 | indices = _int_to_indices(i,param_list_lengths) 148 | curr_param_dict = {} 149 | for k in range(len(param_names)): 150 | curr_param_dict[param_names[k]]=param_lists[k][indices[k]] 151 | param_dict_list.append(curr_param_dict) 152 | return param_dict_list 153 | 154 | def _int_to_indices(j,lengths): 155 | out_list = [] 156 | for i in range(len(lengths)): 157 | curr_ind = j % lengths[i] 158 | out_list.append(curr_ind) 159 | j = j//lengths[i] 160 | return(out_list) 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /ml_insights/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Package Docuemntation 3 | """ 4 | # -*- coding: utf-8 -*- 5 | 6 | from .insights import ModelXRay, explain_prediction_difference, explain_prediction_difference_xgboost 7 | from splinecalib import SplineCalib 8 | from .modeling_utils import get_stratified_foldnums,cv_predictions 9 | from .modeling_utils import plot_pr_curve,plot_pr_curves,histogram_pair 10 | from .modeling_utils import ice_plot,get_range_dict, plot_reliability_diagram 11 | from .shap_insights import consolidate_reason_scores, get_reason_codes, cv_column_shap, predict_reason_strings, predict_reasons_cv, get_reason_score_matrix 12 | from .CVModel import CVModel 13 | 14 | __version__ = '1.1.0' 15 | -------------------------------------------------------------------------------- /ml_insights/calibration.py: -------------------------------------------------------------------------------- 1 | """Calibration of predicted probabilities.""" 2 | import numpy as np 3 | import sklearn 4 | import warnings 5 | from sklearn.base import BaseEstimator, ClassifierMixin, clone 6 | 7 | try: 8 | from sklearn.model_selection import StratifiedKFold 9 | except: 10 | from sklearn.cross_validation import StratifiedKFold 11 | 12 | from .calibration_utils import prob_calibration_function, compact_logit 13 | 14 | 15 | class SplineCalibratedClassifierCV(BaseEstimator, ClassifierMixin): 16 | """Probability calibration using cubic splines. 17 | 18 | With this class, the base_estimator is fit on each of the cross-validation 19 | training set folds in order to generate scores on the (cross-validated) 20 | test set folds. The test set scores are accumulated into a final vector 21 | (the size of the full set) which is used to calibrate the answers. 22 | The model is then fit on the full data set. The predict, and predict_proba 23 | methods are then updated to use the combination of the predictions from the 24 | full model and the calibration function computed as above. 25 | 26 | Parameters 27 | ---------- 28 | base_estimator : instance BaseEstimator 29 | The classifier whose output decision function needs to be calibrated 30 | to offer more accurate predict_proba outputs. If cv='prefit', the 31 | classifier must have been fit already on data. 32 | 33 | method : 'logistic' or 'ridge' 34 | The default is 'logistic', which is best if you plan to use log-loss as your 35 | performance metric. This method is relatively robust and will typically do 36 | well on brier score as well. The 'ridge' method calibrates using an L2 loss, 37 | and therefore should do better for brier score, but may do considerably worse 38 | on log-loss. 39 | 40 | cv : integer, cross-validation generator, iterable or "prefit", optional 41 | Determines the cross-validation splitting strategy. 42 | Possible inputs for cv are: 43 | 44 | - None, to use the default 5-fold cross-validation, 45 | - integer, to specify the number of folds. 46 | - 'prefit', if you wish to use the data only for calibration 47 | 48 | For integer/None inputs, if ``y`` is binary or multiclass, 49 | :class:`sklearn.model_selection.StratifiedKFold` is used. If ``y`` is 50 | neither binary nor multiclass, :class:`sklearn.model_selection.KFold` 51 | is used. 52 | 53 | Refer :ref:`User Guide ` for the various 54 | cross-validation strategies that can be used here. 55 | 56 | If "prefit" is passed, it is assumed that base_estimator has been 57 | fitted already and all data is used for calibration. 58 | 59 | Attributes 60 | ---------- 61 | uncalibrated_classifier: this gives the uncalibrated version of the classifier, fit on the entire data set 62 | 63 | calib_func: this is the calibration function that has been learned from the cross-validation. Applying this function 64 | to the results of the uncalibrated classifier (via model.predict_proba(X_test)[:,1]) gives the fully calibrated classifier 65 | 66 | References 67 | ---------- 68 | """ 69 | def __init__(self, base_estimator=None, method='logistic', cv=5, transform_type='none', cl_eps = .000001, **calib_kwargs): 70 | warn_msg = ('\nThis class is deprecated and will eventually be removed.' + 71 | '\nPlease use the SplineCalib class for calibration.') 72 | warnings.warn(warn_msg, FutureWarning) 73 | 74 | self.base_estimator = base_estimator 75 | self.uncalibrated_classifier = None 76 | self.calib_func = None 77 | self.method = method 78 | self.cv = cv 79 | self.cl_eps = cl_eps 80 | self.calib_kwargs = calib_kwargs 81 | self.fit_on_multiclass = False 82 | self.transform_type = transform_type 83 | self.pre_transform = lambda x: x 84 | if type(self.transform_type) == str: 85 | if self.transform_type == 'cl': 86 | self.pre_transform = lambda x: compact_logit(x, eps = self.cl_eps) 87 | if callable(self.transform_type): 88 | self.pre_transform = self.transform_type 89 | 90 | def fit(self, X, y, verbose=False): 91 | """Fit the calibrated model 92 | 93 | Parameters 94 | ---------- 95 | X : array-like, shape (n_samples, n_features) 96 | Training data. 97 | 98 | y : array-like, shape (n_samples,) 99 | Target values. 100 | 101 | Returns 102 | ------- 103 | self : object 104 | Returns an instance of self. 105 | """ 106 | 107 | 108 | if len(np.unique(y)) > 2: 109 | self.fit_on_multiclass = True 110 | return self._fit_multiclass(X, y, verbose=verbose) 111 | 112 | self.fit_on_multiclass=False 113 | if ((type(self.cv)==str) and (self.cv=='prefit')): 114 | self.uncalibrated_classifier = self.base_estimator 115 | y_pred = self.uncalibrated_classifier.predict_proba(X)[:,1] 116 | 117 | else: 118 | y_pred = np.zeros(len(y)) 119 | 120 | if sklearn.__version__ < '0.18': 121 | if type(self.cv)==int: 122 | skf = StratifiedKFold(y, n_folds=self.cv,shuffle=True) 123 | else: 124 | skf = self.cv 125 | else: 126 | if type(self.cv)==int: 127 | skf = StratifiedKFold(n_splits=self.cv, shuffle=True).split(X, y) 128 | else: 129 | skf = self.cv.split(X,y) 130 | for idx, (train_idx, test_idx) in enumerate(skf): 131 | if verbose: 132 | print("training fold {} of {}".format(idx+1, self.cv)) 133 | X_train = np.array(X)[train_idx,:] 134 | X_test = np.array(X)[test_idx,:] 135 | y_train = np.array(y)[train_idx] 136 | # We could also copy the model first and then fit it 137 | this_estimator = clone(self.base_estimator) 138 | this_estimator.fit(X_train,y_train) 139 | y_pred[test_idx] = this_estimator.predict_proba(X_test)[:,1] 140 | 141 | if verbose: 142 | print("Training Full Model") 143 | self.uncalibrated_classifier = clone(self.base_estimator) 144 | self.uncalibrated_classifier.fit(X, y) 145 | 146 | # calibrating function 147 | if verbose: 148 | print("Determining Calibration Function") 149 | if self.method=='logistic': 150 | self.calib_func = prob_calibration_function(y, self.pre_transform(y_pred), verbose=verbose, **self.calib_kwargs) 151 | if self.method=='ridge': 152 | self.calib_func = prob_calibration_function(y, self.pre_transform(y_pred), method='ridge', verbose=verbose, **self.calib_kwargs) 153 | # training full model 154 | 155 | return self 156 | 157 | def _fit_multiclass(self, X, y, verbose=False): 158 | """Fit the calibrated model in multiclass setting 159 | 160 | Parameters 161 | ---------- 162 | X : array-like, shape (n_samples, n_features) 163 | Training data. 164 | 165 | y : array-like, shape (n_samples,) 166 | Target values. 167 | 168 | Returns 169 | ------- 170 | self : object 171 | Returns an instance of self. 172 | """ 173 | class_list = np.unique(y) 174 | num_classes = len(class_list) 175 | y_mod = np.zeros(len(y)) 176 | for i in range(num_classes): 177 | y_mod[y==class_list[i]]=i 178 | 179 | y_mod = y_mod.astype(int) 180 | if ((type(self.cv)==str) and (self.cv=='prefit')): 181 | self.uncalibrated_classifier = self.base_estimator 182 | y_pred = self.uncalibrated_classifier.predict_proba(X) 183 | 184 | else: 185 | y_pred = np.zeros((len(y_mod),num_classes)) 186 | if sklearn.__version__ < '0.18': 187 | skf = StratifiedKFold(y_mod, n_folds=self.cv,shuffle=True) 188 | else: 189 | skf = StratifiedKFold(n_splits=self.cv, shuffle=True).split(X, y) 190 | for idx, (train_idx, test_idx) in enumerate(skf): 191 | if verbose: 192 | print("training fold {} of {}".format(idx+1, self.cv)) 193 | X_train = np.array(X)[train_idx,:] 194 | X_test = np.array(X)[test_idx,:] 195 | y_train = np.array(y_mod)[train_idx] 196 | # We could also copy the model first and then fit it 197 | this_estimator = clone(self.base_estimator) 198 | this_estimator.fit(X_train,y_train) 199 | y_pred[test_idx,:] = this_estimator.predict_proba(X_test) 200 | 201 | if verbose: 202 | print("Training Full Model") 203 | self.uncalibrated_classifier = clone(self.base_estimator) 204 | self.uncalibrated_classifier.fit(X, y_mod) 205 | 206 | # calibrating function 207 | if verbose: 208 | print("Determining Calibration Function") 209 | if self.method=='logistic': 210 | self.calib_func, self.cf_list = prob_calibration_function_multiclass(y_mod, self.pre_transform(y_pred), verbose=verbose, **self.calib_kwargs) 211 | if self.method=='ridge': 212 | self.calib_func, self.cf_list = prob_calibration_function_multiclass(y_mod, self.pre_transform(y_pred), verbose=verbose, method='ridge', **self.calib_kwargs) 213 | # training full model 214 | 215 | return self 216 | 217 | 218 | def predict_proba(self, X): 219 | """Posterior probabilities of classification 220 | 221 | This function returns posterior probabilities of classification 222 | according to each class on an array of test vectors X. 223 | 224 | Parameters 225 | ---------- 226 | X : array-like, shape (n_samples, n_features) 227 | The samples. 228 | 229 | Returns 230 | ------- 231 | C : array, shape (n_samples, n_classes) 232 | The predicted probas. 233 | """ 234 | # check_is_fitted(self, ["classes_", "calibrated_classifier"]) 235 | if self.fit_on_multiclass: 236 | return self.calib_func(self.pre_transform(self.uncalibrated_classifier.predict_proba(X))) 237 | 238 | col_1 = self.calib_func(self.pre_transform(self.uncalibrated_classifier.predict_proba(X)[:,1])) 239 | col_0 = 1-col_1 240 | return np.vstack((col_0,col_1)).T 241 | 242 | 243 | 244 | def predict(self, X): 245 | """Predict the target of new samples. Can be different from the 246 | prediction of the uncalibrated classifier. 247 | 248 | Parameters 249 | ---------- 250 | X : array-like, shape (n_samples, n_features) 251 | The samples. 252 | 253 | Returns 254 | ------- 255 | C : array, shape (n_samples,) 256 | The predicted class. 257 | """ 258 | # check_is_fitted(self, ["classes_", "calibrated_classifier"]) 259 | return self.uncalibrated_classifier.classes_[np.argmax(self.predict_proba(X), axis=1)] 260 | 261 | def classes_(self): 262 | return self.uncalibrated_classifier.classes_ 263 | 264 | 265 | 266 | """Calibration of predicted probabilities.""" 267 | import numpy as np 268 | import sklearn 269 | from sklearn.base import BaseEstimator, ClassifierMixin, clone 270 | 271 | try: 272 | from sklearn.model_selection import StratifiedKFold 273 | except: 274 | from sklearn.cross_validation import StratifiedKFold 275 | 276 | from .calibration_utils import prob_calibration_function_multiclass 277 | 278 | 279 | class SplineCalibratedClassifierMulticlassCV(BaseEstimator, ClassifierMixin): 280 | """Probability calibration using cubic splines. 281 | 282 | With this class, the base_estimator is fit on each of the cross-validation 283 | training set folds in order to generate scores on the (cross-validated) 284 | test set folds. The test set scores are accumulated into a final vector 285 | (the size of the full set) which is used to calibrate the answers. 286 | The model is then fit on the full data set. The predict, and predict_proba 287 | methods are then updated to use the combination of the predictions from the 288 | full model and the calibration function computed as above. 289 | 290 | Parameters 291 | ---------- 292 | base_estimator : instance BaseEstimator 293 | The classifier whose output decision function needs to be calibrated 294 | to offer more accurate predict_proba outputs. If cv='prefit', the 295 | classifier must have been fit already on data. 296 | 297 | method : 'logistic' or 'ridge' 298 | The default is 'logistic', which is best if you plan to use log-loss as your 299 | performance metric. This method is relatively robust and will typically do 300 | well on brier score as well. The 'ridge' method calibrates using an L2 loss, 301 | and therefore should do better for brier score, but may do considerably worse 302 | on log-loss. 303 | 304 | cv : integer, cross-validation generator, iterable or "prefit", optional 305 | Determines the cross-validation splitting strategy. 306 | Possible inputs for cv are: 307 | 308 | - None, to use the default 5-fold cross-validation, 309 | - integer, to specify the number of folds. 310 | - 'prefit', if you wish to use the data only for calibration 311 | 312 | For integer/None inputs, if ``y`` is binary or multiclass, 313 | :class:`sklearn.model_selection.StratifiedKFold` is used. If ``y`` is 314 | neither binary nor multiclass, :class:`sklearn.model_selection.KFold` 315 | is used. 316 | 317 | Refer :ref:`User Guide ` for the various 318 | cross-validation strategies that can be used here. 319 | 320 | If "prefit" is passed, it is assumed that base_estimator has been 321 | fitted already and all data is used for calibration. 322 | 323 | Attributes 324 | ---------- 325 | uncalibrated_classifier: this gives the uncalibrated version of the classifier, fit on the entire data set 326 | 327 | calib_func: this is the calibration function that has been learned from the cross-validation. Applying this function 328 | to the results of the uncalibrated classifier (via model.predict_proba(X_test)[:,1]) gives the fully calibrated classifier 329 | 330 | References 331 | ---------- 332 | """ 333 | def __init__(self, base_estimator=None, method='logistic', cv=5, **calib_kwargs): 334 | warn_msg = ('\nThis class is deprecated and will eventually be removed.' + 335 | '\nPlease use the SplineCalib class for calibration.') 336 | warnings.warn(warn_msg, FutureWarning) 337 | 338 | self.base_estimator = base_estimator 339 | self.uncalibrated_classifier = None 340 | self.calib_func = None 341 | self.method = method 342 | self.cv = cv 343 | self.calib_kwargs = calib_kwargs 344 | 345 | def fit(self, X, y, verbose=False): 346 | """Fit the calibrated model 347 | 348 | Parameters 349 | ---------- 350 | X : array-like, shape (n_samples, n_features) 351 | Training data. 352 | 353 | y : array-like, shape (n_samples,) 354 | Target values. 355 | 356 | Returns 357 | ------- 358 | self : object 359 | Returns an instance of self. 360 | """ 361 | class_list = np.unique(y) 362 | num_classes = len(class_list) 363 | y_mod = np.zeros(len(y)) 364 | 365 | for i in range(num_classes): 366 | y_mod[np.where(y==class_list[i])]=i 367 | 368 | y_mod = y_mod.astype(int) 369 | if ((type(self.cv)==str) and (self.cv=='prefit')): 370 | self.uncalibrated_classifier = self.base_estimator 371 | y_pred = self.uncalibrated_classifier.predict_proba(X)[:,1] 372 | 373 | else: 374 | y_pred = np.zeros((len(y_mod),num_classes)) 375 | if sklearn.__version__ < '0.18': 376 | skf = StratifiedKFold(y_mod, n_folds=self.cv,shuffle=True) 377 | else: 378 | skf = StratifiedKFold(n_splits=self.cv, shuffle=True).split(X, y) 379 | for idx, (train_idx, test_idx) in enumerate(skf): 380 | if verbose: 381 | print("training fold {} of {}".format(idx+1, self.cv)) 382 | X_train = np.array(X)[train_idx,:] 383 | X_test = np.array(X)[test_idx,:] 384 | y_train = np.array(y_mod)[train_idx] 385 | # We could also copy the model first and then fit it 386 | this_estimator = clone(self.base_estimator) 387 | this_estimator.fit(X_train,y_train) 388 | y_pred[test_idx,:] = this_estimator.predict_proba(X_test) 389 | 390 | if verbose: 391 | print("Training Full Model") 392 | self.uncalibrated_classifier = clone(self.base_estimator) 393 | self.uncalibrated_classifier.fit(X, y_mod) 394 | 395 | # calibrating function 396 | if verbose: 397 | print("Determining Calibration Function") 398 | if self.method=='logistic': 399 | self.calib_func = prob_calibration_function_multiclass(y_mod, y_pred, verbose=verbose, **self.calib_kwargs) 400 | if self.method=='ridge': 401 | self.calib_func = prob_calibration_function_multiclass(y_mod, y_pred, verbose=verbose, method='ridge', **self.calib_kwargs) 402 | # training full model 403 | 404 | return self 405 | 406 | def predict_proba(self, X): 407 | """Posterior probabilities of classification 408 | 409 | This function returns posterior probabilities of classification 410 | according to each class on an array of test vectors X. 411 | 412 | Parameters 413 | ---------- 414 | X : array-like, shape (n_samples, n_features) 415 | The samples. 416 | 417 | Returns 418 | ------- 419 | C : array, shape (n_samples, n_classes) 420 | The predicted probas. 421 | """ 422 | # check_is_fitted(self, ["classes_", "calibrated_classifier"]) 423 | return self.calib_func(self.uncalibrated_classifier.predict_proba(X)) 424 | 425 | 426 | def predict(self, X): 427 | """Predict the target of new samples. Can be different from the 428 | prediction of the uncalibrated classifier. 429 | 430 | Parameters 431 | ---------- 432 | X : array-like, shape (n_samples, n_features) 433 | The samples. 434 | 435 | Returns 436 | ------- 437 | C : array, shape (n_samples,) 438 | The predicted class. 439 | """ 440 | # check_is_fitted(self, ["classes_", "calibrated_classifier"]) 441 | return self.uncalibrated_classifier.classes_[np.argmax(self.predict_proba(X), axis=1)] 442 | 443 | def classes_(self): 444 | return self.uncalibrated_classifier.classes_ 445 | -------------------------------------------------------------------------------- /ml_insights/cross_validation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.model_selection import StratifiedKFold 4 | 5 | def cv_predict_proba(X, y,estimator, cv): 6 | 7 | ## Convert from Pandas to Numpy if necessary 8 | if (type(X)==pd.DataFrame) or (type(X)==pd.Series): 9 | X = X.values 10 | if (type(y)==pd.DataFrame) or (type(y)==pd.Series): 11 | y = y.values 12 | 13 | num_classes = len(np.unique(y)) 14 | out_vec=np.zeros((len(y),num_classes)) 15 | 16 | #Main loop to do cross-validated predict proba and construct output matrix 17 | for tr, te in cv.split(X,y): 18 | estimator.fit(X[tr],y[tr]) 19 | out_vals = estimator.predict_proba(X[te]) 20 | out_vec[te,:] = out_vals 21 | return out_vec 22 | 23 | 24 | def cv_score(X, y, estimator, cv, score_fn): 25 | return(score_fn(y,cv_predict_proba(X,y,estimator,cv))) 26 | 27 | def _get_param_settings_from_grid(param_grid): 28 | num_settings = np.prod([len(i) for i in param_grid.values()]) 29 | pg_tuple = tuple(param_grid.items()) 30 | param_names = [k[0] for k in pg_tuple] 31 | param_lists = [k[1] for k in pg_tuple] 32 | param_list_lengths = [len(k) for k in param_lists] 33 | param_dict_list = [] 34 | for i in range(num_settings): 35 | indices = _int_to_indices(i,param_list_lengths) 36 | curr_param_dict = {} 37 | for k in range(len(param_names)): 38 | curr_param_dict[param_names[k]]=param_lists[k][indices[k]] 39 | param_dict_list.append(curr_param_dict) 40 | return param_dict_list 41 | 42 | def _int_to_indices(j,lengths): 43 | out_list = [] 44 | for i in range(len(lengths)): 45 | curr_ind = j % lengths[i] 46 | out_list.append(curr_ind) 47 | j = j//lengths[i] 48 | return(out_list) 49 | 50 | def grid_search(X,y, model, param_grid, score_fn, verbose=True): 51 | param_arg_list = _get_param_settings_from_grid(param_grid) 52 | num_settings = len(param_arg_list) 53 | param_list_scores = np.zeros(num_settings) 54 | skf = StratifiedKFold(5, shuffle=True, random_state=42) 55 | for i in range(num_settings): 56 | curr_param_dict = param_arg_list[i] 57 | if verbose: 58 | print(curr_param_dict) 59 | model.set_params(**curr_param_dict) 60 | curr_score=cv_score(X,y,model,skf,score_fn) 61 | param_list_scores[i]=curr_score 62 | if verbose: 63 | print(curr_param_dict,curr_score) 64 | 65 | return(list(zip(param_arg_list,param_list_scores))) 66 | 67 | -------------------------------------------------------------------------------- /ml_insights/data/ortho.csv: -------------------------------------------------------------------------------- 1 | contrast1,contrast2,answer 2 | 0.59999999999999998,0.29999999999999999,1 3 | 0.65000000000000002,0.29999999999999999,1 4 | 0.69999999999999996,0.29999999999999999,1 5 | 0.5,0.29999999999999999,1 6 | 0.10000000000000001,0.29999999999999999,2 7 | 0.01,0.29999999999999999,2 8 | 0.20000000000000001,0.29999999999999999,2 9 | 0.5,0.29999999999999999,1 10 | 0.10000000000000001,0.29999999999999999,2 11 | 0.90000000000000002,0.29999999999999999,1 12 | 0.90000000000000002,0.29999999999999999,1 13 | 0.55000000000000004,0.29999999999999999,1 14 | 0.40000000000000002,0.29999999999999999,1 15 | 0.20000000000000001,0.29999999999999999,2 16 | 0.20000000000000001,0.29999999999999999,2 17 | 0.29999999999999999,0.29999999999999999,2 18 | 0.65000000000000002,0.29999999999999999,1 19 | 0.20000000000000001,0.29999999999999999,2 20 | 0.5,0.29999999999999999,1 21 | 0.5,0.29999999999999999,1 22 | 0.01,0.29999999999999999,1 23 | 0.90000000000000002,0.29999999999999999,1 24 | 0.65000000000000002,0.29999999999999999,1 25 | 0.90000000000000002,0.29999999999999999,1 26 | 0.5,0.29999999999999999,1 27 | 0.01,0.29999999999999999,2 28 | 0.59999999999999998,0.29999999999999999,1 29 | 0.65000000000000002,0.29999999999999999,1 30 | 0.5,0.29999999999999999,2 31 | 0.01,0.29999999999999999,2 32 | 0.69999999999999996,0.29999999999999999,1 33 | 0.69999999999999996,0.29999999999999999,1 34 | 0.29999999999999999,0.29999999999999999,2 35 | 0.20000000000000001,0.29999999999999999,2 36 | 0.90000000000000002,0.29999999999999999,1 37 | 0.5,0.29999999999999999,2 38 | 0.10000000000000001,0.29999999999999999,1 39 | 0.01,0.29999999999999999,2 40 | 0.29999999999999999,0.29999999999999999,2 41 | 0.5,0.29999999999999999,2 42 | 0.69999999999999996,0.29999999999999999,1 43 | 0.69999999999999996,0.29999999999999999,1 44 | 0.90000000000000002,0.29999999999999999,1 45 | 0.90000000000000002,0.29999999999999999,1 46 | 0.65000000000000002,0.29999999999999999,1 47 | 0.40000000000000002,0.29999999999999999,1 48 | 0.65000000000000002,0.29999999999999999,1 49 | 0.01,0.29999999999999999,2 50 | 0.29999999999999999,0.29999999999999999,2 51 | 0.59999999999999998,0.29999999999999999,2 52 | 0.01,0.29999999999999999,2 53 | 0.20000000000000001,0.29999999999999999,2 54 | 0.10000000000000001,0.29999999999999999,2 55 | 0.5,0.29999999999999999,1 56 | 0.69999999999999996,0.29999999999999999,1 57 | 0.01,0.29999999999999999,2 58 | 0.10000000000000001,0.29999999999999999,2 59 | 0.55000000000000004,0.29999999999999999,1 60 | 0.01,0.29999999999999999,2 61 | 0.29999999999999999,0.29999999999999999,2 62 | 0.40000000000000002,0.29999999999999999,2 63 | 0.20000000000000001,0.29999999999999999,1 64 | 0.29999999999999999,0.29999999999999999,2 65 | 0.55000000000000004,0.29999999999999999,2 66 | 0.10000000000000001,0.29999999999999999,2 67 | 0.40000000000000002,0.29999999999999999,1 68 | 0.10000000000000001,0.29999999999999999,2 69 | 0.10000000000000001,0.29999999999999999,2 70 | 0.69999999999999996,0.29999999999999999,1 71 | 0.59999999999999998,0.29999999999999999,1 72 | 0.59999999999999998,0.29999999999999999,1 73 | 0.5,0.29999999999999999,1 74 | 0.40000000000000002,0.29999999999999999,2 75 | 0.55000000000000004,0.29999999999999999,1 76 | 0.29999999999999999,0.29999999999999999,2 77 | 0.55000000000000004,0.29999999999999999,2 78 | 0.10000000000000001,0.29999999999999999,2 79 | 0.65000000000000002,0.29999999999999999,1 80 | 0.5,0.29999999999999999,1 81 | 0.5,0.29999999999999999,2 82 | 0.65000000000000002,0.29999999999999999,1 83 | 0.55000000000000004,0.29999999999999999,2 84 | 0.29999999999999999,0.29999999999999999,2 85 | 0.55000000000000004,0.29999999999999999,1 86 | 0.55000000000000004,0.29999999999999999,1 87 | 0.65000000000000002,0.29999999999999999,1 88 | 0.5,0.29999999999999999,2 89 | 0.10000000000000001,0.29999999999999999,2 90 | 0.10000000000000001,0.29999999999999999,2 91 | 0.90000000000000002,0.29999999999999999,1 92 | 0.55000000000000004,0.29999999999999999,2 93 | 0.20000000000000001,0.29999999999999999,2 94 | 0.20000000000000001,0.29999999999999999,2 95 | 0.20000000000000001,0.29999999999999999,1 96 | 0.40000000000000002,0.29999999999999999,2 97 | 0.65000000000000002,0.29999999999999999,1 98 | 0.65000000000000002,0.29999999999999999,1 99 | 0.5,0.29999999999999999,1 100 | 0.59999999999999998,0.29999999999999999,2 101 | 0.65000000000000002,0.29999999999999999,1 102 | 0.5,0.29999999999999999,2 103 | 0.59999999999999998,0.29999999999999999,2 104 | 0.59999999999999998,0.29999999999999999,1 105 | 0.59999999999999998,0.29999999999999999,1 106 | 0.40000000000000002,0.29999999999999999,2 107 | 0.90000000000000002,0.29999999999999999,1 108 | 0.90000000000000002,0.29999999999999999,1 109 | 0.29999999999999999,0.29999999999999999,2 110 | 0.40000000000000002,0.29999999999999999,2 111 | 0.40000000000000002,0.29999999999999999,2 112 | 0.59999999999999998,0.29999999999999999,1 113 | 0.69999999999999996,0.29999999999999999,1 114 | 0.10000000000000001,0.29999999999999999,2 115 | 0.59999999999999998,0.29999999999999999,1 116 | 0.5,0.29999999999999999,1 117 | 0.55000000000000004,0.29999999999999999,1 118 | 0.55000000000000004,0.29999999999999999,1 119 | 0.5,0.29999999999999999,1 120 | 0.55000000000000004,0.29999999999999999,1 121 | 0.40000000000000002,0.29999999999999999,1 122 | 0.59999999999999998,0.29999999999999999,1 123 | 0.10000000000000001,0.29999999999999999,2 124 | 0.59999999999999998,0.29999999999999999,1 125 | 0.59999999999999998,0.29999999999999999,1 126 | 0.20000000000000001,0.29999999999999999,2 127 | 0.01,0.29999999999999999,2 128 | 0.5,0.29999999999999999,1 129 | 0.90000000000000002,0.29999999999999999,1 130 | 0.65000000000000002,0.29999999999999999,2 131 | 0.65000000000000002,0.29999999999999999,1 132 | 0.29999999999999999,0.29999999999999999,2 133 | 0.55000000000000004,0.29999999999999999,1 134 | 0.5,0.29999999999999999,2 135 | 0.90000000000000002,0.29999999999999999,1 136 | 0.90000000000000002,0.29999999999999999,1 137 | 0.65000000000000002,0.29999999999999999,1 138 | 0.20000000000000001,0.29999999999999999,2 139 | 0.40000000000000002,0.29999999999999999,2 140 | 0.65000000000000002,0.29999999999999999,1 141 | 0.69999999999999996,0.29999999999999999,1 142 | 0.90000000000000002,0.29999999999999999,1 143 | 0.69999999999999996,0.29999999999999999,1 144 | 0.40000000000000002,0.29999999999999999,1 145 | 0.55000000000000004,0.29999999999999999,1 146 | 0.40000000000000002,0.29999999999999999,2 147 | 0.29999999999999999,0.29999999999999999,2 148 | 0.69999999999999996,0.29999999999999999,1 149 | 0.29999999999999999,0.29999999999999999,2 150 | 0.59999999999999998,0.29999999999999999,1 151 | 0.40000000000000002,0.29999999999999999,2 152 | 0.55000000000000004,0.29999999999999999,2 153 | 0.20000000000000001,0.29999999999999999,2 154 | 0.01,0.29999999999999999,2 155 | 0.10000000000000001,0.29999999999999999,2 156 | 0.90000000000000002,0.29999999999999999,1 157 | 0.29999999999999999,0.29999999999999999,2 158 | 0.10000000000000001,0.29999999999999999,2 159 | 0.59999999999999998,0.29999999999999999,1 160 | 0.01,0.29999999999999999,2 161 | 0.20000000000000001,0.29999999999999999,2 162 | 0.65000000000000002,0.29999999999999999,1 163 | 0.59999999999999998,0.29999999999999999,1 164 | 0.69999999999999996,0.29999999999999999,1 165 | 0.20000000000000001,0.29999999999999999,2 166 | 0.65000000000000002,0.29999999999999999,1 167 | 0.65000000000000002,0.29999999999999999,1 168 | 0.10000000000000001,0.29999999999999999,1 169 | 0.59999999999999998,0.29999999999999999,1 170 | 0.90000000000000002,0.29999999999999999,1 171 | 0.29999999999999999,0.29999999999999999,2 172 | 0.69999999999999996,0.29999999999999999,1 173 | 0.40000000000000002,0.29999999999999999,1 174 | 0.20000000000000001,0.29999999999999999,2 175 | 0.69999999999999996,0.29999999999999999,1 176 | 0.01,0.29999999999999999,2 177 | 0.20000000000000001,0.29999999999999999,2 178 | 0.5,0.29999999999999999,2 179 | -------------------------------------------------------------------------------- /ml_insights/data/para.csv: -------------------------------------------------------------------------------- 1 | contrast1,contrast2,answer 2 | 0.10000000000000001,0.29999999999999999,2 3 | 0.90000000000000002,0.29999999999999999,1 4 | 0.90000000000000002,0.29999999999999999,1 5 | 0.40000000000000002,0.29999999999999999,2 6 | 0.01,0.29999999999999999,2 7 | 0.40000000000000002,0.29999999999999999,2 8 | 0.20000000000000001,0.29999999999999999,2 9 | 0.40000000000000002,0.29999999999999999,2 10 | 0.20000000000000001,0.29999999999999999,2 11 | 0.55000000000000004,0.29999999999999999,2 12 | 0.65000000000000002,0.29999999999999999,2 13 | 0.5,0.29999999999999999,2 14 | 0.29999999999999999,0.29999999999999999,2 15 | 0.20000000000000001,0.29999999999999999,2 16 | 0.90000000000000002,0.29999999999999999,1 17 | 0.5,0.29999999999999999,1 18 | 0.69999999999999996,0.29999999999999999,2 19 | 0.59999999999999998,0.29999999999999999,2 20 | 0.90000000000000002,0.29999999999999999,1 21 | 0.55000000000000004,0.29999999999999999,2 22 | 0.55000000000000004,0.29999999999999999,2 23 | 0.5,0.29999999999999999,2 24 | 0.20000000000000001,0.29999999999999999,2 25 | 0.01,0.29999999999999999,2 26 | 0.29999999999999999,0.29999999999999999,2 27 | 0.29999999999999999,0.29999999999999999,2 28 | 0.20000000000000001,0.29999999999999999,2 29 | 0.69999999999999996,0.29999999999999999,1 30 | 0.69999999999999996,0.29999999999999999,1 31 | 0.5,0.29999999999999999,2 32 | 0.10000000000000001,0.29999999999999999,2 33 | 0.59999999999999998,0.29999999999999999,2 34 | 0.20000000000000001,0.29999999999999999,2 35 | 0.59999999999999998,0.29999999999999999,2 36 | 0.65000000000000002,0.29999999999999999,1 37 | 0.55000000000000004,0.29999999999999999,1 38 | 0.01,0.29999999999999999,2 39 | 0.55000000000000004,0.29999999999999999,2 40 | 0.29999999999999999,0.29999999999999999,2 41 | 0.01,0.29999999999999999,2 42 | 0.5,0.29999999999999999,2 43 | 0.20000000000000001,0.29999999999999999,2 44 | 0.69999999999999996,0.29999999999999999,1 45 | 0.5,0.29999999999999999,1 46 | 0.90000000000000002,0.29999999999999999,1 47 | 0.55000000000000004,0.29999999999999999,1 48 | 0.59999999999999998,0.29999999999999999,1 49 | 0.59999999999999998,0.29999999999999999,2 50 | 0.01,0.29999999999999999,2 51 | 0.65000000000000002,0.29999999999999999,1 52 | 0.90000000000000002,0.29999999999999999,1 53 | 0.55000000000000004,0.29999999999999999,1 54 | 0.59999999999999998,0.29999999999999999,2 55 | 0.5,0.29999999999999999,2 56 | 0.55000000000000004,0.29999999999999999,2 57 | 0.5,0.29999999999999999,2 58 | 0.29999999999999999,0.29999999999999999,2 59 | 0.01,0.29999999999999999,2 60 | 0.55000000000000004,0.29999999999999999,2 61 | 0.59999999999999998,0.29999999999999999,1 62 | 0.01,0.29999999999999999,2 63 | 0.10000000000000001,0.29999999999999999,2 64 | 0.90000000000000002,0.29999999999999999,1 65 | 0.10000000000000001,0.29999999999999999,2 66 | 0.29999999999999999,0.29999999999999999,2 67 | 0.20000000000000001,0.29999999999999999,2 68 | 0.40000000000000002,0.29999999999999999,2 69 | 0.5,0.29999999999999999,2 70 | 0.65000000000000002,0.29999999999999999,1 71 | 0.69999999999999996,0.29999999999999999,1 72 | 0.5,0.29999999999999999,2 73 | 0.90000000000000002,0.29999999999999999,1 74 | 0.65000000000000002,0.29999999999999999,1 75 | 0.10000000000000001,0.29999999999999999,2 76 | 0.90000000000000002,0.29999999999999999,1 77 | 0.59999999999999998,0.29999999999999999,1 78 | 0.69999999999999996,0.29999999999999999,1 79 | 0.5,0.29999999999999999,1 80 | 0.20000000000000001,0.29999999999999999,2 81 | 0.10000000000000001,0.29999999999999999,2 82 | 0.55000000000000004,0.29999999999999999,1 83 | 0.10000000000000001,0.29999999999999999,2 84 | 0.59999999999999998,0.29999999999999999,1 85 | 0.90000000000000002,0.29999999999999999,1 86 | 0.5,0.29999999999999999,2 87 | 0.5,0.29999999999999999,2 88 | 0.5,0.29999999999999999,2 89 | 0.40000000000000002,0.29999999999999999,2 90 | 0.69999999999999996,0.29999999999999999,1 91 | 0.55000000000000004,0.29999999999999999,2 92 | 0.90000000000000002,0.29999999999999999,1 93 | 0.5,0.29999999999999999,1 94 | 0.01,0.29999999999999999,2 95 | 0.65000000000000002,0.29999999999999999,1 96 | 0.20000000000000001,0.29999999999999999,2 97 | 0.55000000000000004,0.29999999999999999,1 98 | 0.59999999999999998,0.29999999999999999,1 99 | 0.40000000000000002,0.29999999999999999,2 100 | 0.10000000000000001,0.29999999999999999,2 101 | 0.90000000000000002,0.29999999999999999,1 102 | 0.40000000000000002,0.29999999999999999,2 103 | 0.65000000000000002,0.29999999999999999,1 104 | 0.90000000000000002,0.29999999999999999,1 105 | 0.29999999999999999,0.29999999999999999,2 106 | 0.59999999999999998,0.29999999999999999,1 107 | 0.01,0.29999999999999999,2 108 | 0.29999999999999999,0.29999999999999999,2 109 | 0.65000000000000002,0.29999999999999999,1 110 | 0.10000000000000001,0.29999999999999999,2 111 | 0.29999999999999999,0.29999999999999999,2 112 | 0.59999999999999998,0.29999999999999999,1 113 | 0.01,0.29999999999999999,2 114 | 0.01,0.29999999999999999,2 115 | 0.40000000000000002,0.29999999999999999,2 116 | 0.90000000000000002,0.29999999999999999,1 117 | 0.29999999999999999,0.29999999999999999,2 118 | 0.20000000000000001,0.29999999999999999,2 119 | 0.40000000000000002,0.29999999999999999,2 120 | 0.20000000000000001,0.29999999999999999,1 121 | 0.5,0.29999999999999999,2 122 | 0.90000000000000002,0.29999999999999999,1 123 | 0.20000000000000001,0.29999999999999999,2 124 | 0.59999999999999998,0.29999999999999999,2 125 | 0.10000000000000001,0.29999999999999999,2 126 | 0.40000000000000002,0.29999999999999999,2 127 | 0.29999999999999999,0.29999999999999999,2 128 | 0.20000000000000001,0.29999999999999999,2 129 | 0.90000000000000002,0.29999999999999999,1 130 | 0.01,0.29999999999999999,2 131 | 0.65000000000000002,0.29999999999999999,2 132 | 0.01,0.29999999999999999,2 133 | 0.29999999999999999,0.29999999999999999,2 134 | 0.5,0.29999999999999999,1 135 | 0.69999999999999996,0.29999999999999999,2 136 | 0.10000000000000001,0.29999999999999999,2 137 | 0.01,0.29999999999999999,2 138 | 0.59999999999999998,0.29999999999999999,2 139 | 0.65000000000000002,0.29999999999999999,1 140 | 0.10000000000000001,0.29999999999999999,1 141 | 0.5,0.29999999999999999,1 142 | 0.55000000000000004,0.29999999999999999,2 143 | 0.90000000000000002,0.29999999999999999,1 144 | 0.55000000000000004,0.29999999999999999,1 145 | 0.20000000000000001,0.29999999999999999,2 146 | 0.69999999999999996,0.29999999999999999,1 147 | 0.29999999999999999,0.29999999999999999,2 148 | 0.40000000000000002,0.29999999999999999,2 149 | 0.01,0.29999999999999999,2 150 | 0.5,0.29999999999999999,2 151 | 0.59999999999999998,0.29999999999999999,1 152 | 0.01,0.29999999999999999,2 153 | 0.40000000000000002,0.29999999999999999,2 154 | 0.10000000000000001,0.29999999999999999,2 155 | 0.5,0.29999999999999999,1 156 | 0.69999999999999996,0.29999999999999999,2 157 | 0.90000000000000002,0.29999999999999999,1 158 | 0.10000000000000001,0.29999999999999999,2 159 | 0.40000000000000002,0.29999999999999999,2 160 | 0.59999999999999998,0.29999999999999999,1 161 | 0.29999999999999999,0.29999999999999999,2 162 | 0.20000000000000001,0.29999999999999999,2 163 | 0.40000000000000002,0.29999999999999999,2 164 | 0.40000000000000002,0.29999999999999999,2 165 | 0.55000000000000004,0.29999999999999999,2 166 | 0.40000000000000002,0.29999999999999999,2 167 | 0.55000000000000004,0.29999999999999999,1 168 | -------------------------------------------------------------------------------- /ml_insights/insights.py: -------------------------------------------------------------------------------- 1 | import math 2 | import warnings 3 | import numpy as np 4 | import pandas as pd 5 | from .utils import _gca, is_classifier, is_regressor 6 | 7 | 8 | class ModelXRay(object): 9 | """This class executes a model over a broad range of modified data points to analyze aspects of its performance. 10 | 11 | For each point in the data set, and for every feature involved of the prediction of the model, a new set of data 12 | points is created where the chosen feature is varied across its (empirical) range. These modified data points are 13 | fed into the model to get a set of model predictions for each feature-data point combination. 14 | 15 | It is desirable that the "data" object passed in be relatively large in size, since the algorithm will make 16 | some heuristic choices based on the ranges of values it sees. We suggest using at least 100 data points and preferably 17 | more than 500. 18 | 19 | It returns a results object, which can then be passed to functions such as feature_effect_summary and 20 | feature_dependence_plots to gain insight on the how the various features affect the target. The results 21 | object can also be used directly by a user who wants to operate at a low-level. 22 | 23 | Parameters 24 | ---------- 25 | 26 | 27 | model : A model object from sklearn or similar styled objects. The `predict` method will be used if it is 28 | a regression model, while `predict_proba` will be used if it is a (binary) classification model. Multi-class 29 | classifiers are not supported at this time. 30 | 31 | data : A DataFrame possessing the sameucture that the model would take as an argument. These methods are designed 32 | to be used on "test" data (i.e. data that was not used in the training of the model). However, there is nothing 33 | structural to prevent it from being used on training data, and there may be some insight gained by doing so. 34 | 35 | columns : a specific subset of columns to be used. Default is None, which means to use all available columns in *data* 36 | 37 | resolution : how many different "grid points" to use for each feature. The algorithm will use only the unique values 38 | it sees in *data* if there are fewer than *resolution* unique values. Otherwise it will use *resolution* linearly spaced 39 | values ranging from the min observed value to the max observed value. 40 | """ 41 | 42 | def __init__(self, model, data, columns=None, resolution=100, normalize_loc=None, pred_col_name = None, pred_col_index=1): 43 | 44 | self.model = model 45 | self.data = data 46 | self.pred_col_index = pred_col_index 47 | if type(data) == pd.DataFrame: 48 | if (pred_col_name != None) and (is_classifier(self.model)): 49 | self.pred_col_index = np.where(self.model.classes_ == pred_col_name)[0][0] 50 | self.pred_col_name = data.columns[self.pred_col_index] 51 | self.data_values = data.values 52 | 53 | 54 | else: 55 | self.data_values = data 56 | 57 | self.columns = columns 58 | self.results = self._model_xray(columns, resolution, normalize_loc) 59 | 60 | 61 | def _get_data_rows(self, row_nums): 62 | if type(self.data) == pd.DataFrame: 63 | return self.data.iloc[row_nums] 64 | else: 65 | return self.data[row_nums, :] 66 | 67 | 68 | def _get_predictions(self, rows): 69 | # Catch deprecated warnings from Predict call 70 | with warnings.catch_warnings(): 71 | warnings.simplefilter("ignore") 72 | 73 | if is_classifier(self.model): 74 | y_pred = self.model.predict_proba(rows)[:,self.pred_col_index] 75 | else: 76 | #print('off') 77 | y_pred = self.model.predict(rows) 78 | return y_pred 79 | 80 | 81 | def gen_model_pred(self, row, col_idx, values): 82 | rows = [] 83 | for val in values: 84 | sim_row = row.copy() 85 | sim_row[col_idx] = val 86 | rows.append(sim_row) 87 | # If the row is a Series, make it into a DF 88 | if type(rows[0]) == pd.Series: 89 | rows = pd.DataFrame(rows) 90 | y_pred = self._get_predictions(rows) 91 | return y_pred 92 | 93 | def _model_xray(self, columns, resolution, normalize_loc): 94 | '''This function executes a model over a broad range of conditions to analyze aspects of its performance. 95 | 96 | For each point in the data set, and for every feature involved of the prediction of the model, a new set of data 97 | points is created where the chosen feature is varied across its (empirical) range. These modified data points are 98 | fed into the model to get a set of model predictions for each feature-data point combination. 99 | 100 | It is desirable that the "data" object passed in be relatively large in size, since the algorithm will make 101 | some heuristic choices based on the ranges of values it sees. We suggest using at least 100 data points and preferably 102 | more than 500. 103 | 104 | It returns a results object, which can then be passed to functions such as feature_effect_summary and 105 | feature_dependence_plots to gain insight on the how the various features affect the target. The results 106 | object can also be used directly by a user who wants to operate at a low-level. 107 | 108 | Parameters 109 | ---------- 110 | 111 | model : A model object from sklearn or similar styled objects. The `predict` method will be used if it is 112 | a regression model, while `predict_proba` will be used if it is a (binary) classification model. Multi-class 113 | classifiers are not supported at this time. 114 | 115 | data : A DataFrame possessing the same structure that the model would take as an argument. These methods are designed 116 | to be used on "test" data (i.e. data that was not used in the training of the model). However, there is nothing 117 | structural to prevent it from being used on training data, and there may be some insight gained by doing so. 118 | 119 | columns : a specific subset of columns to be used. Default is None, which means to use all available columns in *data* 120 | 121 | resolution : how many different "grid points" to use for each feature. The algorithm will use only the unique values 122 | it sees in *data* if there are fewer than *resolution* unique values. Otherwise it will use *resolution* linearly spaced 123 | values ranging from the min observed value to the max observed value. 124 | 125 | Returns 126 | ------- 127 | 128 | results : The "results" object is a dictionary where the keys are the feature names and the values are a 2-tuple. This 129 | object is intended primarily to be passed to other functions to interact with and display the data. However, advanced 130 | users may wish to understand and/or use the object directly. 131 | 132 | The first element in the tuple is the set of different feature values that were substituted in for each data point. The 133 | second element in the tuple is matrix where the number of rows is the number of data points and the number of columns 134 | is the number of different feature values. The (i,j)th element of the matrix is the result of the model prediction when 135 | data point i has the feature in question set to jth value. 136 | ''' 137 | ## Convert Pandas DataFrame to nparray explicitly to make life easier 138 | #print('hello!!!') 139 | 140 | 141 | ## Determine the range of values to plot for the chosen column 142 | if columns is None: 143 | if type(self.data) == pd.DataFrame: 144 | columns = self.data.columns 145 | if type(self.data)==np.ndarray: 146 | columns = range(len(self.data[0])) # Assuming a 2-D Dataset 147 | else: 148 | # Verify that columns is an iterable 149 | try: 150 | iterator = iter(columns) 151 | except TypeError: 152 | # not iterable 153 | columns = [columns] 154 | else: 155 | # iterable 156 | pass 157 | 158 | # Build Column Index 159 | column_nums = [] 160 | if type(self.data) == pd.DataFrame: 161 | for column in columns: 162 | try: 163 | column_nums.append(self.data.columns.get_loc(column)) 164 | except KeyError: 165 | ## TODO 166 | pass 167 | else: 168 | # Column Index and Column Names are the same 169 | if type(columns[0]) == int: 170 | column_nums = columns 171 | else: 172 | column_nums = range(len(columns)) 173 | 174 | # Use the Numpy array of data values to ease indexing by col. numbers 175 | results = {} 176 | num_pts = len(self.data_values) 177 | for column_num, column_name in zip(column_nums, columns): 178 | if (len(np.unique(self.data_values[:,column_num])) > resolution): 179 | col_values = np.linspace(np.nanmin(self.data_values[:,column_num]), 180 | np.nanmax(self.data_values[:,column_num]),resolution) 181 | else: 182 | col_values = np.sort(np.unique(self.data_values[:,column_num])) 183 | ## Define the empty data structure to output 184 | out_matrix = np.zeros([num_pts,len(col_values)]) 185 | 186 | ## Generate predictions 187 | if type(self.data) == pd.DataFrame: 188 | rows = self.data.iterrows() 189 | else: 190 | rows = enumerate(self.data) 191 | for loop_idx, (row_idx, row) in enumerate(rows): 192 | y_pred = self.gen_model_pred(row, column_num, col_values) 193 | if normalize_loc=='start': 194 | y_pred = y_pred - y_pred[0] 195 | if normalize_loc=='end': 196 | y_pred = y_pred - y_pred[-1] 197 | if (type(normalize_loc)==int and normalize_loc>=0 and normalize_loc 0: 247 | num_features = min(num_features, len(columns)) 248 | else: 249 | num_features = len(columns) 250 | plot_data = [result_data[idx] for idx in sortind][-num_features:] 251 | 252 | if ax is None: 253 | ax = _gca() 254 | fig = ax.get_figure() 255 | fig.set_figwidth(10) 256 | fig.set_figheight(max(6, int(math.ceil(num_features*0.5)))) 257 | ax.boxplot(plot_data, notch=0, sym='+', vert=0, whis=1.5) 258 | ax.set_yticklabels([columns[idx] for idx in sortind][-num_features:]); 259 | 260 | 261 | def feature_dependence_plots(self, y_scaling='none', show_base_points=True, pts_selected='sample', 262 | columns = None, num_pts=5, figsize=None): 263 | '''This function visualizes the effect of a single variable in models with complicated dependencies. 264 | Given a dataset, it will select points in that dataset, and then change the select column across 265 | different values to view the effect of the model prediction given that variable. These have been called 266 | Individual Conditional Expectation plots (or ICE-plots), see Goldstein, Kapelner, Bleich, 267 | Pitkin. Peeking Inside the Black Box: Visualizing Statistical Learning With Plots of Individual 268 | Conditional Expectation. Journal of Computational and Graphical Statistics (March 2014) 269 | ''' 270 | 271 | import matplotlib.pyplot as plt 272 | 273 | if columns == None: 274 | columns = sorted(list(self.results.keys())) 275 | num_rows = len(self.results[columns[0]][1]) # Get number of sample rows 276 | if (type(pts_selected)==str and pts_selected=='sample'): 277 | row_indexes = np.random.choice(np.arange(num_rows), num_pts) 278 | else: 279 | row_indexes = pts_selected 280 | 281 | if show_base_points: 282 | base_rows = self._get_data_rows(row_indexes) 283 | y_base_points = self._get_predictions(base_rows) 284 | if y_scaling=='logit': 285 | y_base_points = np.log(y_base_points/(1-y_base_points)) 286 | if y_scaling=='logit10': 287 | y_base_points = np.log10(y_base_points/(1-y_base_points)) 288 | if y_scaling=='logit2': 289 | y_base_points = np.log2(y_base_points/(1-y_base_points)) 290 | else: 291 | y_base_points = None 292 | 293 | n_cols = min(3, len(columns)) 294 | n_rows = int(math.ceil(len(columns) / n_cols)) 295 | figsize = (n_cols * 4, n_rows * 4) 296 | fig, axes = plt.subplots(n_rows, n_cols, figsize=figsize) 297 | for col_name, ax in zip(columns, axes.flatten()): 298 | x = self.results[col_name][0] 299 | y_values = self.results[col_name][1][row_indexes] 300 | y_plot = y_values 301 | if y_scaling=='logit': 302 | y_plot = np.log(y_values/(1-y_values)) 303 | if y_scaling=='logit10': 304 | y_plot = np.log10(y_values/(1-y_values)) 305 | if y_scaling=='logit2': 306 | y_plot = np.log2(y_values/(1-y_values)) 307 | for y in y_plot: 308 | ax.plot(x, y) 309 | # Plot Base Points 310 | if y_base_points is not None: 311 | ax.scatter(base_rows[col_name], y_base_points) 312 | ax.set_title(col_name[:30]) 313 | plt.tight_layout() 314 | return row_indexes 315 | 316 | 317 | def explain_prediction_difference(self, index_1, index_2, tol=.03, verbose=True, decimals=4): 318 | '''Given the indices of two points in the "xray"-ed data set, this function gives an explanation 319 | of the factors contributing to the difference in the predictions. 320 | 321 | Starting with the first point given, the considers changing each feature from its current value to that 322 | possessed by the second point. The function evaluates the target in both scenarios and determines the 323 | feature value change that creates the biggest (absolute) change in the target. This change is selected 324 | and the current point becomes the new point with the new feature value. This is repeated until the new 325 | target value is within a factor of 1+tol of the second point. 326 | ''' 327 | data_row_1 = self._get_data_rows(index_1) 328 | data_row_2 = self._get_data_rows(index_2) 329 | return explain_prediction_difference(self.model, data_row_1, data_row_2, tol, verbose, decimals, self.pred_col_index) 330 | 331 | 332 | def importance_distribution_of_variable(model_result_array): 333 | max_result_vec = np.array(list(map(np.max,model_result_array))) 334 | min_result_vec = np.array(list(map(np.min,model_result_array))) 335 | return max_result_vec - min_result_vec 336 | 337 | 338 | def explain_prediction_difference(model, data_row_1, data_row_2, tol=.03, verbose=True, decimals = 4, pred_col_index=1): 339 | '''Given a model and two single row data frames, this function gives an explanation 340 | of the factors contributing to the difference in the predictions. 341 | 342 | Starting with the first point given, the considers changing each feature from its current value to that 343 | possessed by the second point. The function evaluates the target in both scenarios and determines the 344 | feature value change that creates the biggest (absolute) change in the target. This change is selected 345 | and the current point becomes the new point with the new feature value. This is repeated until the new 346 | target value is within a factor of 1+tol of the second point. 347 | ''' 348 | column_names = data_row_1.index 349 | num_columns = len(column_names) 350 | 351 | dr_1 = data_row_1.values.reshape(1,-1) 352 | dr_2 = data_row_2.values.reshape(1,-1) 353 | column_list = list(range(num_columns)) 354 | curr_pt = np.copy(dr_1) 355 | if is_classifier(model): 356 | val1 = model.predict_proba(dr_1)[0,pred_col_index] 357 | val2 = model.predict_proba(dr_2)[0,pred_col_index] 358 | else: 359 | val1 = model.predict(dr_1)[0] 360 | val2 = model.predict(dr_2)[0] 361 | if verbose: 362 | print(val1, val2) 363 | print('Your initial point has a target value of {}'.format(np.round(val1,decimals=decimals))) 364 | print('Your final point has a target value of {}'.format(np.round(val2,decimals=decimals))) 365 | pt_list = [dr_1] 366 | val_list = [val1] 367 | curr_val = val1 368 | final_val = val2 369 | feat_list =[] 370 | move_list = [] 371 | feat_val_change_list = [] 372 | #for num_steps in range(4): 373 | while (((curr_val/final_val) >(1+tol)) or ((curr_val/final_val) <(1-tol))): 374 | biggest_move = 0 375 | best_column = -1 376 | best_val = curr_val 377 | for i in column_list: 378 | test_pt = np.copy(curr_pt) 379 | prev_feat_val = test_pt[0,i] 380 | subst_val = dr_2[0,i] 381 | test_pt[0,i] = subst_val 382 | if is_classifier(model): 383 | test_val = model.predict_proba(test_pt)[0,pred_col_index] 384 | else: 385 | test_val = model.predict(test_pt)[0] 386 | move_size = (test_val - curr_val) 387 | if(np.abs(move_size)>=np.abs(biggest_move)): 388 | biggest_move = move_size 389 | best_column = i 390 | best_val = test_val 391 | old_feat_val = prev_feat_val 392 | new_feat_val = subst_val 393 | subst_val = dr_2[0,best_column] 394 | curr_pt[0,best_column] = subst_val 395 | val_list.append(best_val) 396 | curr_val = best_val 397 | if verbose: 398 | print('Changing {} from {} to {}'.format(column_names[best_column],np.round(old_feat_val,decimals=decimals),np.round(new_feat_val,decimals=decimals))) 399 | print('\t\tchanges your target by {} to {}'.format(np.round(biggest_move,decimals=decimals), np.round(best_val,decimals=decimals))) 400 | print('----------') 401 | if not (((curr_val/final_val) >(1+tol)) or ((curr_val/final_val) <(1-tol))): 402 | print('Tolerance of {} reached'.format(tol)) 403 | print('Current value of {} is within {}% of {}'.format(np.round(curr_val,decimals=decimals),(100*tol),np.round(final_val,decimals=decimals))) 404 | feat_list.append(column_names[best_column]) 405 | column_list.remove(best_column) 406 | move_list.append(biggest_move) 407 | feat_val_change_list.append((old_feat_val, new_feat_val)) 408 | return feat_list, feat_val_change_list, move_list, val_list 409 | 410 | 411 | def explain_prediction_difference_xgboost(model, data_row_1, data_row_2, tol=.03, verbose=True, decimals = 4, pred_col_index=1): 412 | '''Given a model and two single row data frames, this function gives an explanation 413 | of the factors contributing to the difference in the predictions. 414 | 415 | Starting with the first point given, the considers changing each feature from its current value to that 416 | possessed by the second point. The function evaluates the target in both scenarios and determines the 417 | feature value change that creates the biggest (absolute) change in the target. This change is selected 418 | and the current point becomes the new point with the new feature value. This is repeated until the new 419 | target value is within a factor of 1+tol of the second point. 420 | ''' 421 | column_names = data_row_1.columns 422 | num_columns = len(column_names) 423 | 424 | #dr_1 = data_row_1.values.reshape(1,-1) 425 | #dr_2 = data_row_2.values.reshape(1,-1) 426 | dr_1 = data_row_1 427 | dr_2 = data_row_2 428 | column_list = list(range(num_columns)) 429 | curr_pt = (dr_1).copy() 430 | if is_classifier(model): 431 | val1 = model.predict_proba(dr_1)[0,pred_col_index] 432 | val2 = model.predict_proba(dr_2)[0,pred_col_index] 433 | else: 434 | val1 = model.predict(dr_1)[0] 435 | val2 = model.predict(dr_2)[0] 436 | if verbose: 437 | print(val1, val2) 438 | print('Your initial point has a target value of {}'.format(np.round(val1,decimals=decimals))) 439 | print('Your final point has a target value of {}'.format(np.round(val2,decimals=decimals))) 440 | pt_list = [dr_1] 441 | val_list = [val1] 442 | curr_val = val1 443 | final_val = val2 444 | feat_list =[] 445 | move_list = [] 446 | feat_val_change_list = [] 447 | #for num_steps in range(4): 448 | while (((curr_val/final_val) >(1+tol)) or ((curr_val/final_val) <(1-tol))): 449 | biggest_move = 0 450 | best_column = -1 451 | best_val = curr_val 452 | for i in column_list: 453 | test_pt = (curr_pt).copy() 454 | prev_feat_val = test_pt.iloc[0,i] 455 | subst_val = dr_2.iloc[0,i] 456 | test_pt.iloc[0,i] = subst_val 457 | if is_classifier(model): 458 | test_val = model.predict_proba(test_pt)[0,pred_col_index] 459 | else: 460 | test_val = model.predict(test_pt)[0] 461 | move_size = (test_val - curr_val) 462 | if(np.abs(move_size)>=np.abs(biggest_move)): 463 | biggest_move = move_size 464 | best_column = i 465 | best_val = test_val 466 | old_feat_val = prev_feat_val 467 | new_feat_val = subst_val 468 | subst_val = dr_2.iloc[0,best_column] 469 | curr_pt.iloc[0,best_column] = subst_val 470 | val_list.append(best_val) 471 | curr_val = best_val 472 | if verbose: 473 | print('Changing {} from {} to {}'.format(column_names[best_column],np.round(old_feat_val,decimals=decimals),np.round(new_feat_val,decimals=decimals))) 474 | print('\t\tchanges your target by {} to {}'.format(np.round(biggest_move,decimals=decimals), np.round(best_val,decimals=decimals))) 475 | print('----------') 476 | if not (((curr_val/final_val) >(1+tol)) or ((curr_val/final_val) <(1-tol))): 477 | print('Tolerance of {} reached'.format(tol)) 478 | print('Current value of {} is within {}% of {}'.format(np.round(curr_val,decimals=decimals),(100*tol),np.round(final_val,decimals=decimals))) 479 | feat_list.append(column_names[best_column]) 480 | column_list.remove(best_column) 481 | move_list.append(biggest_move) 482 | feat_val_change_list.append((old_feat_val, new_feat_val)) 483 | return feat_list, feat_val_change_list, move_list, val_list 484 | 485 | 486 | 487 | 488 | -------------------------------------------------------------------------------- /ml_insights/shap_insights.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | try: 5 | import xgboost as xgb 6 | 7 | except ImportError: 8 | xgb_installed = False 9 | 10 | def consolidate_reason_scores(df_ind_expl, dict_map): 11 | reason_list = dict_map.keys() 12 | df_rsn = pd.DataFrame(columns = reason_list) 13 | for reason in reason_list: 14 | df_rsn[reason] = np.sum(df_ind_expl.loc[:,dict_map[reason]], axis=1) 15 | return df_rsn 16 | 17 | def get_reason_codes(df_rsn, thresh, direction='greater', delimiter=';'): 18 | nr, nc = df_rsn.shape 19 | argsort_mat = np.argsort(-df_rsn.values) 20 | if (direction=='lesser'): 21 | num_exceeding_thresh_vec = np.sum(df_rsn.values<=thresh, axis=1) 22 | else: 23 | num_exceeding_thresh_vec = np.sum(df_rsn.values>=thresh, axis=1) 24 | reason_mat = np.array([df_rsn.columns[i] for row in argsort_mat for i in row ]).reshape(nr,nc) 25 | reason_vec = np.array([delimiter.join(list(reason_mat[j][:num_exceeding_thresh_vec[j]])) for j in range(nr)]) 26 | return reason_vec 27 | 28 | def cv_column_shap(xgbcv, X_pr, fn): 29 | results = np.zeros((X_pr.shape[0], xgbcv.num_features+1)) 30 | fold_set = np.unique(fn) 31 | for fold in fold_set: 32 | X_te = xgb.DMatrix(X_pr[fn == fold].values) 33 | fold_results = xgbcv.model_dict[fold].get_booster().predict(X_te, pred_contribs=True, validate_features=False) 34 | results[fn==fold] = fold_results 35 | return results 36 | 37 | def predict_reasons_cv(xgbcv, X_pr, fn, reason_map, thresh, delimiter=';'): 38 | shap_val_mat = cv_column_shap(xgbcv, X_pr, fn) 39 | df_shap_val = pd.DataFrame(shap_val_mat[:,:-1], columns = X_pr.columns) 40 | df_reason_scores = consolidate_reason_scores(df_shap_val,reason_map) 41 | reason_list_vec = get_reason_codes(df_reason_scores, thresh, delimiter=delimiter) 42 | return(reason_list_vec) 43 | 44 | def predict_reason_strings(xgbmodel, X_pr, reason_map, thresh, delimiter=';', direction='greater'): 45 | X_pr_dmat = xgb.DMatrix(X_pr) 46 | shap_val_mat = xgbmodel.get_booster().predict(X_pr_dmat, pred_contribs=True, validate_features=False) 47 | df_shap_val = pd.DataFrame(shap_val_mat[:,:-1], columns = X_pr.columns) 48 | df_reason_scores = consolidate_reason_scores(df_shap_val,reason_map) 49 | reason_list_vec = get_reason_codes(df_reason_scores, thresh, direction=direction, delimiter=delimiter) 50 | return(reason_list_vec) 51 | 52 | def get_reason_score_matrix(xgbmodel, X_pr, validate=False): 53 | if (type(X_pr)==pd.DataFrame): 54 | X_test_dmat = xgb.DMatrix(X_pr) 55 | reason_list = list(X_pr.columns) + ['Intercept'] 56 | reas_mat = xgbmodel.get_booster().predict(X_test_dmat, pred_contribs=True, validate_features=validate) 57 | else: 58 | reason_list = ['f'+str(i) for i in range(X_pr.shape[1])] + ['Intercept'] 59 | X_test_dmat = xgb.DMatrix(X_pr, feature_names = reason_list[:-1]) 60 | reas_mat = xgbmodel.get_booster().predict(X_test_dmat, pred_contribs=True, validate_features=validate) 61 | return(pd.DataFrame(reas_mat, columns=reason_list)) 62 | 63 | # def augment_tree(tree_dict): 64 | # if 'leaf' in tree_dict.keys(): 65 | # value = tree_dict['leaf'] 66 | # tree_dict['value_at_node'] = value 67 | # return value 68 | # else: 69 | # a0 = tree_dict['children'][0]['cover'] 70 | # a1 = tree_dict['children'][1]['cover'] 71 | # value = (a0 * augment_tree(tree_dict['children'][0]) + a1 * augment_tree(tree_dict['children'][1]))/(a0 + a1) 72 | 73 | # tree_dict['value_at_node'] = value 74 | # return value 75 | 76 | -------------------------------------------------------------------------------- /ml_insights/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numeristical/introspective/dbe96f7fc4dfd24d7ed6a6982d661426d74ee172/ml_insights/tests/__init__.py -------------------------------------------------------------------------------- /ml_insights/tests/test_example.py: -------------------------------------------------------------------------------- 1 | import os.path as op 2 | import numpy as np 3 | import pandas as pd 4 | import numpy.testing as npt 5 | import ml_insights as mli 6 | from sklearn.metrics import roc_auc_score, log_loss 7 | 8 | data_path = op.join(mli.__path__[0], 'data') 9 | 10 | 11 | def test_1(): 12 | assert(True) 13 | 14 | 15 | -------------------------------------------------------------------------------- /ml_insights/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import os.path as op 2 | import numpy as np 3 | import pandas as pd 4 | import numpy.testing as npt 5 | import ml_insights as mli 6 | from sklearn.metrics import roc_auc_score, log_loss 7 | 8 | data_path = op.join(mli.__path__[0], 'data') 9 | 10 | 11 | def test_get_range_dict(): 12 | """ 13 | This tests the get_range_dict function on a sample dataframe 14 | 15 | """ 16 | df = pd.read_csv(op.join(data_path,'faux_data.csv')) 17 | rd = mli.get_range_dict(df, max_pts=172) 18 | t1 = len(rd['int1']), len(rd['int2']) == (100, 172) 19 | t2 = len(rd['float1']) == 172 20 | t3 = len(rd['str1']), len(rd['str2']) == (172, 50) 21 | assert(t1 and t2 and t3) 22 | 23 | 24 | -------------------------------------------------------------------------------- /ml_insights/utils.py: -------------------------------------------------------------------------------- 1 | def _gca(): 2 | import matplotlib.pyplot as plt 3 | return plt.gca() 4 | 5 | 6 | def is_classifier(estimator): 7 | """Returns True if the given estimator is (probably) a classifier.""" 8 | return getattr(estimator, "_estimator_type", None) == "classifier" 9 | 10 | 11 | def is_regressor(estimator): 12 | """Returns True if the given estimator is (probably) a regressor.""" 13 | return getattr(estimator, "_estimator_type", None) == "regressor" 14 | -------------------------------------------------------------------------------- /mli_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/numeristical/introspective/dbe96f7fc4dfd24d7ed6a6982d661426d74ee172/mli_screenshot.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools >= 77.0.3"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "ml_insights" 7 | version = "1.1.0" 8 | dependencies = [ 9 | "pandas>=0.23", 10 | "numpy>=1.23.5", 11 | "matplotlib>=2.0.0", 12 | "scikit-learn>=0.24.2", 13 | "scipy>=1.6.0", 14 | "splinecalib>=0.0.13" 15 | ] 16 | authors = [ 17 | { name="Brian Lucena / Ramesh Sampath", email="brian@numeristical.com" }, 18 | ] 19 | description = "Package to calibrate and understand ML Models" 20 | readme = "README.md" 21 | requires-python = ">=3.8" 22 | classifiers = [ 23 | "Programming Language :: Python :: 3", 24 | "Operating System :: OS Independent", 25 | ] 26 | license = "MIT" 27 | license-files = ["LICEN[CS]E*"] 28 | 29 | [project.urls] 30 | Homepage = "https://github.com/numeristical/introspective" 31 | Issues = "https://github.com/numeristical/introspective/issues" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx_rtd_theme 2 | mkdocs -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Setup for the ml_insights package.""" 2 | 3 | from setuptools import setup 4 | 5 | setup( 6 | packages=[ 7 | 'ml_insights', 8 | ], 9 | 10 | ) 11 | --------------------------------------------------------------------------------