├── .github └── workflows │ └── test_package.yml ├── .gitignore ├── .travis.yml ├── CHANGELOG.rst ├── CONTRIBUTORS.txt ├── LICENSE.txt ├── MANIFEST.in ├── Makefile ├── README.md ├── documents ├── pylogit_computation.pdf └── pylogit_computation.tex ├── examples ├── .ipynb_checkpoints │ ├── Main PyLogit Example-checkpoint.ipynb │ ├── Python Biogeme Benchmark--01Logit-checkpoint.ipynb │ ├── mlogit Benchmark--Train and Fishing-checkpoint.ipynb │ └── mlogit_Benchmark--Heating-checkpoint.ipynb ├── data │ ├── electricity_r_data_long.csv │ ├── fishing_data_r.csv │ ├── heating_data_r.csv │ ├── long_swiss_metro_data.csv │ ├── swissmetro.dat │ └── train_data_r.csv └── notebooks │ ├── .ipynb_checkpoints │ ├── Asymmetric Choice Models Example-checkpoint.ipynb │ ├── Converting Long-Format to Wide-Format-checkpoint.ipynb │ ├── Main PyLogit Example-checkpoint.ipynb │ ├── Mixed Logit Example--mlogit Benchmark--Electricity-checkpoint.ipynb │ ├── More Mixed Logit--Heteroskedasticity and Nesting-checkpoint.ipynb │ ├── Nested Logit Example--Python Biogeme benchmark--09NestedLogit-checkpoint.ipynb │ ├── Prediction with PyLogit-checkpoint.ipynb │ ├── Python Biogeme Benchmark--01Logit-checkpoint.ipynb │ ├── mlogit Benchmark--Train and Fishing-checkpoint.ipynb │ └── mlogit_Benchmark--Heating-checkpoint.ipynb │ ├── Asymmetric Choice Models Example.ipynb │ ├── Converting Long-Format to Wide-Format.ipynb │ ├── Main PyLogit Example.ipynb │ ├── Mixed Logit Example--mlogit Benchmark--Electricity.ipynb │ ├── More Mixed Logit--Heteroskedasticity and Nesting.ipynb │ ├── Nested Logit Example--Python Biogeme benchmark--09NestedLogit.ipynb │ ├── Prediction with PyLogit.ipynb │ ├── Python Biogeme Benchmark--01Logit.ipynb │ ├── mlogit Benchmark--Train and Fishing.ipynb │ └── mlogit_Benchmark--Heating.ipynb ├── images └── PyLogit_Final-small-04.png ├── pyproject.toml ├── requirements.in ├── requirements.txt ├── src └── pylogit │ ├── __init__.py │ ├── asym_logit.py │ ├── base_multinomial_cm_v2.py │ ├── bootstrap.py │ ├── bootstrap_abc.py │ ├── bootstrap_calcs.py │ ├── bootstrap_mle.py │ ├── bootstrap_sampler.py │ ├── bootstrap_utils.py │ ├── choice_calcs.py │ ├── choice_tools.py │ ├── clog_log.py │ ├── conditional_logit.py │ ├── construct_estimator.py │ ├── display_names.py │ ├── estimation.py │ ├── mixed_logit.py │ ├── mixed_logit_calcs.py │ ├── nested_choice_calcs.py │ ├── nested_logit.py │ ├── newsfragments │ └── .gitignore │ ├── pylogit.py │ ├── scobit.py │ └── uneven_logit.py └── tests ├── __init__.py ├── test_asym_logit.py ├── test_base_cm_predict.py ├── test_base_multinomial_cm.py ├── test_bootstrap_abc.py ├── test_bootstrap_calcs.py ├── test_bootstrap_controller.py ├── test_bootstrap_mle.py ├── test_bootstrap_sampler.py ├── test_bootstrap_utils.py ├── test_choice_calcs.py ├── test_choice_tools.py ├── test_clog_log.py ├── test_conditional_logit.py ├── test_construct_estimator.py ├── test_estimation.py ├── test_mixed_logit.py ├── test_nested_choice_calcs.py ├── test_nested_logit.py ├── test_pylogit.py ├── test_scobit.py └── test_uneven_logit.py /.github/workflows/test_package.yml: -------------------------------------------------------------------------------- 1 | # Build and test the package 2 | name: Testing 3 | 4 | # Run this workflow every time a new commit is pushed or a pull-request is 5 | # merged to your repository's master branch 6 | on: 7 | push: 8 | branches: 9 | - master 10 | - develop 11 | pull_request: 12 | branches: 13 | - master 14 | - develop 15 | 16 | jobs: 17 | # Set the job key. The key is displayed as the job name 18 | # when a job name is not provided 19 | project-workflow: 20 | # Name the Job 21 | name: Build and test the package 22 | # Set the type of machine to run on 23 | runs-on: ubuntu-latest 24 | # Set the python versions to use 25 | strategy: 26 | matrix: 27 | python: [3.6] 28 | 29 | steps: 30 | # Step 1: Check out a copy of your repository on the ubuntu-latest machine 31 | - name: Checkout repository 32 | uses: actions/checkout@v2 33 | 34 | # Step 2: Make sure conda is installed, with mamba for speed 35 | - name: Setup Python with Conda 36 | uses: conda-incubator/setup-miniconda@v2 37 | with: 38 | python-version: ${{ matrix.python }} 39 | mamba-version: "*" 40 | channels: conda-forge,anaconda,defaults 41 | channel-priority: true 42 | activate-environment: pylogit 43 | auto-activate-base: false 44 | 45 | # Step 3: Make sure the project is installed locally & can commit if needed 46 | - name: Install package locally 47 | shell: bash -l {0} 48 | run: | 49 | make install 50 | 51 | # Step 4: Run the project's tests 52 | - name: Run project tests 53 | shell: bash -l {0} 54 | run: | 55 | tox 56 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled python modules. 2 | *.pyc 3 | 4 | # Setuptools distribution folder. 5 | /dist/ 6 | ./dist 7 | dist 8 | 9 | # Inscrutable cache folder 10 | .cache 11 | 12 | # Python egg metadata, regenerated from source files by setuptools. 13 | /*.egg-info 14 | 15 | # setup.py working directory 16 | build 17 | 18 | # Mac OS binary file 19 | .DS_Store 20 | 21 | # Files produced when tracking statement coverage of code. 22 | .coverage 23 | 24 | # Files produced when testing code 25 | .tox 26 | 27 | # Folder that stores deprecated code. 28 | deprecated_code 29 | deprecated_code/ 30 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.4" 5 | - "3.5" 6 | - "3.6" 7 | 8 | # command to install dependencies 9 | before_install: 10 | - "pip install --upgrade pip setuptools wheel" 11 | - "pip install --only-binary=numpy,scipy numpy scipy" 12 | - "pip install pandas" 13 | - "pip install coveralls" 14 | - "pip install pytest-cov" 15 | - "pip install future" 16 | - "pip install tqdm" 17 | - "pip install mock" 18 | 19 | # command to execute test suite 20 | script: py.test --cov-report= --cov=pylogit/ tests/ 21 | branches: 22 | only: 23 | - master 24 | - develop 25 | 26 | # Send results of tests to coveralls 27 | after_success: 28 | - coveralls 29 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Changelog 3 | ========= 4 | 5 | The format is based on [Keep-a-Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | .. towncrier release notes start 9 | 10 | Pylogit 1.0.1 (2020-12-27) 11 | ========================== 12 | 13 | Trivial/Internal Changes 14 | ------------------------ 15 | 16 | - Removed setup.py from repository in favor of pyproject.toml. (#68) 17 | 18 | 19 | Pylogit 1.0.0 (2020-12-27) 20 | ========================== 21 | 22 | Removed from package 23 | -------------------- 24 | 25 | - Support for python2.7 or any python 3 version below 3.6. (#67) 26 | 27 | 28 | Bug fixes 29 | --------- 30 | 31 | - Resolving import issues with the pylogit.bootstrap submodule. (#27) 32 | - Fixed flaky tests causing continuous integration build errors. (#29) 33 | - Fixed Hessian calculation so only the diagonal is penalized during ridge 34 | regression. (#33) 35 | 36 | 37 | Improved Documentation 38 | ---------------------- 39 | 40 | - Made example notebooks py2 and py3 compatible. (#28) 41 | 42 | 43 | Trivial/Internal Changes 44 | ------------------------ 45 | 46 | - Included license file in source distribution. (#18) 47 | - Refactored the Hessian calculation to use less memory-intensive operations 48 | based on linear-algebra decompositions. (#30) 49 | - Added journal reference for the accompanying paper in the project README. 50 | (#35) 51 | - Added project logo to the repository. (#46) 52 | - Switched to pip-tools for specifying development dependencies. (#58) 53 | - Added Makefile to standardize development installation. (#59) 54 | - Switched to flit for packaging. (#60) 55 | - Added towncrier to repository. (#61) 56 | - Added tox to the repository for cross-version testing of PyLogit. (#63) 57 | - Added GitHub Actions workflow for Continuous Integration. (#64) 58 | - Converted the README.rst file to README.md. (#65) 59 | - Adding bump2version to development requirements. (#66) 60 | 61 | 62 | Pylogit 0.2.2 (2017-12-11) 63 | ========================== 64 | 65 | Bug fixes 66 | --------- 67 | 68 | - Changed tqdm dependency to allow for anaconda compatibility. 69 | 70 | 71 | Pylogit 0.2.1 (2017-12-11) 72 | ========================== 73 | 74 | Bug fixes 75 | --------- 76 | 77 | - Added statsmodels and tqdm as package dependencies to fix errors with 0.2.0. 78 | 79 | 80 | Pylogit 0.2.0 (2017-12-10) 81 | ========================== 82 | 83 | Added new features 84 | ------------------ 85 | 86 | - Added support for Python 3.4 - 3.6 87 | - Added AIC and BIC to summary tables of all models. 88 | - Added support for bootstrapping and calculation of bootstrap confidence intervals: 89 | 90 | - percentile intervals, 91 | - bias-corrected and accelerated (BCa) bootstrap confidence intervals, and 92 | - approximate bootstrap confidence (ABC) intervals. 93 | 94 | - Changed sparse matrix creation to enable estimation of larger datasets. 95 | 96 | 97 | Trivial/Internal Changes 98 | ------------------------ 99 | 100 | - Refactored internal code organization and classes for estimation. 101 | 102 | 103 | Pylogit 0.1.2 (2016-12-04) 104 | ========================== 105 | 106 | Added new features 107 | ------------------ 108 | 109 | - Added support to all logit-type models for parameter constraints during model estimation. 110 | All models now support the use of the constrained_pos keyword argument. 111 | - Added new argument checks to provide user-friendly error messages. 112 | - Created more than 175 tests, bringing statement coverage to 99%. 113 | - Updated the underflow and overflow protections to make use of L’Hopital’s rule where appropriate. 114 | 115 | 116 | Bug fixes 117 | --------- 118 | 119 | - Fixed bugs with the nested logit model. 120 | In particular, the predict function, the BHHH approximation to the Fisher Information Matrix, and the ridge regression penalty in the log-likelihood, gradient, and hessian functions have been fixed. 121 | 122 | 123 | Improved Documentation 124 | ---------------------- 125 | 126 | - Added new example notebooks demonstrating prediction, mixed logit, and converting long-format datasets to wide-format. 127 | - Edited docstrings for clarity throughout the library. 128 | 129 | 130 | Trivial/Internal Changes 131 | ------------------------ 132 | 133 | - Extensively refactored codebase. 134 | 135 | 136 | Pylogit 0.1.1 (2016-08-30) 137 | ========================== 138 | 139 | Improved Documentation 140 | ---------------------- 141 | - Added python notebook examples demonstrating how to estimate the asymmetric choice models and the nested logit model. 142 | - Corrected the docstrings in various places. 143 | - Added new datasets to the github repo. 144 | 145 | 146 | Pylogit 0.1.0 (2016-08-29) 147 | ========================== 148 | 149 | Added new features 150 | ------------------ 151 | 152 | - Added asymmetric choice models. 153 | - Added nested logit and mixed logit models. 154 | - Added tests for mixed logit models. 155 | - Added an example notebook demonstrating how to estimate the mixed logit model. 156 | 157 | 158 | Improved Documentation 159 | ---------------------- 160 | 161 | - Changed documentation to numpy doctoring standard. 162 | 163 | 164 | Trivial/Internal Changes 165 | ------------------------ 166 | 167 | - Made print statements compatible with python3. 168 | - Fixed typos in library documentation. 169 | - Internal refactoring. 170 | 171 | 172 | Pylogit 0.0.0 (2016-03-15) 173 | ========================== 174 | 175 | Added new features 176 | ------------------ 177 | 178 | - Initial package release with support for the conditional logit (MNL) model. 179 | -------------------------------------------------------------------------------- /CONTRIBUTORS.txt: -------------------------------------------------------------------------------- 1 | This project has been assisted by numerous individuals and organizations. 2 | 3 | Dr. Akshay Vij contributed code that was the basis for the conditional logit 4 | code, and he provided the motivation and guidance behind the use of "mapping 5 | matrices" that is standard throughout PyLogit. 6 | 7 | Professor John Canny advised Vij on optimizing the code used for the 8 | conditional logit, and as such he also helped the development of this package. 9 | 10 | Professor Paul Waddell has graciously provided the financial support that 11 | helped ensure continued progress on this package. 12 | 13 | Oleksandr Lysenko ported PyLogit to Python >= 3.4. 14 | 15 | Eunice Poon designed the logo. 16 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Timothy A. Brathwaite 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of PyLogit nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include MANIFEST.in 2 | include README.rst 3 | include LICENSE.txt 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ## install : Install project package locally and install pre-commit. 2 | .PHONY : install 3 | install : 4 | pip install pip-tools 5 | pip-compile requirements.in 6 | pip install -r requirements.txt 7 | 8 | ## help : Documentation for make targets. 9 | .PHONY : help 10 | help : Makefile 11 | @sed -n 's/^##//p' $< 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![PyLogit Logo](./images/PyLogit_Final-small-04.png) 2 | 3 | ![Tests](https://github.com/timothyb0912/pylogit/workflows/Testing/badge.svg) 4 | 5 | # PyLogit 6 | PyLogit is a Python package for performing maximum likelihood estimation of conditional logit models and similar discrete choice models. 7 | 8 | ## Main Features 9 | - It supports 10 | - Conditional Logit (Type) Models 11 | - Multinomial Logit Models 12 | - Multinomial Asymmetric Models 13 | - Multinomial Clog-log Model 14 | - Multinomial Scobit Model 15 | - Multinomial Uneven Logit Model 16 | - Multinomial Asymmetric Logit Model 17 | - Nested Logit Models 18 | - Mixed Logit Models (with Normal mixing distributions) 19 | - It supports datasets where the choice set differs across observations 20 | - It supports model specifications where the coefficient for a given variable may be 21 | - completely alternative-specific 22 | (i.e. one coefficient per alternative, subject to identification of the coefficients), 23 | - subset-specific 24 | (i.e. one coefficient per subset of alternatives, where each alternative belongs to only one subset, and there are more than 1 but less than J subsets, where J is the maximum number of available alternatives in the dataset), 25 | - completely generic 26 | (i.e. one coefficient across all alternatives). 27 | 28 | ## Installation 29 | Available from [PyPi](https://pypi.python.org/pypi/pylogit): 30 | ``` 31 | pip install pylogit 32 | ``` 33 | 34 | Available through [Anaconda](https://anaconda.org/conda-forge/pylogit): 35 | ``` 36 | conda install -c conda-forge pylogit 37 | ``` 38 | 39 | or 40 | 41 | ``` 42 | conda install -c timothyb0912 pylogit 43 | ``` 44 | 45 | ## Usage 46 | For Jupyter notebooks filled with examples, see [examples](./examples/). 47 | 48 | 49 | ## For More Information 50 | For more information about the asymmetric models that can be estimated with PyLogit, see the following paper 51 | 52 | > Brathwaite, T., & Walker, J. L. (2018). Asymmetric, closed-form, finite-parameter models of multinomial choice. Journal of Choice Modelling, 29, 78–112. https://doi.org/10.1016/j.jocm.2018.01.002 53 | 54 | A free and better formatted version is available at [ArXiv](http://arxiv.org/abs/1606.05900). 55 | 56 | ## Attribution 57 | If PyLogit (or its constituent models) is useful in your research or work, please cite this package by citing the paper above. 58 | 59 | ## License 60 | Modified BSD (3-clause). See [here](./LICENSE.txt). 61 | 62 | ## Changelog 63 | See [here](./CHANGELOG.rst). 64 | -------------------------------------------------------------------------------- /documents/pylogit_computation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/timothyb0912/pylogit/cffc9c523b5368966ef2481c7dc30f0a5d296de8/documents/pylogit_computation.pdf -------------------------------------------------------------------------------- /examples/notebooks/Converting Long-Format to Wide-Format.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Converting long-format dataframes to wide-format\n", 8 | "The purpose of this notebook is to demonstrate the conversion of long-format data into wide-format. Long-format data contains one row per available alternative per choice situation. In contrast, wide-format data contains one row per choice situation. PyLogit and other software packages (e.g. mlogit in R) use data that is in long-format. However, other software packages, such as Statsmodels in Python or Python BIOGEME, use data that is in wide-format.\n", 9 | "\n", 10 | "Because different software packages have different data format requirements, it is useful to be able to convert one's data from one format to another. Other PyLogit example notebooks (such as the \"Main PyLogit Example\") demonstrate how to take data from wide-format and convert it into long-format. This notebook will demonstrate the reverse process: taking data from long-format and converting it into wide-format.\n", 11 | "\n", 12 | "The dataset being used in this example is the \"Travel Mode Choice\" dataset from Greene and Hensher. It is described on the statsmodels website, and their description is reproduced below in full.\n", 13 | "\n", 14 | "
\n",
 15 |     "    The data, collected as part of a 1987 intercity mode choice study, are a sub-sample of 210 non-business\n",
 16 |     "    trips between Sydney, Canberra and Melbourne in which the traveler chooses a mode from four alternatives\n",
 17 |     "    (plane, car, bus and train). The sample, 840 observations, is choice based with over-sampling of the\n",
 18 |     "    less popular modes (plane, train and bus) and under-sampling of the more popular mode, car. The level of\n",
 19 |     "    service data was derived from highway and transport networks in Sydney, Melbourne, non-metropolitan N.S.W.\n",
 20 |     "    and Victoria, including the Australian Capital Territory.\n",
 21 |     "    \n",
 22 |     "    Number of observations: 840 Observations On 4 Modes for 210 Individuals.\n",
 23 |     "    Number of variables: 8\n",
 24 |     "    Variable name definitions::\n",
 25 |     "\n",
 26 |     "        individual = 1 to 210\n",
 27 |     "        mode =\n",
 28 |     "            1 - air\n",
 29 |     "            2 - train\n",
 30 |     "            3 - bus\n",
 31 |     "            4 - car\n",
 32 |     "        choice =\n",
 33 |     "            0 - no\n",
 34 |     "            1 - yes\n",
 35 |     "        ttme = terminal waiting time for plane, train and bus (minutes); 0\n",
 36 |     "               for car.\n",
 37 |     "        invc = in vehicle cost for all stages (dollars).\n",
 38 |     "        invt = travel time (in-vehicle time) for all stages (minutes).\n",
 39 |     "        gc = generalized cost measure:invc+(invt*value of travel time savings)\n",
 40 |     "            (dollars).\n",
 41 |     "        hinc = household income ($1000s).\n",
 42 |     "        psize = traveling group size in mode chosen (number).\n",
 43 |     "        \n",
 44 |     "    \n",
 45 |     "    Source\n",
 46 |     "\n",
 47 |     "    Greene, W.H. and D. Hensher (1997) Multinomial logit and discrete choice models in Greene, W. H. (1997)\n",
 48 |     "    LIMDEP version 7.0 user’s manual revised, Plainview, New York econometric software, Inc. Download from\n",
 49 |     "    on-line complements to Greene, W.H. (2011) Econometric Analysis, Prentice Hall, 7th Edition (data table\n",
 50 |     "    F18-2) http://people.stern.nyu.edu/wgreene/Text/Edition7/TableF18-2.csv\n",
 51 |     "\n",
 52 |     "
" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 1, 58 | "metadata": { 59 | "collapsed": true 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "# To access the Travel Mode Choice data\n", 64 | "import statsmodels.datasets\n", 65 | "\n", 66 | "# To perform the dataset conversion\n", 67 | "import pylogit as pl" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "### Load the needed dataset" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 3, 80 | "metadata": { 81 | "collapsed": false 82 | }, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/html": [ 87 | "
\n", 88 | "\n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | "
individualmodechoicettmeinvcinvtgchincpsize
0110695910070351
1120343137271351
2130352541770351
314101018030351
421064586868302
\n", 166 | "
" 167 | ], 168 | "text/plain": [ 169 | " individual mode choice ttme invc invt gc hinc psize\n", 170 | "0 1 1 0 69 59 100 70 35 1\n", 171 | "1 1 2 0 34 31 372 71 35 1\n", 172 | "2 1 3 0 35 25 417 70 35 1\n", 173 | "3 1 4 1 0 10 180 30 35 1\n", 174 | "4 2 1 0 64 58 68 68 30 2" 175 | ] 176 | }, 177 | "execution_count": 3, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "# Access the dataset\n", 184 | "mode_data = statsmodels.datasets.modechoice.load_pandas()\n", 185 | "# Get a pandas dataframe of the mode choice data\n", 186 | "long_df = mode_data[\"data\"]\n", 187 | "# Look at the dataframe to ensure that it loaded correctly\n", 188 | "long_df.head()" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "### Create the needed variables for the conversion function.\n", 196 | "The function in PyLogit that is used to convert long-format data to wide-format data is \"convert_long_to_wide,\" and it can be accessed through \"pl.convert_long_to_wide\". The docstring for the function contains all of the information necessary to perform the conversion, but we will leave it to readers to view the docstring at their own leisure. For now, we will simply create the needed objects/arguments for the function.\n", 197 | "\n", 198 | "In particular, we will need the following 7 objects:\n", 199 | "1. ind_vars\n", 200 | "2. alt_specific_vars\n", 201 | "3. subset_specific_vars\n", 202 | "4. obs_id_col\n", 203 | "5. alt_id_col\n", 204 | "6. choice_col\n", 205 | "7. alt_name_dict\n", 206 | "\n", 207 | "The cells below will show exactly what these objects are." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 10, 213 | "metadata": { 214 | "collapsed": true 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "# ind_vars is a list of strings denoting the column\n", 219 | "# headings of data that varies across choice situations,\n", 220 | "# but not across alternatives. In our data, this is\n", 221 | "# the household income and party size.\n", 222 | "individual_specific_variables = [\"hinc\", \"psize\"]\n", 223 | "\n", 224 | "# alt_specific_vaars is a list of strings denoting the\n", 225 | "# column headings of data that vary not only across\n", 226 | "# choice situations but also across all alternatives.\n", 227 | "# These are columns such as the \"level of service\"\n", 228 | "# variables.\n", 229 | "alternative_specific_variables = [\"invc\", \"invt\", \"gc\"]\n", 230 | "\n", 231 | "# subset_specific_vars is a dictionary. Each key is a\n", 232 | "# string that denotes a variable that is subset specific.\n", 233 | "# Each value is a list of alternative ids, over which the\n", 234 | "# variable actually varies. Note that subset specific\n", 235 | "# variables vary across choice situations and across some\n", 236 | "# (but not all) alternatives. This is most common when\n", 237 | "# using variables that are not meaningfully defined for\n", 238 | "# all alternatives. An example of this in our dataset is\n", 239 | "# terminal time (\"ttme\"). This variable is not meaningfully\n", 240 | "# defined for the \"car\" alternative. Therefore, it is always\n", 241 | "# zero. Note \"4\" is the id for the \"car\" alternative\n", 242 | "subset_specific_variables = {\"ttme\": [1, 2, 3]}\n", 243 | "\n", 244 | "# obs_id_col is the column denoting the id of the choice\n", 245 | "# situation. If one was using a panel dataset, with multiple\n", 246 | "# choice situations per unit of observation, the column\n", 247 | "# denoting the unit of observation would be listed in\n", 248 | "# ind_vars (i.e. with the individual specific variables)\n", 249 | "observation_id_column = \"individual\"\n", 250 | "\n", 251 | "# alt_id_col is the column denoting the id of the alternative\n", 252 | "# corresponding to a given row.\n", 253 | "alternative_id_column = \"mode\"\n", 254 | "\n", 255 | "# choice_col is the column denoting whether the alternative\n", 256 | "# on a given row was chosen in the corresponding choice situation\n", 257 | "choice_column = \"choice\"\n", 258 | "\n", 259 | "# Lastly, alt_name_dict is not necessary. However, it is useful.\n", 260 | "# It records the names corresponding to each alternative, if there\n", 261 | "# are any, and allows for the creation of meaningful column names\n", 262 | "# in the wide-format data (such as when creating the columns\n", 263 | "# denoting the available alternatives in each choice situation).\n", 264 | "# The keys of alt_name_dict are the unique alternative ids, and\n", 265 | "# the values are the names of each alternative.\n", 266 | "alternative_name_dict = {1: \"air\",\n", 267 | " 2: \"train\",\n", 268 | " 3: \"bus\",\n", 269 | " 4: \"car\"}" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "### Create the wide-format dataframe" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 12, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/html": [ 289 | "
\n", 290 | "\n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | "
01234
individual12345
choice44444
availability_air11111
availability_train11111
availability_bus11111
availability_car11111
hinc3530407045
psize12132
invc_air59581154960
invc_train3131982632
invc_bus2525532126
invc_car10112358
invt_air1006812568144
invt_train372354892354404
invt_bus417399882399449
invt_car180255720180600
gc_air70681295982
gc_train71841957993
gc_bus70851498194
gc_car30501013299
ttme_air6964696464
ttme_train3444344444
ttme_bus3553355353
\n", 488 | "
" 489 | ], 490 | "text/plain": [ 491 | " 0 1 2 3 4\n", 492 | "individual 1 2 3 4 5\n", 493 | "choice 4 4 4 4 4\n", 494 | "availability_air 1 1 1 1 1\n", 495 | "availability_train 1 1 1 1 1\n", 496 | "availability_bus 1 1 1 1 1\n", 497 | "availability_car 1 1 1 1 1\n", 498 | "hinc 35 30 40 70 45\n", 499 | "psize 1 2 1 3 2\n", 500 | "invc_air 59 58 115 49 60\n", 501 | "invc_train 31 31 98 26 32\n", 502 | "invc_bus 25 25 53 21 26\n", 503 | "invc_car 10 11 23 5 8\n", 504 | "invt_air 100 68 125 68 144\n", 505 | "invt_train 372 354 892 354 404\n", 506 | "invt_bus 417 399 882 399 449\n", 507 | "invt_car 180 255 720 180 600\n", 508 | "gc_air 70 68 129 59 82\n", 509 | "gc_train 71 84 195 79 93\n", 510 | "gc_bus 70 85 149 81 94\n", 511 | "gc_car 30 50 101 32 99\n", 512 | "ttme_air 69 64 69 64 64\n", 513 | "ttme_train 34 44 34 44 44\n", 514 | "ttme_bus 35 53 35 53 53" 515 | ] 516 | }, 517 | "execution_count": 12, 518 | "metadata": {}, 519 | "output_type": "execute_result" 520 | } 521 | ], 522 | "source": [ 523 | "# Finally, we can create the wide format dataframe\n", 524 | "wide_df = pl.convert_long_to_wide(long_df,\n", 525 | " individual_specific_variables,\n", 526 | " alternative_specific_variables,\n", 527 | " subset_specific_variables,\n", 528 | " observation_id_column,\n", 529 | " alternative_id_column,\n", 530 | " choice_column,\n", 531 | " alternative_name_dict)\n", 532 | "\n", 533 | "# Let's look at the created dataframe, transposed for easy viewing\n", 534 | "wide_df.head().T" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": {}, 540 | "source": [ 541 | "As we can see above, PyLogit does a few things automatically. First, using the names provided in alt_name_dict, it will add suffixes to the alternative specific variables and the subset specific variables. These suffixes record what alternative, the given column of data is referring to. Secondly, when dealing with subset specific variables, PyLogit will only create columns of data for alternatives over which the variable actually varies. Lastly, PyLogit automatically creates columns that denote the availability of each alternative for each choice situation. These columns are suffixed to denote the alternatives that they correspond to, and they are inferred automatically from the rows present in the long-format data.\n", 542 | "\n", 543 | "Also, there is a \"null_value\" keyword that one can use in the conversion function. This is useful when one has alternative specific variables, and not all alternatives are available in all choice situations. In this setting, one may want to specify a value for the missing data, such as null, -999, etc. The \"null_value\" keyword argument allows one to do this." 544 | ] 545 | } 546 | ], 547 | "metadata": { 548 | "kernelspec": { 549 | "display_name": "Python 2", 550 | "language": "python", 551 | "name": "python2" 552 | }, 553 | "language_info": { 554 | "codemirror_mode": { 555 | "name": "ipython", 556 | "version": 2 557 | }, 558 | "file_extension": ".py", 559 | "mimetype": "text/x-python", 560 | "name": "python", 561 | "nbconvert_exporter": "python", 562 | "pygments_lexer": "ipython2", 563 | "version": "2.7.12" 564 | } 565 | }, 566 | "nbformat": 4, 567 | "nbformat_minor": 1 568 | } 569 | -------------------------------------------------------------------------------- /images/PyLogit_Final-small-04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/timothyb0912/pylogit/cffc9c523b5368966ef2481c7dc30f0a5d296de8/images/PyLogit_Final-small-04.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=2,<4"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [tool.flit.metadata] 6 | module = "pylogit" 7 | author = "Timothy Brathwaite" 8 | author-email = "timothyb0912@gmail.com" 9 | home-page = "https://github.com/timothyb0912/pylogit" 10 | description-file = "README.md" 11 | requires = [ 12 | "pandas >= 0.16.2", 13 | "numpy >= 1.10.2", 14 | "scipy >= 0.16.1", 15 | "future >= 0.16", 16 | "statsmodels >= 0.6.1", 17 | "tqdm >= 4.15.0", 18 | ] 19 | requires-python = ">=3.6" 20 | keywords = "conditional logit,discrete choice,econometrics,choice models" 21 | license = "BSD-3-Clause" 22 | classifiers = [ 23 | "Topic :: Software Development :: Libraries :: Python Modules", 24 | "Topic :: Scientific/Engineering", 25 | "Intended Audience :: Science/Research", 26 | "Intended Audience :: End Users/Desktop", 27 | "Intended Audience :: Developers", 28 | "Programming Language :: Python :: 3", 29 | "Programming Language :: Python :: 3.6", 30 | "Programming Language :: Python :: 3.7", 31 | "Programming Language :: Python :: 3.8", 32 | "Environment :: Console", 33 | "Development Status :: 2 - Pre-Alpha", 34 | "License :: OSI Approved :: BSD License", 35 | ] 36 | 37 | [tool.flit.sdist] 38 | exclude = ["src/pylogit/newsfragments/"] 39 | 40 | [tool.towncrier] 41 | package = "pylogit" 42 | package_dir = "src/" 43 | filename = "CHANGELOG.rst" 44 | title_format = "{name} {version} ({project_date})" 45 | wrap = true # Wrap text to 79 characters 46 | all_bullets = true 47 | 48 | [[tool.towncrier.type]] 49 | directory = "added" 50 | name = "Added new features" 51 | showcontent = true 52 | 53 | [[tool.towncrier.type]] 54 | directory = "changed" 55 | name = "Changed existing functionality" 56 | showcontent = true 57 | 58 | [[tool.towncrier.type]] 59 | directory = "deprecated" 60 | name = "Marked for removal" 61 | showcontent = true 62 | 63 | [[tool.towncrier.type]] 64 | directory = "removed" 65 | name = "Removed from package" 66 | showcontent = true 67 | 68 | [[tool.towncrier.type]] 69 | directory = "fixed" 70 | name = "Bug fixes" 71 | showcontent = true 72 | 73 | [[tool.towncrier.type]] 74 | directory = "security" 75 | name = "Patched vulnerabilities" 76 | showcontent = true 77 | 78 | [[tool.towncrier.type]] 79 | directory = "doc" 80 | name = "Improved Documentation" 81 | showcontent = true 82 | 83 | [[tool.towncrier.type]] 84 | directory = "trivial" 85 | name = "Trivial/Internal Changes" 86 | showcontent = true 87 | 88 | [tool.tox] 89 | legacy_tox_ini = """ 90 | [tox] 91 | isolated_build = True 92 | envlist = py36, py37, py38 93 | requires = tox-conda 94 | 95 | [testenv] 96 | deps = pytest >= 3.3.0 97 | commands = 98 | make install 99 | pytest 100 | """ 101 | -------------------------------------------------------------------------------- /requirements.in: -------------------------------------------------------------------------------- 1 | bump2version 2 | flit 3 | future 4 | mock 5 | numpy 6 | pandas 7 | pipreqs 8 | pip-tools 9 | pytest 10 | pytest-cov 11 | scipy 12 | statsmodels 13 | towncrier 14 | tox 15 | tox-conda 16 | tqdm 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile requirements.in 6 | # 7 | appdirs==1.4.4 # via virtualenv 8 | attrs==20.3.0 # via pytest 9 | bump2version==1.0.1 # via -r requirements.in 10 | certifi==2020.12.5 # via requests 11 | chardet==4.0.0 # via requests 12 | click==7.1.2 # via pip-tools, towncrier 13 | coverage==5.3.1 # via pytest-cov 14 | distlib==0.3.1 # via virtualenv 15 | docopt==0.6.2 # via pipreqs 16 | docutils==0.16 # via flit 17 | filelock==3.0.12 # via tox, virtualenv 18 | flit-core==3.0.0 # via flit 19 | flit==3.0.0 # via -r requirements.in 20 | future==0.18.2 # via -r requirements.in 21 | idna==2.10 # via requests 22 | importlib-metadata==2.1.1 # via pluggy, pytest, tox, virtualenv 23 | importlib-resources==4.1.0 # via virtualenv 24 | incremental==17.5.0 # via towncrier 25 | iniconfig==1.1.1 # via pytest 26 | jinja2==2.11.2 # via towncrier 27 | markupsafe==1.1.1 # via jinja2 28 | mock==4.0.3 # via -r requirements.in 29 | numpy==1.19.4 # via -r requirements.in, pandas, patsy, scipy, statsmodels 30 | packaging==20.8 # via pytest, tox 31 | pandas==1.1.5 # via -r requirements.in, statsmodels 32 | patsy==0.5.1 # via statsmodels 33 | pip-tools==5.4.0 # via -r requirements.in 34 | pipreqs==0.4.10 # via -r requirements.in 35 | pluggy==0.13.1 # via pytest, tox 36 | py==1.10.0 # via pytest, tox 37 | pyparsing==2.4.7 # via packaging 38 | pytest-cov==2.10.1 # via -r requirements.in 39 | pytest==6.2.1 # via -r requirements.in, pytest-cov 40 | python-dateutil==2.8.1 # via pandas 41 | pytoml==0.1.21 # via flit, flit-core 42 | pytz==2020.5 # via pandas 43 | requests==2.25.1 # via flit, yarg 44 | scipy==1.5.4 # via -r requirements.in, statsmodels 45 | six==1.15.0 # via patsy, pip-tools, python-dateutil, tox, virtualenv 46 | statsmodels==0.12.1 # via -r requirements.in 47 | toml==0.10.2 # via pytest, towncrier, tox 48 | towncrier==19.2.0 # via -r requirements.in 49 | tox-conda==0.4.1 # via -r requirements.in 50 | tox==3.20.1 # via -r requirements.in, tox-conda 51 | tqdm==4.55.0 # via -r requirements.in 52 | urllib3==1.26.2 # via requests 53 | virtualenv==20.2.2 # via tox 54 | yarg==0.1.9 # via pipreqs 55 | zipp==3.4.0 # via importlib-metadata, importlib-resources 56 | 57 | # The following packages are considered to be unsafe in a requirements file: 58 | # pip 59 | -------------------------------------------------------------------------------- /src/pylogit/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 14 15:33:07 2016 4 | 5 | @author: timothyb0912 6 | @module: pylogit 7 | """ 8 | from __future__ import absolute_import 9 | 10 | from .pylogit import create_choice_model 11 | from .bootstrap import Boot 12 | from .choice_tools import convert_wide_to_long 13 | from .choice_tools import convert_long_to_wide 14 | 15 | __version__ = "1.0.1" 16 | -------------------------------------------------------------------------------- /src/pylogit/bootstrap_calcs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Timothy Brathwaite 4 | @name: Bootstrap Calculations 5 | @summary: This module provides functions to calculate the bootstrap 6 | confidence intervals using the 'percentile' and 7 | 'bias-corrected and accelerated' methods. 8 | """ 9 | from __future__ import absolute_import 10 | 11 | import numpy as np 12 | from scipy.stats import norm 13 | 14 | from .bootstrap_utils import check_conf_percentage_validity 15 | from .bootstrap_utils import ensure_samples_is_ndim_ndarray 16 | from .bootstrap_utils import get_alpha_from_conf_percentage 17 | from .bootstrap_utils import combine_conf_endpoints 18 | 19 | # Create a value to be used to avoid numeric underflow. 20 | MIN_COMP_VALUE = 1e-16 21 | 22 | 23 | def calc_percentile_interval(bootstrap_replicates, conf_percentage): 24 | """ 25 | Calculate bootstrap confidence intervals based on raw percentiles of the 26 | bootstrap distribution of samples. 27 | 28 | Parameters 29 | ---------- 30 | bootstrap_replicates : 2D ndarray. 31 | Each row should correspond to a different bootstrap parameter sample. 32 | Each column should correspond to an element of the parameter vector 33 | being estimated. 34 | conf_percentage : scalar in the interval (0.0, 100.0). 35 | Denotes the confidence-level of the returned confidence interval. For 36 | instance, to calculate a 95% confidence interval, pass `95`. 37 | 38 | Returns 39 | ------- 40 | conf_intervals : 2D ndarray. 41 | The shape of the returned array will be `(2, samples.shape[1])`. The 42 | first row will correspond to the lower value in the confidence 43 | interval. The second row will correspond to the upper value in the 44 | confidence interval. There will be one column for each element of the 45 | parameter vector being estimated. 46 | 47 | References 48 | ---------- 49 | Efron, Bradley, and Robert J. Tibshirani. An Introduction to the Bootstrap. 50 | CRC press, 1994. Section 12.5 and Section 13.3. See Equation 13.3. 51 | 52 | Notes 53 | ----- 54 | This function differs slightly from the actual percentile bootstrap 55 | procedure described in Efron and Tibshirani (1994). To ensure that the 56 | returned endpoints of one's bootstrap confidence intervals are actual 57 | values that were observed in the bootstrap distribution, both the procedure 58 | of Efron and Tibshirani and this function make more conservative confidence 59 | intervals. However, this function uses a simpler (and in some cases less 60 | conservative) correction than that of Efron and Tibshirani. 61 | """ 62 | # Check validity of arguments 63 | check_conf_percentage_validity(conf_percentage) 64 | ensure_samples_is_ndim_ndarray(bootstrap_replicates, ndim=2) 65 | # Get the alpha * 100% value 66 | alpha = get_alpha_from_conf_percentage(conf_percentage) 67 | # Get the lower and upper percentiles that demarcate the desired interval. 68 | lower_percent = alpha / 2.0 69 | upper_percent = 100.0 - lower_percent 70 | # Calculate the lower and upper endpoints of the confidence intervals. 71 | # Note that the particular choices of interpolation methods are made in 72 | # order to produce conservatively wide confidence intervals and ensure that 73 | # all returned endpoints in the confidence intervals are actually observed 74 | # in the bootstrap distribution. This is in accordance with the spirit of 75 | # Efron and Tibshirani (1994). 76 | lower_endpoint = np.percentile(bootstrap_replicates, 77 | lower_percent, 78 | interpolation='lower', 79 | axis=0) 80 | upper_endpoint = np.percentile(bootstrap_replicates, 81 | upper_percent, 82 | interpolation='higher', 83 | axis=0) 84 | # Combine the enpoints into a single ndarray. 85 | conf_intervals = combine_conf_endpoints(lower_endpoint, upper_endpoint) 86 | return conf_intervals 87 | 88 | 89 | def calc_bias_correction_bca(bootstrap_replicates, mle_estimate): 90 | """ 91 | Calculate the bias correction for the Bias Corrected and Accelerated (BCa) 92 | bootstrap confidence intervals. 93 | 94 | Parameters 95 | ---------- 96 | bootstrap_replicates : 2D ndarray. 97 | Each row should correspond to a different bootstrap parameter sample. 98 | Each column should correspond to an element of the parameter vector 99 | being estimated. 100 | mle_estimate : 1D ndarray. 101 | The original dataset's maximum likelihood point estimate. Should have 102 | one elements for each component of the estimated parameter vector. 103 | 104 | Returns 105 | ------- 106 | bias_correction : 1D ndarray. 107 | There will be one element for each element in `mle_estimate`. Elements 108 | denote the bias correction factors for each component of the parameter 109 | vector. 110 | 111 | References 112 | ---------- 113 | Efron, Bradley, and Robert J. Tibshirani. An Introduction to the Bootstrap. 114 | CRC press, 1994. Section 14.3, Equation 14.14. 115 | """ 116 | numerator = (bootstrap_replicates < mle_estimate[None, :]).sum(axis=0) 117 | denominator = float(bootstrap_replicates.shape[0]) 118 | bias_correction = norm.ppf(numerator / denominator) 119 | return bias_correction 120 | 121 | 122 | def calc_acceleration_bca(jackknife_replicates): 123 | """ 124 | Calculate the acceleration constant for the Bias Corrected and Accelerated 125 | (BCa) bootstrap confidence intervals. 126 | 127 | Parameters 128 | ---------- 129 | jackknife_replicates : 2D ndarray. 130 | Each row should correspond to a different jackknife parameter sample, 131 | formed by deleting a particular observation and then re-estimating the 132 | desired model. Each column should correspond to an element of the 133 | parameter vector being estimated. 134 | 135 | Returns 136 | ------- 137 | acceleration : 1D ndarray. 138 | There will be one element for each element in `mle_estimate`. Elements 139 | denote the acceleration factors for each component of the parameter 140 | vector. 141 | 142 | References 143 | ---------- 144 | Efron, Bradley, and Robert J. Tibshirani. An Introduction to the Bootstrap. 145 | CRC press, 1994. Section 14.3, Equation 14.15. 146 | """ 147 | # Get the mean of the bootstrapped statistics. 148 | jackknife_mean = jackknife_replicates.mean(axis=0)[None, :] 149 | # Calculate the differences between the mean of the bootstrapped statistics 150 | differences = jackknife_mean - jackknife_replicates 151 | numerator = (differences**3).sum(axis=0) 152 | denominator = 6 * ((differences**2).sum(axis=0))**1.5 153 | # guard against division by zero. Note that this guard shouldn't distort 154 | # the computational results since the numerator should be zero whenever the 155 | # denominator is zero. 156 | zero_denom = np.where(denominator == 0) 157 | denominator[zero_denom] = MIN_COMP_VALUE 158 | # Compute the acceleration. 159 | acceleration = numerator / denominator 160 | return acceleration 161 | 162 | 163 | def calc_lower_bca_percentile(alpha_percent, bias_correction, acceleration): 164 | """ 165 | Calculate the lower values of the Bias Corrected and Accelerated (BCa) 166 | bootstrap confidence intervals. 167 | 168 | Parameters 169 | ---------- 170 | alpha_percent : float in (0.0, 100.0). 171 | `100 - confidence_percentage`, where `confidence_percentage` is the 172 | confidence level (such as 95%), expressed as a percent. 173 | bias_correction : 1D ndarray. 174 | There will be one element for each element in `mle_estimate`. Elements 175 | denote the bias correction factors for each component of the parameter 176 | vector. 177 | acceleration : 1D ndarray. 178 | There will be one element for each element in `mle_estimate`. Elements 179 | denote the acceleration factors for each component of the parameter 180 | vector. 181 | 182 | Returns 183 | ------- 184 | lower_percentile : 1D ndarray. 185 | There will be one element for each element in `mle_estimate`. Elements 186 | denote the smaller values in the confidence interval for each component 187 | of the parameter vector. 188 | 189 | References 190 | ---------- 191 | Efron, Bradley, and Robert J. Tibshirani. An Introduction to the Bootstrap. 192 | CRC press, 1994. Section 14.3, Equation 14.10. 193 | 194 | Notes 195 | ----- 196 | The `alpha` used in this function is different from the `alpha` used in 197 | Efron and Tibshirani (1994). The `alpha` used in this function must be 198 | converted to a decimal (by dividing by 100) and then divided by 2 (to 199 | account for the equal-tailed nature of the confidence interval) in order to 200 | be made equivalent to the `alpha` in Efron and Tibshirani (1994). 201 | """ 202 | z_lower = norm.ppf(alpha_percent / (100.0 * 2)) 203 | numerator = bias_correction + z_lower 204 | denominator = 1 - acceleration * numerator 205 | lower_percentile =\ 206 | norm.cdf(bias_correction + numerator / denominator) * 100 207 | return lower_percentile 208 | 209 | 210 | def calc_upper_bca_percentile(alpha_percent, bias_correction, acceleration): 211 | """ 212 | Calculate the lower values of the Bias Corrected and Accelerated (BCa) 213 | bootstrap confidence intervals. 214 | 215 | Parameters 216 | ---------- 217 | alpha_percent : float in (0.0, 100.0). 218 | `100 - confidence_percentage`, where `confidence_percentage` is the 219 | confidence level (such as 95%), expressed as a percent. 220 | bias_correction : 1D ndarray. 221 | There will be one element for each element in `mle_estimate`. Elements 222 | denote the bias correction factors for each component of the parameter 223 | vector. 224 | acceleration : 1D ndarray. 225 | There will be one element for each element in `mle_estimate`. Elements 226 | denote the acceleration factors for each component of the parameter 227 | vector. 228 | 229 | Returns 230 | ------- 231 | upper_percentile : 1D ndarray. 232 | There will be one element for each element in `mle_estimate`. Elements 233 | denote the larger values in the confidence interval for each component 234 | of the parameter vector. 235 | 236 | References 237 | ---------- 238 | Efron, Bradley, and Robert J. Tibshirani. An Introduction to the Bootstrap. 239 | CRC press, 1994. Section 14.3, Equation 14.10. 240 | 241 | Notes 242 | ----- 243 | The `alpha` used in this function is different from the `alpha` used in 244 | Efron and Tibshirani (1994). The `alpha` used in this function must be 245 | converted to a decimal (by dividing by 100) and then divided by 2 (to 246 | account for the equal-tailed nature of the confidence interval) in order to 247 | be made equivalent to the `alpha` in Efron and Tibshirani (1994). 248 | """ 249 | z_upper = norm.ppf(1 - alpha_percent / (100.0 * 2)) 250 | numerator = bias_correction + z_upper 251 | denominator = 1 - acceleration * numerator 252 | upper_percentile =\ 253 | norm.cdf(bias_correction + numerator / denominator) * 100 254 | return upper_percentile 255 | 256 | 257 | def calc_bca_interval(bootstrap_replicates, 258 | jackknife_replicates, 259 | mle_params, 260 | conf_percentage): 261 | """ 262 | Calculate 'bias-corrected and accelerated' bootstrap confidence intervals. 263 | 264 | Parameters 265 | ---------- 266 | bootstrap_replicates : 2D ndarray. 267 | Each row should correspond to a different bootstrap parameter sample. 268 | Each column should correspond to an element of the parameter vector 269 | being estimated. 270 | jackknife_replicates : 2D ndarray. 271 | Each row should correspond to a different jackknife parameter sample, 272 | formed by deleting a particular observation and then re-estimating the 273 | desired model. Each column should correspond to an element of the 274 | parameter vector being estimated. 275 | mle_params : 1D ndarray. 276 | The original dataset's maximum likelihood point estimate. Should have 277 | the same number of elements as `samples.shape[1]`. 278 | conf_percentage : scalar in the interval (0.0, 100.0). 279 | Denotes the confidence-level of the returned confidence interval. For 280 | instance, to calculate a 95% confidence interval, pass `95`. 281 | 282 | Returns 283 | ------- 284 | conf_intervals : 2D ndarray. 285 | The shape of the returned array will be `(2, samples.shape[1])`. The 286 | first row will correspond to the lower value in the confidence 287 | interval. The second row will correspond to the upper value in the 288 | confidence interval. There will be one column for each element of the 289 | parameter vector being estimated. 290 | 291 | References 292 | ---------- 293 | Efron, Bradley, and Robert J. Tibshirani. An Introduction to the Bootstrap. 294 | CRC press, 1994. Section 14.3. 295 | DiCiccio, Thomas J., and Bradley Efron. "Bootstrap confidence intervals." 296 | Statistical science (1996): 189-212. 297 | """ 298 | # Check validity of arguments 299 | check_conf_percentage_validity(conf_percentage) 300 | ensure_samples_is_ndim_ndarray(bootstrap_replicates, ndim=2) 301 | ensure_samples_is_ndim_ndarray(jackknife_replicates, 302 | name='jackknife', ndim=2) 303 | # Calculate the alpha * 100% value 304 | alpha_percent = get_alpha_from_conf_percentage(conf_percentage) 305 | # Estimate the bias correction for the bootstrap samples 306 | bias_correction =\ 307 | calc_bias_correction_bca(bootstrap_replicates, mle_params) 308 | # Estimate the acceleration 309 | acceleration = calc_acceleration_bca(jackknife_replicates) 310 | # Get the lower and upper percent value for the raw bootstrap samples. 311 | lower_percents =\ 312 | calc_lower_bca_percentile(alpha_percent, bias_correction, acceleration) 313 | upper_percents =\ 314 | calc_upper_bca_percentile(alpha_percent, bias_correction, acceleration) 315 | # Get the lower and upper endpoints for the desired confidence intervals. 316 | lower_endpoints = np.diag(np.percentile(bootstrap_replicates, 317 | lower_percents, 318 | interpolation='lower', 319 | axis=0)) 320 | upper_endpoints = np.diag(np.percentile(bootstrap_replicates, 321 | upper_percents, 322 | interpolation='higher', 323 | axis=0)) 324 | # Combine the enpoints into a single ndarray. 325 | conf_intervals = combine_conf_endpoints(lower_endpoints, upper_endpoints) 326 | return conf_intervals 327 | -------------------------------------------------------------------------------- /src/pylogit/bootstrap_mle.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Timothy Brathwaite 4 | @name: Bootstrap Estimation Procedures 5 | @summary: This module provides functions that will perform the MLE for each 6 | of the bootstrap samples. 7 | """ 8 | from __future__ import absolute_import 9 | 10 | import numpy as np 11 | import pandas as pd 12 | 13 | from . import pylogit as pl 14 | from .display_names import model_type_to_display_name 15 | 16 | 17 | def extract_default_init_vals(orig_model_obj, mnl_point_series, num_params): 18 | """ 19 | Get the default initial values for the desired model type, based on the 20 | point estimate of the MNL model that is 'closest' to the desired model. 21 | 22 | Parameters 23 | ---------- 24 | orig_model_obj : an instance or sublcass of the MNDC class. 25 | Should correspond to the actual model that we want to bootstrap. 26 | mnl_point_series : pandas Series. 27 | Should denote the point estimate from the MNL model that is 'closest' 28 | to the desired model. 29 | num_params : int. 30 | Should denote the number of parameters being estimated (including any 31 | parameters that are being constrained during estimation). 32 | 33 | Returns 34 | ------- 35 | init_vals : 1D ndarray of initial values for the MLE of the desired model. 36 | """ 37 | # Initialize the initial values 38 | init_vals = np.zeros(num_params, dtype=float) 39 | # Figure out which values in mnl_point_series are the index coefficients 40 | no_outside_intercepts = orig_model_obj.intercept_names is None 41 | if no_outside_intercepts: 42 | init_index_coefs = mnl_point_series.values 43 | init_intercepts = None 44 | else: 45 | init_index_coefs =\ 46 | mnl_point_series.loc[orig_model_obj.ind_var_names].values 47 | init_intercepts =\ 48 | mnl_point_series.loc[orig_model_obj.intercept_names].values 49 | 50 | # Add any mixing variables to the index coefficients. 51 | if orig_model_obj.mixing_vars is not None: 52 | num_mixing_vars = len(orig_model_obj.mixing_vars) 53 | init_index_coefs = np.concatenate([init_index_coefs, 54 | np.zeros(num_mixing_vars)], 55 | axis=0) 56 | 57 | # Account for the special transformation of the index coefficients that is 58 | # needed for the asymmetric logit model. 59 | if orig_model_obj.model_type == model_type_to_display_name["Asym"]: 60 | multiplier = np.log(len(np.unique(orig_model_obj.alt_IDs))) 61 | # Cast the initial index coefficients to a float dtype to ensure 62 | # successful broadcasting 63 | init_index_coefs = init_index_coefs.astype(float) 64 | # Adjust the scale of the index coefficients for the asymmetric logit. 65 | init_index_coefs /= multiplier 66 | 67 | # Combine the initial interept values with the initial index coefficients 68 | if init_intercepts is not None: 69 | init_index_coefs =\ 70 | np.concatenate([init_intercepts, init_index_coefs], axis=0) 71 | 72 | # Add index coefficients (and mixing variables) to the total initial array 73 | num_index = init_index_coefs.shape[0] 74 | init_vals[-1 * num_index:] = init_index_coefs 75 | 76 | # Note that the initial values for the transformed nest coefficients and 77 | # the shape parameters is zero so we don't have to change anything 78 | return init_vals 79 | 80 | 81 | def get_model_abbrev(model_obj): 82 | """ 83 | Extract the string used to specify the model type of this model object in 84 | `pylogit.create_chohice_model`. 85 | 86 | Parameters 87 | ---------- 88 | model_obj : An MNDC_Model instance. 89 | 90 | Returns 91 | ------- 92 | str. The internal abbreviation used for the particular type of MNDC_Model. 93 | """ 94 | # Get the 'display name' for our model. 95 | model_type = model_obj.model_type 96 | # Find the model abbreviation for this model's display name. 97 | for key in model_type_to_display_name: 98 | if model_type_to_display_name[key] == model_type: 99 | return key 100 | # If none of the strings in model_type_to_display_name matches our model 101 | # object, then raise an error. 102 | msg = "Model object has an unknown or incorrect model type." 103 | raise ValueError(msg) 104 | 105 | 106 | def get_model_creation_kwargs(model_obj): 107 | """ 108 | Get a dictionary of the keyword arguments needed to create the passed model 109 | object using `pylogit.create_choice_model`. 110 | 111 | Parameters 112 | ---------- 113 | model_obj : An MNDC_Model instance. 114 | 115 | Returns 116 | ------- 117 | model_kwargs : dict. 118 | Contains the keyword arguments and the required values that are needed 119 | to initialize a replica of `model_obj`. 120 | """ 121 | # Extract the model abbreviation for this model 122 | model_abbrev = get_model_abbrev(model_obj) 123 | 124 | # Create a dictionary to store the keyword arguments needed to Initialize 125 | # the new model object.d 126 | model_kwargs = {"model_type": model_abbrev, 127 | "names": model_obj.name_spec, 128 | "intercept_names": model_obj.intercept_names, 129 | "intercept_ref_pos": model_obj.intercept_ref_position, 130 | "shape_names": model_obj.shape_names, 131 | "shape_ref_pos": model_obj.shape_ref_position, 132 | "nest_spec": model_obj.nest_spec, 133 | "mixing_vars": model_obj.mixing_vars, 134 | "mixing_id_col": model_obj.mixing_id_col} 135 | 136 | return model_kwargs 137 | 138 | 139 | def get_mnl_point_est(orig_model_obj, 140 | new_df, 141 | boot_id_col, 142 | num_params, 143 | mnl_spec, 144 | mnl_names, 145 | mnl_init_vals, 146 | mnl_fit_kwargs): 147 | """ 148 | Calculates the MLE for the desired MNL model. 149 | 150 | Parameters 151 | ---------- 152 | orig_model_obj : An MNDC_Model instance. 153 | The object corresponding to the desired model being bootstrapped. 154 | new_df : pandas DataFrame. 155 | The pandas dataframe containing the data to be used to estimate the 156 | MLE of the MNL model for the current bootstrap sample. 157 | boot_id_col : str. 158 | Denotes the new column that specifies the bootstrap observation ids for 159 | choice model estimation. 160 | num_params : non-negative int. 161 | The number of parameters in the MLE of the `orig_model_obj`. 162 | mnl_spec : OrderedDict or None. 163 | If `orig_model_obj` is not a MNL model, then `mnl_spec` should be an 164 | OrderedDict that contains the specification dictionary used to estimate 165 | the MNL model that will provide starting values for the final estimated 166 | model. If `orig_model_obj` is a MNL model, then `mnl_spec` may be None. 167 | mnl_names : OrderedDict or None. 168 | If `orig_model_obj` is not a MNL model, then `mnl_names` should be an 169 | OrderedDict that contains the name dictionary used to initialize the 170 | MNL model that will provide starting values for the final estimated 171 | model. If `orig_model_obj` is a MNL, then `mnl_names` may be None. 172 | mnl_init_vals : 1D ndarray or None. 173 | If `orig_model_obj` is not a MNL model, then `mnl_init_vals` should be 174 | a 1D ndarray. `mnl_init_vals` should denote the initial values used to 175 | estimate the MNL model that provides starting values for the final 176 | desired model. If `orig_model_obj` is a MNL model, then `mnl_init_vals` 177 | may be None. 178 | mnl_fit_kwargs : dict or None. 179 | If `orig_model_obj` is not a MNL model, then `mnl_fit_kwargs` should be 180 | a dict. `mnl_fit_kwargs` should denote the keyword arguments used when 181 | calling the `fit_mle` function of the MNL model that will provide 182 | starting values to the desired choice model. If `orig_model_obj` is a 183 | MNL model, then `mnl_fit_kwargs` may be None. 184 | 185 | Returns 186 | ------- 187 | mnl_point : dict. 188 | The dictionary returned by `scipy.optimize` after estimating the 189 | desired MNL model. 190 | mnl_obj : An MNL model instance. 191 | The model object used to estimate the desired MNL model. 192 | """ 193 | # Get specification and name dictionaries for the mnl model, for the case 194 | # where the model being bootstrapped is an MNL model. In this case, the 195 | # the mnl_spec and the mnl_names that are passed to the function are 196 | # expected to be None. 197 | if orig_model_obj.model_type == model_type_to_display_name["MNL"]: 198 | mnl_spec = orig_model_obj.specification 199 | mnl_names = orig_model_obj.name_spec 200 | if mnl_init_vals is None: 201 | mnl_init_vals = np.zeros(num_params) 202 | if mnl_fit_kwargs is None: 203 | mnl_fit_kwargs = {} 204 | 205 | # Alter the mnl_fit_kwargs to ensure that we only perform point estimation 206 | mnl_fit_kwargs["just_point"] = True 207 | # Use BFGS by default to estimate the MNL since it works well for the MNL. 208 | if "method" not in mnl_fit_kwargs: 209 | mnl_fit_kwargs["method"] = "BFGS" 210 | 211 | # Initialize the mnl model object for the given bootstrap sample. 212 | mnl_obj = pl.create_choice_model(data=new_df, 213 | alt_id_col=orig_model_obj.alt_id_col, 214 | obs_id_col=boot_id_col, 215 | choice_col=orig_model_obj.choice_col, 216 | specification=mnl_spec, 217 | model_type="MNL", 218 | names=mnl_names) 219 | 220 | # Get the MNL point estimate for the parameters of this bootstrap sample. 221 | mnl_point = mnl_obj.fit_mle(mnl_init_vals, **mnl_fit_kwargs) 222 | return mnl_point, mnl_obj 223 | 224 | 225 | def retrieve_point_est(orig_model_obj, 226 | new_df, 227 | new_id_col, 228 | num_params, 229 | mnl_spec, 230 | mnl_names, 231 | mnl_init_vals, 232 | mnl_fit_kwargs, 233 | extract_init_vals=None, 234 | **fit_kwargs): 235 | """ 236 | Calculates the MLE for the desired MNL model. 237 | 238 | Parameters 239 | ---------- 240 | orig_model_obj : An MNDC_Model instance. 241 | The object corresponding to the desired model being bootstrapped. 242 | new_df : pandas DataFrame. 243 | The pandas dataframe containing the data to be used to estimate the 244 | MLE of the MNL model for the current bootstrap sample. 245 | new_id_col : str. 246 | Denotes the new column that specifies the bootstrap observation ids for 247 | choice model estimation. 248 | num_params : non-negative int. 249 | The number of parameters in the MLE of the `orig_model_obj`. 250 | mnl_spec : OrderedDict or None. 251 | If `orig_model_obj` is not a MNL model, then `mnl_spec` should be an 252 | OrderedDict that contains the specification dictionary used to estimate 253 | the MNL model that will provide starting values for the final estimated 254 | model. If `orig_model_obj` is a MNL model, then `mnl_spec` may be None. 255 | mnl_names : OrderedDict or None. 256 | If `orig_model_obj` is not a MNL model, then `mnl_names` should be an 257 | OrderedDict that contains the name dictionary used to initialize the 258 | MNL model that will provide starting values for the final estimated 259 | model. If `orig_model_obj` is a MNL, then `mnl_names` may be None. 260 | mnl_init_vals : 1D ndarray or None. 261 | If `orig_model_obj` is not a MNL model, then `mnl_init_vals` should be 262 | a 1D ndarray. `mnl_init_vals` should denote the initial values used to 263 | estimate the MNL model that provides starting values for the final 264 | desired model. If `orig_model_obj` is a MNL model, then `mnl_init_vals` 265 | may be None. 266 | mnl_fit_kwargs : dict or None. 267 | If `orig_model_obj` is not a MNL model, then `mnl_fit_kwargs` should be 268 | a dict. `mnl_fit_kwargs` should denote the keyword arguments used when 269 | calling the `fit_mle` function of the MNL model that will provide 270 | starting values to the desired choice model. If `orig_model_obj` is a 271 | MNL model, then `mnl_fit_kwargs` may be None. 272 | extract_init_vals : callable or None, optional. 273 | Should accept 3 arguments, in the following order. First, it should 274 | accept `orig_model_obj`. Second, it should accept a pandas Series of 275 | the estimated parameters from the MNL model. The index of the Series 276 | will be the names of the coefficients from `mnl_names`. Thirdly, it 277 | should accept an int denoting the number of parameters in the desired 278 | choice model. The callable should return a 1D ndarray of starting 279 | values for the desired choice model. Default == None. 280 | fit_kwargs : dict. 281 | Denotes the keyword arguments to be used when estimating the desired 282 | choice model using the current bootstrap sample (`new_df`). All such 283 | kwargs will be directly passed to the `fit_mle` method of the desired 284 | model object. 285 | 286 | Returns 287 | ------- 288 | final_point : dict. 289 | The dictionary returned by `scipy.optimize` after estimating the 290 | desired choice model. 291 | """ 292 | # Get the MNL point estimate for the parameters of this bootstrap sample. 293 | mnl_point, mnl_obj = get_mnl_point_est(orig_model_obj, 294 | new_df, 295 | new_id_col, 296 | num_params, 297 | mnl_spec, 298 | mnl_names, 299 | mnl_init_vals, 300 | mnl_fit_kwargs) 301 | mnl_point_series = pd.Series(mnl_point["x"], index=mnl_obj.ind_var_names) 302 | 303 | # Denote the MNL point estimate as our final point estimate if the final 304 | # model we're interested in is an MNL. 305 | if orig_model_obj.model_type == model_type_to_display_name["MNL"]: 306 | final_point = mnl_point 307 | else: 308 | # Determine the function to be used when extracting the initial values 309 | # for the final model from the MNL MLE point estimate. 310 | if extract_init_vals is None: 311 | extraction_func = extract_default_init_vals 312 | else: 313 | extraction_func = extract_init_vals 314 | 315 | # Extract the initial values 316 | default_init_vals =\ 317 | extraction_func(orig_model_obj, mnl_point_series, num_params) 318 | 319 | # Get the keyword arguments needed to initialize the new model object. 320 | model_kwargs = get_model_creation_kwargs(orig_model_obj) 321 | 322 | # Create a new model object 323 | new_obj =\ 324 | pl.create_choice_model(data=new_df, 325 | alt_id_col=orig_model_obj.alt_id_col, 326 | obs_id_col=new_id_col, 327 | choice_col=orig_model_obj.choice_col, 328 | specification=orig_model_obj.specification, 329 | **model_kwargs) 330 | 331 | # Be sure to add 'just_point' to perform pure point estimation. 332 | if 'just_point' not in fit_kwargs: 333 | fit_kwargs['just_point'] = True 334 | 335 | # Fit the model with new data, and return the point estimate dict. 336 | final_point = new_obj.fit_mle(default_init_vals, **fit_kwargs) 337 | 338 | return final_point 339 | -------------------------------------------------------------------------------- /src/pylogit/bootstrap_sampler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Timothy Brathwaite 4 | @name: Bootstrap Sampler 5 | @summary: This module provides functions that will perform the stratified 6 | resampling needed for the bootstrapping procedure. 7 | """ 8 | from __future__ import absolute_import 9 | 10 | from collections import OrderedDict 11 | 12 | import numpy as np 13 | import pandas as pd 14 | 15 | 16 | def relate_obs_ids_to_chosen_alts(obs_id_array, 17 | alt_id_array, 18 | choice_array): 19 | """ 20 | Creates a dictionary that relates each unique alternative id to the set of 21 | observations ids that chose the given alternative. 22 | 23 | Parameters 24 | ---------- 25 | obs_id_array : 1D ndarray of ints. 26 | Should be a long-format array of observation ids. Each element should 27 | correspond to the unique id of the unit of observation that corresponds 28 | to the given row of the long-format data. Note that each unit of 29 | observation may have more than one associated choice situation. 30 | alt_id_array : 1D ndarray of ints. 31 | Should be a long-format array of alternative ids. Each element should 32 | denote the unique id of the alternative that corresponds to the given 33 | row of the long format data. 34 | choice_array : 1D ndarray of ints. 35 | Each element should be either a one or a zero, indicating whether the 36 | alternative on the given row of the long format data was chosen or not. 37 | 38 | Returns 39 | ------- 40 | chosen_alts_to_obs_ids : dict. 41 | Each key will be a unique value from `alt_id_array`. Each key's value 42 | will be a 1D ndarray that contains the sorted, unique observation ids 43 | of those observational units that chose the given alternative. 44 | """ 45 | # Figure out which units of observation chose each alternative. 46 | chosen_alts_to_obs_ids = {} 47 | 48 | for alt_id in np.sort(np.unique(alt_id_array)): 49 | # Determine which observations chose the current alternative. 50 | selection_condition =\ 51 | np.where((alt_id_array == alt_id) & (choice_array == 1)) 52 | 53 | # Store the sorted, unique ids that chose the current alternative. 54 | chosen_alts_to_obs_ids[alt_id] =\ 55 | np.sort(np.unique(obs_id_array[selection_condition])) 56 | 57 | # Return the desired dictionary. 58 | return chosen_alts_to_obs_ids 59 | 60 | 61 | def get_num_obs_choosing_each_alternative(obs_per_alt_dict): 62 | """ 63 | Will create an ordered dictionary that records the number of units of 64 | observation that have chosen the given alternative (i.e. the associated 65 | dictionary key). Will also determine the total number of unique 66 | observations in the dataset. 67 | 68 | Parameters 69 | ---------- 70 | obs_per_alt_dict : dict. 71 | Each key should be a unique alternave id. Each key's value will be 1D 72 | ndarray that contains the sorted, unique observation ids of those 73 | observational units that chose the given alternative. 74 | 75 | Returns 76 | ------- 77 | num_obs_per_group : OrderedDict. 78 | Keys will be the alternative ids present in `obs_per_alt_dict`. Values 79 | will be the `len(obs_per_alt_dict[alt_id]).` 80 | tot_num_obs : int. 81 | Denotes the total number of unique observation ids in one's dataset. 82 | """ 83 | # Initialize the object that is to be returned. 84 | num_obs_per_group = OrderedDict() 85 | 86 | # Determine the number of unique units of observation per group. 87 | for alt_id in obs_per_alt_dict: 88 | num_obs_per_group[alt_id] = len(obs_per_alt_dict[alt_id]) 89 | 90 | # Determine the total number of units of observation that will be chosen 91 | # for each bootstrap sample. 92 | tot_num_obs = sum([num_obs_per_group[g] for g in num_obs_per_group]) 93 | 94 | # Return the desired objects. 95 | return num_obs_per_group, tot_num_obs 96 | 97 | 98 | def create_cross_sectional_bootstrap_samples(obs_id_array, 99 | alt_id_array, 100 | choice_array, 101 | num_samples, 102 | seed=None): 103 | """ 104 | Determines the unique observations that will be present in each bootstrap 105 | sample. This function DOES NOT create the new design matrices or a new 106 | long-format dataframe for each bootstrap sample. Note that these will be 107 | correct bootstrap samples for cross-sectional datasets. This function will 108 | not work correctly for panel datasets. 109 | 110 | Parameters 111 | ---------- 112 | obs_id_array : 1D ndarray of ints. 113 | Each element should denote a unique observation id for the 114 | corresponding row of the long format array. 115 | alt_id_array : 1D ndarray of ints. 116 | Each element should denote a unique alternative id for the 117 | corresponding row of the long format array. 118 | choice_array : 1D ndarray of ints. 119 | Each element should be a one or a zero. The values should denote a 120 | whether or not the corresponding alternative in `alt_id_array` was 121 | chosen by the observational unit in the corresponding row of 122 | `obs_id_array.` 123 | num_samples : int. 124 | Denotes the number of bootstrap samples that need to be drawn. 125 | seed : non-negative int or None, optional. 126 | Denotes the random seed to be used in order to ensure reproducibility 127 | of the bootstrap sample generation. Default is None. If None, no seed 128 | will be used and the generation of the bootstrap samples will (in 129 | general) not be reproducible. 130 | 131 | 132 | Returns 133 | ------- 134 | ids_per_sample : 2D ndarray. 135 | Each row represents a complete bootstrap sample. Each column denotes a 136 | selected bootstrap observation that comprises the bootstrap sample. The 137 | elements of the array denote the observation ids of the chosen 138 | observational units. 139 | """ 140 | # Determine the units of observation that chose each alternative. 141 | chosen_alts_to_obs_ids =\ 142 | relate_obs_ids_to_chosen_alts(obs_id_array, alt_id_array, choice_array) 143 | 144 | # Determine the number of unique units of observation per group and overall 145 | num_obs_per_group, tot_num_obs =\ 146 | get_num_obs_choosing_each_alternative(chosen_alts_to_obs_ids) 147 | 148 | # Initialize the array that will store the observation ids for each sample 149 | ids_per_sample = np.empty((num_samples, tot_num_obs), dtype=float) 150 | 151 | if seed is not None: 152 | # Check the validity of the seed argument. 153 | if not isinstance(seed, int): 154 | msg = "`boot_seed` MUST be an int." 155 | raise ValueError(msg) 156 | 157 | # If desiring reproducibility, set the random seed within numpy 158 | np.random.seed(seed) 159 | 160 | # Initialize a variable to keep track of what column we're on. 161 | col_idx = 0 162 | for alt_id in num_obs_per_group: 163 | # Get the set of observations that chose the current alternative. 164 | relevant_ids = chosen_alts_to_obs_ids[alt_id] 165 | # Determine the number of needed resampled ids. 166 | resample_size = num_obs_per_group[alt_id] 167 | # Resample, with replacement, observations who chose this alternative. 168 | current_ids = (np.random.choice(relevant_ids, 169 | size=resample_size * num_samples, 170 | replace=True) 171 | .reshape((num_samples, resample_size))) 172 | # Determine the last column index to use when storing the resampled ids 173 | end_col = col_idx + resample_size 174 | # Assign the sampled ids to the correct columns of ids_per_sample 175 | ids_per_sample[:, col_idx:end_col] = current_ids 176 | # Update the column index 177 | col_idx += resample_size 178 | 179 | # Return the resampled observation ids. 180 | return ids_per_sample 181 | 182 | 183 | def create_bootstrap_id_array(obs_id_per_sample): 184 | """ 185 | Creates a 2D ndarray that contains the 'bootstrap ids' for each replication 186 | of each unit of observation that is an the set of bootstrap samples. 187 | 188 | Parameters 189 | ---------- 190 | obs_id_per_sample : 2D ndarray of ints. 191 | Should have one row for each bootsrap sample. Should have one column 192 | for each observational unit that is serving as a new bootstrap 193 | observational unit. 194 | 195 | Returns 196 | ------- 197 | bootstrap_id_array : 2D ndarray of ints. 198 | Will have the same shape as `obs_id_per_sample`. Each element will 199 | denote the fake observational id in the new bootstrap dataset. 200 | """ 201 | # Determine the shape of the object to be returned. 202 | n_rows, n_cols = obs_id_per_sample.shape 203 | # Create the array of bootstrap ids. 204 | bootstrap_id_array =\ 205 | np.tile(np.arange(n_cols) + 1, n_rows).reshape((n_rows, n_cols)) 206 | # Return the desired object 207 | return bootstrap_id_array 208 | 209 | 210 | def create_deepcopied_groupby_dict(orig_df, obs_id_col): 211 | """ 212 | Will create a dictionary where each key corresponds to a unique value in 213 | `orig_df[obs_id_col]` and each value corresponds to all of the rows of 214 | `orig_df` where `orig_df[obs_id_col] == key`. 215 | 216 | Parameters 217 | ---------- 218 | orig_df : pandas DataFrame. 219 | Should be long-format dataframe containing the data used to estimate 220 | the desired choice model. 221 | obs_id_col : str. 222 | Should be a column name within `orig_df`. Should denote the original 223 | observation id column. 224 | 225 | Returns 226 | ------- 227 | groupby_dict : dict. 228 | Each key will be a unique value in `orig_df[obs_id_col]` and each value 229 | will be the rows of `orig_df` where `orig_df[obs_id_col] == key`. 230 | """ 231 | # Get the observation id values 232 | obs_id_vals = orig_df[obs_id_col].values 233 | # Get the unique observation ids 234 | unique_obs_ids = np.unique(obs_id_vals) 235 | # Initialize the dictionary to be returned. 236 | groupby_dict = {} 237 | # Populate the dictionary with dataframes for each individual. 238 | for obs_id in unique_obs_ids: 239 | # Filter out only the rows corresponding to the current observation id. 240 | desired_rows = obs_id_vals == obs_id 241 | # Add the desired dataframe to the dictionary. 242 | groupby_dict[obs_id] = orig_df.loc[desired_rows].copy(deep=True) 243 | 244 | # Return the desired object. 245 | return groupby_dict 246 | 247 | 248 | def check_column_existence(col_name, df, presence=True): 249 | """ 250 | Checks whether or not `col_name` is in `df` and raises a helpful error msg 251 | if the desired condition is not met. 252 | 253 | Parameters 254 | ---------- 255 | col_name : str. 256 | Should represent a column whose presence in `df` is to be checked. 257 | df : pandas DataFrame. 258 | The dataframe that will be checked for the presence of `col_name`. 259 | presence : bool, optional. 260 | If True, then this function checks for the PRESENCE of `col_name` from 261 | `df`. If False, then this function checks for the ABSENCE of 262 | `col_name` in `df`. Default == True. 263 | 264 | Returns 265 | ------- 266 | None. 267 | """ 268 | if presence: 269 | if col_name not in df.columns: 270 | msg = "Ensure that `{}` is in `df.columns`." 271 | raise ValueError(msg.format(col_name)) 272 | else: 273 | if col_name in df.columns: 274 | msg = "Ensure that `{}` is not in `df.columns`." 275 | raise ValueError(msg.format(col_name)) 276 | return None 277 | 278 | 279 | def ensure_resampled_obs_ids_in_df(resampled_obs_ids, orig_obs_id_array): 280 | """ 281 | Checks whether all ids in `resampled_obs_ids` are in `orig_obs_id_array`. 282 | Raises a helpful ValueError if not. 283 | 284 | Parameters 285 | ---------- 286 | resampled_obs_ids : 1D ndarray of ints. 287 | Should contain the observation ids of the observational units that will 288 | be used in the current bootstrap sample. 289 | orig_obs_id_array : 1D ndarray of ints. 290 | Should countain the observation ids of the observational units in the 291 | original dataframe containing the data for this model. 292 | 293 | Returns 294 | ------- 295 | None. 296 | """ 297 | if not np.in1d(resampled_obs_ids, orig_obs_id_array).all(): 298 | msg =\ 299 | "All values in `resampled_obs_ids` MUST be in `orig_obs_id_array`." 300 | raise ValueError(msg) 301 | return None 302 | 303 | 304 | def create_bootstrap_dataframe(orig_df, 305 | obs_id_col, 306 | resampled_obs_ids_1d, 307 | groupby_dict, 308 | boot_id_col="bootstrap_id"): 309 | """ 310 | Will create the altered dataframe of data needed to estimate a choice model 311 | with the particular observations that belong to the current bootstrap 312 | sample. 313 | 314 | Parameters 315 | ---------- 316 | orig_df : pandas DataFrame. 317 | Should be long-format dataframe containing the data used to estimate 318 | the desired choice model. 319 | obs_id_col : str. 320 | Should be a column name within `orig_df`. Should denote the original 321 | observation id column. 322 | resampled_obs_ids_1d : 1D ndarray of ints. 323 | Each value should represent the alternative id of a given bootstrap 324 | replicate. 325 | groupby_dict : dict. 326 | Each key will be a unique value in `orig_df[obs_id_col]` and each value 327 | will be the rows of `orig_df` where `orig_df[obs_id_col] == key`. 328 | boot_id_col : str, optional. 329 | Denotes the new column that will be created to specify the bootstrap 330 | observation ids for choice model estimation. 331 | 332 | Returns 333 | ------- 334 | bootstrap_df : pandas Dataframe. 335 | Will contain all the same columns as `orig_df` as well as the 336 | additional `boot_id_col`. For each value in `resampled_obs_ids_1d`, 337 | `bootstrap_df` will contain the long format rows from `orig_df` that 338 | have the given observation id. 339 | """ 340 | # Check the validity of the passed arguments. 341 | check_column_existence(obs_id_col, orig_df, presence=True) 342 | check_column_existence(boot_id_col, orig_df, presence=False) 343 | # Alias the observation id column 344 | obs_id_values = orig_df[obs_id_col].values 345 | # Check the validity of the resampled observation ids. 346 | ensure_resampled_obs_ids_in_df(resampled_obs_ids_1d, obs_id_values) 347 | 348 | # Initialize a list to store the component dataframes that will be 349 | # concatenated to form the final bootstrap_df 350 | component_dfs = [] 351 | 352 | # Populate component_dfs 353 | for boot_id, obs_id in enumerate(resampled_obs_ids_1d): 354 | # Extract the dataframe that we desire. 355 | extracted_df = groupby_dict[obs_id].copy() 356 | # Add the bootstrap id value. 357 | extracted_df[boot_id_col] = boot_id + 1 358 | # Store the component dataframe 359 | component_dfs.append(extracted_df) 360 | 361 | # Create and return the desired dataframe. 362 | bootstrap_df = pd.concat(component_dfs, axis=0, ignore_index=True) 363 | return bootstrap_df 364 | -------------------------------------------------------------------------------- /src/pylogit/bootstrap_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Timothy Brathwaite 4 | @name: Bootstrap Utilities 5 | @summary: This module provides helpful functions for calculating the 6 | bootstrap confidence intervals. 7 | """ 8 | from __future__ import absolute_import 9 | 10 | from numbers import Number 11 | import numpy as np 12 | 13 | 14 | def check_conf_percentage_validity(conf_percentage): 15 | """ 16 | Ensures that `conf_percentage` is in (0, 100). Raises a helpful ValueError 17 | if otherwise. 18 | """ 19 | msg = "conf_percentage MUST be a number between 0.0 and 100." 20 | condition_1 = isinstance(conf_percentage, Number) 21 | if not condition_1: 22 | raise ValueError(msg) 23 | else: 24 | condition_2 = 0 < conf_percentage < 100 25 | if not condition_2: 26 | raise ValueError(msg) 27 | return None 28 | 29 | 30 | def ensure_samples_is_ndim_ndarray(samples, name='bootstrap', ndim=2): 31 | """ 32 | Ensures that `samples` is an `ndim` numpy array. Raises a helpful 33 | ValueError if otherwise. 34 | """ 35 | assert isinstance(ndim, int) 36 | assert isinstance(name, str) 37 | if not isinstance(samples, np.ndarray) or not (samples.ndim == ndim): 38 | sample_name = name + "_samples" 39 | msg = "`{}` MUST be a {}D ndarray.".format(sample_name, ndim) 40 | raise ValueError(msg) 41 | return None 42 | 43 | 44 | def get_alpha_from_conf_percentage(conf_percentage): 45 | """ 46 | Calculates `100 - conf_percentage`, which is useful for calculating alpha 47 | levels. 48 | """ 49 | return 100.0 - conf_percentage 50 | 51 | 52 | def combine_conf_endpoints(lower_array, upper_array): 53 | """ 54 | Concatenates upper and lower endpoint arrays for a given confidence level. 55 | """ 56 | return np.concatenate([lower_array[None, :], upper_array[None, :]], axis=0) 57 | -------------------------------------------------------------------------------- /src/pylogit/conditional_logit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Feb 25 07:19:49 2016 4 | 5 | @name: MultiNomial Logit 6 | @author: Timothy Brathwaite 7 | @summary: Contains functions necessary for estimating multinomial logit 8 | models (with the help of the "base_multinomial_cm.py" file). 9 | Differs from version one since it works with the shape, intercept, 10 | index coefficient partitioning of estimated parameters as opposed 11 | to the shape, index coefficient partitioning scheme of version 1. 12 | """ 13 | from __future__ import absolute_import 14 | 15 | import warnings 16 | import numpy as np 17 | from scipy.sparse import diags 18 | 19 | from . import choice_calcs as cc 20 | from . import base_multinomial_cm_v2 as base_mcm 21 | from .estimation import LogitTypeEstimator 22 | from .estimation import estimate 23 | from .display_names import model_type_to_display_name 24 | 25 | # Create a variable that will be printed if there is a non-fatal error 26 | # in the MNL class construction 27 | _msg_1 = "The Multinomial Logit Model has no shape parameters. " 28 | _msg_2 = "shape_names and shape_ref_pos will be ignored if passed." 29 | _shape_ignore_msg = _msg_1 + _msg_2 30 | 31 | # Create a warning string that will be issued if ridge regression is performed. 32 | _msg_3 = "NOTE: An L2-penalized regression is being performed. The " 33 | _msg_4 = "reported standard errors and robust standard errors " 34 | _msg_5 = "***WILL BE INCORRECT***." 35 | _ridge_warning_msg = _msg_3 + _msg_4 + _msg_5 36 | 37 | # Alias necessary functions from the base multinomial choice model module 38 | general_log_likelihood = cc.calc_log_likelihood 39 | general_gradient = cc.calc_gradient 40 | general_calc_probabilities = cc.calc_probabilities 41 | general_hessian = cc.calc_hessian 42 | 43 | 44 | def split_param_vec(beta, 45 | rows_to_alts=None, 46 | design=None, 47 | return_all_types=False, 48 | *args, **kwargs): 49 | """ 50 | Parameters 51 | ---------- 52 | beta : 1D ndarray. 53 | All elements should by ints, floats, or longs. Should have 1 element 54 | for each utility coefficient being estimated (i.e. num_features). 55 | rows_to_alts : None, 56 | Not actually used. Included merely for consistency with other models. 57 | design : None. 58 | Not actually used. Included merely for consistency with other models. 59 | return_all_types : bool, optional. 60 | Determines whether or not a tuple of 4 elements will be returned (with 61 | one element for the nest, shape, intercept, and index parameters for 62 | this model). If False, a tuple of 3 elements will be returned, as 63 | described below. 64 | 65 | Returns 66 | ------- 67 | tuple. 68 | `(None, None, beta)`. This function is merely for compatibility with 69 | the other choice model files. 70 | 71 | Note 72 | ---- 73 | If `return_all_types == True` then the function will return a tuple of four 74 | objects. In order, these objects will either be None or the arrays 75 | representing the arrays corresponding to the nest, shape, intercept, and 76 | index parameters. 77 | """ 78 | if return_all_types: 79 | return None, None, None, beta 80 | else: 81 | return None, None, beta 82 | 83 | 84 | def _mnl_utility_transform(systematic_utilities, *args, **kwargs): 85 | """ 86 | Parameters 87 | ---------- 88 | systematic_utilities : 1D ndarray. 89 | Should contain the systematic utilities for each each available 90 | alternative for each observation. 91 | 92 | Returns 93 | ------- 94 | `systematic_utilities[:, None]` 95 | """ 96 | # Be sure to return a 2D array since other functions will be expecting this 97 | if len(systematic_utilities.shape) == 1: 98 | systematic_utilities = systematic_utilities[:, np.newaxis] 99 | 100 | return systematic_utilities 101 | 102 | 103 | def _mnl_transform_deriv_c(*args, **kwargs): 104 | """ 105 | Returns None. 106 | 107 | This is a place holder function since the MNL model has no shape 108 | parameters. 109 | """ 110 | # This is a place holder function since the MNL model has no shape 111 | # parameters. 112 | return None 113 | 114 | 115 | def _mnl_transform_deriv_alpha(*args, **kwargs): 116 | """ 117 | Returns None. 118 | 119 | This is a place holder function since the MNL model has no intercept 120 | parameters outside of the index. 121 | """ 122 | # This is a place holder function since the MNL model has no intercept 123 | # parameters outside the index. 124 | return None 125 | 126 | 127 | class MNLEstimator(LogitTypeEstimator): 128 | """ 129 | Estimation Object used to enforce uniformity in the estimation process 130 | across the various logit-type models. 131 | 132 | Parameters 133 | ---------- 134 | model_obj : a pylogit.base_multinomial_cm_v2.MNDC_Model instance. 135 | Should contain the following attributes: 136 | 137 | - alt_IDs 138 | - choices 139 | - design 140 | - intercept_ref_position 141 | - shape_ref_position 142 | - utility_transform 143 | mapping_res : dict. 144 | Should contain the scipy sparse matrices that map the rows of the long 145 | format dataframe to various other objects such as the available 146 | alternatives, the unique observations, etc. The keys that it must have 147 | are `['rows_to_obs', 'rows_to_alts', 'chosen_row_to_obs']` 148 | ridge : int, float, long, or None. 149 | Determines whether or not ridge regression is performed. If a 150 | scalar is passed, then that scalar determines the ridge penalty for 151 | the optimization. The scalar should be greater than or equal to 152 | zero.. 153 | zero_vector : 1D ndarray. 154 | Determines what is viewed as a "null" set of parameters. It is 155 | explicitly passed because some parameters (e.g. parameters that must be 156 | greater than zero) have their null values at values other than zero. 157 | split_params : callable. 158 | Should take a vector of parameters, `mapping_res['rows_to_alts']`, and 159 | model_obj.design as arguments. Should return a tuple containing 160 | separate arrays for the model's shape, outside intercept, and index 161 | coefficients. For each of these arrays, if this model does not contain 162 | the particular type of parameter, the callable should place a `None` in 163 | its place in the tuple. 164 | constrained_pos : list or None, optional. 165 | Denotes the positions of the array of estimated parameters that are 166 | not to change from their initial values. If a list is passed, the 167 | elements are to be integers where no such integer is greater than 168 | `num_params` Default == None. 169 | weights : 1D ndarray or None, optional. 170 | Allows for the calculation of weighted log-likelihoods. The weights can 171 | represent various things. In stratified samples, the weights may be 172 | the proportion of the observations in a given strata for a sample in 173 | relation to the proportion of observations in that strata in the 174 | population. In latent class models, the weights may be the probability 175 | of being a particular class. 176 | """ 177 | def set_derivatives(self): 178 | # Pre-calculate the derivative of the transformation vector with 179 | # respect to the vector of systematic utilities 180 | dh_dv = diags(np.ones(self.design.shape[0]), 0, format='csr') 181 | 182 | # Create a function to calculate dh_dv which will return the 183 | # pre-calculated result when called 184 | def calc_dh_dv(*args): 185 | return dh_dv 186 | 187 | self.calc_dh_dv = calc_dh_dv 188 | self.calc_dh_d_alpha = _mnl_transform_deriv_alpha 189 | self.calc_dh_d_shape = _mnl_transform_deriv_c 190 | 191 | def check_length_of_initial_values(self, init_values): 192 | """ 193 | Ensures that `init_values` is of the correct length. Raises a helpful 194 | ValueError if otherwise. 195 | 196 | Parameters 197 | ---------- 198 | init_values : 1D ndarray. 199 | The initial values to start the optimization process with. There 200 | should be one value for each index coefficient, outside intercept 201 | parameter, and shape parameter being estimated. 202 | 203 | Returns 204 | ------- 205 | None. 206 | """ 207 | # Calculate the expected number of index parameters 208 | num_index_coefs = self.design.shape[1] 209 | 210 | if init_values.shape[0] != num_index_coefs: 211 | msg_1 = "The initial values are of the wrong dimension." 212 | msg_2 = "It should be of dimension {}" 213 | msg_3 = "But instead it has dimension {}" 214 | raise ValueError(msg_1 + 215 | msg_2.format(num_index_coefs) + 216 | msg_3.format(init_values.shape[0])) 217 | 218 | return None 219 | 220 | 221 | class MNL(base_mcm.MNDC_Model): 222 | """ 223 | Parameters 224 | ---------- 225 | data : string or pandas dataframe. 226 | If string, data should be an absolute or relative path to a CSV file 227 | containing the long format data for this choice model. Note long format 228 | is has one row per available alternative for each observation. If 229 | pandas dataframe, the dataframe should be the long format data for the 230 | choice model. 231 | alt_id_col :str. 232 | Should denote the column in data which contains the alternative 233 | identifiers for each row. 234 | obs_id_col : str. 235 | Should denote the column in data which contains the observation 236 | identifiers for each row. 237 | choice_col : str. 238 | Should denote the column in data which contains the ones and zeros that 239 | denote whether or not the given row corresponds to the chosen 240 | alternative for the given individual. 241 | specification : OrderedDict. 242 | Keys are a proper subset of the columns in `data`. Values are either a 243 | list or a single string, "all_diff" or "all_same". If a list, the 244 | elements should be: 245 | - single objects that are in the alternative ID column of `data` 246 | - lists of objects that are within the alternative ID column of 247 | `data`. For each single object in the list, a unique column will 248 | be created (i.e. there will be a unique coefficient for that 249 | variable in the corresponding utility equation of the 250 | corresponding alternative). For lists within the 251 | `specification` values, a single column will be created for all 252 | the alternatives within the iterable (i.e. there will be one 253 | common coefficient for the variables in the iterable). 254 | names : OrderedDict, optional. 255 | Should have the same keys as `specification`. For each key: 256 | - if the corresponding value in `specification` is "all_same", then 257 | there should be a single string as the value in names. 258 | - if the corresponding value in `specification` is "all_diff", then 259 | there should be a list of strings as the value in names. There 260 | should be one string in the value in names for each possible 261 | alternative. 262 | - if the corresponding value in `specification` is a list, then 263 | there should be a list of strings as the value in names. There 264 | should be one string the value in names per item in the value in 265 | `specification`. 266 | Default == None. 267 | 268 | """ 269 | def __init__(self, 270 | data, 271 | alt_id_col, 272 | obs_id_col, 273 | choice_col, 274 | specification, 275 | names=None, 276 | *args, **kwargs): 277 | ########## 278 | # Print a helpful message for users who have included shape parameters 279 | # or shape names unneccessarily 280 | ########## 281 | for keyword in ["shape_names", "shape_ref_pos"]: 282 | if keyword in kwargs and kwargs[keyword] is not None: 283 | warnings.warn(_shape_ignore_msg) 284 | break 285 | 286 | if "intercept_ref_pos" in kwargs: 287 | if kwargs["intercept_ref_pos"] is not None: 288 | msg = "The MNL model should have all intercepts in the index." 289 | raise ValueError(msg) 290 | 291 | # Carry out the common instantiation process for all choice models 292 | super(MNL, self).__init__(data, 293 | alt_id_col, 294 | obs_id_col, 295 | choice_col, 296 | specification, 297 | names=names, 298 | model_type=model_type_to_display_name["MNL"]) 299 | 300 | # Store the utility transform function 301 | self.utility_transform = _mnl_utility_transform 302 | 303 | return None 304 | 305 | def fit_mle(self, 306 | init_vals, 307 | print_res=True, 308 | method="BFGS", 309 | loss_tol=1e-06, 310 | gradient_tol=1e-06, 311 | maxiter=1000, 312 | ridge=None, 313 | constrained_pos=None, 314 | just_point=False, 315 | **kwargs): 316 | """ 317 | Parameters 318 | ---------- 319 | init_vals : 1D ndarray. 320 | The initial values to start the optimization process with. There 321 | should be one value for each utility coefficient being estimated. 322 | print_res : bool, optional. 323 | Determines whether the timing and initial and final log likelihood 324 | results will be printed as they they are determined. 325 | method : str, optional. 326 | Should be a valid string that can be passed to 327 | scipy.optimize.minimize. Determines the optimization algorithm that 328 | is used for this problem. If 'em' is passed, a custom coded EM 329 | algorithm will be used. Default `== 'newton-cg'`. 330 | loss_tol : float, optional. 331 | Determines the tolerance on the difference in objective function 332 | values from one iteration to the next that is needed to determine 333 | convergence. Default `== 1e-06`. 334 | gradient_tol : float, optional. 335 | Determines the tolerance on the difference in gradient values from 336 | one iteration to the next which is needed to determine convergence. 337 | ridge : int, float, long, or None, optional. 338 | Determines whether or not ridge regression is performed. If a 339 | scalar is passed, then that scalar determines the ridge penalty for 340 | the optimization. Default `== None`. 341 | constrained_pos : list or None, optional. 342 | Denotes the positions of the array of estimated parameters that are 343 | not to change from their initial values. If a list is passed, the 344 | elements are to be integers where no such integer is greater than 345 | `init_vals.size.` Default == None. 346 | just_point : bool, optional. 347 | Determines whether (True) or not (False) calculations that are non- 348 | critical for obtaining the maximum likelihood point estimate will 349 | be performed. If True, this function will return the results 350 | dictionary from scipy.optimize. Default == False. 351 | 352 | Returns 353 | ------- 354 | None or dict. 355 | If `just_point` is False, None is returned and the estimation 356 | results are saved to the model instance. If `just_point` is True, 357 | then the results dictionary from scipy.optimize() is returned. 358 | """ 359 | # Check integrity of passed arguments 360 | kwargs_to_be_ignored = ["init_shapes", "init_intercepts", "init_coefs"] 361 | if any([x in kwargs for x in kwargs_to_be_ignored]): 362 | msg = "MNL model does not use of any of the following kwargs:\n{}" 363 | msg_2 = "Remove such kwargs and pass a single init_vals argument" 364 | raise ValueError(msg.format(kwargs_to_be_ignored) + msg_2) 365 | 366 | if ridge is not None: 367 | warnings.warn(_ridge_warning_msg) 368 | 369 | # Store the optimization method 370 | self.optimization_method = method 371 | 372 | # Store the ridge parameter 373 | self.ridge_param = ridge 374 | 375 | # Construct the mappings from alternatives to observations and from 376 | # chosen alternatives to observations 377 | mapping_res = self.get_mappings_for_fit() 378 | 379 | # Create the estimation object 380 | zero_vector = np.zeros(init_vals.shape) 381 | mnl_estimator = MNLEstimator(self, 382 | mapping_res, 383 | ridge, 384 | zero_vector, 385 | split_param_vec, 386 | constrained_pos=constrained_pos) 387 | # Set the derivative functions for estimation 388 | mnl_estimator.set_derivatives() 389 | 390 | # Perform one final check on the length of the initial values 391 | mnl_estimator.check_length_of_initial_values(init_vals) 392 | 393 | # Get the estimation results 394 | estimation_res = estimate(init_vals, 395 | mnl_estimator, 396 | method, 397 | loss_tol, 398 | gradient_tol, 399 | maxiter, 400 | print_res, 401 | just_point=just_point) 402 | 403 | if not just_point: 404 | # Store the estimation results 405 | self.store_fit_results(estimation_res) 406 | 407 | return None 408 | else: 409 | return estimation_res 410 | -------------------------------------------------------------------------------- /src/pylogit/construct_estimator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @name: Estimator Constructor 4 | @author: Timothy Brathwaite 5 | @summary: Contains functions necessary for constructing the Estimation 6 | Objects used to provide convenience functions when estimating 7 | PyLogit's various choice models. 8 | """ 9 | from __future__ import absolute_import 10 | 11 | import numpy as np 12 | 13 | from .display_names import model_type_to_display_name as display_name_dict 14 | 15 | from .mixed_logit import MixedEstimator 16 | from .mixed_logit import split_param_vec as mixed_split_params 17 | 18 | from .nested_logit import NestedEstimator 19 | from .nested_logit import split_param_vec as nested_split_params 20 | 21 | from .conditional_logit import MNLEstimator 22 | from .conditional_logit import split_param_vec as mnl_split_params 23 | 24 | from .clog_log import ClogEstimator 25 | from .clog_log import split_param_vec as clog_split_params 26 | 27 | from .asym_logit import AsymEstimator 28 | from .asym_logit import split_param_vec as asym_split_params 29 | 30 | from .scobit import ScobitEstimator 31 | from .scobit import split_param_vec as scobit_split_params 32 | 33 | from .uneven_logit import UnevenEstimator 34 | from .uneven_logit import split_param_vec as uneven_split_params 35 | 36 | # Map the displayed model types to the internal model names. 37 | display_name_to_model_type = {v: k for k, v in display_name_dict.items()} 38 | 39 | # Map the internal model types to their appropriate estimator and split params 40 | # functions 41 | model_type_to_resources =\ 42 | {"MNL": {'estimator': MNLEstimator, 'split_func': mnl_split_params}, 43 | "Asym": {'estimator': AsymEstimator, 'split_func': asym_split_params}, 44 | "Cloglog": {'estimator': ClogEstimator, 'split_func': clog_split_params}, 45 | "Scobit": {'estimator': ScobitEstimator, 46 | 'split_func': scobit_split_params}, 47 | "Uneven": {'estimator': UnevenEstimator, 48 | 'split_func': uneven_split_params}, 49 | "Nested Logit": {'estimator': NestedEstimator, 50 | 'split_func': nested_split_params}, 51 | "Mixed Logit": {'estimator': MixedEstimator, 52 | 'split_func': mixed_split_params}} 53 | 54 | 55 | def create_estimation_obj(model_obj, 56 | init_vals, 57 | mappings=None, 58 | ridge=None, 59 | constrained_pos=None, 60 | weights=None): 61 | """ 62 | Should return a model estimation object corresponding to the model type of 63 | the `model_obj`. 64 | 65 | Parameters 66 | ---------- 67 | model_obj : an instance or sublcass of the MNDC class. 68 | init_vals : 1D ndarray. 69 | The initial values to start the estimation process with. In the 70 | following order, there should be one value for each nest coefficient, 71 | shape parameter, outside intercept parameter, or index coefficient that 72 | is being estimated. 73 | mappings : OrderedDict or None, optional. 74 | Keys will be `["rows_to_obs", "rows_to_alts", "chosen_row_to_obs", 75 | "rows_to_nests"]`. The value for `rows_to_obs` will map the rows of 76 | the `long_form` to the unique observations (on the columns) in 77 | their order of appearance. The value for `rows_to_alts` will map 78 | the rows of the `long_form` to the unique alternatives which are 79 | possible in the dataset (on the columns), in sorted order--not 80 | order of appearance. The value for `chosen_row_to_obs`, if not 81 | None, will map the rows of the `long_form` that contain the chosen 82 | alternatives to the specific observations those rows are associated 83 | with (denoted by the columns). The value of `rows_to_nests`, if not 84 | None, will map the rows of the `long_form` to the nest (denoted by 85 | the column) that contains the row's alternative. Default == None. 86 | ridge : int, float, long, or None, optional. 87 | Determines whether or not ridge regression is performed. If a 88 | scalar is passed, then that scalar determines the ridge penalty for 89 | the optimization. The scalar should be greater than or equal to 90 | zero. Default `== None`. 91 | constrained_pos : list or None, optional. 92 | Denotes the positions of the array of estimated parameters that are 93 | not to change from their initial values. If a list is passed, the 94 | elements are to be integers where no such integer is greater than 95 | `init_vals.size.` Default == None. 96 | weights : 1D ndarray. 97 | Should contain the weights for each corresponding observation for each 98 | row of the long format data. 99 | """ 100 | # Get the mapping matrices for each model 101 | mapping_matrices =\ 102 | model_obj.get_mappings_for_fit() if mappings is None else mappings 103 | # Create the zero vector for each model. 104 | zero_vector = np.zeros(init_vals.shape[0]) 105 | # Get the internal model name 106 | internal_model_name = display_name_to_model_type[model_obj.model_type] 107 | # Get the split parameter function and estimator class for this model. 108 | estimator_class, current_split_func =\ 109 | (model_type_to_resources[internal_model_name]['estimator'], 110 | model_type_to_resources[internal_model_name]['split_func']) 111 | # Create the estimator instance that is desired. 112 | estimation_obj = estimator_class(model_obj, 113 | mapping_matrices, 114 | ridge, 115 | zero_vector, 116 | current_split_func, 117 | constrained_pos, 118 | weights=weights) 119 | # Return the created object 120 | return estimation_obj 121 | -------------------------------------------------------------------------------- /src/pylogit/display_names.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | This file declares the strings that will be displayed for each model type based 4 | on the abbriviated model type string that is passed to the choice model 5 | constructor. 6 | """ 7 | from __future__ import absolute_import 8 | 9 | from collections import OrderedDict 10 | model_type_to_display_name = OrderedDict() 11 | model_type_to_display_name["MNL"] = "Multinomial Logit Model" 12 | model_type_to_display_name["Asym"] = "Multinomial Asymmetric Logit Model" 13 | model_type_to_display_name["Cloglog"] = "Multinomial Clog-log Model" 14 | model_type_to_display_name["Scobit"] = "Multinomial Scobit Model" 15 | model_type_to_display_name["Uneven"] = "Multinomial Uneven Logit Model" 16 | model_type_to_display_name["Nested Logit"] = "Nested Logit Model" 17 | model_type_to_display_name["Mixed Logit"] = "Mixed Logit Model" 18 | -------------------------------------------------------------------------------- /src/pylogit/newsfragments/.gitignore: -------------------------------------------------------------------------------- 1 | !.gitignore 2 | -------------------------------------------------------------------------------- /src/pylogit/pylogit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Feb 29 22:07:30 2016 4 | 5 | @module: generalized_choice_model 6 | @name: Python Based Conditional Logit-type Models 7 | @author: Timothy Brathwaite 8 | @summary: Contains functions necessary for estimating multinomial, asymmetric 9 | conditional choice models (and standard conditional logit models). 10 | @notes: "Under the hood", this module indirectly or directly relies upon 11 | the following files: 12 | [base_multinomial_cm_v2.py, 13 | choice_calcs.py, 14 | choice_tools.py, 15 | conditional_logit.py, 16 | asym_logit.py, 17 | uneven_logit.py, 18 | scobit.py, 19 | clog_log.py, 20 | nested_logit.py 21 | mixed_logit.py] 22 | """ 23 | from __future__ import absolute_import 24 | 25 | from . import conditional_logit as mnl 26 | from . import asym_logit 27 | from . import uneven_logit 28 | from . import scobit 29 | from . import clog_log 30 | from . import nested_logit 31 | from . import mixed_logit 32 | 33 | # Create a dictionary relating the model type parameter to the class that 34 | # the general choice model should inherit from 35 | model_type_to_class = {"MNL": mnl.MNL, 36 | "Asym": asym_logit.MNAL, 37 | "Cloglog": clog_log.MNCL, 38 | "Scobit": scobit.MNSL, 39 | "Uneven": uneven_logit.MNUL, 40 | "Nested Logit": nested_logit.NestedLogit, 41 | "Mixed Logit": mixed_logit.MixedLogit} 42 | 43 | # Create a dictionary relating the model type parameter to the name of the 44 | # class that the general choice model should inherit from 45 | model_type_to_class_name = {"MNL": "MNL", 46 | "Asym": "MNAL", 47 | "Cloglog": "MNCL", 48 | "Scobit": "MNSL", 49 | "Uneven": "MNUL", 50 | "Nested Logit": "NestedLogit", 51 | "Mixed Logit": "MixedLogit"} 52 | 53 | # Store the names of the model_type kwargs that are valid. 54 | valid_model_types = model_type_to_class.keys() 55 | 56 | 57 | # Create a function that checks the user's model type and ensures its validity 58 | def ensure_valid_model_type(specified_type, model_type_list): 59 | """ 60 | Checks to make sure that `specified_type` is in `model_type_list` and 61 | raises a helpful error if this is not the case. 62 | 63 | Parameters 64 | ---------- 65 | specified_type : str. 66 | Denotes the user-specified model type that is to be checked. 67 | model_type_list : list of strings. 68 | Contains all of the model types that are acceptable kwarg values. 69 | 70 | Returns 71 | ------- 72 | None. 73 | """ 74 | if specified_type not in model_type_list: 75 | msg_1 = "The specified model_type was not valid." 76 | msg_2 = "Valid model-types are {}".format(model_type_list) 77 | msg_3 = "The passed model-type was: {}".format(specified_type) 78 | total_msg = "\n".join([msg_1, msg_2, msg_3]) 79 | raise ValueError(total_msg) 80 | return None 81 | 82 | 83 | def create_choice_model(data, 84 | alt_id_col, 85 | obs_id_col, 86 | choice_col, 87 | specification, 88 | model_type, 89 | intercept_ref_pos=None, 90 | shape_ref_pos=None, 91 | names=None, 92 | intercept_names=None, 93 | shape_names=None, 94 | nest_spec=None, 95 | mixing_id_col=None, 96 | mixing_vars=None): 97 | """ 98 | Parameters 99 | ---------- 100 | data : string or pandas dataframe. 101 | If `data` is a string, it should be an absolute or relative path to 102 | a CSV file containing the long format data for this choice model. 103 | Note long format has one row per available alternative for each 104 | observation. If `data` is a pandas dataframe, `data` should already 105 | be in long format. 106 | alt_id_col : string. 107 | Should denote the column in data that contains the alternative 108 | identifiers for each row. 109 | obs_id_col : string. 110 | Should denote the column in data that contains the observation 111 | identifiers for each row. 112 | choice_col : string. 113 | Should denote the column in data which contains the ones and zeros 114 | that denote whether or not the given row corresponds to the chosen 115 | alternative for the given individual. 116 | specification : OrderedDict. 117 | Keys are a proper subset of the columns in `long_form_df`. Values are 118 | either a list or a single string, `all_diff` or `all_same`. If a list, 119 | the elements should be: 120 | 1) single objects that are within the alternative ID column of 121 | `long_form_df` 122 | 2) lists of objects that are within the alternative ID column of 123 | `long_form_df`. For each single object in the list, a unique 124 | column will be created (i.e. there will be a unique 125 | coefficient for that variable in the corresponding utility 126 | equation of the corresponding alternative). For lists within 127 | the `specification_dict` values, a single column will be 128 | created for all the alternatives within iterable (i.e. there 129 | will be one common coefficient for the variables in the 130 | iterable). 131 | model_type : string. 132 | Denotes the model type of the choice_model being instantiated. 133 | Should be one of the following values: 134 | 135 | - "MNL" 136 | - "Asym" 137 | - "Cloglog" 138 | - "Scobit" 139 | - "Uneven" 140 | - "Nested Logit" 141 | - "Mixed Logit" 142 | intercept_ref_pos : int, optional. 143 | Valid only when the intercepts being estimated are not part of the 144 | index. Specifies the alternative in the ordered array of unique 145 | alternative ids whose intercept or alternative-specific constant is 146 | not estimated, to ensure model identifiability. Default == None. 147 | shape_ref_pos : int, optional. 148 | Specifies the alternative in the ordered array of unique 149 | alternative ids whose shape parameter is not estimated, to ensure 150 | model identifiability. Default == None. 151 | names : OrderedDict or None, optional. 152 | Should have the same keys as `specification`. For each key: 153 | 154 | - if the corresponding value in `specification` is 155 | "all_same", then there should be a single string as the value 156 | in names. 157 | - if the corresponding value in `specification` is "all_diff", 158 | then there should be a list of strings as the value in names. 159 | There should be one string in the value in names for each 160 | possible alternative. 161 | - if the corresponding value in `specification` is a list, then 162 | there should be a list of strings as the value in names. 163 | There should be one string the value in names per item in the 164 | value in `specification`. 165 | Default == None. 166 | intercept_names : list of strings or None, optional. 167 | If a list is passed, then the list should have the same number of 168 | elements as there are possible alternatives in data, minus 1. Each 169 | element of the list should be the name of the corresponding 170 | alternative's intercept term, in sorted order of the possible 171 | alternative IDs. If None is passed, the resulting names that are 172 | shown in the estimation results will be 173 | ["Outside_ASC_{}".format(x) for x in shape_names]. Default = None. 174 | shape_names : list of strings or None, optional. 175 | If a list is passed, then the list should have the same number of 176 | elements as there are possible alternative IDs in data. Each 177 | element of the list should be a string denoting the name of the 178 | corresponding alternative, in sorted order of the possible 179 | alternative IDs. The resulting names which are shown in the 180 | estimation results will be 181 | ["shape_{}".format(x) for x in shape_names]. Default = None. 182 | nest_spec : OrderedDict or None, optional. 183 | Keys are strings that define the name of the nests. Values are 184 | lists of alternative ids, denoting which alternatives belong to 185 | which nests. Each alternative id only be associated with a single 186 | nest! Default == None. 187 | mixing_id_col : str, or None, optional. 188 | Should be a column heading in `data`. Should denote the column in 189 | `data` which contains the identifiers of the units of observation 190 | over which the coefficients of the model are thought to be randomly 191 | distributed. If `model_type == "Mixed Logit"`, then `mixing_id_col` 192 | must be passed. Default == None. 193 | mixing_vars : list, or None, optional. 194 | All elements of the list should be strings. Each string should be 195 | present in the values of `names.values()` and they're associated 196 | variables should only be index variables (i.e. part of the design 197 | matrix). If `model_type == "Mixed Logit"`, then `mixing_vars` must 198 | be passed. Default == None. 199 | 200 | Returns 201 | ------- 202 | model_obj : instantiation of the Choice Model Class corresponding 203 | to the model type passed as the function argument. The returned 204 | object will have been instantiated with the arguments passed to 205 | this function. 206 | """ 207 | # Make sure the model type is valid 208 | ensure_valid_model_type(model_type, valid_model_types) 209 | 210 | # Carry out the appropriate instantiation process for the chosen 211 | # choice model 212 | model_kwargs = {"intercept_ref_pos": intercept_ref_pos, 213 | "shape_ref_pos": shape_ref_pos, 214 | "names": names, 215 | "intercept_names": intercept_names, 216 | "shape_names": shape_names, 217 | "nest_spec": nest_spec, 218 | "mixing_id_col": mixing_id_col, 219 | "mixing_vars": mixing_vars} 220 | return model_type_to_class[model_type](data, 221 | alt_id_col, 222 | obs_id_col, 223 | choice_col, 224 | specification, 225 | **model_kwargs) 226 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/timothyb0912/pylogit/cffc9c523b5368966ef2481c7dc30f0a5d296de8/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_bootstrap_calcs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the bootstrap_calcs.py file. 3 | """ 4 | import unittest 5 | 6 | import numpy as np 7 | import numpy.testing as npt 8 | import pandas as pd 9 | from scipy.stats import norm, gumbel_r 10 | 11 | import pylogit.bootstrap_calcs as bc 12 | 13 | try: 14 | # Python 3.x does not natively support xrange 15 | from past.builtins import xrange 16 | except ImportError: 17 | pass 18 | 19 | 20 | class ComputationalTests(unittest.TestCase): 21 | def setUp(self): 22 | """ 23 | Note that the spatial test data used in many of these tests comes from 24 | Efron, Bradley, and Robert J. Tibshirani. An Introduction to the 25 | Bootstrap. CRC press, 1994. Chapter 14. 26 | """ 27 | # Determine the number of parameters and number of bootstrap replicates 28 | num_replicates = 100 29 | num_params = 5 30 | # Create a set of fake bootstrap replicates 31 | self.bootstrap_replicates =\ 32 | (np.arange(1, 1 + num_replicates)[:, None] * 33 | np.arange(1, 1 + num_params)[None, :]) 34 | # Create a fake maximum likelihood parameter estimate 35 | self.mle_params = self.bootstrap_replicates[50, :] 36 | # Create a set of fake jackknife replicates 37 | array_container = [] 38 | for est in self.mle_params: 39 | array_container.append(gumbel_r.rvs(loc=est, size=10)) 40 | self.jackknife_replicates =\ 41 | np.concatenate([x[:, None] for x in array_container], axis=1) 42 | # Create a fake confidence percentage. 43 | self.conf_percentage = 94.88 44 | 45 | # Store the spatial test data from Efron and Tibshirani (1994) 46 | self.test_data =\ 47 | np.array([48, 36, 20, 29, 42, 42, 20, 42, 22, 41, 45, 14, 6, 48 | 0, 33, 28, 34, 4, 32, 24, 47, 41, 24, 26, 30, 41]) 49 | 50 | # Note how many test data observations there are. 51 | num_test_obs = self.test_data.size 52 | 53 | # Create the function to calculate the jackknife replicates. 54 | def calc_theta(array): 55 | result = ((array - array.mean())**2).sum() / float(array.size) 56 | return result 57 | self.calc_theta = calc_theta 58 | self.test_theta_hat = np.array([calc_theta(self.test_data)]) 59 | 60 | # Create a pandas series of the data. Allows for easy case deletion. 61 | raw_series = pd.Series(self.test_data) 62 | # Create the array of jackknife replicates 63 | jackknife_replicates = np.empty((num_test_obs, 1), dtype=float) 64 | for obs in xrange(num_test_obs): 65 | current_data = raw_series[raw_series.index != obs].values 66 | jackknife_replicates[obs] = calc_theta(current_data) 67 | self.test_jackknife_replicates = jackknife_replicates 68 | 69 | return None 70 | 71 | def test_calc_percentile_interval(self): 72 | # Get the alpha percentage. Should be 5.12 so alpha / 2 should be 2.56 73 | alpha = bc.get_alpha_from_conf_percentage(self.conf_percentage) 74 | # These next 2 statements work because there are exactly 100 replicates 75 | # We should have the value in BR[lower_row, 0] = 3 so that there are 2 76 | # elements in bootstrap_replicates (BR) that are less than this. I.e. 77 | # we want lower_row = 2. Note 2.56 rounded down is 2. 78 | lower_row = int(np.floor(alpha / 2.0)) 79 | # 100 - 2.56 is 97.44. Rounded up, this is 98. 80 | # We want the row such that the value in the first column of that row 81 | # is 98, i.e. we want the row at index 97. 82 | upper_row = int(np.floor(100 - (alpha / 2.0))) 83 | # Create the expected results 84 | expected_results =\ 85 | bc.combine_conf_endpoints(self.bootstrap_replicates[lower_row], 86 | self.bootstrap_replicates[upper_row]) 87 | # Alias the function being tested 88 | func = bc.calc_percentile_interval 89 | # Get the function results 90 | func_results = func(self.bootstrap_replicates, self.conf_percentage) 91 | # Perform the desired tests 92 | self.assertIsInstance(func_results, np.ndarray) 93 | self.assertEqual(func_results.shape, expected_results.shape) 94 | npt.assert_allclose(func_results, expected_results) 95 | return None 96 | 97 | def test_calc_bias_correction_bca(self): 98 | # There are 100 bootstrap replicates, already in ascending order for 99 | # each column. If we take row 51 to be the mle, then 50% of the 100 | # replicates are less than the mle, and we should have bias = 0. 101 | expected_result = np.zeros(self.mle_params.size) 102 | 103 | # Alias the function to be tested. 104 | func = bc.calc_bias_correction_bca 105 | 106 | # Perform the desired test 107 | func_result = func(self.bootstrap_replicates, self.mle_params) 108 | self.assertIsInstance(func_result, np.ndarray) 109 | self.assertEqual(func_result.shape, expected_result.shape) 110 | npt.assert_allclose(func_result, expected_result) 111 | 112 | # Create a fake mle that should be higher than 95% of the results 113 | fake_mle = self.bootstrap_replicates[95] 114 | expected_result_2 = norm.ppf(0.95) * np.ones(self.mle_params.size) 115 | func_result_2 = func(self.bootstrap_replicates, fake_mle) 116 | 117 | self.assertIsInstance(func_result_2, np.ndarray) 118 | self.assertEqual(func_result_2.shape, expected_result_2.shape) 119 | npt.assert_allclose(func_result_2, expected_result_2) 120 | return None 121 | 122 | def test_calc_acceleration_bca(self): 123 | # Get the expected result. See page 186 of Efron and Tibshirani (1994) 124 | expected_result = np.array([0.061]) 125 | 126 | # Alias the function being tested 127 | func = bc.calc_acceleration_bca 128 | 129 | # Perform the desired test 130 | func_result = func(self.test_jackknife_replicates) 131 | self.assertIsInstance(func_result, np.ndarray) 132 | self.assertEqual(func_result.shape, expected_result.shape) 133 | # Note the absolute tolerance of 5e-4 is used because the results 134 | # should agree when rounded to 3 decimal places. This will be the case 135 | # if the two sets of results agree to within 5e-4 of each other. 136 | npt.assert_allclose(func_result, expected_result, atol=5e-4) 137 | return None 138 | 139 | def test_calc_lower_bca_percentile(self): 140 | # Use the parameter values from 141 | # Efron, Bradley, and Robert J. Tibshirani. An Introduction to the 142 | # Bootstrap. CRC press, 1994. Pages 185-186 143 | # Note that my alpha is Efron's alpha / 2, in percents not decimals 144 | alpha_percent = 10 145 | bias_correction = np.array([0.146]) 146 | acceleration = np.array([0.061]) 147 | 148 | # Note the expected results 149 | expected_result = np.array([0.110]) 150 | 151 | # Alias the function being tested 152 | func = bc.calc_lower_bca_percentile 153 | 154 | # Perform the desired tests 155 | # Note we divide the function results by 100 since our results are in 156 | # terms of percents and Efron's results are in decimals. 157 | func_result = func(alpha_percent, bias_correction, acceleration) / 100 158 | self.assertIsInstance(func_result, np.ndarray) 159 | self.assertEqual(func_result.shape, expected_result.shape) 160 | # Note the absolute tolerance of 5e-4 is used because the results 161 | # should agree when rounded to 3 decimal places. This will be the case 162 | # if the two sets of results agree to within 5e-4 of each other. 163 | npt.assert_allclose(func_result, expected_result, atol=5e-4) 164 | return None 165 | 166 | def test_calc_upper_bca_percentile(self): 167 | # Use the parameter values from 168 | # Efron, Bradley, and Robert J. Tibshirani. An Introduction to the 169 | # Bootstrap. CRC press, 1994. Pages 185-186 170 | # Note that my alpha is Efron's alpha / 2, in percents not decimals 171 | alpha_percent = 10 172 | bias_correction = np.array([0.146]) 173 | acceleration = np.array([0.061]) 174 | 175 | # Note the expected results 176 | expected_result = np.array([0.985]) 177 | 178 | # Alias the function being tested 179 | func = bc.calc_upper_bca_percentile 180 | 181 | # Perform the desired tests 182 | # Note we divide the function results by 100 since our results are in 183 | # terms of percents and Efron's results are in decimals. 184 | func_result = func(alpha_percent, bias_correction, acceleration) / 100 185 | self.assertIsInstance(func_result, np.ndarray) 186 | self.assertEqual(func_result.shape, expected_result.shape) 187 | # Note the absolute tolerance of 1e-3 is used because the results 188 | # should be within 0.001 of each other. 189 | npt.assert_allclose(func_result, expected_result, atol=1e-3) 190 | return None 191 | 192 | def test_calc_bca_interval(self): 193 | # Create the bootstrap replicates for the test data 194 | num_test_reps = 5000 195 | num_test_obs = self.test_data.size 196 | test_indices = np.arange(num_test_obs) 197 | boot_indx_shape = (num_test_reps, num_test_obs) 198 | np.random.seed(8292017) 199 | boot_indices =\ 200 | np.random.choice(test_indices, 201 | replace=True, 202 | size=num_test_obs*num_test_reps) 203 | self.test_bootstrap_replicates =\ 204 | np.fromiter((self.calc_theta(self.test_data[x]) for x in 205 | boot_indices.reshape(boot_indx_shape)), 206 | dtype=float)[:, None] 207 | 208 | # Note the expected result. See page 183 of Efron and Tibshirani (1994) 209 | expected_result = np.array([[115.8], [259.6]]) 210 | 211 | # Bundle the necessary arguments 212 | args = [self.test_bootstrap_replicates, 213 | self.test_jackknife_replicates, 214 | self.test_theta_hat, 215 | 90] 216 | 217 | # Alias the function being tested 218 | func = bc.calc_bca_interval 219 | 220 | # Get the function results 221 | func_result = func(*args) 222 | 223 | # Perform the desired tests 224 | # Note we divide the function results by 100 since our results are in 225 | # terms of percents and Efron's results are in decimals. 226 | self.assertIsInstance(func_result, np.ndarray) 227 | self.assertEqual(func_result.shape, expected_result.shape) 228 | # Note the relative tolerance of 0.01 is used because the function 229 | # results should be within 1% of the expected result. Note that some 230 | # differences are expected due to simulation error on both the part of 231 | # Efron and Tibshirani (1994) when they reported their results, and on 232 | # our part when calculating the results. 233 | npt.assert_allclose(func_result, expected_result, rtol=0.01) 234 | return None 235 | -------------------------------------------------------------------------------- /tests/test_bootstrap_sampler.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the bootstrap_sampler.py file. 3 | """ 4 | import unittest 5 | from collections import OrderedDict 6 | 7 | import numpy as np 8 | import numpy.testing as npt 9 | import pandas as pd 10 | 11 | import pylogit.bootstrap_sampler as bs 12 | 13 | try: 14 | # Python 3.x does not natively support xrange 15 | from past.builtins import xrange 16 | except ImportError: 17 | pass 18 | 19 | 20 | class SamplerTests(unittest.TestCase): 21 | def test_relate_obs_ids_to_chosen_alts(self): 22 | # Create fake data for the observation, alternative, and choice ids. 23 | obs_ids = np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]) 24 | alt_ids = np.array([1, 2, 1, 3, 2, 3, 2, 3, 1, 3, 1, 2]) 25 | choices = np.array([0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0]) 26 | 27 | # Create the dictionary that we expect the tested function to return 28 | expected_dict = {1: np.array([2, 6]), 29 | 2: np.array([1, 4]), 30 | 3: np.array([3, 5])} 31 | 32 | # Alias the function being tested. 33 | func = bs.relate_obs_ids_to_chosen_alts 34 | 35 | # Execute the given tests. 36 | func_result = func(obs_ids, alt_ids, choices) 37 | self.assertIsInstance(func_result, dict) 38 | for key in expected_dict: 39 | self.assertIn(key, func_result) 40 | self.assertIsInstance(func_result[key], np.ndarray) 41 | self.assertEqual(func_result[key].ndim, 1) 42 | npt.assert_allclose(func_result[key], expected_dict[key]) 43 | return None 44 | 45 | def test_get_num_obs_choosing_each_alternative(self): 46 | # Alias the function that is to be tested 47 | func = bs.get_num_obs_choosing_each_alternative 48 | 49 | # Create the dictionary of observations per alternative 50 | obs_per_group = {1: np.array([2, 6, 7]), 51 | 2: np.array([1]), 52 | 3: np.array([3, 5])} 53 | 54 | # Get the 'expected results' 55 | expected_dict = OrderedDict() 56 | expected_dict[1] = obs_per_group[1].size 57 | expected_dict[2] = obs_per_group[2].size 58 | expected_dict[3] = obs_per_group[3].size 59 | expected_num_obs = (obs_per_group[1].size + 60 | obs_per_group[2].size + 61 | obs_per_group[3].size) 62 | 63 | # Get the results from the function 64 | func_dict, func_num_obs = func(obs_per_group) 65 | 66 | # Perform the desired tests 67 | self.assertIsInstance(func_dict, OrderedDict) 68 | self.assertIsInstance(func_num_obs, int) 69 | self.assertEqual(func_num_obs, expected_num_obs) 70 | for key in func_dict: 71 | func_num = func_dict[key] 72 | self.assertIsInstance(func_num, int) 73 | self.assertEqual(func_num, expected_dict[key]) 74 | return None 75 | 76 | def test_create_cross_sectional_bootstrap_samples(self): 77 | # Create fake data for the observation, alternative, and choice ids. 78 | obs_ids = np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6]) 79 | alt_ids = np.array([1, 2, 1, 3, 2, 3, 2, 3, 1, 3, 1, 2]) 80 | choices = np.array([0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0]) 81 | 82 | # Determine the number of samples to be taken 83 | num_samples = 5 84 | 85 | # Determine the random seed for reproducibility 86 | seed = 55 87 | np.random.seed(seed) 88 | 89 | # Create the dictionary of observations per alternative 90 | obs_per_group = {1: np.array([2, 6]), 91 | 2: np.array([1, 4]), 92 | 3: np.array([3, 5])} 93 | num_obs_per_group = {1: 2, 2: 2, 3: 2} 94 | 95 | # Determine the array that should be created. 96 | expected_ids = np.empty((num_samples, 6)) 97 | 98 | expected_shape_1 = (num_samples, num_obs_per_group[1]) 99 | expected_ids[:, :2] =\ 100 | np.random.choice(obs_per_group[1], 101 | size=num_samples * num_obs_per_group[1], 102 | replace=True).reshape(expected_shape_1) 103 | 104 | expected_shape_2 = (num_samples, num_obs_per_group[2]) 105 | expected_ids[:, 2:4] =\ 106 | np.random.choice(obs_per_group[2], 107 | size=num_samples * len(obs_per_group[2]), 108 | replace=True).reshape(expected_shape_2) 109 | 110 | expected_shape_3 = (num_samples, num_obs_per_group[3]) 111 | expected_ids[:, 4:6] =\ 112 | np.random.choice(obs_per_group[3], 113 | size=num_samples * len(obs_per_group[3]), 114 | replace=True).reshape(expected_shape_3) 115 | 116 | # Alias the function being tested. 117 | func = bs.create_cross_sectional_bootstrap_samples 118 | 119 | # Get the desired results 120 | func_result = func(obs_ids, alt_ids, choices, num_samples, seed=seed) 121 | 122 | # Perform the requisite tests 123 | self.assertIsInstance(func_result, np.ndarray) 124 | self.assertEqual(func_result.shape, expected_ids.shape) 125 | npt.assert_allclose(func_result, expected_ids) 126 | 127 | # Make sure the argument check works 128 | self.assertRaises(ValueError, 129 | func, 130 | obs_ids, 131 | alt_ids, 132 | choices, 133 | num_samples, 134 | "2") 135 | 136 | return None 137 | 138 | def test_create_bootstrap_id_array(self): 139 | # Create an array of fake bootstrapped observation ids 140 | fake_obs_id_per_sample = np.arange(25).reshape((5, 5)) 141 | 142 | # Create the expected result denoting the "bootstrap ids" for each of 143 | # the sampled observation ids. 144 | expected_results = np.array([[1, 2, 3, 4, 5], 145 | [1, 2, 3, 4, 5], 146 | [1, 2, 3, 4, 5], 147 | [1, 2, 3, 4, 5], 148 | [1, 2, 3, 4, 5]]) 149 | # Alias the function being tested 150 | func = bs.create_bootstrap_id_array 151 | # Get the function results 152 | func_result = func(fake_obs_id_per_sample) 153 | 154 | # Perform the desired tests 155 | self.assertIsInstance(func_result, np.ndarray) 156 | npt.assert_allclose(func_result, expected_results) 157 | 158 | return None 159 | 160 | def test_create_deepcopied_groupby_dict(self): 161 | # Create the dataframe of fake data 162 | fake_df = pd.DataFrame({"obs_id": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], 163 | "alt_id": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], 164 | "choice": [1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1], 165 | "x": [1, 1.2, 1.4, 0.3, 0.9, 1.11, 0.53, 0.82, 166 | 1.31, 1.24, 0.98, 0.76]}) 167 | # Create the result that we expect from the function being tested. 168 | expected_res = {1: fake_df.iloc[0:2], 169 | 2: fake_df.iloc[2:4], 170 | 3: fake_df.iloc[4:6], 171 | 4: fake_df.iloc[6:8], 172 | 5: fake_df.iloc[8:10], 173 | 6: fake_df.iloc[10:]} 174 | # Alias the function being tested 175 | func = bs.create_deepcopied_groupby_dict 176 | 177 | # Get the result of the function 178 | func_result = func(fake_df, "obs_id") 179 | 180 | # Perform the requisite tests 181 | # Ensure the returned value is a dictionary 182 | self.assertIsInstance(func_result, dict) 183 | # Ensure the returned value and the expected value have the same keys. 184 | self.assertEqual(sorted(func_result.keys()), 185 | sorted(expected_res.keys())) 186 | for key in func_result: 187 | # Get the expected and returned dataframes for each observation id 188 | sub_func_result = func_result[key] 189 | sub_expected_res = expected_res[key] 190 | 191 | # Ensure that the dataframes have equal values. 192 | npt.assert_allclose(sub_func_result.values, 193 | sub_expected_res.values) 194 | 195 | # Ensure the dataframes don't share the same location in memory. 196 | self.assertNotEqual(id(sub_func_result), id(sub_expected_res)) 197 | return None 198 | 199 | def test_check_column_existence(self): 200 | # Create the fake dataframe for the test. 201 | fake_df = pd.DataFrame({"obs_id": [1, 1, 2, 2, 3, 3], 202 | "alt_id": [1, 2, 1, 2, 1, 2], 203 | "choice": [1, 0, 0, 1, 1, 0]}) 204 | # Create the sets of arguments and keyword arguments that should not 205 | # lead to raising errors. 206 | good_cols = ["obs_id", "boot_id"] 207 | good_kwargs = [{"presence": True}, {"presence": False}] 208 | 209 | # Alias the function that is being tested 210 | func = bs.check_column_existence 211 | 212 | # Perform the desired tests. 213 | for pos in xrange(len(good_cols)): 214 | col = good_cols[pos] 215 | current_good_kwargs = good_kwargs[pos] 216 | current_bad_kwargs =\ 217 | {"presence": bool(1 - current_good_kwargs["presence"])} 218 | pattern = ("Ensure that `{}` is ".format(col) + 219 | "not " * (1 - current_bad_kwargs["presence"]) + 220 | "in `df.columns`.") 221 | 222 | self.assertIsNone(func(col, fake_df, **current_good_kwargs)) 223 | self.assertRaisesRegexp(ValueError, 224 | pattern, 225 | func, 226 | col, 227 | fake_df, 228 | **current_bad_kwargs) 229 | 230 | return None 231 | 232 | def test_ensure_resampled_obs_ids_in_df(self): 233 | # Create fake data for the test. 234 | good_resampled_obs_ids = np.array([1, 1, 4, 3, 4]) 235 | bad_resampled_obs_ids = np.array([1, 1, 4, 3, 8]) 236 | fake_orig_obs_ids = np.arange(1, 6) 237 | 238 | # Expected error msg pattern 239 | expected_err_msg =\ 240 | "All values in `resampled_obs_ids` MUST be in `orig_obs_id_array`." 241 | 242 | # Alias the function being tested. 243 | func = bs.ensure_resampled_obs_ids_in_df 244 | 245 | # Perform the desired tests 246 | self.assertIsNone(func(good_resampled_obs_ids, fake_orig_obs_ids)) 247 | self.assertRaisesRegexp(ValueError, 248 | expected_err_msg, 249 | func, 250 | bad_resampled_obs_ids, 251 | fake_orig_obs_ids) 252 | return None 253 | 254 | def test_create_bootstrap_dataframe(self): 255 | # Create the dataframe of fake data 256 | fake_df = pd.DataFrame({"obs_id": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6], 257 | "alt_id": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2], 258 | "choice": [1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1], 259 | "x": [1, 1.2, 1.4, 0.3, 0.9, 1.11, 0.53, 0.82, 260 | 1.31, 1.24, 0.98, 0.76]}) 261 | # Note the observation id column 262 | obs_id_col = "obs_id" 263 | 264 | # Get the bootstrapped samples of the observation ids 265 | sampling_args = [fake_df["obs_id"].values, 266 | fake_df["alt_id"].values, 267 | fake_df["choice"].values, 268 | 5] 269 | sampled_obs_ids =\ 270 | bs.create_cross_sectional_bootstrap_samples(*sampling_args) 271 | rel_sampled_ids = sampled_obs_ids[0, :] 272 | 273 | # Get the groupby dictionary for this dataframe. 274 | groupby_dictionary =\ 275 | bs.create_deepcopied_groupby_dict(fake_df, obs_id_col) 276 | 277 | # Alias the function necessary to create the bootstrap dataframe 278 | func = bs.create_bootstrap_dataframe 279 | # Create the bootstrap id column name 280 | boot_id_col = "new_id" 281 | 282 | # Create the expected result. 283 | expected_result =\ 284 | [groupby_dictionary[obs_id].copy() for obs_id in rel_sampled_ids] 285 | for pos in xrange(len(expected_result)): 286 | expected_result[pos][boot_id_col] = pos + 1 287 | expected_result = pd.concat(expected_result, axis=0, ignore_index=True) 288 | 289 | # Get the function result 290 | func_result = func(fake_df, 291 | obs_id_col, 292 | rel_sampled_ids, 293 | groupby_dictionary, 294 | boot_id_col=boot_id_col) 295 | 296 | # Perform the desired tests. 297 | self.assertIsInstance(func_result, pd.DataFrame) 298 | self.assertIn(boot_id_col, func_result.columns.values) 299 | self.assertEqual(expected_result.shape, func_result.shape) 300 | npt.assert_allclose(expected_result.values, func_result.values) 301 | return None 302 | -------------------------------------------------------------------------------- /tests/test_bootstrap_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the bootstrap_utils.py file. 3 | """ 4 | import unittest 5 | 6 | import numpy as np 7 | import numpy.testing as npt 8 | 9 | import pylogit.bootstrap_utils as bu 10 | 11 | 12 | class UtilityTester(unittest.TestCase): 13 | def test_check_conf_percentage_validity(self): 14 | # Create a list of valid and invalid arguments 15 | good_args = [80, 95.0, 30] 16 | bad_args = [-2, '95', None, (90,)] 17 | # Note the message that should be displayed in case of errors. 18 | expected_err_msg =\ 19 | "conf_percentage MUST be a number between 0.0 and 100." 20 | # Alias the function being tested 21 | func = bu.check_conf_percentage_validity 22 | # Perform the desired tests 23 | for arg in good_args: 24 | self.assertIsNone(func(arg)) 25 | for arg in bad_args: 26 | self.assertRaisesRegexp(ValueError, 27 | expected_err_msg, 28 | func, 29 | arg) 30 | return None 31 | 32 | def test_ensure_samples_is_ndim_ndarray(self): 33 | # Create a list of valid and invalid arguments 34 | base_array = np.arange(10) 35 | good_args = [base_array.copy().reshape((2, 5)), 36 | base_array.copy().reshape((5, 2))] 37 | bad_args = [base_array, base_array[None, None, :], 30] 38 | # Create a 'name' argument 39 | fake_name = 'test' 40 | # Note the message that should be displayed in case of errors. 41 | expected_err_msg =\ 42 | "`{}` MUST be a 2D ndarray.".format(fake_name + '_samples') 43 | # Alias the function being tested 44 | func = bu.ensure_samples_is_ndim_ndarray 45 | # Perform the desired tests 46 | for arg in good_args: 47 | self.assertIsNone(func(arg, name=fake_name)) 48 | for arg in bad_args: 49 | self.assertRaisesRegexp(ValueError, 50 | expected_err_msg, 51 | func, 52 | arg, 53 | name=fake_name) 54 | self.assertIsNone(func(base_array, ndim=1)) 55 | return None 56 | 57 | def test_get_alpha_from_conf_percentage(self): 58 | # Create a list of valid confidence percentages 59 | good_args = [80, 95.0, 30] 60 | # Create a list of expected results 61 | expected_results = [20, 5, 70] 62 | # Alias the function being tested 63 | func = bu.get_alpha_from_conf_percentage 64 | # Perform the desired tests 65 | for pos, arg in enumerate(good_args): 66 | self.assertEqual(func(arg), expected_results[pos]) 67 | return None 68 | 69 | def test_combine_conf_endpoints(self): 70 | # Create fake arguments 71 | lower_array = np.arange(5) 72 | upper_array = np.arange(2, 7) 73 | # Create the expected result 74 | expected_result =\ 75 | np.array([lower_array.tolist(), upper_array.tolist()]) 76 | # Alias the function being tested 77 | func = bu.combine_conf_endpoints 78 | # Perform the desired test 79 | npt.assert_allclose(expected_result, func(lower_array, upper_array)) 80 | return None 81 | -------------------------------------------------------------------------------- /tests/test_conditional_logit.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the conditional_logit.py file. These tests do not include tests of 3 | the functions that perform the mathematical calculations necessary to estimate 4 | the MNL model. 5 | """ 6 | import warnings 7 | import unittest 8 | from collections import OrderedDict 9 | 10 | import numpy as np 11 | import numpy.testing as npt 12 | import pandas as pd 13 | 14 | import pylogit.conditional_logit as mnl 15 | 16 | 17 | class HelperFuncTests(unittest.TestCase): 18 | """ 19 | Defines the tests for the 'helper' functions for estimating the MNL model. 20 | """ 21 | 22 | def setUp(self): 23 | # Set up the fake arguments 24 | self.fake_beta = np.arange(3) 25 | self.fake_args = ["foo", 1] 26 | self.fake_kwargs = {"fake_arg_1": "bar", 27 | "fake_arg_2": 2, 28 | "fake_arg_3": True} 29 | self.fake_design = np.arange(6).reshape((2, 3)) 30 | self.fake_index = self.fake_design.dot(self.fake_beta) 31 | 32 | def test_split_param_vec(self): 33 | """ 34 | Ensures that split_param_vec returns (None, None, index_coefs) 35 | when called from within conditional_logit.py. 36 | """ 37 | # Store the results of split_param_vec() 38 | split_results = mnl.split_param_vec(self.fake_beta, 39 | return_all_types=False, 40 | *self.fake_args, 41 | **self.fake_kwargs) 42 | # Check for expected results. 43 | self.assertIsNone(split_results[0]) 44 | self.assertIsNone(split_results[1]) 45 | npt.assert_allclose(split_results[2], self.fake_beta) 46 | 47 | # Store the results of split_param_vec() 48 | split_results = mnl.split_param_vec(self.fake_beta, 49 | return_all_types=True, 50 | *self.fake_args, 51 | **self.fake_kwargs) 52 | # Check for expected results. 53 | self.assertIsNone(split_results[0]) 54 | self.assertIsNone(split_results[1]) 55 | self.assertIsNone(split_results[2]) 56 | npt.assert_allclose(split_results[3], self.fake_beta) 57 | 58 | return None 59 | 60 | def test_mnl_utility_transform(self): 61 | """ 62 | Ensures that mnl_utility_transform returns a 2D version of the 1D 63 | 1D index array that is passed to it. 64 | """ 65 | # Get the results of _mnl_utiilty_transform() 66 | transform_results = mnl._mnl_utility_transform(self.fake_index, 67 | *self.fake_args, 68 | **self.fake_kwargs) 69 | 70 | # Check to make sure the results are as expected 71 | self.assertIsInstance(transform_results, np.ndarray) 72 | self.assertEqual(transform_results.shape, (2, 1)) 73 | npt.assert_allclose(transform_results, self.fake_index[:, None]) 74 | 75 | return None 76 | 77 | def test_mnl_transform_deriv_c(self): 78 | """ 79 | Ensures that mnl_transform_deriv_c returns None. 80 | """ 81 | derivative_results = mnl._mnl_transform_deriv_c(self.fake_index, 82 | *self.fake_args, 83 | **self.fake_kwargs) 84 | self.assertIsNone(derivative_results) 85 | 86 | return None 87 | 88 | def test_mnl_transform_deriv_alpha(self): 89 | """ 90 | Ensures that mnl_transform_deriv_alpha returns None. 91 | """ 92 | derivative_results = mnl._mnl_transform_deriv_alpha(self.fake_index, 93 | *self.fake_args, 94 | **self.fake_kwargs) 95 | self.assertIsNone(derivative_results) 96 | 97 | return None 98 | 99 | 100 | class ChoiceObjectTests(unittest.TestCase): 101 | """ 102 | Defines the tests for the MNL model object's `__init__` function and its 103 | other methods. 104 | """ 105 | 106 | def setUp(self): 107 | # Create fake versions of the needed arguments for the MNL constructor 108 | self.fake_df = pd.DataFrame({"obs_id": [1, 1, 2, 2, 3, 3], 109 | "alt_id": [1, 2, 1, 2, 1, 2], 110 | "choice": [0, 1, 0, 1, 1, 0], 111 | "x": range(6)}) 112 | self.fake_specification = OrderedDict() 113 | self.fake_specification["x"] = [[1, 2]] 114 | self.fake_names = OrderedDict() 115 | self.fake_names["x"] = ["x (generic coefficient)"] 116 | self.alt_id_col = "alt_id" 117 | self.obs_id_col = "obs_id" 118 | self.choice_col = "choice" 119 | self.fake_beta = np.array([1]) 120 | 121 | return None 122 | 123 | def test_outside_intercept_error_in_constructor(self): 124 | """ 125 | Ensures that a ValueError is raised when the 'intercept_ref_pos' kwarg 126 | is passed to the MNL model constructor. This prevents people from 127 | expecting the use of outside intercept parameters to work with the MNL 128 | model. 129 | """ 130 | # Create a variable for the standard arguments to this function. 131 | standard_args = [self.fake_df, 132 | self.alt_id_col, 133 | self.obs_id_col, 134 | self.choice_col, 135 | self.fake_specification] 136 | # Create a variable for the kwargs being passed to the constructor 137 | kwarg_map = {"intercept_ref_pos": 2} 138 | 139 | self.assertRaises(ValueError, 140 | mnl.MNL, 141 | *standard_args, 142 | **kwarg_map) 143 | return None 144 | 145 | def test_shape_ignore_msg_in_constructor(self): 146 | """ 147 | Ensures that a UserWarning is raised when the 'shape_ref_pos' or 148 | 'shape_names' keyword arguments are passed to the MNL model 149 | constructor. This warns people against expecting the MNL to work with 150 | shape parameters, and alerts them to the fact they are using an MNL 151 | model when they might have been expecting to instantiate a different 152 | choice model. 153 | """ 154 | # Create a variable for the standard arguments to this function. 155 | standard_args = [self.fake_df, 156 | self.alt_id_col, 157 | self.obs_id_col, 158 | self.choice_col, 159 | self.fake_specification] 160 | 161 | # Create a variable for the kwargs being passed to the constructor 162 | kwarg_map_1 = {"shape_ref_pos": 2} 163 | kwarg_map_2 = {"shape_names": OrderedDict([("x", ["foo"])])} 164 | 165 | # Test to ensure that the shape ignore message is printed when using 166 | # either of these two kwargs 167 | with warnings.catch_warnings(record=True) as context: 168 | # Use this filter to always trigger the UserWarnings 169 | warnings.simplefilter('always', UserWarning) 170 | 171 | for pos, bad_kwargs in enumerate([kwarg_map_1, kwarg_map_2]): 172 | # Create an MNL model object with the irrelevant kwargs. 173 | # This should trigger a UserWarning 174 | mnl_obj = mnl.MNL(*standard_args, **bad_kwargs) 175 | # Check that the warning has been created. 176 | self.assertEqual(len(context), pos + 1) 177 | self.assertIsInstance(context[-1].category, type(UserWarning)) 178 | self.assertIn(mnl._shape_ignore_msg, str(context[-1].message)) 179 | 180 | return None 181 | 182 | def test_outside_intercept_error_in_fit_mle(self): 183 | """ 184 | Ensures that a ValueError is raised when users try to use any other 185 | type of initial value input methods other than the `init_vals` 186 | argument of `fit_mle()`. This prevents people from expecting the use 187 | of outside intercept or shape parameters to work with the MNL model. 188 | """ 189 | # Create a variable for the standard arguments to the MNL constructor. 190 | standard_args = [self.fake_df, 191 | self.alt_id_col, 192 | self.obs_id_col, 193 | self.choice_col, 194 | self.fake_specification] 195 | 196 | # Create the mnl model object whose coefficients will be estimated. 197 | base_mnl = mnl.MNL(*standard_args) 198 | 199 | # Create a variable for the arguments to the fit_mle function. 200 | fit_args = [self.fake_beta] 201 | 202 | # Create variables for the incorrect kwargs. 203 | # The print_res = False arguments are to make sure strings aren't 204 | # printed to the console unnecessarily. 205 | kwarg_map_1 = {"init_shapes": np.array([1, 2]), 206 | "print_res": False} 207 | kwarg_map_2 = {"init_intercepts": np.array([1]), 208 | "print_res": False} 209 | kwarg_map_3 = {"init_coefs": np.array([1]), 210 | "print_res": False} 211 | 212 | # Test to ensure that the kwarg ignore message is printed when using 213 | # any of these three incorrect kwargs 214 | for kwargs in [kwarg_map_1, kwarg_map_2, kwarg_map_3]: 215 | self.assertRaises(ValueError, base_mnl.fit_mle, 216 | *fit_args, **kwargs) 217 | 218 | return None 219 | 220 | def test_ridge_warning_in_fit_mle(self): 221 | """ 222 | Ensure that a UserWarning is raised when one passes the ridge keyword 223 | argument to the `fit_mle` method of an MNL model object. 224 | """ 225 | # Create a variable for the standard arguments to the MNL constructor. 226 | standard_args = [self.fake_df, 227 | self.alt_id_col, 228 | self.obs_id_col, 229 | self.choice_col, 230 | self.fake_specification] 231 | 232 | # Create the mnl model object whose coefficients will be estimated. 233 | base_mnl = mnl.MNL(*standard_args) 234 | 235 | # Create a variable for the fit_mle function's kwargs. 236 | # The print_res = False arguments are to make sure strings aren't 237 | # printed to the console unnecessarily. 238 | kwargs = {"ridge": 0.5, 239 | "print_res": False} 240 | 241 | # Test to make sure that the ridge warning message is printed when 242 | # using the ridge keyword argument 243 | with warnings.catch_warnings(record=True) as w: 244 | # Use this filter to always trigger the UserWarnings 245 | warnings.simplefilter('always', UserWarning) 246 | 247 | base_mnl.fit_mle(self.fake_beta, **kwargs) 248 | self.assertGreaterEqual(len(w), 1) 249 | self.assertIsInstance(w[0].category, type(UserWarning)) 250 | self.assertIn(mnl._ridge_warning_msg, str(w[0].message)) 251 | 252 | return None 253 | 254 | def test_check_length_of_initial_values(self): 255 | """ 256 | Ensure that a ValueError is raised when one passes an init_vals 257 | argument of the wrong length. 258 | """ 259 | # Create a variable for the standard arguments to the MNL constructor. 260 | standard_args = [self.fake_df, 261 | self.alt_id_col, 262 | self.obs_id_col, 263 | self.choice_col, 264 | self.fake_specification] 265 | 266 | # Create the mnl model object whose coefficients will be estimated. 267 | base_mnl = mnl.MNL(*standard_args) 268 | 269 | # Create the EstimationObj 270 | mapping_res = base_mnl.get_mappings_for_fit() 271 | ridge = None 272 | zero_vector = np.zeros(1) 273 | split_params = mnl.split_param_vec 274 | mnl_estimator = mnl.MNLEstimator(base_mnl, 275 | mapping_res, 276 | ridge, 277 | zero_vector, 278 | split_params) 279 | 280 | # Alias the function to be checked 281 | func = mnl_estimator.check_length_of_initial_values 282 | 283 | for i in [2, 3]: 284 | init_vals = np.ones(i) 285 | self.assertRaises(ValueError, func, init_vals) 286 | 287 | self.assertIsNone(func(np.ones(1))) 288 | 289 | return None 290 | 291 | def test_just_point_kwarg(self): 292 | # Create a variable for the standard arguments to the MNL constructor. 293 | standard_args = [self.fake_df, 294 | self.alt_id_col, 295 | self.obs_id_col, 296 | self.choice_col, 297 | self.fake_specification] 298 | 299 | # Create the mnl model object whose coefficients will be estimated. 300 | base_mnl = mnl.MNL(*standard_args) 301 | # Alias the function being tested 302 | func = base_mnl.fit_mle 303 | # Get the necessary kwargs 304 | kwargs = {"just_point": True} 305 | # Get the function results 306 | func_result = func(self.fake_beta, **kwargs) 307 | # Perform the desired tests to make sure we get back a dictionary with 308 | # an "x" key in it and a value that is a ndarray. 309 | self.assertIsInstance(func_result, dict) 310 | self.assertIn("x", func_result) 311 | self.assertIsInstance(func_result["x"], np.ndarray) 312 | return None 313 | -------------------------------------------------------------------------------- /tests/test_estimation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Use this file to test methods and classes in test_estimation.py 3 | """ 4 | import warnings 5 | import unittest 6 | from collections import OrderedDict 7 | from numbers import Number 8 | 9 | import numpy as np 10 | import numpy.testing as npt 11 | import pandas as pd 12 | from scipy.sparse import csr_matrix 13 | 14 | import pylogit.asym_logit as asym 15 | import pylogit.estimation as estimation 16 | 17 | # Use the following to always show the warnings 18 | np.seterr(all='warn') 19 | warnings.simplefilter("always") 20 | 21 | 22 | class GenericTestCase(unittest.TestCase): 23 | """ 24 | Defines the common setUp method used for the different type of tests. 25 | """ 26 | 27 | def setUp(self): 28 | # The set up being used is one where there are two choice situations, 29 | # The first having three alternatives, and the second having only two 30 | # alternatives. There is one generic variable. Two alternative 31 | # specific constants and all three shape parameters are used. 32 | 33 | # Create the betas to be used during the tests 34 | self.fake_betas = np.array([-0.6]) 35 | 36 | # Create the fake outside intercepts to be used during the tests 37 | self.fake_intercepts = np.array([1, 0.5]) 38 | 39 | # Create names for the intercept parameters 40 | self.fake_intercept_names = ["ASC 1", "ASC 2"] 41 | 42 | # Record the position of the intercept that is not being estimated 43 | self.fake_intercept_ref_pos = 2 44 | 45 | # Create the shape parameters to be used during the tests. Note that 46 | # these are the reparameterized shape parameters, thus they will be 47 | # exponentiated in the fit_mle process and various calculations. 48 | self.fake_shapes = np.array([-1, 1]) 49 | 50 | # Create names for the intercept parameters 51 | self.fake_shape_names = ["Shape 1", "Shape 2"] 52 | 53 | # Record the position of the shape parameter that is being constrained 54 | self.fake_shape_ref_pos = 2 55 | 56 | # Calculate the 'natural' shape parameters 57 | self.natural_shapes = asym._convert_eta_to_c(self.fake_shapes, 58 | self.fake_shape_ref_pos) 59 | 60 | # Create an array of all model parameters 61 | self.fake_all_params = np.concatenate((self.fake_shapes, 62 | self.fake_intercepts, 63 | self.fake_betas)) 64 | 65 | # The mapping between rows and alternatives is given below. 66 | self.fake_rows_to_alts = csr_matrix(np.array([[1, 0, 0], 67 | [0, 1, 0], 68 | [0, 0, 1], 69 | [1, 0, 0], 70 | [0, 0, 1]])) 71 | 72 | # Create the fake design matrix with columns denoting X 73 | # The intercepts are not included because they are kept outside the 74 | # index in the scobit model. 75 | self.fake_design = np.array([[1], 76 | [2], 77 | [3], 78 | [1.5], 79 | [3.5]]) 80 | 81 | # Create the index array for this set of choice situations 82 | self.fake_index = self.fake_design.dot(self.fake_betas) 83 | 84 | # Create the needed dataframe for the Asymmetric Logit constructor 85 | self.fake_df = pd.DataFrame({"obs_id": [1, 1, 1, 2, 2], 86 | "alt_id": [1, 2, 3, 1, 3], 87 | "choice": [0, 1, 0, 0, 1], 88 | "x": self.fake_design[:, 0], 89 | "intercept": [1 for i in range(5)]}) 90 | 91 | # Record the various column names 92 | self.alt_id_col = "alt_id" 93 | self.obs_id_col = "obs_id" 94 | self.choice_col = "choice" 95 | 96 | # Create the index specification and name dictionaryfor the model 97 | self.fake_specification = OrderedDict() 98 | self.fake_names = OrderedDict() 99 | self.fake_specification["x"] = [[1, 2, 3]] 100 | self.fake_names["x"] = ["x (generic coefficient)"] 101 | 102 | # Bundle args and kwargs used to construct the Asymmetric Logit model. 103 | self.constructor_args = [self.fake_df, 104 | self.alt_id_col, 105 | self.obs_id_col, 106 | self.choice_col, 107 | self.fake_specification] 108 | 109 | # Create a variable for the kwargs being passed to the constructor 110 | self.constructor_kwargs = {"intercept_ref_pos": 111 | self.fake_intercept_ref_pos, 112 | "shape_ref_pos": self.fake_shape_ref_pos, 113 | "names": self.fake_names, 114 | "intercept_names": 115 | self.fake_intercept_names, 116 | "shape_names": self.fake_shape_names} 117 | 118 | # Initialize a basic Asymmetric Logit model whose coefficients will be 119 | # estimated. 120 | self.model_obj = asym.MNAL(*self.constructor_args, 121 | **self.constructor_kwargs) 122 | 123 | return None 124 | 125 | 126 | class EstimationObjTests(GenericTestCase): 127 | """ 128 | Store the tests for the basic methods in the EstimationObj class. 129 | """ 130 | 131 | def test_constructor(self): 132 | # Create a zero vector 133 | zero_vector = np.zeros(self.fake_all_params.shape[0]) 134 | # Create a ridge parameter 135 | ridge_param = 0.5 136 | # Split parameter function 137 | split_param_func = asym.split_param_vec 138 | # Store the mapping dictionaries 139 | mapping_dict = self.model_obj.get_mappings_for_fit() 140 | # Store the positions of the parameters to be constrained 141 | constrained_pos = [0] 142 | # Create the kewargs for the estimation object 143 | kwargs = {"constrained_pos": constrained_pos} 144 | 145 | # Create the estimation object 146 | estimation_object = estimation.EstimationObj(self.model_obj, 147 | mapping_dict, 148 | ridge_param, 149 | zero_vector, 150 | split_param_func, 151 | **kwargs) 152 | 153 | # Perform the tests to ensure that the desired attributes were 154 | # correctly created 155 | attr_names = ["alt_id_vector", 156 | "choice_vector", 157 | "design", 158 | "intercept_ref_pos", 159 | "shape_ref_pos", 160 | "rows_to_obs", 161 | "rows_to_alts", 162 | "chosen_row_to_obs", 163 | "rows_to_nests", 164 | "rows_to_mixers", 165 | "ridge", 166 | "constrained_pos", 167 | "zero_vector", 168 | "split_params", 169 | "utility_transform", 170 | "calc_dh_dv", 171 | "calc_dh_d_alpha", 172 | "calc_dh_d_shape"] 173 | for attr in attr_names: 174 | self.assertTrue(hasattr(estimation_object, attr)) 175 | 176 | # Make sure that the objects that should be arrays, are arrays 177 | for attr in ["alt_id_vector", 178 | "choice_vector", 179 | "design", 180 | "zero_vector"]: 181 | self.assertIsInstance(getattr(estimation_object, attr), np.ndarray) 182 | # Ensure that the arrays have the correct values 183 | npt.assert_allclose(estimation_object.alt_id_vector, 184 | self.model_obj.alt_IDs) 185 | npt.assert_allclose(estimation_object.choice_vector, 186 | self.model_obj.choices) 187 | npt.assert_allclose(estimation_object.design, self.model_obj.design) 188 | npt.assert_allclose(estimation_object.zero_vector, zero_vector) 189 | 190 | # Ensure that the scalars are scalars with the correct values 191 | for attr in ["intercept_ref_pos", "shape_ref_pos", "ridge"]: 192 | self.assertIsInstance(getattr(estimation_object, attr), Number) 193 | self.assertEqual(estimation_object.intercept_ref_pos, 194 | self.model_obj.intercept_ref_position) 195 | self.assertEqual(estimation_object.shape_ref_pos, 196 | self.model_obj.shape_ref_position) 197 | self.assertEqual(estimation_object.ridge, ridge_param) 198 | 199 | # Ensure that the mapping matrices are correct 200 | for attr in ["rows_to_obs", "rows_to_alts", "chosen_row_to_obs", 201 | "rows_to_nests", "rows_to_mixers"]: 202 | # Get the mapping matrix as stored on the model object. 203 | matrix_on_object = getattr(estimation_object, attr) 204 | if matrix_on_object is not None: 205 | npt.assert_allclose(matrix_on_object.A, mapping_dict[attr].A) 206 | else: 207 | self.assertIsNone(mapping_dict[attr]) 208 | 209 | # Ensure that the function definitions point to the correct locations 210 | self.assertEqual(id(estimation_object.split_params), 211 | id(split_param_func)) 212 | self.assertEqual(id(estimation_object.utility_transform), 213 | id(self.model_obj.utility_transform)) 214 | 215 | # Make sure that the derivative functions return None, for now. 216 | for attr in ["calc_dh_dv", 217 | "calc_dh_d_alpha", 218 | "calc_dh_d_shape"]: 219 | func = getattr(estimation_object, attr) 220 | self.assertIsNone(func("foo")) 221 | 222 | return None 223 | 224 | def test_not_implemented_error_in_example_functions(self): 225 | # Create a zero vector 226 | zero_vector = np.zeros(self.fake_all_params.shape[0]) 227 | # Create a ridge parameter 228 | ridge_param = 0.5 229 | # Split parameter function 230 | split_param_func = asym.split_param_vec 231 | # Store the mapping dictionaries 232 | mapping_dict = self.model_obj.get_mappings_for_fit() 233 | # Store the positions of the parameters to be constrained 234 | constrained_pos = [0] 235 | # Create the kwargs for the estimation object 236 | kwargs = {"constrained_pos": constrained_pos} 237 | 238 | # Create the estimation object 239 | estimation_object = estimation.EstimationObj(self.model_obj, 240 | mapping_dict, 241 | ridge_param, 242 | zero_vector, 243 | split_param_func, 244 | **kwargs) 245 | 246 | # Record the names of the methods that are created as examples 247 | example_methods = ["convenience_calc_probs", 248 | "convenience_calc_log_likelihood", 249 | "convenience_calc_gradient", 250 | "convenience_calc_hessian", 251 | "convenience_calc_fisher_approx"] 252 | for method_name in example_methods: 253 | func = getattr(estimation_object, method_name) 254 | error_msg = "Method should be defined by descendant classes" 255 | self.assertRaisesRegexp(NotImplementedError, 256 | error_msg, 257 | func, 258 | None) 259 | 260 | return None 261 | 262 | def test_ensure_positivity_and_length_of_weights(self): 263 | # Create a set of good and bad arguments 264 | num_rows = self.fake_design.shape[0] 265 | fake_data = pd.DataFrame(self.fake_design, columns=['x']) 266 | good_weights = [None, np.ones(num_rows)] 267 | bad_weights =\ 268 | [1, np.ones((3, 3)), np.ones(num_rows + 1), -1 * np.ones(num_rows)] 269 | # Alias the function being tested 270 | func = estimation.ensure_positivity_and_length_of_weights 271 | # Note the error messages that should be raised. 272 | msg_1 = '`weights` MUST be a 1D ndarray.' 273 | msg_2 = '`weights` must have the same number of rows as `data`.' 274 | msg_3 = '`weights` MUST be >= 0.' 275 | expected_error_msgs = [msg_1, msg_1, msg_2, msg_3] 276 | # Perform the desired tests 277 | for weights in good_weights: 278 | self.assertIsNone(func(weights, fake_data)) 279 | for pos, weights in enumerate(bad_weights): 280 | self.assertRaisesRegexp(ValueError, 281 | expected_error_msgs[pos], 282 | func, 283 | weights, 284 | fake_data) 285 | return None 286 | -------------------------------------------------------------------------------- /tests/test_nested_logit.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the nested_logit.py file. These tests do not include tests of 3 | the functions that perform the mathematical calculations necessary to estimate 4 | the Nested Logit model. 5 | """ 6 | import warnings 7 | import unittest 8 | from collections import OrderedDict 9 | 10 | import numpy as np 11 | import numpy.testing as npt 12 | import pandas as pd 13 | from scipy.sparse import csr_matrix 14 | 15 | import pylogit.nested_logit as nl 16 | 17 | 18 | class NestedLogitTests(unittest.TestCase): 19 | """ 20 | Tests of the `split_param_vec` function, the `NestedLogit` model 21 | constructor, and the `fit_mle()` method. 22 | """ 23 | 24 | def setUp(self): 25 | # Create the betas to be used during the tests 26 | self.fake_betas = np.array([0.3, -0.6, 0.2]) 27 | # Create the fake nest coefficients to be used during the tests 28 | self.fake_nest_coefs = np.array([1, 0.5]) 29 | # Create an array of all model parameters 30 | self.fake_all_params = np.concatenate((self.fake_nest_coefs, 31 | self.fake_betas)) 32 | # The set up being used is one where there are two choice situations, 33 | # The first having three alternatives, and the second having only two. 34 | # The nest memberships of these alternatives are given below. 35 | self.fake_rows_to_nests = csr_matrix(np.array([[0, 1], 36 | [0, 1], 37 | [1, 0], 38 | [0, 1], 39 | [1, 0]])) 40 | 41 | # Create a sparse matrix that maps the rows of the design matrix to the 42 | # observatins 43 | self.fake_rows_to_obs = csr_matrix(np.array([[1, 0], 44 | [1, 0], 45 | [1, 0], 46 | [0, 1], 47 | [0, 1]])) 48 | 49 | # Create the fake design matrix with columns denoting ASC_1, ASC_2, X 50 | self.fake_design = np.array([[1, 0, 1], 51 | [0, 1, 2], 52 | [0, 0, 3], 53 | [1, 0, 1.5], 54 | [0, 0, 3.5]]) 55 | 56 | # Create fake versions of the needed arguments for the MNL constructor 57 | self.fake_df = pd.DataFrame({"obs_id": [1, 1, 1, 2, 2], 58 | "alt_id": [1, 2, 3, 1, 3], 59 | "choice": [0, 1, 0, 0, 1], 60 | "x": range(5), 61 | "intercept": [1 for i in range(5)]}) 62 | 63 | # Record the various column names 64 | self.alt_id_col = "alt_id" 65 | self.obs_id_col = "obs_id" 66 | self.choice_col = "choice" 67 | 68 | # Create a sparse matrix that maps the chosen rows of the design 69 | # matrix to the observatins 70 | self.fake_chosen_rows_to_obs = csr_matrix(np.array([[0, 0], 71 | [1, 0], 72 | [0, 0], 73 | [0, 0], 74 | [0, 1]])) 75 | 76 | # Create the index specification and name dictionaryfor the model 77 | self.fake_specification = OrderedDict() 78 | self.fake_specification["intercept"] = [1, 2] 79 | self.fake_specification["x"] = [[1, 2, 3]] 80 | self.fake_names = OrderedDict() 81 | self.fake_names["intercept"] = ["ASC 1", "ASC 2"] 82 | self.fake_names["x"] = ["x (generic coefficient)"] 83 | 84 | # Create the nesting specification 85 | self.fake_nest_spec = OrderedDict() 86 | self.fake_nest_spec["Nest 1"] = [1, 2] 87 | self.fake_nest_spec["Nest 2"] = [3] 88 | 89 | return None 90 | 91 | def test_split_param_vec(self): 92 | """ 93 | Ensures that split_param_vec returns a tuple of nest coefficients and 94 | index coefficients. 95 | """ 96 | split_results = nl.split_param_vec(self.fake_all_params, 97 | self.fake_rows_to_nests) 98 | 99 | # Check that the results of split_param_vec are as expected 100 | self.assertIsInstance(split_results, tuple) 101 | self.assertEqual(len(split_results), 2) 102 | for item in split_results: 103 | self.assertIsInstance(item, np.ndarray) 104 | self.assertEqual(len(item.shape), 1) 105 | npt.assert_allclose(self.fake_nest_coefs, split_results[0]) 106 | npt.assert_allclose(self.fake_betas, split_results[1]) 107 | 108 | return None 109 | 110 | def test_missing_nest_spec_error_in_constructor(self): 111 | """ 112 | Ensure that the Nested Logit model cannot be constructed without the 113 | `nest_spec` keyword argument being passed a value other than `None`. 114 | """ 115 | # Bundle the arguments used to construct the nested logit model 116 | constructor_args = [self.fake_df, 117 | self.alt_id_col, 118 | self.obs_id_col, 119 | self.choice_col, 120 | self.fake_specification, 121 | self.fake_names] 122 | 123 | self.assertRaises(ValueError, nl.NestedLogit, *constructor_args) 124 | 125 | return None 126 | 127 | def test_ridge_warning_in_fit_mle(self): 128 | """ 129 | Ensure that a UserWarning is raised when one passes the ridge keyword 130 | argument to the `fit_mle` method of a Nested Logit model object. 131 | """ 132 | # Bundle the arguments used to construct the nested logit model 133 | constructor_args = [self.fake_df, 134 | self.alt_id_col, 135 | self.obs_id_col, 136 | self.choice_col, 137 | self.fake_specification, 138 | self.fake_names] 139 | # Bundle the kwargs for constructing the nested_logit_model 140 | constructor_kwargs = {"nest_spec": self.fake_nest_spec} 141 | 142 | # Create the mnl model object whose coefficients will be estimated. 143 | base_nl = nl.NestedLogit(*constructor_args, **constructor_kwargs) 144 | 145 | # Create a variable for the fit_mle function's kwargs. 146 | # The print_res = False arguments are to make sure strings aren't 147 | # printed to the console unnecessarily. 148 | fit_kwargs = {"constrained_pos": [1], 149 | "ridge": 0.5, 150 | "print_res": False} 151 | 152 | # Test to make sure that the ridge warning message is printed when 153 | # using the ridge keyword argument 154 | with warnings.catch_warnings(record=True) as w: 155 | # Use this filter to always trigger the UserWarnings 156 | warnings.simplefilter('always', UserWarning) 157 | 158 | base_nl.fit_mle(self.fake_all_params, **fit_kwargs) 159 | self.assertGreaterEqual(len(w), 1) 160 | self.assertIsInstance(w[0].category, type(UserWarning)) 161 | self.assertIn(nl._ridge_warning_msg, str(w[0].message)) 162 | 163 | return None 164 | 165 | def test_invalid_init_kwargs_error_in_fit_mle(self): 166 | """ 167 | Ensures that a ValueError is raised when users try to use any other 168 | type of initial value input methods other than the `init_vals` 169 | argument of `fit_mle()`. This prevents people from expecting the use 170 | of outside intercept or shape parameters to work with the Nested Logit 171 | model. 172 | """ 173 | # Bundle the arguments used to construct the nested logit model 174 | constructor_args = [self.fake_df, 175 | self.alt_id_col, 176 | self.obs_id_col, 177 | self.choice_col, 178 | self.fake_specification] 179 | 180 | # Bundle the kwargs for constructing the nested_logit_model 181 | constructor_kwargs = {"names": self.fake_names, 182 | "nest_spec": self.fake_nest_spec} 183 | 184 | # Create the mnl model object whose coefficients will be estimated. 185 | base_nl = nl.NestedLogit(*constructor_args, **constructor_kwargs) 186 | 187 | # Create a variable for the arguments to the fit_mle function. 188 | # this mimics the arguments passed when trying to use the shape_param 189 | # or outside intercepts kwargs with fit_mle. 190 | fit_args = [None] 191 | 192 | # Create variables for the incorrect kwargs. 193 | # The print_res = False arguments are to make sure strings aren't 194 | # printed to the console unnecessarily. 195 | kwarg_map_1 = {"init_shapes": np.array([1, 2]), 196 | "print_res": False} 197 | kwarg_map_2 = {"init_intercepts": np.array([1]), 198 | "print_res": False} 199 | kwarg_map_3 = {"init_coefs": np.array([1]), 200 | "print_res": False} 201 | 202 | # Test to ensure that the kwarg ignore message is printed when using 203 | # any of these three incorrect kwargs 204 | for kwargs in [kwarg_map_1, kwarg_map_2, kwarg_map_3]: 205 | self.assertRaises(ValueError, base_nl.fit_mle, 206 | *fit_args, **kwargs) 207 | 208 | return None 209 | 210 | def test_just_point_kwarg(self): 211 | """ 212 | Ensure that calling `fit_mle` with `just_point = True` returns a 213 | dictionary with a 'x' key and a corresponding value that is an ndarray. 214 | """ 215 | # Bundle the arguments used to construct the nested logit model 216 | constructor_args = [self.fake_df, 217 | self.alt_id_col, 218 | self.obs_id_col, 219 | self.choice_col, 220 | self.fake_specification] 221 | 222 | # Bundle the kwargs for constructing the nested_logit_model 223 | constructor_kwargs = {"names": self.fake_names, 224 | "nest_spec": self.fake_nest_spec} 225 | 226 | # Create the mnl model object whose coefficients will be estimated. 227 | base_nl = nl.NestedLogit(*constructor_args, **constructor_kwargs) 228 | # Create a variable for the arguments to the fit_mle function. 229 | fit_args = [self.fake_all_params] 230 | # Alias the function being tested 231 | func = base_nl.fit_mle 232 | # Get the necessary kwargs 233 | kwargs = {"just_point": True} 234 | # Get the function results 235 | func_result = func(*fit_args, **kwargs) 236 | # Perform the desired tests to make sure we get back a dictionary with 237 | # an "x" key in it and a value that is a ndarray. 238 | self.assertIsInstance(func_result, dict) 239 | self.assertIn("x", func_result) 240 | self.assertIsInstance(func_result["x"], np.ndarray) 241 | return None 242 | 243 | def test_invalid_init_vals_length_in_estimate(self): 244 | """ 245 | Ensure that when _estimate() is called, with an init_values argument 246 | that is of an incorrect length, a ValueError is raised. 247 | """ 248 | # Bundle the arguments used to construct the nested logit model 249 | constructor_args = [self.fake_df, 250 | self.alt_id_col, 251 | self.obs_id_col, 252 | self.choice_col, 253 | self.fake_specification, 254 | self.fake_names] 255 | # Bundle the kwargs for constructing the nested_logit_model 256 | constructor_kwargs = {"nest_spec": self.fake_nest_spec} 257 | 258 | # Create the mnl model object whose coefficients will be estimated. 259 | base_nl = nl.NestedLogit(*constructor_args, **constructor_kwargs) 260 | 261 | # Create an estimator object. 262 | zero_vector = np.zeros(self.fake_all_params.shape[0]) 263 | estimator_args = [base_nl, 264 | base_nl.get_mappings_for_fit(), 265 | None, 266 | zero_vector, 267 | nl.split_param_vec] 268 | estimator_kwargs = {"constrained_pos": [1]} 269 | nested_estimator = nl.NestedEstimator(*estimator_args, 270 | **estimator_kwargs) 271 | 272 | # Alias the function being tested 273 | func = nested_estimator.check_length_of_initial_values 274 | 275 | # Test that the desired error is raised 276 | for i in [-1, 1]: 277 | init_values = np.arange(self.fake_all_params.shape[0] + i) 278 | 279 | self.assertRaisesRegexp(ValueError, 280 | "values are of the wrong dimension", 281 | func, 282 | init_values) 283 | 284 | return None 285 | 286 | def test_identify_degenerate_nests(self): 287 | """ 288 | Ensure that `identify_degenerate_nests` returns the correct list when 289 | using nest specifications that do and do not contain degenerate nests. 290 | """ 291 | good_spec = OrderedDict() 292 | good_spec["Nest 1"] = [1, 2] 293 | good_spec["Nest 2"] = [3, 4] 294 | 295 | bad_spec = OrderedDict() 296 | bad_spec["Nest 1"] = [1] 297 | bad_spec["Nest 2"] = [2, 3] 298 | bad_spec["Nest 3"] = [4] 299 | 300 | # Alias the function being tested 301 | func = nl.identify_degenerate_nests 302 | 303 | # Test the function 304 | self.assertEqual([], func(good_spec)) 305 | self.assertEqual([0, 2], func(bad_spec)) 306 | 307 | return None 308 | -------------------------------------------------------------------------------- /tests/test_pylogit.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for the user-facing choice model constructor. 3 | """ 4 | import unittest 5 | from collections import OrderedDict 6 | 7 | import numpy as np 8 | import numpy.testing as npt 9 | import pandas as pd 10 | 11 | import pylogit 12 | import pylogit.display_names as display_names 13 | 14 | 15 | # Get the dictionary that maps the model type to the names of the model that 16 | # are stored on the model object itself. 17 | model_type_to_display_name = display_names.model_type_to_display_name 18 | 19 | 20 | class ConstructorTests(unittest.TestCase): 21 | """ 22 | Contains the tests of the choice model construction function. 23 | """ 24 | 25 | def setUp(self): 26 | """ 27 | Create the input data needed to test the choice model constructor. 28 | """ 29 | # The set up being used is one where there are two choice situations, 30 | # The first having three alternatives, and the second having only two 31 | # alternatives. There is one generic variable. Two alternative 32 | # specific constants and all three shape parameters are used. 33 | 34 | # Create the betas to be used during the tests 35 | self.fake_betas = np.array([-0.6]) 36 | 37 | # Create the fake outside intercepts to be used during the tests 38 | self.fake_intercepts = np.array([1, 0.5]) 39 | 40 | # Create names for the intercept parameters 41 | self.fake_intercept_names = ["ASC 1", "ASC 2"] 42 | 43 | # Record the position of the intercept that is not being estimated 44 | self.fake_intercept_ref_pos = 2 45 | 46 | # Create the shape parameters to be used during the tests. Note that 47 | # these are the reparameterized shape parameters, thus they will be 48 | # exponentiated in the fit_mle process and various calculations. 49 | self.fake_shapes = np.array([-1, 0, 1]) 50 | 51 | # Create names for the intercept parameters 52 | self.fake_shape_names = ["Shape 1", "Shape 2", "Shape 3"] 53 | 54 | # Create a shape ref position (used in the Asymmetric Logit Model) 55 | self.fake_shape_ref_pos = 2 56 | 57 | # # Create an array of all model parameters 58 | # self.fake_all_params = np.concatenate((self.fake_shapes, 59 | # self.fake_intercepts, 60 | # self.fake_betas)) 61 | 62 | # # The mapping between rows and alternatives is given below. 63 | # self.fake_rows_to_alts = csr_matrix(np.array([[1, 0, 0], 64 | # [0, 1, 0], 65 | # [0, 0, 1], 66 | # [1, 0, 0], 67 | # [0, 0, 1]])) 68 | 69 | # Create the fake design matrix with columns denoting X 70 | # The intercepts are not included because they are kept outside the 71 | # index in the uneven model. 72 | self.fake_design = np.array([[1], 73 | [2], 74 | [3], 75 | [1.5], 76 | [3.5]]) 77 | 78 | # Create the index array for this set of choice situations 79 | self.fake_index = self.fake_design.dot(self.fake_betas) 80 | 81 | # Create the needed dataframe for the choice model constructor 82 | self.fake_df = pd.DataFrame({"obs_id": [1, 1, 1, 2, 2], 83 | "alt_id": [1, 2, 3, 1, 3], 84 | "choice": [0, 1, 0, 0, 1], 85 | "x": self.fake_design[:, 0], 86 | "intercept": [1 for i in range(5)]}) 87 | 88 | # Record the various column names 89 | self.alt_id_col = "alt_id" 90 | self.obs_id_col = "obs_id" 91 | self.choice_col = "choice" 92 | 93 | # Create the index specification and name dictionary for the model 94 | self.fake_specification = OrderedDict() 95 | self.fake_names = OrderedDict() 96 | self.fake_specification["x"] = [[1, 2, 3]] 97 | self.fake_names["x"] = ["x (generic coefficient)"] 98 | 99 | # Create the nesting specification 100 | self.fake_nest_spec = OrderedDict() 101 | self.fake_nest_spec["Nest 1"] = [1, 2] 102 | self.fake_nest_spec["Nest 2"] = [3] 103 | 104 | # Bundle the args and kwargs used to construct the models. 105 | # Note that "MNL" is used as a model_type placeholder, and it will be 106 | # replaced as needed by each model 107 | self.constructor_args = [self.fake_df, 108 | self.alt_id_col, 109 | self.obs_id_col, 110 | self.choice_col, 111 | self.fake_specification, 112 | "MNL"] 113 | 114 | # Create a variable for the kwargs being passed to the constructor 115 | self.constructor_kwargs = {"intercept_ref_pos": 116 | self.fake_intercept_ref_pos, 117 | "names": self.fake_names, 118 | "intercept_names": 119 | self.fake_intercept_names, 120 | "shape_names": self.fake_shape_names} 121 | 122 | def test_constructor(self): 123 | """ 124 | Construct the various choice models and make sure the constructed 125 | object has the necessary attributes. 126 | """ 127 | # Record the model types of all the models to be created 128 | all_model_types = model_type_to_display_name.keys() 129 | 130 | # Record the attribute / value pairs that are common to all models. 131 | common_attr_value_dict = {"data": self.fake_df, 132 | "name_spec": self.fake_names, 133 | "design": self.fake_design, 134 | "ind_var_names": self.fake_names["x"], 135 | "alt_id_col": self.alt_id_col, 136 | "obs_id_col": self.obs_id_col, 137 | "choice_col": self.choice_col, 138 | "specification": self.fake_specification, 139 | "alt_IDs": self.fake_df["alt_id"].values, 140 | "choices": self.fake_df["choice"].values} 141 | 142 | # Create a shape name dictionary to relate the various models to the 143 | # names of their shape parameters. 144 | shape_name_dict = {"MNL": None, 145 | "Asym": self.fake_shape_names[:2], 146 | "Cloglog": None, 147 | "Scobit": self.fake_shape_names, 148 | "Uneven": self.fake_shape_names, 149 | "Nested Logit": None, 150 | "Mixed Logit": None} 151 | 152 | # Create a shape reference position dictionary to relate the various 153 | # models to their shape reference positions. 154 | shape_ref_dict = {} 155 | for key in shape_name_dict: 156 | shape_ref_dict[key] = (None if key != "Asym" else 157 | self.fake_shape_ref_pos) 158 | 159 | # Create an intercept_names and intercept_ref_position dictionary to 160 | # relate the various models to their respective kwargs. 161 | intercept_names_dict = {} 162 | intercept_ref_dict = {} 163 | for key in shape_name_dict: 164 | if key in ["MNL", "Nested Logit", "Mixed Logit"]: 165 | intercept_names_dict[key] = None 166 | intercept_ref_dict[key] = None 167 | else: 168 | intercept_names_dict[key] = self.fake_intercept_names 169 | intercept_ref_dict[key] = self.fake_intercept_ref_pos 170 | 171 | # Create a nest_names dictionary to relate the various models to their 172 | # nest_name attributes 173 | nest_name_dict = {} 174 | nest_spec_dict = {} 175 | for key in shape_name_dict: 176 | if key != "Nested Logit": 177 | nest_name_dict[key] = None 178 | nest_spec_dict[key] = None 179 | else: 180 | nest_name_dict[key] = list(self.fake_nest_spec.keys()) 181 | nest_spec_dict[key] = self.fake_nest_spec 182 | 183 | # Create dictionaries for the mixing_id_col, mixing_vars, and 184 | # mixing_pos attributes 185 | mixing_id_col_dict = {} 186 | mixing_vars_dict = {} 187 | mixing_pos_dict = {} 188 | 189 | for key in shape_name_dict: 190 | if key != "Mixed Logit": 191 | mixing_id_col_dict[key] = None 192 | mixing_vars_dict[key] = None 193 | mixing_pos_dict[key] = None 194 | else: 195 | mixing_id_col_dict[key] = self.obs_id_col 196 | mixing_vars_dict[key] = self.fake_names["x"] 197 | mixing_pos_dict[key] = [0] 198 | 199 | # Record the attribute / value pairs that vary across models 200 | varying_attr_value_dict = {"model_type": model_type_to_display_name, 201 | "intercept_names": intercept_names_dict, 202 | "intercept_ref_position": 203 | intercept_ref_dict, 204 | "shape_names": shape_name_dict, 205 | "shape_ref_position": shape_ref_dict, 206 | "nest_names": nest_name_dict, 207 | "nest_spec": nest_spec_dict, 208 | "mixing_id_col": mixing_id_col_dict, 209 | "mixing_vars": mixing_vars_dict, 210 | "mixing_pos": mixing_pos_dict} 211 | 212 | # Set up the keyword arguments that are needed for each of the model 213 | # types 214 | variable_kwargs = {} 215 | for model_name in all_model_types: 216 | variable_kwargs[model_name] = {} 217 | variable_kwargs[model_name]["intercept_names"] =\ 218 | intercept_names_dict[model_name] 219 | variable_kwargs[model_name]["intercept_ref_pos"] =\ 220 | intercept_ref_dict[model_name] 221 | variable_kwargs[model_name]["shape_ref_pos"] =\ 222 | shape_ref_dict[model_name] 223 | variable_kwargs[model_name]["shape_names"] =\ 224 | shape_name_dict[model_name] 225 | variable_kwargs[model_name]["nest_spec"] =\ 226 | nest_spec_dict[model_name] 227 | variable_kwargs[model_name]["mixing_id_col"] =\ 228 | mixing_id_col_dict[model_name] 229 | variable_kwargs[model_name]["mixing_vars"] =\ 230 | mixing_vars_dict[model_name] 231 | 232 | # Execute the test for each model type 233 | for model_name in all_model_types: 234 | # Update the model type in the list of constructor args 235 | self.constructor_args[-1] = model_name 236 | 237 | # Use this specific model's keyword arguments 238 | self.constructor_kwargs.update(variable_kwargs[model_name]) 239 | 240 | # Construct the model object 241 | model_obj = pylogit.create_choice_model(*self.constructor_args, 242 | **self.constructor_kwargs) 243 | 244 | # Make sure that the constructor has all of the required attributes 245 | for attr in common_attr_value_dict: 246 | value = common_attr_value_dict[attr] 247 | if isinstance(value, pd.DataFrame): 248 | self.assertTrue(value.equals(model_obj.data)) 249 | elif isinstance(value, np.ndarray): 250 | npt.assert_allclose(value, 251 | model_obj.__getattribute__(attr)) 252 | else: 253 | self.assertEqual(value, 254 | model_obj.__getattribute__(attr)) 255 | 256 | for attr in varying_attr_value_dict: 257 | value = varying_attr_value_dict[attr][model_name] 258 | 259 | self.assertEqual(value, 260 | model_obj.__getattribute__(attr)) 261 | 262 | return None 263 | 264 | def test_ensure_valid_model_type(self): 265 | """ 266 | Ensure that the desired message is raised when an invalid type is 267 | passed, and that None is returned otherwise 268 | """ 269 | # Note the "valid" type strings for our test 270 | test_types = ["bar", "foo", "Sreeta", "Feras"] 271 | # Note a set of invalid type strings for the test 272 | bad_types = ["Tim", "Sam"] 273 | 274 | # Alias the function to be tested 275 | func = pylogit.pylogit.ensure_valid_model_type 276 | 277 | # Make note of part of the error message that should be raised 278 | partial_error_msg = "The specified model_type was not valid." 279 | 280 | # Perform the requisite tests 281 | for good_example in test_types: 282 | self.assertIsNone(func(good_example, test_types)) 283 | for bad_example in bad_types: 284 | self.assertRaisesRegexp(ValueError, 285 | partial_error_msg, 286 | func, 287 | bad_example, 288 | test_types) 289 | 290 | return None 291 | --------------------------------------------------------------------------------