├── .github
└── workflows
│ └── test_package.yml
├── .gitignore
├── .travis.yml
├── CHANGELOG.rst
├── CONTRIBUTORS.txt
├── LICENSE.txt
├── MANIFEST.in
├── Makefile
├── README.md
├── documents
├── pylogit_computation.pdf
└── pylogit_computation.tex
├── examples
├── .ipynb_checkpoints
│ ├── Main PyLogit Example-checkpoint.ipynb
│ ├── Python Biogeme Benchmark--01Logit-checkpoint.ipynb
│ ├── mlogit Benchmark--Train and Fishing-checkpoint.ipynb
│ └── mlogit_Benchmark--Heating-checkpoint.ipynb
├── data
│ ├── electricity_r_data_long.csv
│ ├── fishing_data_r.csv
│ ├── heating_data_r.csv
│ ├── long_swiss_metro_data.csv
│ ├── swissmetro.dat
│ └── train_data_r.csv
└── notebooks
│ ├── .ipynb_checkpoints
│ ├── Asymmetric Choice Models Example-checkpoint.ipynb
│ ├── Converting Long-Format to Wide-Format-checkpoint.ipynb
│ ├── Main PyLogit Example-checkpoint.ipynb
│ ├── Mixed Logit Example--mlogit Benchmark--Electricity-checkpoint.ipynb
│ ├── More Mixed Logit--Heteroskedasticity and Nesting-checkpoint.ipynb
│ ├── Nested Logit Example--Python Biogeme benchmark--09NestedLogit-checkpoint.ipynb
│ ├── Prediction with PyLogit-checkpoint.ipynb
│ ├── Python Biogeme Benchmark--01Logit-checkpoint.ipynb
│ ├── mlogit Benchmark--Train and Fishing-checkpoint.ipynb
│ └── mlogit_Benchmark--Heating-checkpoint.ipynb
│ ├── Asymmetric Choice Models Example.ipynb
│ ├── Converting Long-Format to Wide-Format.ipynb
│ ├── Main PyLogit Example.ipynb
│ ├── Mixed Logit Example--mlogit Benchmark--Electricity.ipynb
│ ├── More Mixed Logit--Heteroskedasticity and Nesting.ipynb
│ ├── Nested Logit Example--Python Biogeme benchmark--09NestedLogit.ipynb
│ ├── Prediction with PyLogit.ipynb
│ ├── Python Biogeme Benchmark--01Logit.ipynb
│ ├── mlogit Benchmark--Train and Fishing.ipynb
│ └── mlogit_Benchmark--Heating.ipynb
├── images
└── PyLogit_Final-small-04.png
├── pyproject.toml
├── requirements.in
├── requirements.txt
├── src
└── pylogit
│ ├── __init__.py
│ ├── asym_logit.py
│ ├── base_multinomial_cm_v2.py
│ ├── bootstrap.py
│ ├── bootstrap_abc.py
│ ├── bootstrap_calcs.py
│ ├── bootstrap_mle.py
│ ├── bootstrap_sampler.py
│ ├── bootstrap_utils.py
│ ├── choice_calcs.py
│ ├── choice_tools.py
│ ├── clog_log.py
│ ├── conditional_logit.py
│ ├── construct_estimator.py
│ ├── display_names.py
│ ├── estimation.py
│ ├── mixed_logit.py
│ ├── mixed_logit_calcs.py
│ ├── nested_choice_calcs.py
│ ├── nested_logit.py
│ ├── newsfragments
│ └── .gitignore
│ ├── pylogit.py
│ ├── scobit.py
│ └── uneven_logit.py
└── tests
├── __init__.py
├── test_asym_logit.py
├── test_base_cm_predict.py
├── test_base_multinomial_cm.py
├── test_bootstrap_abc.py
├── test_bootstrap_calcs.py
├── test_bootstrap_controller.py
├── test_bootstrap_mle.py
├── test_bootstrap_sampler.py
├── test_bootstrap_utils.py
├── test_choice_calcs.py
├── test_choice_tools.py
├── test_clog_log.py
├── test_conditional_logit.py
├── test_construct_estimator.py
├── test_estimation.py
├── test_mixed_logit.py
├── test_nested_choice_calcs.py
├── test_nested_logit.py
├── test_pylogit.py
├── test_scobit.py
└── test_uneven_logit.py
/.github/workflows/test_package.yml:
--------------------------------------------------------------------------------
1 | # Build and test the package
2 | name: Testing
3 |
4 | # Run this workflow every time a new commit is pushed or a pull-request is
5 | # merged to your repository's master branch
6 | on:
7 | push:
8 | branches:
9 | - master
10 | - develop
11 | pull_request:
12 | branches:
13 | - master
14 | - develop
15 |
16 | jobs:
17 | # Set the job key. The key is displayed as the job name
18 | # when a job name is not provided
19 | project-workflow:
20 | # Name the Job
21 | name: Build and test the package
22 | # Set the type of machine to run on
23 | runs-on: ubuntu-latest
24 | # Set the python versions to use
25 | strategy:
26 | matrix:
27 | python: [3.6]
28 |
29 | steps:
30 | # Step 1: Check out a copy of your repository on the ubuntu-latest machine
31 | - name: Checkout repository
32 | uses: actions/checkout@v2
33 |
34 | # Step 2: Make sure conda is installed, with mamba for speed
35 | - name: Setup Python with Conda
36 | uses: conda-incubator/setup-miniconda@v2
37 | with:
38 | python-version: ${{ matrix.python }}
39 | mamba-version: "*"
40 | channels: conda-forge,anaconda,defaults
41 | channel-priority: true
42 | activate-environment: pylogit
43 | auto-activate-base: false
44 |
45 | # Step 3: Make sure the project is installed locally & can commit if needed
46 | - name: Install package locally
47 | shell: bash -l {0}
48 | run: |
49 | make install
50 |
51 | # Step 4: Run the project's tests
52 | - name: Run project tests
53 | shell: bash -l {0}
54 | run: |
55 | tox
56 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled python modules.
2 | *.pyc
3 |
4 | # Setuptools distribution folder.
5 | /dist/
6 | ./dist
7 | dist
8 |
9 | # Inscrutable cache folder
10 | .cache
11 |
12 | # Python egg metadata, regenerated from source files by setuptools.
13 | /*.egg-info
14 |
15 | # setup.py working directory
16 | build
17 |
18 | # Mac OS binary file
19 | .DS_Store
20 |
21 | # Files produced when tracking statement coverage of code.
22 | .coverage
23 |
24 | # Files produced when testing code
25 | .tox
26 |
27 | # Folder that stores deprecated code.
28 | deprecated_code
29 | deprecated_code/
30 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "2.7"
4 | - "3.4"
5 | - "3.5"
6 | - "3.6"
7 |
8 | # command to install dependencies
9 | before_install:
10 | - "pip install --upgrade pip setuptools wheel"
11 | - "pip install --only-binary=numpy,scipy numpy scipy"
12 | - "pip install pandas"
13 | - "pip install coveralls"
14 | - "pip install pytest-cov"
15 | - "pip install future"
16 | - "pip install tqdm"
17 | - "pip install mock"
18 |
19 | # command to execute test suite
20 | script: py.test --cov-report= --cov=pylogit/ tests/
21 | branches:
22 | only:
23 | - master
24 | - develop
25 |
26 | # Send results of tests to coveralls
27 | after_success:
28 | - coveralls
29 |
--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
1 | =========
2 | Changelog
3 | =========
4 |
5 | The format is based on [Keep-a-Changelog](https://keepachangelog.com/en/1.0.0/),
6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7 |
8 | .. towncrier release notes start
9 |
10 | Pylogit 1.0.1 (2020-12-27)
11 | ==========================
12 |
13 | Trivial/Internal Changes
14 | ------------------------
15 |
16 | - Removed setup.py from repository in favor of pyproject.toml. (#68)
17 |
18 |
19 | Pylogit 1.0.0 (2020-12-27)
20 | ==========================
21 |
22 | Removed from package
23 | --------------------
24 |
25 | - Support for python2.7 or any python 3 version below 3.6. (#67)
26 |
27 |
28 | Bug fixes
29 | ---------
30 |
31 | - Resolving import issues with the pylogit.bootstrap submodule. (#27)
32 | - Fixed flaky tests causing continuous integration build errors. (#29)
33 | - Fixed Hessian calculation so only the diagonal is penalized during ridge
34 | regression. (#33)
35 |
36 |
37 | Improved Documentation
38 | ----------------------
39 |
40 | - Made example notebooks py2 and py3 compatible. (#28)
41 |
42 |
43 | Trivial/Internal Changes
44 | ------------------------
45 |
46 | - Included license file in source distribution. (#18)
47 | - Refactored the Hessian calculation to use less memory-intensive operations
48 | based on linear-algebra decompositions. (#30)
49 | - Added journal reference for the accompanying paper in the project README.
50 | (#35)
51 | - Added project logo to the repository. (#46)
52 | - Switched to pip-tools for specifying development dependencies. (#58)
53 | - Added Makefile to standardize development installation. (#59)
54 | - Switched to flit for packaging. (#60)
55 | - Added towncrier to repository. (#61)
56 | - Added tox to the repository for cross-version testing of PyLogit. (#63)
57 | - Added GitHub Actions workflow for Continuous Integration. (#64)
58 | - Converted the README.rst file to README.md. (#65)
59 | - Adding bump2version to development requirements. (#66)
60 |
61 |
62 | Pylogit 0.2.2 (2017-12-11)
63 | ==========================
64 |
65 | Bug fixes
66 | ---------
67 |
68 | - Changed tqdm dependency to allow for anaconda compatibility.
69 |
70 |
71 | Pylogit 0.2.1 (2017-12-11)
72 | ==========================
73 |
74 | Bug fixes
75 | ---------
76 |
77 | - Added statsmodels and tqdm as package dependencies to fix errors with 0.2.0.
78 |
79 |
80 | Pylogit 0.2.0 (2017-12-10)
81 | ==========================
82 |
83 | Added new features
84 | ------------------
85 |
86 | - Added support for Python 3.4 - 3.6
87 | - Added AIC and BIC to summary tables of all models.
88 | - Added support for bootstrapping and calculation of bootstrap confidence intervals:
89 |
90 | - percentile intervals,
91 | - bias-corrected and accelerated (BCa) bootstrap confidence intervals, and
92 | - approximate bootstrap confidence (ABC) intervals.
93 |
94 | - Changed sparse matrix creation to enable estimation of larger datasets.
95 |
96 |
97 | Trivial/Internal Changes
98 | ------------------------
99 |
100 | - Refactored internal code organization and classes for estimation.
101 |
102 |
103 | Pylogit 0.1.2 (2016-12-04)
104 | ==========================
105 |
106 | Added new features
107 | ------------------
108 |
109 | - Added support to all logit-type models for parameter constraints during model estimation.
110 | All models now support the use of the constrained_pos keyword argument.
111 | - Added new argument checks to provide user-friendly error messages.
112 | - Created more than 175 tests, bringing statement coverage to 99%.
113 | - Updated the underflow and overflow protections to make use of L’Hopital’s rule where appropriate.
114 |
115 |
116 | Bug fixes
117 | ---------
118 |
119 | - Fixed bugs with the nested logit model.
120 | In particular, the predict function, the BHHH approximation to the Fisher Information Matrix, and the ridge regression penalty in the log-likelihood, gradient, and hessian functions have been fixed.
121 |
122 |
123 | Improved Documentation
124 | ----------------------
125 |
126 | - Added new example notebooks demonstrating prediction, mixed logit, and converting long-format datasets to wide-format.
127 | - Edited docstrings for clarity throughout the library.
128 |
129 |
130 | Trivial/Internal Changes
131 | ------------------------
132 |
133 | - Extensively refactored codebase.
134 |
135 |
136 | Pylogit 0.1.1 (2016-08-30)
137 | ==========================
138 |
139 | Improved Documentation
140 | ----------------------
141 | - Added python notebook examples demonstrating how to estimate the asymmetric choice models and the nested logit model.
142 | - Corrected the docstrings in various places.
143 | - Added new datasets to the github repo.
144 |
145 |
146 | Pylogit 0.1.0 (2016-08-29)
147 | ==========================
148 |
149 | Added new features
150 | ------------------
151 |
152 | - Added asymmetric choice models.
153 | - Added nested logit and mixed logit models.
154 | - Added tests for mixed logit models.
155 | - Added an example notebook demonstrating how to estimate the mixed logit model.
156 |
157 |
158 | Improved Documentation
159 | ----------------------
160 |
161 | - Changed documentation to numpy doctoring standard.
162 |
163 |
164 | Trivial/Internal Changes
165 | ------------------------
166 |
167 | - Made print statements compatible with python3.
168 | - Fixed typos in library documentation.
169 | - Internal refactoring.
170 |
171 |
172 | Pylogit 0.0.0 (2016-03-15)
173 | ==========================
174 |
175 | Added new features
176 | ------------------
177 |
178 | - Initial package release with support for the conditional logit (MNL) model.
179 |
--------------------------------------------------------------------------------
/CONTRIBUTORS.txt:
--------------------------------------------------------------------------------
1 | This project has been assisted by numerous individuals and organizations.
2 |
3 | Dr. Akshay Vij contributed code that was the basis for the conditional logit
4 | code, and he provided the motivation and guidance behind the use of "mapping
5 | matrices" that is standard throughout PyLogit.
6 |
7 | Professor John Canny advised Vij on optimizing the code used for the
8 | conditional logit, and as such he also helped the development of this package.
9 |
10 | Professor Paul Waddell has graciously provided the financial support that
11 | helped ensure continued progress on this package.
12 |
13 | Oleksandr Lysenko ported PyLogit to Python >= 3.4.
14 |
15 | Eunice Poon designed the logo.
16 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2016, Timothy A. Brathwaite
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 | this list of conditions and the following disclaimer in the documentation
12 | and/or other materials provided with the distribution.
13 |
14 | * Neither the name of PyLogit nor the names of its
15 | contributors may be used to endorse or promote products derived from
16 | this software without specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include MANIFEST.in
2 | include README.rst
3 | include LICENSE.txt
4 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | ## install : Install project package locally and install pre-commit.
2 | .PHONY : install
3 | install :
4 | pip install pip-tools
5 | pip-compile requirements.in
6 | pip install -r requirements.txt
7 |
8 | ## help : Documentation for make targets.
9 | .PHONY : help
10 | help : Makefile
11 | @sed -n 's/^##//p' $<
12 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | 
4 |
5 | # PyLogit
6 | PyLogit is a Python package for performing maximum likelihood estimation of conditional logit models and similar discrete choice models.
7 |
8 | ## Main Features
9 | - It supports
10 | - Conditional Logit (Type) Models
11 | - Multinomial Logit Models
12 | - Multinomial Asymmetric Models
13 | - Multinomial Clog-log Model
14 | - Multinomial Scobit Model
15 | - Multinomial Uneven Logit Model
16 | - Multinomial Asymmetric Logit Model
17 | - Nested Logit Models
18 | - Mixed Logit Models (with Normal mixing distributions)
19 | - It supports datasets where the choice set differs across observations
20 | - It supports model specifications where the coefficient for a given variable may be
21 | - completely alternative-specific
22 | (i.e. one coefficient per alternative, subject to identification of the coefficients),
23 | - subset-specific
24 | (i.e. one coefficient per subset of alternatives, where each alternative belongs to only one subset, and there are more than 1 but less than J subsets, where J is the maximum number of available alternatives in the dataset),
25 | - completely generic
26 | (i.e. one coefficient across all alternatives).
27 |
28 | ## Installation
29 | Available from [PyPi](https://pypi.python.org/pypi/pylogit):
30 | ```
31 | pip install pylogit
32 | ```
33 |
34 | Available through [Anaconda](https://anaconda.org/conda-forge/pylogit):
35 | ```
36 | conda install -c conda-forge pylogit
37 | ```
38 |
39 | or
40 |
41 | ```
42 | conda install -c timothyb0912 pylogit
43 | ```
44 |
45 | ## Usage
46 | For Jupyter notebooks filled with examples, see [examples](./examples/).
47 |
48 |
49 | ## For More Information
50 | For more information about the asymmetric models that can be estimated with PyLogit, see the following paper
51 |
52 | > Brathwaite, T., & Walker, J. L. (2018). Asymmetric, closed-form, finite-parameter models of multinomial choice. Journal of Choice Modelling, 29, 78–112. https://doi.org/10.1016/j.jocm.2018.01.002
53 |
54 | A free and better formatted version is available at [ArXiv](http://arxiv.org/abs/1606.05900).
55 |
56 | ## Attribution
57 | If PyLogit (or its constituent models) is useful in your research or work, please cite this package by citing the paper above.
58 |
59 | ## License
60 | Modified BSD (3-clause). See [here](./LICENSE.txt).
61 |
62 | ## Changelog
63 | See [here](./CHANGELOG.rst).
64 |
--------------------------------------------------------------------------------
/documents/pylogit_computation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/timothyb0912/pylogit/cffc9c523b5368966ef2481c7dc30f0a5d296de8/documents/pylogit_computation.pdf
--------------------------------------------------------------------------------
/examples/notebooks/Converting Long-Format to Wide-Format.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Converting long-format dataframes to wide-format\n",
8 | "The purpose of this notebook is to demonstrate the conversion of long-format data into wide-format. Long-format data contains one row per available alternative per choice situation. In contrast, wide-format data contains one row per choice situation. PyLogit and other software packages (e.g. mlogit in R) use data that is in long-format. However, other software packages, such as Statsmodels in Python or Python BIOGEME, use data that is in wide-format.\n",
9 | "\n",
10 | "Because different software packages have different data format requirements, it is useful to be able to convert one's data from one format to another. Other PyLogit example notebooks (such as the \"Main PyLogit Example\") demonstrate how to take data from wide-format and convert it into long-format. This notebook will demonstrate the reverse process: taking data from long-format and converting it into wide-format.\n",
11 | "\n",
12 | "The dataset being used in this example is the \"Travel Mode Choice\" dataset from Greene and Hensher. It is described on the statsmodels website, and their description is reproduced below in full.\n",
13 | "\n",
14 | "
\n",
15 | " The data, collected as part of a 1987 intercity mode choice study, are a sub-sample of 210 non-business\n",
16 | " trips between Sydney, Canberra and Melbourne in which the traveler chooses a mode from four alternatives\n",
17 | " (plane, car, bus and train). The sample, 840 observations, is choice based with over-sampling of the\n",
18 | " less popular modes (plane, train and bus) and under-sampling of the more popular mode, car. The level of\n",
19 | " service data was derived from highway and transport networks in Sydney, Melbourne, non-metropolitan N.S.W.\n",
20 | " and Victoria, including the Australian Capital Territory.\n",
21 | " \n",
22 | " Number of observations: 840 Observations On 4 Modes for 210 Individuals.\n",
23 | " Number of variables: 8\n",
24 | " Variable name definitions::\n",
25 | "\n",
26 | " individual = 1 to 210\n",
27 | " mode =\n",
28 | " 1 - air\n",
29 | " 2 - train\n",
30 | " 3 - bus\n",
31 | " 4 - car\n",
32 | " choice =\n",
33 | " 0 - no\n",
34 | " 1 - yes\n",
35 | " ttme = terminal waiting time for plane, train and bus (minutes); 0\n",
36 | " for car.\n",
37 | " invc = in vehicle cost for all stages (dollars).\n",
38 | " invt = travel time (in-vehicle time) for all stages (minutes).\n",
39 | " gc = generalized cost measure:invc+(invt*value of travel time savings)\n",
40 | " (dollars).\n",
41 | " hinc = household income ($1000s).\n",
42 | " psize = traveling group size in mode chosen (number).\n",
43 | " \n",
44 | " \n",
45 | " Source\n",
46 | "\n",
47 | " Greene, W.H. and D. Hensher (1997) Multinomial logit and discrete choice models in Greene, W. H. (1997)\n",
48 | " LIMDEP version 7.0 user’s manual revised, Plainview, New York econometric software, Inc. Download from\n",
49 | " on-line complements to Greene, W.H. (2011) Econometric Analysis, Prentice Hall, 7th Edition (data table\n",
50 | " F18-2) http://people.stern.nyu.edu/wgreene/Text/Edition7/TableF18-2.csv\n",
51 | "\n",
52 | "
"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 1,
58 | "metadata": {
59 | "collapsed": true
60 | },
61 | "outputs": [],
62 | "source": [
63 | "# To access the Travel Mode Choice data\n",
64 | "import statsmodels.datasets\n",
65 | "\n",
66 | "# To perform the dataset conversion\n",
67 | "import pylogit as pl"
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {},
73 | "source": [
74 | "### Load the needed dataset"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 3,
80 | "metadata": {
81 | "collapsed": false
82 | },
83 | "outputs": [
84 | {
85 | "data": {
86 | "text/html": [
87 | "\n",
88 | "
\n",
89 | " \n",
90 | " \n",
91 | " | \n",
92 | " individual | \n",
93 | " mode | \n",
94 | " choice | \n",
95 | " ttme | \n",
96 | " invc | \n",
97 | " invt | \n",
98 | " gc | \n",
99 | " hinc | \n",
100 | " psize | \n",
101 | "
\n",
102 | " \n",
103 | " \n",
104 | " \n",
105 | " 0 | \n",
106 | " 1 | \n",
107 | " 1 | \n",
108 | " 0 | \n",
109 | " 69 | \n",
110 | " 59 | \n",
111 | " 100 | \n",
112 | " 70 | \n",
113 | " 35 | \n",
114 | " 1 | \n",
115 | "
\n",
116 | " \n",
117 | " 1 | \n",
118 | " 1 | \n",
119 | " 2 | \n",
120 | " 0 | \n",
121 | " 34 | \n",
122 | " 31 | \n",
123 | " 372 | \n",
124 | " 71 | \n",
125 | " 35 | \n",
126 | " 1 | \n",
127 | "
\n",
128 | " \n",
129 | " 2 | \n",
130 | " 1 | \n",
131 | " 3 | \n",
132 | " 0 | \n",
133 | " 35 | \n",
134 | " 25 | \n",
135 | " 417 | \n",
136 | " 70 | \n",
137 | " 35 | \n",
138 | " 1 | \n",
139 | "
\n",
140 | " \n",
141 | " 3 | \n",
142 | " 1 | \n",
143 | " 4 | \n",
144 | " 1 | \n",
145 | " 0 | \n",
146 | " 10 | \n",
147 | " 180 | \n",
148 | " 30 | \n",
149 | " 35 | \n",
150 | " 1 | \n",
151 | "
\n",
152 | " \n",
153 | " 4 | \n",
154 | " 2 | \n",
155 | " 1 | \n",
156 | " 0 | \n",
157 | " 64 | \n",
158 | " 58 | \n",
159 | " 68 | \n",
160 | " 68 | \n",
161 | " 30 | \n",
162 | " 2 | \n",
163 | "
\n",
164 | " \n",
165 | "
\n",
166 | "
"
167 | ],
168 | "text/plain": [
169 | " individual mode choice ttme invc invt gc hinc psize\n",
170 | "0 1 1 0 69 59 100 70 35 1\n",
171 | "1 1 2 0 34 31 372 71 35 1\n",
172 | "2 1 3 0 35 25 417 70 35 1\n",
173 | "3 1 4 1 0 10 180 30 35 1\n",
174 | "4 2 1 0 64 58 68 68 30 2"
175 | ]
176 | },
177 | "execution_count": 3,
178 | "metadata": {},
179 | "output_type": "execute_result"
180 | }
181 | ],
182 | "source": [
183 | "# Access the dataset\n",
184 | "mode_data = statsmodels.datasets.modechoice.load_pandas()\n",
185 | "# Get a pandas dataframe of the mode choice data\n",
186 | "long_df = mode_data[\"data\"]\n",
187 | "# Look at the dataframe to ensure that it loaded correctly\n",
188 | "long_df.head()"
189 | ]
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | "### Create the needed variables for the conversion function.\n",
196 | "The function in PyLogit that is used to convert long-format data to wide-format data is \"convert_long_to_wide,\" and it can be accessed through \"pl.convert_long_to_wide\". The docstring for the function contains all of the information necessary to perform the conversion, but we will leave it to readers to view the docstring at their own leisure. For now, we will simply create the needed objects/arguments for the function.\n",
197 | "\n",
198 | "In particular, we will need the following 7 objects:\n",
199 | "1. ind_vars\n",
200 | "2. alt_specific_vars\n",
201 | "3. subset_specific_vars\n",
202 | "4. obs_id_col\n",
203 | "5. alt_id_col\n",
204 | "6. choice_col\n",
205 | "7. alt_name_dict\n",
206 | "\n",
207 | "The cells below will show exactly what these objects are."
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": 10,
213 | "metadata": {
214 | "collapsed": true
215 | },
216 | "outputs": [],
217 | "source": [
218 | "# ind_vars is a list of strings denoting the column\n",
219 | "# headings of data that varies across choice situations,\n",
220 | "# but not across alternatives. In our data, this is\n",
221 | "# the household income and party size.\n",
222 | "individual_specific_variables = [\"hinc\", \"psize\"]\n",
223 | "\n",
224 | "# alt_specific_vaars is a list of strings denoting the\n",
225 | "# column headings of data that vary not only across\n",
226 | "# choice situations but also across all alternatives.\n",
227 | "# These are columns such as the \"level of service\"\n",
228 | "# variables.\n",
229 | "alternative_specific_variables = [\"invc\", \"invt\", \"gc\"]\n",
230 | "\n",
231 | "# subset_specific_vars is a dictionary. Each key is a\n",
232 | "# string that denotes a variable that is subset specific.\n",
233 | "# Each value is a list of alternative ids, over which the\n",
234 | "# variable actually varies. Note that subset specific\n",
235 | "# variables vary across choice situations and across some\n",
236 | "# (but not all) alternatives. This is most common when\n",
237 | "# using variables that are not meaningfully defined for\n",
238 | "# all alternatives. An example of this in our dataset is\n",
239 | "# terminal time (\"ttme\"). This variable is not meaningfully\n",
240 | "# defined for the \"car\" alternative. Therefore, it is always\n",
241 | "# zero. Note \"4\" is the id for the \"car\" alternative\n",
242 | "subset_specific_variables = {\"ttme\": [1, 2, 3]}\n",
243 | "\n",
244 | "# obs_id_col is the column denoting the id of the choice\n",
245 | "# situation. If one was using a panel dataset, with multiple\n",
246 | "# choice situations per unit of observation, the column\n",
247 | "# denoting the unit of observation would be listed in\n",
248 | "# ind_vars (i.e. with the individual specific variables)\n",
249 | "observation_id_column = \"individual\"\n",
250 | "\n",
251 | "# alt_id_col is the column denoting the id of the alternative\n",
252 | "# corresponding to a given row.\n",
253 | "alternative_id_column = \"mode\"\n",
254 | "\n",
255 | "# choice_col is the column denoting whether the alternative\n",
256 | "# on a given row was chosen in the corresponding choice situation\n",
257 | "choice_column = \"choice\"\n",
258 | "\n",
259 | "# Lastly, alt_name_dict is not necessary. However, it is useful.\n",
260 | "# It records the names corresponding to each alternative, if there\n",
261 | "# are any, and allows for the creation of meaningful column names\n",
262 | "# in the wide-format data (such as when creating the columns\n",
263 | "# denoting the available alternatives in each choice situation).\n",
264 | "# The keys of alt_name_dict are the unique alternative ids, and\n",
265 | "# the values are the names of each alternative.\n",
266 | "alternative_name_dict = {1: \"air\",\n",
267 | " 2: \"train\",\n",
268 | " 3: \"bus\",\n",
269 | " 4: \"car\"}"
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "metadata": {},
275 | "source": [
276 | "### Create the wide-format dataframe"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": 12,
282 | "metadata": {
283 | "collapsed": false
284 | },
285 | "outputs": [
286 | {
287 | "data": {
288 | "text/html": [
289 | "\n",
290 | "
\n",
291 | " \n",
292 | " \n",
293 | " | \n",
294 | " 0 | \n",
295 | " 1 | \n",
296 | " 2 | \n",
297 | " 3 | \n",
298 | " 4 | \n",
299 | "
\n",
300 | " \n",
301 | " \n",
302 | " \n",
303 | " individual | \n",
304 | " 1 | \n",
305 | " 2 | \n",
306 | " 3 | \n",
307 | " 4 | \n",
308 | " 5 | \n",
309 | "
\n",
310 | " \n",
311 | " choice | \n",
312 | " 4 | \n",
313 | " 4 | \n",
314 | " 4 | \n",
315 | " 4 | \n",
316 | " 4 | \n",
317 | "
\n",
318 | " \n",
319 | " availability_air | \n",
320 | " 1 | \n",
321 | " 1 | \n",
322 | " 1 | \n",
323 | " 1 | \n",
324 | " 1 | \n",
325 | "
\n",
326 | " \n",
327 | " availability_train | \n",
328 | " 1 | \n",
329 | " 1 | \n",
330 | " 1 | \n",
331 | " 1 | \n",
332 | " 1 | \n",
333 | "
\n",
334 | " \n",
335 | " availability_bus | \n",
336 | " 1 | \n",
337 | " 1 | \n",
338 | " 1 | \n",
339 | " 1 | \n",
340 | " 1 | \n",
341 | "
\n",
342 | " \n",
343 | " availability_car | \n",
344 | " 1 | \n",
345 | " 1 | \n",
346 | " 1 | \n",
347 | " 1 | \n",
348 | " 1 | \n",
349 | "
\n",
350 | " \n",
351 | " hinc | \n",
352 | " 35 | \n",
353 | " 30 | \n",
354 | " 40 | \n",
355 | " 70 | \n",
356 | " 45 | \n",
357 | "
\n",
358 | " \n",
359 | " psize | \n",
360 | " 1 | \n",
361 | " 2 | \n",
362 | " 1 | \n",
363 | " 3 | \n",
364 | " 2 | \n",
365 | "
\n",
366 | " \n",
367 | " invc_air | \n",
368 | " 59 | \n",
369 | " 58 | \n",
370 | " 115 | \n",
371 | " 49 | \n",
372 | " 60 | \n",
373 | "
\n",
374 | " \n",
375 | " invc_train | \n",
376 | " 31 | \n",
377 | " 31 | \n",
378 | " 98 | \n",
379 | " 26 | \n",
380 | " 32 | \n",
381 | "
\n",
382 | " \n",
383 | " invc_bus | \n",
384 | " 25 | \n",
385 | " 25 | \n",
386 | " 53 | \n",
387 | " 21 | \n",
388 | " 26 | \n",
389 | "
\n",
390 | " \n",
391 | " invc_car | \n",
392 | " 10 | \n",
393 | " 11 | \n",
394 | " 23 | \n",
395 | " 5 | \n",
396 | " 8 | \n",
397 | "
\n",
398 | " \n",
399 | " invt_air | \n",
400 | " 100 | \n",
401 | " 68 | \n",
402 | " 125 | \n",
403 | " 68 | \n",
404 | " 144 | \n",
405 | "
\n",
406 | " \n",
407 | " invt_train | \n",
408 | " 372 | \n",
409 | " 354 | \n",
410 | " 892 | \n",
411 | " 354 | \n",
412 | " 404 | \n",
413 | "
\n",
414 | " \n",
415 | " invt_bus | \n",
416 | " 417 | \n",
417 | " 399 | \n",
418 | " 882 | \n",
419 | " 399 | \n",
420 | " 449 | \n",
421 | "
\n",
422 | " \n",
423 | " invt_car | \n",
424 | " 180 | \n",
425 | " 255 | \n",
426 | " 720 | \n",
427 | " 180 | \n",
428 | " 600 | \n",
429 | "
\n",
430 | " \n",
431 | " gc_air | \n",
432 | " 70 | \n",
433 | " 68 | \n",
434 | " 129 | \n",
435 | " 59 | \n",
436 | " 82 | \n",
437 | "
\n",
438 | " \n",
439 | " gc_train | \n",
440 | " 71 | \n",
441 | " 84 | \n",
442 | " 195 | \n",
443 | " 79 | \n",
444 | " 93 | \n",
445 | "
\n",
446 | " \n",
447 | " gc_bus | \n",
448 | " 70 | \n",
449 | " 85 | \n",
450 | " 149 | \n",
451 | " 81 | \n",
452 | " 94 | \n",
453 | "
\n",
454 | " \n",
455 | " gc_car | \n",
456 | " 30 | \n",
457 | " 50 | \n",
458 | " 101 | \n",
459 | " 32 | \n",
460 | " 99 | \n",
461 | "
\n",
462 | " \n",
463 | " ttme_air | \n",
464 | " 69 | \n",
465 | " 64 | \n",
466 | " 69 | \n",
467 | " 64 | \n",
468 | " 64 | \n",
469 | "
\n",
470 | " \n",
471 | " ttme_train | \n",
472 | " 34 | \n",
473 | " 44 | \n",
474 | " 34 | \n",
475 | " 44 | \n",
476 | " 44 | \n",
477 | "
\n",
478 | " \n",
479 | " ttme_bus | \n",
480 | " 35 | \n",
481 | " 53 | \n",
482 | " 35 | \n",
483 | " 53 | \n",
484 | " 53 | \n",
485 | "
\n",
486 | " \n",
487 | "
\n",
488 | "
"
489 | ],
490 | "text/plain": [
491 | " 0 1 2 3 4\n",
492 | "individual 1 2 3 4 5\n",
493 | "choice 4 4 4 4 4\n",
494 | "availability_air 1 1 1 1 1\n",
495 | "availability_train 1 1 1 1 1\n",
496 | "availability_bus 1 1 1 1 1\n",
497 | "availability_car 1 1 1 1 1\n",
498 | "hinc 35 30 40 70 45\n",
499 | "psize 1 2 1 3 2\n",
500 | "invc_air 59 58 115 49 60\n",
501 | "invc_train 31 31 98 26 32\n",
502 | "invc_bus 25 25 53 21 26\n",
503 | "invc_car 10 11 23 5 8\n",
504 | "invt_air 100 68 125 68 144\n",
505 | "invt_train 372 354 892 354 404\n",
506 | "invt_bus 417 399 882 399 449\n",
507 | "invt_car 180 255 720 180 600\n",
508 | "gc_air 70 68 129 59 82\n",
509 | "gc_train 71 84 195 79 93\n",
510 | "gc_bus 70 85 149 81 94\n",
511 | "gc_car 30 50 101 32 99\n",
512 | "ttme_air 69 64 69 64 64\n",
513 | "ttme_train 34 44 34 44 44\n",
514 | "ttme_bus 35 53 35 53 53"
515 | ]
516 | },
517 | "execution_count": 12,
518 | "metadata": {},
519 | "output_type": "execute_result"
520 | }
521 | ],
522 | "source": [
523 | "# Finally, we can create the wide format dataframe\n",
524 | "wide_df = pl.convert_long_to_wide(long_df,\n",
525 | " individual_specific_variables,\n",
526 | " alternative_specific_variables,\n",
527 | " subset_specific_variables,\n",
528 | " observation_id_column,\n",
529 | " alternative_id_column,\n",
530 | " choice_column,\n",
531 | " alternative_name_dict)\n",
532 | "\n",
533 | "# Let's look at the created dataframe, transposed for easy viewing\n",
534 | "wide_df.head().T"
535 | ]
536 | },
537 | {
538 | "cell_type": "markdown",
539 | "metadata": {},
540 | "source": [
541 | "As we can see above, PyLogit does a few things automatically. First, using the names provided in alt_name_dict, it will add suffixes to the alternative specific variables and the subset specific variables. These suffixes record what alternative, the given column of data is referring to. Secondly, when dealing with subset specific variables, PyLogit will only create columns of data for alternatives over which the variable actually varies. Lastly, PyLogit automatically creates columns that denote the availability of each alternative for each choice situation. These columns are suffixed to denote the alternatives that they correspond to, and they are inferred automatically from the rows present in the long-format data.\n",
542 | "\n",
543 | "Also, there is a \"null_value\" keyword that one can use in the conversion function. This is useful when one has alternative specific variables, and not all alternatives are available in all choice situations. In this setting, one may want to specify a value for the missing data, such as null, -999, etc. The \"null_value\" keyword argument allows one to do this."
544 | ]
545 | }
546 | ],
547 | "metadata": {
548 | "kernelspec": {
549 | "display_name": "Python 2",
550 | "language": "python",
551 | "name": "python2"
552 | },
553 | "language_info": {
554 | "codemirror_mode": {
555 | "name": "ipython",
556 | "version": 2
557 | },
558 | "file_extension": ".py",
559 | "mimetype": "text/x-python",
560 | "name": "python",
561 | "nbconvert_exporter": "python",
562 | "pygments_lexer": "ipython2",
563 | "version": "2.7.12"
564 | }
565 | },
566 | "nbformat": 4,
567 | "nbformat_minor": 1
568 | }
569 |
--------------------------------------------------------------------------------
/images/PyLogit_Final-small-04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/timothyb0912/pylogit/cffc9c523b5368966ef2481c7dc30f0a5d296de8/images/PyLogit_Final-small-04.png
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["flit_core >=2,<4"]
3 | build-backend = "flit_core.buildapi"
4 |
5 | [tool.flit.metadata]
6 | module = "pylogit"
7 | author = "Timothy Brathwaite"
8 | author-email = "timothyb0912@gmail.com"
9 | home-page = "https://github.com/timothyb0912/pylogit"
10 | description-file = "README.md"
11 | requires = [
12 | "pandas >= 0.16.2",
13 | "numpy >= 1.10.2",
14 | "scipy >= 0.16.1",
15 | "future >= 0.16",
16 | "statsmodels >= 0.6.1",
17 | "tqdm >= 4.15.0",
18 | ]
19 | requires-python = ">=3.6"
20 | keywords = "conditional logit,discrete choice,econometrics,choice models"
21 | license = "BSD-3-Clause"
22 | classifiers = [
23 | "Topic :: Software Development :: Libraries :: Python Modules",
24 | "Topic :: Scientific/Engineering",
25 | "Intended Audience :: Science/Research",
26 | "Intended Audience :: End Users/Desktop",
27 | "Intended Audience :: Developers",
28 | "Programming Language :: Python :: 3",
29 | "Programming Language :: Python :: 3.6",
30 | "Programming Language :: Python :: 3.7",
31 | "Programming Language :: Python :: 3.8",
32 | "Environment :: Console",
33 | "Development Status :: 2 - Pre-Alpha",
34 | "License :: OSI Approved :: BSD License",
35 | ]
36 |
37 | [tool.flit.sdist]
38 | exclude = ["src/pylogit/newsfragments/"]
39 |
40 | [tool.towncrier]
41 | package = "pylogit"
42 | package_dir = "src/"
43 | filename = "CHANGELOG.rst"
44 | title_format = "{name} {version} ({project_date})"
45 | wrap = true # Wrap text to 79 characters
46 | all_bullets = true
47 |
48 | [[tool.towncrier.type]]
49 | directory = "added"
50 | name = "Added new features"
51 | showcontent = true
52 |
53 | [[tool.towncrier.type]]
54 | directory = "changed"
55 | name = "Changed existing functionality"
56 | showcontent = true
57 |
58 | [[tool.towncrier.type]]
59 | directory = "deprecated"
60 | name = "Marked for removal"
61 | showcontent = true
62 |
63 | [[tool.towncrier.type]]
64 | directory = "removed"
65 | name = "Removed from package"
66 | showcontent = true
67 |
68 | [[tool.towncrier.type]]
69 | directory = "fixed"
70 | name = "Bug fixes"
71 | showcontent = true
72 |
73 | [[tool.towncrier.type]]
74 | directory = "security"
75 | name = "Patched vulnerabilities"
76 | showcontent = true
77 |
78 | [[tool.towncrier.type]]
79 | directory = "doc"
80 | name = "Improved Documentation"
81 | showcontent = true
82 |
83 | [[tool.towncrier.type]]
84 | directory = "trivial"
85 | name = "Trivial/Internal Changes"
86 | showcontent = true
87 |
88 | [tool.tox]
89 | legacy_tox_ini = """
90 | [tox]
91 | isolated_build = True
92 | envlist = py36, py37, py38
93 | requires = tox-conda
94 |
95 | [testenv]
96 | deps = pytest >= 3.3.0
97 | commands =
98 | make install
99 | pytest
100 | """
101 |
--------------------------------------------------------------------------------
/requirements.in:
--------------------------------------------------------------------------------
1 | bump2version
2 | flit
3 | future
4 | mock
5 | numpy
6 | pandas
7 | pipreqs
8 | pip-tools
9 | pytest
10 | pytest-cov
11 | scipy
12 | statsmodels
13 | towncrier
14 | tox
15 | tox-conda
16 | tqdm
17 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile
3 | # To update, run:
4 | #
5 | # pip-compile requirements.in
6 | #
7 | appdirs==1.4.4 # via virtualenv
8 | attrs==20.3.0 # via pytest
9 | bump2version==1.0.1 # via -r requirements.in
10 | certifi==2020.12.5 # via requests
11 | chardet==4.0.0 # via requests
12 | click==7.1.2 # via pip-tools, towncrier
13 | coverage==5.3.1 # via pytest-cov
14 | distlib==0.3.1 # via virtualenv
15 | docopt==0.6.2 # via pipreqs
16 | docutils==0.16 # via flit
17 | filelock==3.0.12 # via tox, virtualenv
18 | flit-core==3.0.0 # via flit
19 | flit==3.0.0 # via -r requirements.in
20 | future==0.18.2 # via -r requirements.in
21 | idna==2.10 # via requests
22 | importlib-metadata==2.1.1 # via pluggy, pytest, tox, virtualenv
23 | importlib-resources==4.1.0 # via virtualenv
24 | incremental==17.5.0 # via towncrier
25 | iniconfig==1.1.1 # via pytest
26 | jinja2==2.11.2 # via towncrier
27 | markupsafe==1.1.1 # via jinja2
28 | mock==4.0.3 # via -r requirements.in
29 | numpy==1.19.4 # via -r requirements.in, pandas, patsy, scipy, statsmodels
30 | packaging==20.8 # via pytest, tox
31 | pandas==1.1.5 # via -r requirements.in, statsmodels
32 | patsy==0.5.1 # via statsmodels
33 | pip-tools==5.4.0 # via -r requirements.in
34 | pipreqs==0.4.10 # via -r requirements.in
35 | pluggy==0.13.1 # via pytest, tox
36 | py==1.10.0 # via pytest, tox
37 | pyparsing==2.4.7 # via packaging
38 | pytest-cov==2.10.1 # via -r requirements.in
39 | pytest==6.2.1 # via -r requirements.in, pytest-cov
40 | python-dateutil==2.8.1 # via pandas
41 | pytoml==0.1.21 # via flit, flit-core
42 | pytz==2020.5 # via pandas
43 | requests==2.25.1 # via flit, yarg
44 | scipy==1.5.4 # via -r requirements.in, statsmodels
45 | six==1.15.0 # via patsy, pip-tools, python-dateutil, tox, virtualenv
46 | statsmodels==0.12.1 # via -r requirements.in
47 | toml==0.10.2 # via pytest, towncrier, tox
48 | towncrier==19.2.0 # via -r requirements.in
49 | tox-conda==0.4.1 # via -r requirements.in
50 | tox==3.20.1 # via -r requirements.in, tox-conda
51 | tqdm==4.55.0 # via -r requirements.in
52 | urllib3==1.26.2 # via requests
53 | virtualenv==20.2.2 # via tox
54 | yarg==0.1.9 # via pipreqs
55 | zipp==3.4.0 # via importlib-metadata, importlib-resources
56 |
57 | # The following packages are considered to be unsafe in a requirements file:
58 | # pip
59 |
--------------------------------------------------------------------------------
/src/pylogit/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Mar 14 15:33:07 2016
4 |
5 | @author: timothyb0912
6 | @module: pylogit
7 | """
8 | from __future__ import absolute_import
9 |
10 | from .pylogit import create_choice_model
11 | from .bootstrap import Boot
12 | from .choice_tools import convert_wide_to_long
13 | from .choice_tools import convert_long_to_wide
14 |
15 | __version__ = "1.0.1"
16 |
--------------------------------------------------------------------------------
/src/pylogit/bootstrap_calcs.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @author: Timothy Brathwaite
4 | @name: Bootstrap Calculations
5 | @summary: This module provides functions to calculate the bootstrap
6 | confidence intervals using the 'percentile' and
7 | 'bias-corrected and accelerated' methods.
8 | """
9 | from __future__ import absolute_import
10 |
11 | import numpy as np
12 | from scipy.stats import norm
13 |
14 | from .bootstrap_utils import check_conf_percentage_validity
15 | from .bootstrap_utils import ensure_samples_is_ndim_ndarray
16 | from .bootstrap_utils import get_alpha_from_conf_percentage
17 | from .bootstrap_utils import combine_conf_endpoints
18 |
19 | # Create a value to be used to avoid numeric underflow.
20 | MIN_COMP_VALUE = 1e-16
21 |
22 |
23 | def calc_percentile_interval(bootstrap_replicates, conf_percentage):
24 | """
25 | Calculate bootstrap confidence intervals based on raw percentiles of the
26 | bootstrap distribution of samples.
27 |
28 | Parameters
29 | ----------
30 | bootstrap_replicates : 2D ndarray.
31 | Each row should correspond to a different bootstrap parameter sample.
32 | Each column should correspond to an element of the parameter vector
33 | being estimated.
34 | conf_percentage : scalar in the interval (0.0, 100.0).
35 | Denotes the confidence-level of the returned confidence interval. For
36 | instance, to calculate a 95% confidence interval, pass `95`.
37 |
38 | Returns
39 | -------
40 | conf_intervals : 2D ndarray.
41 | The shape of the returned array will be `(2, samples.shape[1])`. The
42 | first row will correspond to the lower value in the confidence
43 | interval. The second row will correspond to the upper value in the
44 | confidence interval. There will be one column for each element of the
45 | parameter vector being estimated.
46 |
47 | References
48 | ----------
49 | Efron, Bradley, and Robert J. Tibshirani. An Introduction to the Bootstrap.
50 | CRC press, 1994. Section 12.5 and Section 13.3. See Equation 13.3.
51 |
52 | Notes
53 | -----
54 | This function differs slightly from the actual percentile bootstrap
55 | procedure described in Efron and Tibshirani (1994). To ensure that the
56 | returned endpoints of one's bootstrap confidence intervals are actual
57 | values that were observed in the bootstrap distribution, both the procedure
58 | of Efron and Tibshirani and this function make more conservative confidence
59 | intervals. However, this function uses a simpler (and in some cases less
60 | conservative) correction than that of Efron and Tibshirani.
61 | """
62 | # Check validity of arguments
63 | check_conf_percentage_validity(conf_percentage)
64 | ensure_samples_is_ndim_ndarray(bootstrap_replicates, ndim=2)
65 | # Get the alpha * 100% value
66 | alpha = get_alpha_from_conf_percentage(conf_percentage)
67 | # Get the lower and upper percentiles that demarcate the desired interval.
68 | lower_percent = alpha / 2.0
69 | upper_percent = 100.0 - lower_percent
70 | # Calculate the lower and upper endpoints of the confidence intervals.
71 | # Note that the particular choices of interpolation methods are made in
72 | # order to produce conservatively wide confidence intervals and ensure that
73 | # all returned endpoints in the confidence intervals are actually observed
74 | # in the bootstrap distribution. This is in accordance with the spirit of
75 | # Efron and Tibshirani (1994).
76 | lower_endpoint = np.percentile(bootstrap_replicates,
77 | lower_percent,
78 | interpolation='lower',
79 | axis=0)
80 | upper_endpoint = np.percentile(bootstrap_replicates,
81 | upper_percent,
82 | interpolation='higher',
83 | axis=0)
84 | # Combine the enpoints into a single ndarray.
85 | conf_intervals = combine_conf_endpoints(lower_endpoint, upper_endpoint)
86 | return conf_intervals
87 |
88 |
89 | def calc_bias_correction_bca(bootstrap_replicates, mle_estimate):
90 | """
91 | Calculate the bias correction for the Bias Corrected and Accelerated (BCa)
92 | bootstrap confidence intervals.
93 |
94 | Parameters
95 | ----------
96 | bootstrap_replicates : 2D ndarray.
97 | Each row should correspond to a different bootstrap parameter sample.
98 | Each column should correspond to an element of the parameter vector
99 | being estimated.
100 | mle_estimate : 1D ndarray.
101 | The original dataset's maximum likelihood point estimate. Should have
102 | one elements for each component of the estimated parameter vector.
103 |
104 | Returns
105 | -------
106 | bias_correction : 1D ndarray.
107 | There will be one element for each element in `mle_estimate`. Elements
108 | denote the bias correction factors for each component of the parameter
109 | vector.
110 |
111 | References
112 | ----------
113 | Efron, Bradley, and Robert J. Tibshirani. An Introduction to the Bootstrap.
114 | CRC press, 1994. Section 14.3, Equation 14.14.
115 | """
116 | numerator = (bootstrap_replicates < mle_estimate[None, :]).sum(axis=0)
117 | denominator = float(bootstrap_replicates.shape[0])
118 | bias_correction = norm.ppf(numerator / denominator)
119 | return bias_correction
120 |
121 |
122 | def calc_acceleration_bca(jackknife_replicates):
123 | """
124 | Calculate the acceleration constant for the Bias Corrected and Accelerated
125 | (BCa) bootstrap confidence intervals.
126 |
127 | Parameters
128 | ----------
129 | jackknife_replicates : 2D ndarray.
130 | Each row should correspond to a different jackknife parameter sample,
131 | formed by deleting a particular observation and then re-estimating the
132 | desired model. Each column should correspond to an element of the
133 | parameter vector being estimated.
134 |
135 | Returns
136 | -------
137 | acceleration : 1D ndarray.
138 | There will be one element for each element in `mle_estimate`. Elements
139 | denote the acceleration factors for each component of the parameter
140 | vector.
141 |
142 | References
143 | ----------
144 | Efron, Bradley, and Robert J. Tibshirani. An Introduction to the Bootstrap.
145 | CRC press, 1994. Section 14.3, Equation 14.15.
146 | """
147 | # Get the mean of the bootstrapped statistics.
148 | jackknife_mean = jackknife_replicates.mean(axis=0)[None, :]
149 | # Calculate the differences between the mean of the bootstrapped statistics
150 | differences = jackknife_mean - jackknife_replicates
151 | numerator = (differences**3).sum(axis=0)
152 | denominator = 6 * ((differences**2).sum(axis=0))**1.5
153 | # guard against division by zero. Note that this guard shouldn't distort
154 | # the computational results since the numerator should be zero whenever the
155 | # denominator is zero.
156 | zero_denom = np.where(denominator == 0)
157 | denominator[zero_denom] = MIN_COMP_VALUE
158 | # Compute the acceleration.
159 | acceleration = numerator / denominator
160 | return acceleration
161 |
162 |
163 | def calc_lower_bca_percentile(alpha_percent, bias_correction, acceleration):
164 | """
165 | Calculate the lower values of the Bias Corrected and Accelerated (BCa)
166 | bootstrap confidence intervals.
167 |
168 | Parameters
169 | ----------
170 | alpha_percent : float in (0.0, 100.0).
171 | `100 - confidence_percentage`, where `confidence_percentage` is the
172 | confidence level (such as 95%), expressed as a percent.
173 | bias_correction : 1D ndarray.
174 | There will be one element for each element in `mle_estimate`. Elements
175 | denote the bias correction factors for each component of the parameter
176 | vector.
177 | acceleration : 1D ndarray.
178 | There will be one element for each element in `mle_estimate`. Elements
179 | denote the acceleration factors for each component of the parameter
180 | vector.
181 |
182 | Returns
183 | -------
184 | lower_percentile : 1D ndarray.
185 | There will be one element for each element in `mle_estimate`. Elements
186 | denote the smaller values in the confidence interval for each component
187 | of the parameter vector.
188 |
189 | References
190 | ----------
191 | Efron, Bradley, and Robert J. Tibshirani. An Introduction to the Bootstrap.
192 | CRC press, 1994. Section 14.3, Equation 14.10.
193 |
194 | Notes
195 | -----
196 | The `alpha` used in this function is different from the `alpha` used in
197 | Efron and Tibshirani (1994). The `alpha` used in this function must be
198 | converted to a decimal (by dividing by 100) and then divided by 2 (to
199 | account for the equal-tailed nature of the confidence interval) in order to
200 | be made equivalent to the `alpha` in Efron and Tibshirani (1994).
201 | """
202 | z_lower = norm.ppf(alpha_percent / (100.0 * 2))
203 | numerator = bias_correction + z_lower
204 | denominator = 1 - acceleration * numerator
205 | lower_percentile =\
206 | norm.cdf(bias_correction + numerator / denominator) * 100
207 | return lower_percentile
208 |
209 |
210 | def calc_upper_bca_percentile(alpha_percent, bias_correction, acceleration):
211 | """
212 | Calculate the lower values of the Bias Corrected and Accelerated (BCa)
213 | bootstrap confidence intervals.
214 |
215 | Parameters
216 | ----------
217 | alpha_percent : float in (0.0, 100.0).
218 | `100 - confidence_percentage`, where `confidence_percentage` is the
219 | confidence level (such as 95%), expressed as a percent.
220 | bias_correction : 1D ndarray.
221 | There will be one element for each element in `mle_estimate`. Elements
222 | denote the bias correction factors for each component of the parameter
223 | vector.
224 | acceleration : 1D ndarray.
225 | There will be one element for each element in `mle_estimate`. Elements
226 | denote the acceleration factors for each component of the parameter
227 | vector.
228 |
229 | Returns
230 | -------
231 | upper_percentile : 1D ndarray.
232 | There will be one element for each element in `mle_estimate`. Elements
233 | denote the larger values in the confidence interval for each component
234 | of the parameter vector.
235 |
236 | References
237 | ----------
238 | Efron, Bradley, and Robert J. Tibshirani. An Introduction to the Bootstrap.
239 | CRC press, 1994. Section 14.3, Equation 14.10.
240 |
241 | Notes
242 | -----
243 | The `alpha` used in this function is different from the `alpha` used in
244 | Efron and Tibshirani (1994). The `alpha` used in this function must be
245 | converted to a decimal (by dividing by 100) and then divided by 2 (to
246 | account for the equal-tailed nature of the confidence interval) in order to
247 | be made equivalent to the `alpha` in Efron and Tibshirani (1994).
248 | """
249 | z_upper = norm.ppf(1 - alpha_percent / (100.0 * 2))
250 | numerator = bias_correction + z_upper
251 | denominator = 1 - acceleration * numerator
252 | upper_percentile =\
253 | norm.cdf(bias_correction + numerator / denominator) * 100
254 | return upper_percentile
255 |
256 |
257 | def calc_bca_interval(bootstrap_replicates,
258 | jackknife_replicates,
259 | mle_params,
260 | conf_percentage):
261 | """
262 | Calculate 'bias-corrected and accelerated' bootstrap confidence intervals.
263 |
264 | Parameters
265 | ----------
266 | bootstrap_replicates : 2D ndarray.
267 | Each row should correspond to a different bootstrap parameter sample.
268 | Each column should correspond to an element of the parameter vector
269 | being estimated.
270 | jackknife_replicates : 2D ndarray.
271 | Each row should correspond to a different jackknife parameter sample,
272 | formed by deleting a particular observation and then re-estimating the
273 | desired model. Each column should correspond to an element of the
274 | parameter vector being estimated.
275 | mle_params : 1D ndarray.
276 | The original dataset's maximum likelihood point estimate. Should have
277 | the same number of elements as `samples.shape[1]`.
278 | conf_percentage : scalar in the interval (0.0, 100.0).
279 | Denotes the confidence-level of the returned confidence interval. For
280 | instance, to calculate a 95% confidence interval, pass `95`.
281 |
282 | Returns
283 | -------
284 | conf_intervals : 2D ndarray.
285 | The shape of the returned array will be `(2, samples.shape[1])`. The
286 | first row will correspond to the lower value in the confidence
287 | interval. The second row will correspond to the upper value in the
288 | confidence interval. There will be one column for each element of the
289 | parameter vector being estimated.
290 |
291 | References
292 | ----------
293 | Efron, Bradley, and Robert J. Tibshirani. An Introduction to the Bootstrap.
294 | CRC press, 1994. Section 14.3.
295 | DiCiccio, Thomas J., and Bradley Efron. "Bootstrap confidence intervals."
296 | Statistical science (1996): 189-212.
297 | """
298 | # Check validity of arguments
299 | check_conf_percentage_validity(conf_percentage)
300 | ensure_samples_is_ndim_ndarray(bootstrap_replicates, ndim=2)
301 | ensure_samples_is_ndim_ndarray(jackknife_replicates,
302 | name='jackknife', ndim=2)
303 | # Calculate the alpha * 100% value
304 | alpha_percent = get_alpha_from_conf_percentage(conf_percentage)
305 | # Estimate the bias correction for the bootstrap samples
306 | bias_correction =\
307 | calc_bias_correction_bca(bootstrap_replicates, mle_params)
308 | # Estimate the acceleration
309 | acceleration = calc_acceleration_bca(jackknife_replicates)
310 | # Get the lower and upper percent value for the raw bootstrap samples.
311 | lower_percents =\
312 | calc_lower_bca_percentile(alpha_percent, bias_correction, acceleration)
313 | upper_percents =\
314 | calc_upper_bca_percentile(alpha_percent, bias_correction, acceleration)
315 | # Get the lower and upper endpoints for the desired confidence intervals.
316 | lower_endpoints = np.diag(np.percentile(bootstrap_replicates,
317 | lower_percents,
318 | interpolation='lower',
319 | axis=0))
320 | upper_endpoints = np.diag(np.percentile(bootstrap_replicates,
321 | upper_percents,
322 | interpolation='higher',
323 | axis=0))
324 | # Combine the enpoints into a single ndarray.
325 | conf_intervals = combine_conf_endpoints(lower_endpoints, upper_endpoints)
326 | return conf_intervals
327 |
--------------------------------------------------------------------------------
/src/pylogit/bootstrap_mle.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @author: Timothy Brathwaite
4 | @name: Bootstrap Estimation Procedures
5 | @summary: This module provides functions that will perform the MLE for each
6 | of the bootstrap samples.
7 | """
8 | from __future__ import absolute_import
9 |
10 | import numpy as np
11 | import pandas as pd
12 |
13 | from . import pylogit as pl
14 | from .display_names import model_type_to_display_name
15 |
16 |
17 | def extract_default_init_vals(orig_model_obj, mnl_point_series, num_params):
18 | """
19 | Get the default initial values for the desired model type, based on the
20 | point estimate of the MNL model that is 'closest' to the desired model.
21 |
22 | Parameters
23 | ----------
24 | orig_model_obj : an instance or sublcass of the MNDC class.
25 | Should correspond to the actual model that we want to bootstrap.
26 | mnl_point_series : pandas Series.
27 | Should denote the point estimate from the MNL model that is 'closest'
28 | to the desired model.
29 | num_params : int.
30 | Should denote the number of parameters being estimated (including any
31 | parameters that are being constrained during estimation).
32 |
33 | Returns
34 | -------
35 | init_vals : 1D ndarray of initial values for the MLE of the desired model.
36 | """
37 | # Initialize the initial values
38 | init_vals = np.zeros(num_params, dtype=float)
39 | # Figure out which values in mnl_point_series are the index coefficients
40 | no_outside_intercepts = orig_model_obj.intercept_names is None
41 | if no_outside_intercepts:
42 | init_index_coefs = mnl_point_series.values
43 | init_intercepts = None
44 | else:
45 | init_index_coefs =\
46 | mnl_point_series.loc[orig_model_obj.ind_var_names].values
47 | init_intercepts =\
48 | mnl_point_series.loc[orig_model_obj.intercept_names].values
49 |
50 | # Add any mixing variables to the index coefficients.
51 | if orig_model_obj.mixing_vars is not None:
52 | num_mixing_vars = len(orig_model_obj.mixing_vars)
53 | init_index_coefs = np.concatenate([init_index_coefs,
54 | np.zeros(num_mixing_vars)],
55 | axis=0)
56 |
57 | # Account for the special transformation of the index coefficients that is
58 | # needed for the asymmetric logit model.
59 | if orig_model_obj.model_type == model_type_to_display_name["Asym"]:
60 | multiplier = np.log(len(np.unique(orig_model_obj.alt_IDs)))
61 | # Cast the initial index coefficients to a float dtype to ensure
62 | # successful broadcasting
63 | init_index_coefs = init_index_coefs.astype(float)
64 | # Adjust the scale of the index coefficients for the asymmetric logit.
65 | init_index_coefs /= multiplier
66 |
67 | # Combine the initial interept values with the initial index coefficients
68 | if init_intercepts is not None:
69 | init_index_coefs =\
70 | np.concatenate([init_intercepts, init_index_coefs], axis=0)
71 |
72 | # Add index coefficients (and mixing variables) to the total initial array
73 | num_index = init_index_coefs.shape[0]
74 | init_vals[-1 * num_index:] = init_index_coefs
75 |
76 | # Note that the initial values for the transformed nest coefficients and
77 | # the shape parameters is zero so we don't have to change anything
78 | return init_vals
79 |
80 |
81 | def get_model_abbrev(model_obj):
82 | """
83 | Extract the string used to specify the model type of this model object in
84 | `pylogit.create_chohice_model`.
85 |
86 | Parameters
87 | ----------
88 | model_obj : An MNDC_Model instance.
89 |
90 | Returns
91 | -------
92 | str. The internal abbreviation used for the particular type of MNDC_Model.
93 | """
94 | # Get the 'display name' for our model.
95 | model_type = model_obj.model_type
96 | # Find the model abbreviation for this model's display name.
97 | for key in model_type_to_display_name:
98 | if model_type_to_display_name[key] == model_type:
99 | return key
100 | # If none of the strings in model_type_to_display_name matches our model
101 | # object, then raise an error.
102 | msg = "Model object has an unknown or incorrect model type."
103 | raise ValueError(msg)
104 |
105 |
106 | def get_model_creation_kwargs(model_obj):
107 | """
108 | Get a dictionary of the keyword arguments needed to create the passed model
109 | object using `pylogit.create_choice_model`.
110 |
111 | Parameters
112 | ----------
113 | model_obj : An MNDC_Model instance.
114 |
115 | Returns
116 | -------
117 | model_kwargs : dict.
118 | Contains the keyword arguments and the required values that are needed
119 | to initialize a replica of `model_obj`.
120 | """
121 | # Extract the model abbreviation for this model
122 | model_abbrev = get_model_abbrev(model_obj)
123 |
124 | # Create a dictionary to store the keyword arguments needed to Initialize
125 | # the new model object.d
126 | model_kwargs = {"model_type": model_abbrev,
127 | "names": model_obj.name_spec,
128 | "intercept_names": model_obj.intercept_names,
129 | "intercept_ref_pos": model_obj.intercept_ref_position,
130 | "shape_names": model_obj.shape_names,
131 | "shape_ref_pos": model_obj.shape_ref_position,
132 | "nest_spec": model_obj.nest_spec,
133 | "mixing_vars": model_obj.mixing_vars,
134 | "mixing_id_col": model_obj.mixing_id_col}
135 |
136 | return model_kwargs
137 |
138 |
139 | def get_mnl_point_est(orig_model_obj,
140 | new_df,
141 | boot_id_col,
142 | num_params,
143 | mnl_spec,
144 | mnl_names,
145 | mnl_init_vals,
146 | mnl_fit_kwargs):
147 | """
148 | Calculates the MLE for the desired MNL model.
149 |
150 | Parameters
151 | ----------
152 | orig_model_obj : An MNDC_Model instance.
153 | The object corresponding to the desired model being bootstrapped.
154 | new_df : pandas DataFrame.
155 | The pandas dataframe containing the data to be used to estimate the
156 | MLE of the MNL model for the current bootstrap sample.
157 | boot_id_col : str.
158 | Denotes the new column that specifies the bootstrap observation ids for
159 | choice model estimation.
160 | num_params : non-negative int.
161 | The number of parameters in the MLE of the `orig_model_obj`.
162 | mnl_spec : OrderedDict or None.
163 | If `orig_model_obj` is not a MNL model, then `mnl_spec` should be an
164 | OrderedDict that contains the specification dictionary used to estimate
165 | the MNL model that will provide starting values for the final estimated
166 | model. If `orig_model_obj` is a MNL model, then `mnl_spec` may be None.
167 | mnl_names : OrderedDict or None.
168 | If `orig_model_obj` is not a MNL model, then `mnl_names` should be an
169 | OrderedDict that contains the name dictionary used to initialize the
170 | MNL model that will provide starting values for the final estimated
171 | model. If `orig_model_obj` is a MNL, then `mnl_names` may be None.
172 | mnl_init_vals : 1D ndarray or None.
173 | If `orig_model_obj` is not a MNL model, then `mnl_init_vals` should be
174 | a 1D ndarray. `mnl_init_vals` should denote the initial values used to
175 | estimate the MNL model that provides starting values for the final
176 | desired model. If `orig_model_obj` is a MNL model, then `mnl_init_vals`
177 | may be None.
178 | mnl_fit_kwargs : dict or None.
179 | If `orig_model_obj` is not a MNL model, then `mnl_fit_kwargs` should be
180 | a dict. `mnl_fit_kwargs` should denote the keyword arguments used when
181 | calling the `fit_mle` function of the MNL model that will provide
182 | starting values to the desired choice model. If `orig_model_obj` is a
183 | MNL model, then `mnl_fit_kwargs` may be None.
184 |
185 | Returns
186 | -------
187 | mnl_point : dict.
188 | The dictionary returned by `scipy.optimize` after estimating the
189 | desired MNL model.
190 | mnl_obj : An MNL model instance.
191 | The model object used to estimate the desired MNL model.
192 | """
193 | # Get specification and name dictionaries for the mnl model, for the case
194 | # where the model being bootstrapped is an MNL model. In this case, the
195 | # the mnl_spec and the mnl_names that are passed to the function are
196 | # expected to be None.
197 | if orig_model_obj.model_type == model_type_to_display_name["MNL"]:
198 | mnl_spec = orig_model_obj.specification
199 | mnl_names = orig_model_obj.name_spec
200 | if mnl_init_vals is None:
201 | mnl_init_vals = np.zeros(num_params)
202 | if mnl_fit_kwargs is None:
203 | mnl_fit_kwargs = {}
204 |
205 | # Alter the mnl_fit_kwargs to ensure that we only perform point estimation
206 | mnl_fit_kwargs["just_point"] = True
207 | # Use BFGS by default to estimate the MNL since it works well for the MNL.
208 | if "method" not in mnl_fit_kwargs:
209 | mnl_fit_kwargs["method"] = "BFGS"
210 |
211 | # Initialize the mnl model object for the given bootstrap sample.
212 | mnl_obj = pl.create_choice_model(data=new_df,
213 | alt_id_col=orig_model_obj.alt_id_col,
214 | obs_id_col=boot_id_col,
215 | choice_col=orig_model_obj.choice_col,
216 | specification=mnl_spec,
217 | model_type="MNL",
218 | names=mnl_names)
219 |
220 | # Get the MNL point estimate for the parameters of this bootstrap sample.
221 | mnl_point = mnl_obj.fit_mle(mnl_init_vals, **mnl_fit_kwargs)
222 | return mnl_point, mnl_obj
223 |
224 |
225 | def retrieve_point_est(orig_model_obj,
226 | new_df,
227 | new_id_col,
228 | num_params,
229 | mnl_spec,
230 | mnl_names,
231 | mnl_init_vals,
232 | mnl_fit_kwargs,
233 | extract_init_vals=None,
234 | **fit_kwargs):
235 | """
236 | Calculates the MLE for the desired MNL model.
237 |
238 | Parameters
239 | ----------
240 | orig_model_obj : An MNDC_Model instance.
241 | The object corresponding to the desired model being bootstrapped.
242 | new_df : pandas DataFrame.
243 | The pandas dataframe containing the data to be used to estimate the
244 | MLE of the MNL model for the current bootstrap sample.
245 | new_id_col : str.
246 | Denotes the new column that specifies the bootstrap observation ids for
247 | choice model estimation.
248 | num_params : non-negative int.
249 | The number of parameters in the MLE of the `orig_model_obj`.
250 | mnl_spec : OrderedDict or None.
251 | If `orig_model_obj` is not a MNL model, then `mnl_spec` should be an
252 | OrderedDict that contains the specification dictionary used to estimate
253 | the MNL model that will provide starting values for the final estimated
254 | model. If `orig_model_obj` is a MNL model, then `mnl_spec` may be None.
255 | mnl_names : OrderedDict or None.
256 | If `orig_model_obj` is not a MNL model, then `mnl_names` should be an
257 | OrderedDict that contains the name dictionary used to initialize the
258 | MNL model that will provide starting values for the final estimated
259 | model. If `orig_model_obj` is a MNL, then `mnl_names` may be None.
260 | mnl_init_vals : 1D ndarray or None.
261 | If `orig_model_obj` is not a MNL model, then `mnl_init_vals` should be
262 | a 1D ndarray. `mnl_init_vals` should denote the initial values used to
263 | estimate the MNL model that provides starting values for the final
264 | desired model. If `orig_model_obj` is a MNL model, then `mnl_init_vals`
265 | may be None.
266 | mnl_fit_kwargs : dict or None.
267 | If `orig_model_obj` is not a MNL model, then `mnl_fit_kwargs` should be
268 | a dict. `mnl_fit_kwargs` should denote the keyword arguments used when
269 | calling the `fit_mle` function of the MNL model that will provide
270 | starting values to the desired choice model. If `orig_model_obj` is a
271 | MNL model, then `mnl_fit_kwargs` may be None.
272 | extract_init_vals : callable or None, optional.
273 | Should accept 3 arguments, in the following order. First, it should
274 | accept `orig_model_obj`. Second, it should accept a pandas Series of
275 | the estimated parameters from the MNL model. The index of the Series
276 | will be the names of the coefficients from `mnl_names`. Thirdly, it
277 | should accept an int denoting the number of parameters in the desired
278 | choice model. The callable should return a 1D ndarray of starting
279 | values for the desired choice model. Default == None.
280 | fit_kwargs : dict.
281 | Denotes the keyword arguments to be used when estimating the desired
282 | choice model using the current bootstrap sample (`new_df`). All such
283 | kwargs will be directly passed to the `fit_mle` method of the desired
284 | model object.
285 |
286 | Returns
287 | -------
288 | final_point : dict.
289 | The dictionary returned by `scipy.optimize` after estimating the
290 | desired choice model.
291 | """
292 | # Get the MNL point estimate for the parameters of this bootstrap sample.
293 | mnl_point, mnl_obj = get_mnl_point_est(orig_model_obj,
294 | new_df,
295 | new_id_col,
296 | num_params,
297 | mnl_spec,
298 | mnl_names,
299 | mnl_init_vals,
300 | mnl_fit_kwargs)
301 | mnl_point_series = pd.Series(mnl_point["x"], index=mnl_obj.ind_var_names)
302 |
303 | # Denote the MNL point estimate as our final point estimate if the final
304 | # model we're interested in is an MNL.
305 | if orig_model_obj.model_type == model_type_to_display_name["MNL"]:
306 | final_point = mnl_point
307 | else:
308 | # Determine the function to be used when extracting the initial values
309 | # for the final model from the MNL MLE point estimate.
310 | if extract_init_vals is None:
311 | extraction_func = extract_default_init_vals
312 | else:
313 | extraction_func = extract_init_vals
314 |
315 | # Extract the initial values
316 | default_init_vals =\
317 | extraction_func(orig_model_obj, mnl_point_series, num_params)
318 |
319 | # Get the keyword arguments needed to initialize the new model object.
320 | model_kwargs = get_model_creation_kwargs(orig_model_obj)
321 |
322 | # Create a new model object
323 | new_obj =\
324 | pl.create_choice_model(data=new_df,
325 | alt_id_col=orig_model_obj.alt_id_col,
326 | obs_id_col=new_id_col,
327 | choice_col=orig_model_obj.choice_col,
328 | specification=orig_model_obj.specification,
329 | **model_kwargs)
330 |
331 | # Be sure to add 'just_point' to perform pure point estimation.
332 | if 'just_point' not in fit_kwargs:
333 | fit_kwargs['just_point'] = True
334 |
335 | # Fit the model with new data, and return the point estimate dict.
336 | final_point = new_obj.fit_mle(default_init_vals, **fit_kwargs)
337 |
338 | return final_point
339 |
--------------------------------------------------------------------------------
/src/pylogit/bootstrap_sampler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @author: Timothy Brathwaite
4 | @name: Bootstrap Sampler
5 | @summary: This module provides functions that will perform the stratified
6 | resampling needed for the bootstrapping procedure.
7 | """
8 | from __future__ import absolute_import
9 |
10 | from collections import OrderedDict
11 |
12 | import numpy as np
13 | import pandas as pd
14 |
15 |
16 | def relate_obs_ids_to_chosen_alts(obs_id_array,
17 | alt_id_array,
18 | choice_array):
19 | """
20 | Creates a dictionary that relates each unique alternative id to the set of
21 | observations ids that chose the given alternative.
22 |
23 | Parameters
24 | ----------
25 | obs_id_array : 1D ndarray of ints.
26 | Should be a long-format array of observation ids. Each element should
27 | correspond to the unique id of the unit of observation that corresponds
28 | to the given row of the long-format data. Note that each unit of
29 | observation may have more than one associated choice situation.
30 | alt_id_array : 1D ndarray of ints.
31 | Should be a long-format array of alternative ids. Each element should
32 | denote the unique id of the alternative that corresponds to the given
33 | row of the long format data.
34 | choice_array : 1D ndarray of ints.
35 | Each element should be either a one or a zero, indicating whether the
36 | alternative on the given row of the long format data was chosen or not.
37 |
38 | Returns
39 | -------
40 | chosen_alts_to_obs_ids : dict.
41 | Each key will be a unique value from `alt_id_array`. Each key's value
42 | will be a 1D ndarray that contains the sorted, unique observation ids
43 | of those observational units that chose the given alternative.
44 | """
45 | # Figure out which units of observation chose each alternative.
46 | chosen_alts_to_obs_ids = {}
47 |
48 | for alt_id in np.sort(np.unique(alt_id_array)):
49 | # Determine which observations chose the current alternative.
50 | selection_condition =\
51 | np.where((alt_id_array == alt_id) & (choice_array == 1))
52 |
53 | # Store the sorted, unique ids that chose the current alternative.
54 | chosen_alts_to_obs_ids[alt_id] =\
55 | np.sort(np.unique(obs_id_array[selection_condition]))
56 |
57 | # Return the desired dictionary.
58 | return chosen_alts_to_obs_ids
59 |
60 |
61 | def get_num_obs_choosing_each_alternative(obs_per_alt_dict):
62 | """
63 | Will create an ordered dictionary that records the number of units of
64 | observation that have chosen the given alternative (i.e. the associated
65 | dictionary key). Will also determine the total number of unique
66 | observations in the dataset.
67 |
68 | Parameters
69 | ----------
70 | obs_per_alt_dict : dict.
71 | Each key should be a unique alternave id. Each key's value will be 1D
72 | ndarray that contains the sorted, unique observation ids of those
73 | observational units that chose the given alternative.
74 |
75 | Returns
76 | -------
77 | num_obs_per_group : OrderedDict.
78 | Keys will be the alternative ids present in `obs_per_alt_dict`. Values
79 | will be the `len(obs_per_alt_dict[alt_id]).`
80 | tot_num_obs : int.
81 | Denotes the total number of unique observation ids in one's dataset.
82 | """
83 | # Initialize the object that is to be returned.
84 | num_obs_per_group = OrderedDict()
85 |
86 | # Determine the number of unique units of observation per group.
87 | for alt_id in obs_per_alt_dict:
88 | num_obs_per_group[alt_id] = len(obs_per_alt_dict[alt_id])
89 |
90 | # Determine the total number of units of observation that will be chosen
91 | # for each bootstrap sample.
92 | tot_num_obs = sum([num_obs_per_group[g] for g in num_obs_per_group])
93 |
94 | # Return the desired objects.
95 | return num_obs_per_group, tot_num_obs
96 |
97 |
98 | def create_cross_sectional_bootstrap_samples(obs_id_array,
99 | alt_id_array,
100 | choice_array,
101 | num_samples,
102 | seed=None):
103 | """
104 | Determines the unique observations that will be present in each bootstrap
105 | sample. This function DOES NOT create the new design matrices or a new
106 | long-format dataframe for each bootstrap sample. Note that these will be
107 | correct bootstrap samples for cross-sectional datasets. This function will
108 | not work correctly for panel datasets.
109 |
110 | Parameters
111 | ----------
112 | obs_id_array : 1D ndarray of ints.
113 | Each element should denote a unique observation id for the
114 | corresponding row of the long format array.
115 | alt_id_array : 1D ndarray of ints.
116 | Each element should denote a unique alternative id for the
117 | corresponding row of the long format array.
118 | choice_array : 1D ndarray of ints.
119 | Each element should be a one or a zero. The values should denote a
120 | whether or not the corresponding alternative in `alt_id_array` was
121 | chosen by the observational unit in the corresponding row of
122 | `obs_id_array.`
123 | num_samples : int.
124 | Denotes the number of bootstrap samples that need to be drawn.
125 | seed : non-negative int or None, optional.
126 | Denotes the random seed to be used in order to ensure reproducibility
127 | of the bootstrap sample generation. Default is None. If None, no seed
128 | will be used and the generation of the bootstrap samples will (in
129 | general) not be reproducible.
130 |
131 |
132 | Returns
133 | -------
134 | ids_per_sample : 2D ndarray.
135 | Each row represents a complete bootstrap sample. Each column denotes a
136 | selected bootstrap observation that comprises the bootstrap sample. The
137 | elements of the array denote the observation ids of the chosen
138 | observational units.
139 | """
140 | # Determine the units of observation that chose each alternative.
141 | chosen_alts_to_obs_ids =\
142 | relate_obs_ids_to_chosen_alts(obs_id_array, alt_id_array, choice_array)
143 |
144 | # Determine the number of unique units of observation per group and overall
145 | num_obs_per_group, tot_num_obs =\
146 | get_num_obs_choosing_each_alternative(chosen_alts_to_obs_ids)
147 |
148 | # Initialize the array that will store the observation ids for each sample
149 | ids_per_sample = np.empty((num_samples, tot_num_obs), dtype=float)
150 |
151 | if seed is not None:
152 | # Check the validity of the seed argument.
153 | if not isinstance(seed, int):
154 | msg = "`boot_seed` MUST be an int."
155 | raise ValueError(msg)
156 |
157 | # If desiring reproducibility, set the random seed within numpy
158 | np.random.seed(seed)
159 |
160 | # Initialize a variable to keep track of what column we're on.
161 | col_idx = 0
162 | for alt_id in num_obs_per_group:
163 | # Get the set of observations that chose the current alternative.
164 | relevant_ids = chosen_alts_to_obs_ids[alt_id]
165 | # Determine the number of needed resampled ids.
166 | resample_size = num_obs_per_group[alt_id]
167 | # Resample, with replacement, observations who chose this alternative.
168 | current_ids = (np.random.choice(relevant_ids,
169 | size=resample_size * num_samples,
170 | replace=True)
171 | .reshape((num_samples, resample_size)))
172 | # Determine the last column index to use when storing the resampled ids
173 | end_col = col_idx + resample_size
174 | # Assign the sampled ids to the correct columns of ids_per_sample
175 | ids_per_sample[:, col_idx:end_col] = current_ids
176 | # Update the column index
177 | col_idx += resample_size
178 |
179 | # Return the resampled observation ids.
180 | return ids_per_sample
181 |
182 |
183 | def create_bootstrap_id_array(obs_id_per_sample):
184 | """
185 | Creates a 2D ndarray that contains the 'bootstrap ids' for each replication
186 | of each unit of observation that is an the set of bootstrap samples.
187 |
188 | Parameters
189 | ----------
190 | obs_id_per_sample : 2D ndarray of ints.
191 | Should have one row for each bootsrap sample. Should have one column
192 | for each observational unit that is serving as a new bootstrap
193 | observational unit.
194 |
195 | Returns
196 | -------
197 | bootstrap_id_array : 2D ndarray of ints.
198 | Will have the same shape as `obs_id_per_sample`. Each element will
199 | denote the fake observational id in the new bootstrap dataset.
200 | """
201 | # Determine the shape of the object to be returned.
202 | n_rows, n_cols = obs_id_per_sample.shape
203 | # Create the array of bootstrap ids.
204 | bootstrap_id_array =\
205 | np.tile(np.arange(n_cols) + 1, n_rows).reshape((n_rows, n_cols))
206 | # Return the desired object
207 | return bootstrap_id_array
208 |
209 |
210 | def create_deepcopied_groupby_dict(orig_df, obs_id_col):
211 | """
212 | Will create a dictionary where each key corresponds to a unique value in
213 | `orig_df[obs_id_col]` and each value corresponds to all of the rows of
214 | `orig_df` where `orig_df[obs_id_col] == key`.
215 |
216 | Parameters
217 | ----------
218 | orig_df : pandas DataFrame.
219 | Should be long-format dataframe containing the data used to estimate
220 | the desired choice model.
221 | obs_id_col : str.
222 | Should be a column name within `orig_df`. Should denote the original
223 | observation id column.
224 |
225 | Returns
226 | -------
227 | groupby_dict : dict.
228 | Each key will be a unique value in `orig_df[obs_id_col]` and each value
229 | will be the rows of `orig_df` where `orig_df[obs_id_col] == key`.
230 | """
231 | # Get the observation id values
232 | obs_id_vals = orig_df[obs_id_col].values
233 | # Get the unique observation ids
234 | unique_obs_ids = np.unique(obs_id_vals)
235 | # Initialize the dictionary to be returned.
236 | groupby_dict = {}
237 | # Populate the dictionary with dataframes for each individual.
238 | for obs_id in unique_obs_ids:
239 | # Filter out only the rows corresponding to the current observation id.
240 | desired_rows = obs_id_vals == obs_id
241 | # Add the desired dataframe to the dictionary.
242 | groupby_dict[obs_id] = orig_df.loc[desired_rows].copy(deep=True)
243 |
244 | # Return the desired object.
245 | return groupby_dict
246 |
247 |
248 | def check_column_existence(col_name, df, presence=True):
249 | """
250 | Checks whether or not `col_name` is in `df` and raises a helpful error msg
251 | if the desired condition is not met.
252 |
253 | Parameters
254 | ----------
255 | col_name : str.
256 | Should represent a column whose presence in `df` is to be checked.
257 | df : pandas DataFrame.
258 | The dataframe that will be checked for the presence of `col_name`.
259 | presence : bool, optional.
260 | If True, then this function checks for the PRESENCE of `col_name` from
261 | `df`. If False, then this function checks for the ABSENCE of
262 | `col_name` in `df`. Default == True.
263 |
264 | Returns
265 | -------
266 | None.
267 | """
268 | if presence:
269 | if col_name not in df.columns:
270 | msg = "Ensure that `{}` is in `df.columns`."
271 | raise ValueError(msg.format(col_name))
272 | else:
273 | if col_name in df.columns:
274 | msg = "Ensure that `{}` is not in `df.columns`."
275 | raise ValueError(msg.format(col_name))
276 | return None
277 |
278 |
279 | def ensure_resampled_obs_ids_in_df(resampled_obs_ids, orig_obs_id_array):
280 | """
281 | Checks whether all ids in `resampled_obs_ids` are in `orig_obs_id_array`.
282 | Raises a helpful ValueError if not.
283 |
284 | Parameters
285 | ----------
286 | resampled_obs_ids : 1D ndarray of ints.
287 | Should contain the observation ids of the observational units that will
288 | be used in the current bootstrap sample.
289 | orig_obs_id_array : 1D ndarray of ints.
290 | Should countain the observation ids of the observational units in the
291 | original dataframe containing the data for this model.
292 |
293 | Returns
294 | -------
295 | None.
296 | """
297 | if not np.in1d(resampled_obs_ids, orig_obs_id_array).all():
298 | msg =\
299 | "All values in `resampled_obs_ids` MUST be in `orig_obs_id_array`."
300 | raise ValueError(msg)
301 | return None
302 |
303 |
304 | def create_bootstrap_dataframe(orig_df,
305 | obs_id_col,
306 | resampled_obs_ids_1d,
307 | groupby_dict,
308 | boot_id_col="bootstrap_id"):
309 | """
310 | Will create the altered dataframe of data needed to estimate a choice model
311 | with the particular observations that belong to the current bootstrap
312 | sample.
313 |
314 | Parameters
315 | ----------
316 | orig_df : pandas DataFrame.
317 | Should be long-format dataframe containing the data used to estimate
318 | the desired choice model.
319 | obs_id_col : str.
320 | Should be a column name within `orig_df`. Should denote the original
321 | observation id column.
322 | resampled_obs_ids_1d : 1D ndarray of ints.
323 | Each value should represent the alternative id of a given bootstrap
324 | replicate.
325 | groupby_dict : dict.
326 | Each key will be a unique value in `orig_df[obs_id_col]` and each value
327 | will be the rows of `orig_df` where `orig_df[obs_id_col] == key`.
328 | boot_id_col : str, optional.
329 | Denotes the new column that will be created to specify the bootstrap
330 | observation ids for choice model estimation.
331 |
332 | Returns
333 | -------
334 | bootstrap_df : pandas Dataframe.
335 | Will contain all the same columns as `orig_df` as well as the
336 | additional `boot_id_col`. For each value in `resampled_obs_ids_1d`,
337 | `bootstrap_df` will contain the long format rows from `orig_df` that
338 | have the given observation id.
339 | """
340 | # Check the validity of the passed arguments.
341 | check_column_existence(obs_id_col, orig_df, presence=True)
342 | check_column_existence(boot_id_col, orig_df, presence=False)
343 | # Alias the observation id column
344 | obs_id_values = orig_df[obs_id_col].values
345 | # Check the validity of the resampled observation ids.
346 | ensure_resampled_obs_ids_in_df(resampled_obs_ids_1d, obs_id_values)
347 |
348 | # Initialize a list to store the component dataframes that will be
349 | # concatenated to form the final bootstrap_df
350 | component_dfs = []
351 |
352 | # Populate component_dfs
353 | for boot_id, obs_id in enumerate(resampled_obs_ids_1d):
354 | # Extract the dataframe that we desire.
355 | extracted_df = groupby_dict[obs_id].copy()
356 | # Add the bootstrap id value.
357 | extracted_df[boot_id_col] = boot_id + 1
358 | # Store the component dataframe
359 | component_dfs.append(extracted_df)
360 |
361 | # Create and return the desired dataframe.
362 | bootstrap_df = pd.concat(component_dfs, axis=0, ignore_index=True)
363 | return bootstrap_df
364 |
--------------------------------------------------------------------------------
/src/pylogit/bootstrap_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @author: Timothy Brathwaite
4 | @name: Bootstrap Utilities
5 | @summary: This module provides helpful functions for calculating the
6 | bootstrap confidence intervals.
7 | """
8 | from __future__ import absolute_import
9 |
10 | from numbers import Number
11 | import numpy as np
12 |
13 |
14 | def check_conf_percentage_validity(conf_percentage):
15 | """
16 | Ensures that `conf_percentage` is in (0, 100). Raises a helpful ValueError
17 | if otherwise.
18 | """
19 | msg = "conf_percentage MUST be a number between 0.0 and 100."
20 | condition_1 = isinstance(conf_percentage, Number)
21 | if not condition_1:
22 | raise ValueError(msg)
23 | else:
24 | condition_2 = 0 < conf_percentage < 100
25 | if not condition_2:
26 | raise ValueError(msg)
27 | return None
28 |
29 |
30 | def ensure_samples_is_ndim_ndarray(samples, name='bootstrap', ndim=2):
31 | """
32 | Ensures that `samples` is an `ndim` numpy array. Raises a helpful
33 | ValueError if otherwise.
34 | """
35 | assert isinstance(ndim, int)
36 | assert isinstance(name, str)
37 | if not isinstance(samples, np.ndarray) or not (samples.ndim == ndim):
38 | sample_name = name + "_samples"
39 | msg = "`{}` MUST be a {}D ndarray.".format(sample_name, ndim)
40 | raise ValueError(msg)
41 | return None
42 |
43 |
44 | def get_alpha_from_conf_percentage(conf_percentage):
45 | """
46 | Calculates `100 - conf_percentage`, which is useful for calculating alpha
47 | levels.
48 | """
49 | return 100.0 - conf_percentage
50 |
51 |
52 | def combine_conf_endpoints(lower_array, upper_array):
53 | """
54 | Concatenates upper and lower endpoint arrays for a given confidence level.
55 | """
56 | return np.concatenate([lower_array[None, :], upper_array[None, :]], axis=0)
57 |
--------------------------------------------------------------------------------
/src/pylogit/conditional_logit.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Feb 25 07:19:49 2016
4 |
5 | @name: MultiNomial Logit
6 | @author: Timothy Brathwaite
7 | @summary: Contains functions necessary for estimating multinomial logit
8 | models (with the help of the "base_multinomial_cm.py" file).
9 | Differs from version one since it works with the shape, intercept,
10 | index coefficient partitioning of estimated parameters as opposed
11 | to the shape, index coefficient partitioning scheme of version 1.
12 | """
13 | from __future__ import absolute_import
14 |
15 | import warnings
16 | import numpy as np
17 | from scipy.sparse import diags
18 |
19 | from . import choice_calcs as cc
20 | from . import base_multinomial_cm_v2 as base_mcm
21 | from .estimation import LogitTypeEstimator
22 | from .estimation import estimate
23 | from .display_names import model_type_to_display_name
24 |
25 | # Create a variable that will be printed if there is a non-fatal error
26 | # in the MNL class construction
27 | _msg_1 = "The Multinomial Logit Model has no shape parameters. "
28 | _msg_2 = "shape_names and shape_ref_pos will be ignored if passed."
29 | _shape_ignore_msg = _msg_1 + _msg_2
30 |
31 | # Create a warning string that will be issued if ridge regression is performed.
32 | _msg_3 = "NOTE: An L2-penalized regression is being performed. The "
33 | _msg_4 = "reported standard errors and robust standard errors "
34 | _msg_5 = "***WILL BE INCORRECT***."
35 | _ridge_warning_msg = _msg_3 + _msg_4 + _msg_5
36 |
37 | # Alias necessary functions from the base multinomial choice model module
38 | general_log_likelihood = cc.calc_log_likelihood
39 | general_gradient = cc.calc_gradient
40 | general_calc_probabilities = cc.calc_probabilities
41 | general_hessian = cc.calc_hessian
42 |
43 |
44 | def split_param_vec(beta,
45 | rows_to_alts=None,
46 | design=None,
47 | return_all_types=False,
48 | *args, **kwargs):
49 | """
50 | Parameters
51 | ----------
52 | beta : 1D ndarray.
53 | All elements should by ints, floats, or longs. Should have 1 element
54 | for each utility coefficient being estimated (i.e. num_features).
55 | rows_to_alts : None,
56 | Not actually used. Included merely for consistency with other models.
57 | design : None.
58 | Not actually used. Included merely for consistency with other models.
59 | return_all_types : bool, optional.
60 | Determines whether or not a tuple of 4 elements will be returned (with
61 | one element for the nest, shape, intercept, and index parameters for
62 | this model). If False, a tuple of 3 elements will be returned, as
63 | described below.
64 |
65 | Returns
66 | -------
67 | tuple.
68 | `(None, None, beta)`. This function is merely for compatibility with
69 | the other choice model files.
70 |
71 | Note
72 | ----
73 | If `return_all_types == True` then the function will return a tuple of four
74 | objects. In order, these objects will either be None or the arrays
75 | representing the arrays corresponding to the nest, shape, intercept, and
76 | index parameters.
77 | """
78 | if return_all_types:
79 | return None, None, None, beta
80 | else:
81 | return None, None, beta
82 |
83 |
84 | def _mnl_utility_transform(systematic_utilities, *args, **kwargs):
85 | """
86 | Parameters
87 | ----------
88 | systematic_utilities : 1D ndarray.
89 | Should contain the systematic utilities for each each available
90 | alternative for each observation.
91 |
92 | Returns
93 | -------
94 | `systematic_utilities[:, None]`
95 | """
96 | # Be sure to return a 2D array since other functions will be expecting this
97 | if len(systematic_utilities.shape) == 1:
98 | systematic_utilities = systematic_utilities[:, np.newaxis]
99 |
100 | return systematic_utilities
101 |
102 |
103 | def _mnl_transform_deriv_c(*args, **kwargs):
104 | """
105 | Returns None.
106 |
107 | This is a place holder function since the MNL model has no shape
108 | parameters.
109 | """
110 | # This is a place holder function since the MNL model has no shape
111 | # parameters.
112 | return None
113 |
114 |
115 | def _mnl_transform_deriv_alpha(*args, **kwargs):
116 | """
117 | Returns None.
118 |
119 | This is a place holder function since the MNL model has no intercept
120 | parameters outside of the index.
121 | """
122 | # This is a place holder function since the MNL model has no intercept
123 | # parameters outside the index.
124 | return None
125 |
126 |
127 | class MNLEstimator(LogitTypeEstimator):
128 | """
129 | Estimation Object used to enforce uniformity in the estimation process
130 | across the various logit-type models.
131 |
132 | Parameters
133 | ----------
134 | model_obj : a pylogit.base_multinomial_cm_v2.MNDC_Model instance.
135 | Should contain the following attributes:
136 |
137 | - alt_IDs
138 | - choices
139 | - design
140 | - intercept_ref_position
141 | - shape_ref_position
142 | - utility_transform
143 | mapping_res : dict.
144 | Should contain the scipy sparse matrices that map the rows of the long
145 | format dataframe to various other objects such as the available
146 | alternatives, the unique observations, etc. The keys that it must have
147 | are `['rows_to_obs', 'rows_to_alts', 'chosen_row_to_obs']`
148 | ridge : int, float, long, or None.
149 | Determines whether or not ridge regression is performed. If a
150 | scalar is passed, then that scalar determines the ridge penalty for
151 | the optimization. The scalar should be greater than or equal to
152 | zero..
153 | zero_vector : 1D ndarray.
154 | Determines what is viewed as a "null" set of parameters. It is
155 | explicitly passed because some parameters (e.g. parameters that must be
156 | greater than zero) have their null values at values other than zero.
157 | split_params : callable.
158 | Should take a vector of parameters, `mapping_res['rows_to_alts']`, and
159 | model_obj.design as arguments. Should return a tuple containing
160 | separate arrays for the model's shape, outside intercept, and index
161 | coefficients. For each of these arrays, if this model does not contain
162 | the particular type of parameter, the callable should place a `None` in
163 | its place in the tuple.
164 | constrained_pos : list or None, optional.
165 | Denotes the positions of the array of estimated parameters that are
166 | not to change from their initial values. If a list is passed, the
167 | elements are to be integers where no such integer is greater than
168 | `num_params` Default == None.
169 | weights : 1D ndarray or None, optional.
170 | Allows for the calculation of weighted log-likelihoods. The weights can
171 | represent various things. In stratified samples, the weights may be
172 | the proportion of the observations in a given strata for a sample in
173 | relation to the proportion of observations in that strata in the
174 | population. In latent class models, the weights may be the probability
175 | of being a particular class.
176 | """
177 | def set_derivatives(self):
178 | # Pre-calculate the derivative of the transformation vector with
179 | # respect to the vector of systematic utilities
180 | dh_dv = diags(np.ones(self.design.shape[0]), 0, format='csr')
181 |
182 | # Create a function to calculate dh_dv which will return the
183 | # pre-calculated result when called
184 | def calc_dh_dv(*args):
185 | return dh_dv
186 |
187 | self.calc_dh_dv = calc_dh_dv
188 | self.calc_dh_d_alpha = _mnl_transform_deriv_alpha
189 | self.calc_dh_d_shape = _mnl_transform_deriv_c
190 |
191 | def check_length_of_initial_values(self, init_values):
192 | """
193 | Ensures that `init_values` is of the correct length. Raises a helpful
194 | ValueError if otherwise.
195 |
196 | Parameters
197 | ----------
198 | init_values : 1D ndarray.
199 | The initial values to start the optimization process with. There
200 | should be one value for each index coefficient, outside intercept
201 | parameter, and shape parameter being estimated.
202 |
203 | Returns
204 | -------
205 | None.
206 | """
207 | # Calculate the expected number of index parameters
208 | num_index_coefs = self.design.shape[1]
209 |
210 | if init_values.shape[0] != num_index_coefs:
211 | msg_1 = "The initial values are of the wrong dimension."
212 | msg_2 = "It should be of dimension {}"
213 | msg_3 = "But instead it has dimension {}"
214 | raise ValueError(msg_1 +
215 | msg_2.format(num_index_coefs) +
216 | msg_3.format(init_values.shape[0]))
217 |
218 | return None
219 |
220 |
221 | class MNL(base_mcm.MNDC_Model):
222 | """
223 | Parameters
224 | ----------
225 | data : string or pandas dataframe.
226 | If string, data should be an absolute or relative path to a CSV file
227 | containing the long format data for this choice model. Note long format
228 | is has one row per available alternative for each observation. If
229 | pandas dataframe, the dataframe should be the long format data for the
230 | choice model.
231 | alt_id_col :str.
232 | Should denote the column in data which contains the alternative
233 | identifiers for each row.
234 | obs_id_col : str.
235 | Should denote the column in data which contains the observation
236 | identifiers for each row.
237 | choice_col : str.
238 | Should denote the column in data which contains the ones and zeros that
239 | denote whether or not the given row corresponds to the chosen
240 | alternative for the given individual.
241 | specification : OrderedDict.
242 | Keys are a proper subset of the columns in `data`. Values are either a
243 | list or a single string, "all_diff" or "all_same". If a list, the
244 | elements should be:
245 | - single objects that are in the alternative ID column of `data`
246 | - lists of objects that are within the alternative ID column of
247 | `data`. For each single object in the list, a unique column will
248 | be created (i.e. there will be a unique coefficient for that
249 | variable in the corresponding utility equation of the
250 | corresponding alternative). For lists within the
251 | `specification` values, a single column will be created for all
252 | the alternatives within the iterable (i.e. there will be one
253 | common coefficient for the variables in the iterable).
254 | names : OrderedDict, optional.
255 | Should have the same keys as `specification`. For each key:
256 | - if the corresponding value in `specification` is "all_same", then
257 | there should be a single string as the value in names.
258 | - if the corresponding value in `specification` is "all_diff", then
259 | there should be a list of strings as the value in names. There
260 | should be one string in the value in names for each possible
261 | alternative.
262 | - if the corresponding value in `specification` is a list, then
263 | there should be a list of strings as the value in names. There
264 | should be one string the value in names per item in the value in
265 | `specification`.
266 | Default == None.
267 |
268 | """
269 | def __init__(self,
270 | data,
271 | alt_id_col,
272 | obs_id_col,
273 | choice_col,
274 | specification,
275 | names=None,
276 | *args, **kwargs):
277 | ##########
278 | # Print a helpful message for users who have included shape parameters
279 | # or shape names unneccessarily
280 | ##########
281 | for keyword in ["shape_names", "shape_ref_pos"]:
282 | if keyword in kwargs and kwargs[keyword] is not None:
283 | warnings.warn(_shape_ignore_msg)
284 | break
285 |
286 | if "intercept_ref_pos" in kwargs:
287 | if kwargs["intercept_ref_pos"] is not None:
288 | msg = "The MNL model should have all intercepts in the index."
289 | raise ValueError(msg)
290 |
291 | # Carry out the common instantiation process for all choice models
292 | super(MNL, self).__init__(data,
293 | alt_id_col,
294 | obs_id_col,
295 | choice_col,
296 | specification,
297 | names=names,
298 | model_type=model_type_to_display_name["MNL"])
299 |
300 | # Store the utility transform function
301 | self.utility_transform = _mnl_utility_transform
302 |
303 | return None
304 |
305 | def fit_mle(self,
306 | init_vals,
307 | print_res=True,
308 | method="BFGS",
309 | loss_tol=1e-06,
310 | gradient_tol=1e-06,
311 | maxiter=1000,
312 | ridge=None,
313 | constrained_pos=None,
314 | just_point=False,
315 | **kwargs):
316 | """
317 | Parameters
318 | ----------
319 | init_vals : 1D ndarray.
320 | The initial values to start the optimization process with. There
321 | should be one value for each utility coefficient being estimated.
322 | print_res : bool, optional.
323 | Determines whether the timing and initial and final log likelihood
324 | results will be printed as they they are determined.
325 | method : str, optional.
326 | Should be a valid string that can be passed to
327 | scipy.optimize.minimize. Determines the optimization algorithm that
328 | is used for this problem. If 'em' is passed, a custom coded EM
329 | algorithm will be used. Default `== 'newton-cg'`.
330 | loss_tol : float, optional.
331 | Determines the tolerance on the difference in objective function
332 | values from one iteration to the next that is needed to determine
333 | convergence. Default `== 1e-06`.
334 | gradient_tol : float, optional.
335 | Determines the tolerance on the difference in gradient values from
336 | one iteration to the next which is needed to determine convergence.
337 | ridge : int, float, long, or None, optional.
338 | Determines whether or not ridge regression is performed. If a
339 | scalar is passed, then that scalar determines the ridge penalty for
340 | the optimization. Default `== None`.
341 | constrained_pos : list or None, optional.
342 | Denotes the positions of the array of estimated parameters that are
343 | not to change from their initial values. If a list is passed, the
344 | elements are to be integers where no such integer is greater than
345 | `init_vals.size.` Default == None.
346 | just_point : bool, optional.
347 | Determines whether (True) or not (False) calculations that are non-
348 | critical for obtaining the maximum likelihood point estimate will
349 | be performed. If True, this function will return the results
350 | dictionary from scipy.optimize. Default == False.
351 |
352 | Returns
353 | -------
354 | None or dict.
355 | If `just_point` is False, None is returned and the estimation
356 | results are saved to the model instance. If `just_point` is True,
357 | then the results dictionary from scipy.optimize() is returned.
358 | """
359 | # Check integrity of passed arguments
360 | kwargs_to_be_ignored = ["init_shapes", "init_intercepts", "init_coefs"]
361 | if any([x in kwargs for x in kwargs_to_be_ignored]):
362 | msg = "MNL model does not use of any of the following kwargs:\n{}"
363 | msg_2 = "Remove such kwargs and pass a single init_vals argument"
364 | raise ValueError(msg.format(kwargs_to_be_ignored) + msg_2)
365 |
366 | if ridge is not None:
367 | warnings.warn(_ridge_warning_msg)
368 |
369 | # Store the optimization method
370 | self.optimization_method = method
371 |
372 | # Store the ridge parameter
373 | self.ridge_param = ridge
374 |
375 | # Construct the mappings from alternatives to observations and from
376 | # chosen alternatives to observations
377 | mapping_res = self.get_mappings_for_fit()
378 |
379 | # Create the estimation object
380 | zero_vector = np.zeros(init_vals.shape)
381 | mnl_estimator = MNLEstimator(self,
382 | mapping_res,
383 | ridge,
384 | zero_vector,
385 | split_param_vec,
386 | constrained_pos=constrained_pos)
387 | # Set the derivative functions for estimation
388 | mnl_estimator.set_derivatives()
389 |
390 | # Perform one final check on the length of the initial values
391 | mnl_estimator.check_length_of_initial_values(init_vals)
392 |
393 | # Get the estimation results
394 | estimation_res = estimate(init_vals,
395 | mnl_estimator,
396 | method,
397 | loss_tol,
398 | gradient_tol,
399 | maxiter,
400 | print_res,
401 | just_point=just_point)
402 |
403 | if not just_point:
404 | # Store the estimation results
405 | self.store_fit_results(estimation_res)
406 |
407 | return None
408 | else:
409 | return estimation_res
410 |
--------------------------------------------------------------------------------
/src/pylogit/construct_estimator.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @name: Estimator Constructor
4 | @author: Timothy Brathwaite
5 | @summary: Contains functions necessary for constructing the Estimation
6 | Objects used to provide convenience functions when estimating
7 | PyLogit's various choice models.
8 | """
9 | from __future__ import absolute_import
10 |
11 | import numpy as np
12 |
13 | from .display_names import model_type_to_display_name as display_name_dict
14 |
15 | from .mixed_logit import MixedEstimator
16 | from .mixed_logit import split_param_vec as mixed_split_params
17 |
18 | from .nested_logit import NestedEstimator
19 | from .nested_logit import split_param_vec as nested_split_params
20 |
21 | from .conditional_logit import MNLEstimator
22 | from .conditional_logit import split_param_vec as mnl_split_params
23 |
24 | from .clog_log import ClogEstimator
25 | from .clog_log import split_param_vec as clog_split_params
26 |
27 | from .asym_logit import AsymEstimator
28 | from .asym_logit import split_param_vec as asym_split_params
29 |
30 | from .scobit import ScobitEstimator
31 | from .scobit import split_param_vec as scobit_split_params
32 |
33 | from .uneven_logit import UnevenEstimator
34 | from .uneven_logit import split_param_vec as uneven_split_params
35 |
36 | # Map the displayed model types to the internal model names.
37 | display_name_to_model_type = {v: k for k, v in display_name_dict.items()}
38 |
39 | # Map the internal model types to their appropriate estimator and split params
40 | # functions
41 | model_type_to_resources =\
42 | {"MNL": {'estimator': MNLEstimator, 'split_func': mnl_split_params},
43 | "Asym": {'estimator': AsymEstimator, 'split_func': asym_split_params},
44 | "Cloglog": {'estimator': ClogEstimator, 'split_func': clog_split_params},
45 | "Scobit": {'estimator': ScobitEstimator,
46 | 'split_func': scobit_split_params},
47 | "Uneven": {'estimator': UnevenEstimator,
48 | 'split_func': uneven_split_params},
49 | "Nested Logit": {'estimator': NestedEstimator,
50 | 'split_func': nested_split_params},
51 | "Mixed Logit": {'estimator': MixedEstimator,
52 | 'split_func': mixed_split_params}}
53 |
54 |
55 | def create_estimation_obj(model_obj,
56 | init_vals,
57 | mappings=None,
58 | ridge=None,
59 | constrained_pos=None,
60 | weights=None):
61 | """
62 | Should return a model estimation object corresponding to the model type of
63 | the `model_obj`.
64 |
65 | Parameters
66 | ----------
67 | model_obj : an instance or sublcass of the MNDC class.
68 | init_vals : 1D ndarray.
69 | The initial values to start the estimation process with. In the
70 | following order, there should be one value for each nest coefficient,
71 | shape parameter, outside intercept parameter, or index coefficient that
72 | is being estimated.
73 | mappings : OrderedDict or None, optional.
74 | Keys will be `["rows_to_obs", "rows_to_alts", "chosen_row_to_obs",
75 | "rows_to_nests"]`. The value for `rows_to_obs` will map the rows of
76 | the `long_form` to the unique observations (on the columns) in
77 | their order of appearance. The value for `rows_to_alts` will map
78 | the rows of the `long_form` to the unique alternatives which are
79 | possible in the dataset (on the columns), in sorted order--not
80 | order of appearance. The value for `chosen_row_to_obs`, if not
81 | None, will map the rows of the `long_form` that contain the chosen
82 | alternatives to the specific observations those rows are associated
83 | with (denoted by the columns). The value of `rows_to_nests`, if not
84 | None, will map the rows of the `long_form` to the nest (denoted by
85 | the column) that contains the row's alternative. Default == None.
86 | ridge : int, float, long, or None, optional.
87 | Determines whether or not ridge regression is performed. If a
88 | scalar is passed, then that scalar determines the ridge penalty for
89 | the optimization. The scalar should be greater than or equal to
90 | zero. Default `== None`.
91 | constrained_pos : list or None, optional.
92 | Denotes the positions of the array of estimated parameters that are
93 | not to change from their initial values. If a list is passed, the
94 | elements are to be integers where no such integer is greater than
95 | `init_vals.size.` Default == None.
96 | weights : 1D ndarray.
97 | Should contain the weights for each corresponding observation for each
98 | row of the long format data.
99 | """
100 | # Get the mapping matrices for each model
101 | mapping_matrices =\
102 | model_obj.get_mappings_for_fit() if mappings is None else mappings
103 | # Create the zero vector for each model.
104 | zero_vector = np.zeros(init_vals.shape[0])
105 | # Get the internal model name
106 | internal_model_name = display_name_to_model_type[model_obj.model_type]
107 | # Get the split parameter function and estimator class for this model.
108 | estimator_class, current_split_func =\
109 | (model_type_to_resources[internal_model_name]['estimator'],
110 | model_type_to_resources[internal_model_name]['split_func'])
111 | # Create the estimator instance that is desired.
112 | estimation_obj = estimator_class(model_obj,
113 | mapping_matrices,
114 | ridge,
115 | zero_vector,
116 | current_split_func,
117 | constrained_pos,
118 | weights=weights)
119 | # Return the created object
120 | return estimation_obj
121 |
--------------------------------------------------------------------------------
/src/pylogit/display_names.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | This file declares the strings that will be displayed for each model type based
4 | on the abbriviated model type string that is passed to the choice model
5 | constructor.
6 | """
7 | from __future__ import absolute_import
8 |
9 | from collections import OrderedDict
10 | model_type_to_display_name = OrderedDict()
11 | model_type_to_display_name["MNL"] = "Multinomial Logit Model"
12 | model_type_to_display_name["Asym"] = "Multinomial Asymmetric Logit Model"
13 | model_type_to_display_name["Cloglog"] = "Multinomial Clog-log Model"
14 | model_type_to_display_name["Scobit"] = "Multinomial Scobit Model"
15 | model_type_to_display_name["Uneven"] = "Multinomial Uneven Logit Model"
16 | model_type_to_display_name["Nested Logit"] = "Nested Logit Model"
17 | model_type_to_display_name["Mixed Logit"] = "Mixed Logit Model"
18 |
--------------------------------------------------------------------------------
/src/pylogit/newsfragments/.gitignore:
--------------------------------------------------------------------------------
1 | !.gitignore
2 |
--------------------------------------------------------------------------------
/src/pylogit/pylogit.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Feb 29 22:07:30 2016
4 |
5 | @module: generalized_choice_model
6 | @name: Python Based Conditional Logit-type Models
7 | @author: Timothy Brathwaite
8 | @summary: Contains functions necessary for estimating multinomial, asymmetric
9 | conditional choice models (and standard conditional logit models).
10 | @notes: "Under the hood", this module indirectly or directly relies upon
11 | the following files:
12 | [base_multinomial_cm_v2.py,
13 | choice_calcs.py,
14 | choice_tools.py,
15 | conditional_logit.py,
16 | asym_logit.py,
17 | uneven_logit.py,
18 | scobit.py,
19 | clog_log.py,
20 | nested_logit.py
21 | mixed_logit.py]
22 | """
23 | from __future__ import absolute_import
24 |
25 | from . import conditional_logit as mnl
26 | from . import asym_logit
27 | from . import uneven_logit
28 | from . import scobit
29 | from . import clog_log
30 | from . import nested_logit
31 | from . import mixed_logit
32 |
33 | # Create a dictionary relating the model type parameter to the class that
34 | # the general choice model should inherit from
35 | model_type_to_class = {"MNL": mnl.MNL,
36 | "Asym": asym_logit.MNAL,
37 | "Cloglog": clog_log.MNCL,
38 | "Scobit": scobit.MNSL,
39 | "Uneven": uneven_logit.MNUL,
40 | "Nested Logit": nested_logit.NestedLogit,
41 | "Mixed Logit": mixed_logit.MixedLogit}
42 |
43 | # Create a dictionary relating the model type parameter to the name of the
44 | # class that the general choice model should inherit from
45 | model_type_to_class_name = {"MNL": "MNL",
46 | "Asym": "MNAL",
47 | "Cloglog": "MNCL",
48 | "Scobit": "MNSL",
49 | "Uneven": "MNUL",
50 | "Nested Logit": "NestedLogit",
51 | "Mixed Logit": "MixedLogit"}
52 |
53 | # Store the names of the model_type kwargs that are valid.
54 | valid_model_types = model_type_to_class.keys()
55 |
56 |
57 | # Create a function that checks the user's model type and ensures its validity
58 | def ensure_valid_model_type(specified_type, model_type_list):
59 | """
60 | Checks to make sure that `specified_type` is in `model_type_list` and
61 | raises a helpful error if this is not the case.
62 |
63 | Parameters
64 | ----------
65 | specified_type : str.
66 | Denotes the user-specified model type that is to be checked.
67 | model_type_list : list of strings.
68 | Contains all of the model types that are acceptable kwarg values.
69 |
70 | Returns
71 | -------
72 | None.
73 | """
74 | if specified_type not in model_type_list:
75 | msg_1 = "The specified model_type was not valid."
76 | msg_2 = "Valid model-types are {}".format(model_type_list)
77 | msg_3 = "The passed model-type was: {}".format(specified_type)
78 | total_msg = "\n".join([msg_1, msg_2, msg_3])
79 | raise ValueError(total_msg)
80 | return None
81 |
82 |
83 | def create_choice_model(data,
84 | alt_id_col,
85 | obs_id_col,
86 | choice_col,
87 | specification,
88 | model_type,
89 | intercept_ref_pos=None,
90 | shape_ref_pos=None,
91 | names=None,
92 | intercept_names=None,
93 | shape_names=None,
94 | nest_spec=None,
95 | mixing_id_col=None,
96 | mixing_vars=None):
97 | """
98 | Parameters
99 | ----------
100 | data : string or pandas dataframe.
101 | If `data` is a string, it should be an absolute or relative path to
102 | a CSV file containing the long format data for this choice model.
103 | Note long format has one row per available alternative for each
104 | observation. If `data` is a pandas dataframe, `data` should already
105 | be in long format.
106 | alt_id_col : string.
107 | Should denote the column in data that contains the alternative
108 | identifiers for each row.
109 | obs_id_col : string.
110 | Should denote the column in data that contains the observation
111 | identifiers for each row.
112 | choice_col : string.
113 | Should denote the column in data which contains the ones and zeros
114 | that denote whether or not the given row corresponds to the chosen
115 | alternative for the given individual.
116 | specification : OrderedDict.
117 | Keys are a proper subset of the columns in `long_form_df`. Values are
118 | either a list or a single string, `all_diff` or `all_same`. If a list,
119 | the elements should be:
120 | 1) single objects that are within the alternative ID column of
121 | `long_form_df`
122 | 2) lists of objects that are within the alternative ID column of
123 | `long_form_df`. For each single object in the list, a unique
124 | column will be created (i.e. there will be a unique
125 | coefficient for that variable in the corresponding utility
126 | equation of the corresponding alternative). For lists within
127 | the `specification_dict` values, a single column will be
128 | created for all the alternatives within iterable (i.e. there
129 | will be one common coefficient for the variables in the
130 | iterable).
131 | model_type : string.
132 | Denotes the model type of the choice_model being instantiated.
133 | Should be one of the following values:
134 |
135 | - "MNL"
136 | - "Asym"
137 | - "Cloglog"
138 | - "Scobit"
139 | - "Uneven"
140 | - "Nested Logit"
141 | - "Mixed Logit"
142 | intercept_ref_pos : int, optional.
143 | Valid only when the intercepts being estimated are not part of the
144 | index. Specifies the alternative in the ordered array of unique
145 | alternative ids whose intercept or alternative-specific constant is
146 | not estimated, to ensure model identifiability. Default == None.
147 | shape_ref_pos : int, optional.
148 | Specifies the alternative in the ordered array of unique
149 | alternative ids whose shape parameter is not estimated, to ensure
150 | model identifiability. Default == None.
151 | names : OrderedDict or None, optional.
152 | Should have the same keys as `specification`. For each key:
153 |
154 | - if the corresponding value in `specification` is
155 | "all_same", then there should be a single string as the value
156 | in names.
157 | - if the corresponding value in `specification` is "all_diff",
158 | then there should be a list of strings as the value in names.
159 | There should be one string in the value in names for each
160 | possible alternative.
161 | - if the corresponding value in `specification` is a list, then
162 | there should be a list of strings as the value in names.
163 | There should be one string the value in names per item in the
164 | value in `specification`.
165 | Default == None.
166 | intercept_names : list of strings or None, optional.
167 | If a list is passed, then the list should have the same number of
168 | elements as there are possible alternatives in data, minus 1. Each
169 | element of the list should be the name of the corresponding
170 | alternative's intercept term, in sorted order of the possible
171 | alternative IDs. If None is passed, the resulting names that are
172 | shown in the estimation results will be
173 | ["Outside_ASC_{}".format(x) for x in shape_names]. Default = None.
174 | shape_names : list of strings or None, optional.
175 | If a list is passed, then the list should have the same number of
176 | elements as there are possible alternative IDs in data. Each
177 | element of the list should be a string denoting the name of the
178 | corresponding alternative, in sorted order of the possible
179 | alternative IDs. The resulting names which are shown in the
180 | estimation results will be
181 | ["shape_{}".format(x) for x in shape_names]. Default = None.
182 | nest_spec : OrderedDict or None, optional.
183 | Keys are strings that define the name of the nests. Values are
184 | lists of alternative ids, denoting which alternatives belong to
185 | which nests. Each alternative id only be associated with a single
186 | nest! Default == None.
187 | mixing_id_col : str, or None, optional.
188 | Should be a column heading in `data`. Should denote the column in
189 | `data` which contains the identifiers of the units of observation
190 | over which the coefficients of the model are thought to be randomly
191 | distributed. If `model_type == "Mixed Logit"`, then `mixing_id_col`
192 | must be passed. Default == None.
193 | mixing_vars : list, or None, optional.
194 | All elements of the list should be strings. Each string should be
195 | present in the values of `names.values()` and they're associated
196 | variables should only be index variables (i.e. part of the design
197 | matrix). If `model_type == "Mixed Logit"`, then `mixing_vars` must
198 | be passed. Default == None.
199 |
200 | Returns
201 | -------
202 | model_obj : instantiation of the Choice Model Class corresponding
203 | to the model type passed as the function argument. The returned
204 | object will have been instantiated with the arguments passed to
205 | this function.
206 | """
207 | # Make sure the model type is valid
208 | ensure_valid_model_type(model_type, valid_model_types)
209 |
210 | # Carry out the appropriate instantiation process for the chosen
211 | # choice model
212 | model_kwargs = {"intercept_ref_pos": intercept_ref_pos,
213 | "shape_ref_pos": shape_ref_pos,
214 | "names": names,
215 | "intercept_names": intercept_names,
216 | "shape_names": shape_names,
217 | "nest_spec": nest_spec,
218 | "mixing_id_col": mixing_id_col,
219 | "mixing_vars": mixing_vars}
220 | return model_type_to_class[model_type](data,
221 | alt_id_col,
222 | obs_id_col,
223 | choice_col,
224 | specification,
225 | **model_kwargs)
226 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/timothyb0912/pylogit/cffc9c523b5368966ef2481c7dc30f0a5d296de8/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_bootstrap_calcs.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for the bootstrap_calcs.py file.
3 | """
4 | import unittest
5 |
6 | import numpy as np
7 | import numpy.testing as npt
8 | import pandas as pd
9 | from scipy.stats import norm, gumbel_r
10 |
11 | import pylogit.bootstrap_calcs as bc
12 |
13 | try:
14 | # Python 3.x does not natively support xrange
15 | from past.builtins import xrange
16 | except ImportError:
17 | pass
18 |
19 |
20 | class ComputationalTests(unittest.TestCase):
21 | def setUp(self):
22 | """
23 | Note that the spatial test data used in many of these tests comes from
24 | Efron, Bradley, and Robert J. Tibshirani. An Introduction to the
25 | Bootstrap. CRC press, 1994. Chapter 14.
26 | """
27 | # Determine the number of parameters and number of bootstrap replicates
28 | num_replicates = 100
29 | num_params = 5
30 | # Create a set of fake bootstrap replicates
31 | self.bootstrap_replicates =\
32 | (np.arange(1, 1 + num_replicates)[:, None] *
33 | np.arange(1, 1 + num_params)[None, :])
34 | # Create a fake maximum likelihood parameter estimate
35 | self.mle_params = self.bootstrap_replicates[50, :]
36 | # Create a set of fake jackknife replicates
37 | array_container = []
38 | for est in self.mle_params:
39 | array_container.append(gumbel_r.rvs(loc=est, size=10))
40 | self.jackknife_replicates =\
41 | np.concatenate([x[:, None] for x in array_container], axis=1)
42 | # Create a fake confidence percentage.
43 | self.conf_percentage = 94.88
44 |
45 | # Store the spatial test data from Efron and Tibshirani (1994)
46 | self.test_data =\
47 | np.array([48, 36, 20, 29, 42, 42, 20, 42, 22, 41, 45, 14, 6,
48 | 0, 33, 28, 34, 4, 32, 24, 47, 41, 24, 26, 30, 41])
49 |
50 | # Note how many test data observations there are.
51 | num_test_obs = self.test_data.size
52 |
53 | # Create the function to calculate the jackknife replicates.
54 | def calc_theta(array):
55 | result = ((array - array.mean())**2).sum() / float(array.size)
56 | return result
57 | self.calc_theta = calc_theta
58 | self.test_theta_hat = np.array([calc_theta(self.test_data)])
59 |
60 | # Create a pandas series of the data. Allows for easy case deletion.
61 | raw_series = pd.Series(self.test_data)
62 | # Create the array of jackknife replicates
63 | jackknife_replicates = np.empty((num_test_obs, 1), dtype=float)
64 | for obs in xrange(num_test_obs):
65 | current_data = raw_series[raw_series.index != obs].values
66 | jackknife_replicates[obs] = calc_theta(current_data)
67 | self.test_jackknife_replicates = jackknife_replicates
68 |
69 | return None
70 |
71 | def test_calc_percentile_interval(self):
72 | # Get the alpha percentage. Should be 5.12 so alpha / 2 should be 2.56
73 | alpha = bc.get_alpha_from_conf_percentage(self.conf_percentage)
74 | # These next 2 statements work because there are exactly 100 replicates
75 | # We should have the value in BR[lower_row, 0] = 3 so that there are 2
76 | # elements in bootstrap_replicates (BR) that are less than this. I.e.
77 | # we want lower_row = 2. Note 2.56 rounded down is 2.
78 | lower_row = int(np.floor(alpha / 2.0))
79 | # 100 - 2.56 is 97.44. Rounded up, this is 98.
80 | # We want the row such that the value in the first column of that row
81 | # is 98, i.e. we want the row at index 97.
82 | upper_row = int(np.floor(100 - (alpha / 2.0)))
83 | # Create the expected results
84 | expected_results =\
85 | bc.combine_conf_endpoints(self.bootstrap_replicates[lower_row],
86 | self.bootstrap_replicates[upper_row])
87 | # Alias the function being tested
88 | func = bc.calc_percentile_interval
89 | # Get the function results
90 | func_results = func(self.bootstrap_replicates, self.conf_percentage)
91 | # Perform the desired tests
92 | self.assertIsInstance(func_results, np.ndarray)
93 | self.assertEqual(func_results.shape, expected_results.shape)
94 | npt.assert_allclose(func_results, expected_results)
95 | return None
96 |
97 | def test_calc_bias_correction_bca(self):
98 | # There are 100 bootstrap replicates, already in ascending order for
99 | # each column. If we take row 51 to be the mle, then 50% of the
100 | # replicates are less than the mle, and we should have bias = 0.
101 | expected_result = np.zeros(self.mle_params.size)
102 |
103 | # Alias the function to be tested.
104 | func = bc.calc_bias_correction_bca
105 |
106 | # Perform the desired test
107 | func_result = func(self.bootstrap_replicates, self.mle_params)
108 | self.assertIsInstance(func_result, np.ndarray)
109 | self.assertEqual(func_result.shape, expected_result.shape)
110 | npt.assert_allclose(func_result, expected_result)
111 |
112 | # Create a fake mle that should be higher than 95% of the results
113 | fake_mle = self.bootstrap_replicates[95]
114 | expected_result_2 = norm.ppf(0.95) * np.ones(self.mle_params.size)
115 | func_result_2 = func(self.bootstrap_replicates, fake_mle)
116 |
117 | self.assertIsInstance(func_result_2, np.ndarray)
118 | self.assertEqual(func_result_2.shape, expected_result_2.shape)
119 | npt.assert_allclose(func_result_2, expected_result_2)
120 | return None
121 |
122 | def test_calc_acceleration_bca(self):
123 | # Get the expected result. See page 186 of Efron and Tibshirani (1994)
124 | expected_result = np.array([0.061])
125 |
126 | # Alias the function being tested
127 | func = bc.calc_acceleration_bca
128 |
129 | # Perform the desired test
130 | func_result = func(self.test_jackknife_replicates)
131 | self.assertIsInstance(func_result, np.ndarray)
132 | self.assertEqual(func_result.shape, expected_result.shape)
133 | # Note the absolute tolerance of 5e-4 is used because the results
134 | # should agree when rounded to 3 decimal places. This will be the case
135 | # if the two sets of results agree to within 5e-4 of each other.
136 | npt.assert_allclose(func_result, expected_result, atol=5e-4)
137 | return None
138 |
139 | def test_calc_lower_bca_percentile(self):
140 | # Use the parameter values from
141 | # Efron, Bradley, and Robert J. Tibshirani. An Introduction to the
142 | # Bootstrap. CRC press, 1994. Pages 185-186
143 | # Note that my alpha is Efron's alpha / 2, in percents not decimals
144 | alpha_percent = 10
145 | bias_correction = np.array([0.146])
146 | acceleration = np.array([0.061])
147 |
148 | # Note the expected results
149 | expected_result = np.array([0.110])
150 |
151 | # Alias the function being tested
152 | func = bc.calc_lower_bca_percentile
153 |
154 | # Perform the desired tests
155 | # Note we divide the function results by 100 since our results are in
156 | # terms of percents and Efron's results are in decimals.
157 | func_result = func(alpha_percent, bias_correction, acceleration) / 100
158 | self.assertIsInstance(func_result, np.ndarray)
159 | self.assertEqual(func_result.shape, expected_result.shape)
160 | # Note the absolute tolerance of 5e-4 is used because the results
161 | # should agree when rounded to 3 decimal places. This will be the case
162 | # if the two sets of results agree to within 5e-4 of each other.
163 | npt.assert_allclose(func_result, expected_result, atol=5e-4)
164 | return None
165 |
166 | def test_calc_upper_bca_percentile(self):
167 | # Use the parameter values from
168 | # Efron, Bradley, and Robert J. Tibshirani. An Introduction to the
169 | # Bootstrap. CRC press, 1994. Pages 185-186
170 | # Note that my alpha is Efron's alpha / 2, in percents not decimals
171 | alpha_percent = 10
172 | bias_correction = np.array([0.146])
173 | acceleration = np.array([0.061])
174 |
175 | # Note the expected results
176 | expected_result = np.array([0.985])
177 |
178 | # Alias the function being tested
179 | func = bc.calc_upper_bca_percentile
180 |
181 | # Perform the desired tests
182 | # Note we divide the function results by 100 since our results are in
183 | # terms of percents and Efron's results are in decimals.
184 | func_result = func(alpha_percent, bias_correction, acceleration) / 100
185 | self.assertIsInstance(func_result, np.ndarray)
186 | self.assertEqual(func_result.shape, expected_result.shape)
187 | # Note the absolute tolerance of 1e-3 is used because the results
188 | # should be within 0.001 of each other.
189 | npt.assert_allclose(func_result, expected_result, atol=1e-3)
190 | return None
191 |
192 | def test_calc_bca_interval(self):
193 | # Create the bootstrap replicates for the test data
194 | num_test_reps = 5000
195 | num_test_obs = self.test_data.size
196 | test_indices = np.arange(num_test_obs)
197 | boot_indx_shape = (num_test_reps, num_test_obs)
198 | np.random.seed(8292017)
199 | boot_indices =\
200 | np.random.choice(test_indices,
201 | replace=True,
202 | size=num_test_obs*num_test_reps)
203 | self.test_bootstrap_replicates =\
204 | np.fromiter((self.calc_theta(self.test_data[x]) for x in
205 | boot_indices.reshape(boot_indx_shape)),
206 | dtype=float)[:, None]
207 |
208 | # Note the expected result. See page 183 of Efron and Tibshirani (1994)
209 | expected_result = np.array([[115.8], [259.6]])
210 |
211 | # Bundle the necessary arguments
212 | args = [self.test_bootstrap_replicates,
213 | self.test_jackknife_replicates,
214 | self.test_theta_hat,
215 | 90]
216 |
217 | # Alias the function being tested
218 | func = bc.calc_bca_interval
219 |
220 | # Get the function results
221 | func_result = func(*args)
222 |
223 | # Perform the desired tests
224 | # Note we divide the function results by 100 since our results are in
225 | # terms of percents and Efron's results are in decimals.
226 | self.assertIsInstance(func_result, np.ndarray)
227 | self.assertEqual(func_result.shape, expected_result.shape)
228 | # Note the relative tolerance of 0.01 is used because the function
229 | # results should be within 1% of the expected result. Note that some
230 | # differences are expected due to simulation error on both the part of
231 | # Efron and Tibshirani (1994) when they reported their results, and on
232 | # our part when calculating the results.
233 | npt.assert_allclose(func_result, expected_result, rtol=0.01)
234 | return None
235 |
--------------------------------------------------------------------------------
/tests/test_bootstrap_sampler.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for the bootstrap_sampler.py file.
3 | """
4 | import unittest
5 | from collections import OrderedDict
6 |
7 | import numpy as np
8 | import numpy.testing as npt
9 | import pandas as pd
10 |
11 | import pylogit.bootstrap_sampler as bs
12 |
13 | try:
14 | # Python 3.x does not natively support xrange
15 | from past.builtins import xrange
16 | except ImportError:
17 | pass
18 |
19 |
20 | class SamplerTests(unittest.TestCase):
21 | def test_relate_obs_ids_to_chosen_alts(self):
22 | # Create fake data for the observation, alternative, and choice ids.
23 | obs_ids = np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6])
24 | alt_ids = np.array([1, 2, 1, 3, 2, 3, 2, 3, 1, 3, 1, 2])
25 | choices = np.array([0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0])
26 |
27 | # Create the dictionary that we expect the tested function to return
28 | expected_dict = {1: np.array([2, 6]),
29 | 2: np.array([1, 4]),
30 | 3: np.array([3, 5])}
31 |
32 | # Alias the function being tested.
33 | func = bs.relate_obs_ids_to_chosen_alts
34 |
35 | # Execute the given tests.
36 | func_result = func(obs_ids, alt_ids, choices)
37 | self.assertIsInstance(func_result, dict)
38 | for key in expected_dict:
39 | self.assertIn(key, func_result)
40 | self.assertIsInstance(func_result[key], np.ndarray)
41 | self.assertEqual(func_result[key].ndim, 1)
42 | npt.assert_allclose(func_result[key], expected_dict[key])
43 | return None
44 |
45 | def test_get_num_obs_choosing_each_alternative(self):
46 | # Alias the function that is to be tested
47 | func = bs.get_num_obs_choosing_each_alternative
48 |
49 | # Create the dictionary of observations per alternative
50 | obs_per_group = {1: np.array([2, 6, 7]),
51 | 2: np.array([1]),
52 | 3: np.array([3, 5])}
53 |
54 | # Get the 'expected results'
55 | expected_dict = OrderedDict()
56 | expected_dict[1] = obs_per_group[1].size
57 | expected_dict[2] = obs_per_group[2].size
58 | expected_dict[3] = obs_per_group[3].size
59 | expected_num_obs = (obs_per_group[1].size +
60 | obs_per_group[2].size +
61 | obs_per_group[3].size)
62 |
63 | # Get the results from the function
64 | func_dict, func_num_obs = func(obs_per_group)
65 |
66 | # Perform the desired tests
67 | self.assertIsInstance(func_dict, OrderedDict)
68 | self.assertIsInstance(func_num_obs, int)
69 | self.assertEqual(func_num_obs, expected_num_obs)
70 | for key in func_dict:
71 | func_num = func_dict[key]
72 | self.assertIsInstance(func_num, int)
73 | self.assertEqual(func_num, expected_dict[key])
74 | return None
75 |
76 | def test_create_cross_sectional_bootstrap_samples(self):
77 | # Create fake data for the observation, alternative, and choice ids.
78 | obs_ids = np.array([1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6])
79 | alt_ids = np.array([1, 2, 1, 3, 2, 3, 2, 3, 1, 3, 1, 2])
80 | choices = np.array([0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0])
81 |
82 | # Determine the number of samples to be taken
83 | num_samples = 5
84 |
85 | # Determine the random seed for reproducibility
86 | seed = 55
87 | np.random.seed(seed)
88 |
89 | # Create the dictionary of observations per alternative
90 | obs_per_group = {1: np.array([2, 6]),
91 | 2: np.array([1, 4]),
92 | 3: np.array([3, 5])}
93 | num_obs_per_group = {1: 2, 2: 2, 3: 2}
94 |
95 | # Determine the array that should be created.
96 | expected_ids = np.empty((num_samples, 6))
97 |
98 | expected_shape_1 = (num_samples, num_obs_per_group[1])
99 | expected_ids[:, :2] =\
100 | np.random.choice(obs_per_group[1],
101 | size=num_samples * num_obs_per_group[1],
102 | replace=True).reshape(expected_shape_1)
103 |
104 | expected_shape_2 = (num_samples, num_obs_per_group[2])
105 | expected_ids[:, 2:4] =\
106 | np.random.choice(obs_per_group[2],
107 | size=num_samples * len(obs_per_group[2]),
108 | replace=True).reshape(expected_shape_2)
109 |
110 | expected_shape_3 = (num_samples, num_obs_per_group[3])
111 | expected_ids[:, 4:6] =\
112 | np.random.choice(obs_per_group[3],
113 | size=num_samples * len(obs_per_group[3]),
114 | replace=True).reshape(expected_shape_3)
115 |
116 | # Alias the function being tested.
117 | func = bs.create_cross_sectional_bootstrap_samples
118 |
119 | # Get the desired results
120 | func_result = func(obs_ids, alt_ids, choices, num_samples, seed=seed)
121 |
122 | # Perform the requisite tests
123 | self.assertIsInstance(func_result, np.ndarray)
124 | self.assertEqual(func_result.shape, expected_ids.shape)
125 | npt.assert_allclose(func_result, expected_ids)
126 |
127 | # Make sure the argument check works
128 | self.assertRaises(ValueError,
129 | func,
130 | obs_ids,
131 | alt_ids,
132 | choices,
133 | num_samples,
134 | "2")
135 |
136 | return None
137 |
138 | def test_create_bootstrap_id_array(self):
139 | # Create an array of fake bootstrapped observation ids
140 | fake_obs_id_per_sample = np.arange(25).reshape((5, 5))
141 |
142 | # Create the expected result denoting the "bootstrap ids" for each of
143 | # the sampled observation ids.
144 | expected_results = np.array([[1, 2, 3, 4, 5],
145 | [1, 2, 3, 4, 5],
146 | [1, 2, 3, 4, 5],
147 | [1, 2, 3, 4, 5],
148 | [1, 2, 3, 4, 5]])
149 | # Alias the function being tested
150 | func = bs.create_bootstrap_id_array
151 | # Get the function results
152 | func_result = func(fake_obs_id_per_sample)
153 |
154 | # Perform the desired tests
155 | self.assertIsInstance(func_result, np.ndarray)
156 | npt.assert_allclose(func_result, expected_results)
157 |
158 | return None
159 |
160 | def test_create_deepcopied_groupby_dict(self):
161 | # Create the dataframe of fake data
162 | fake_df = pd.DataFrame({"obs_id": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6],
163 | "alt_id": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
164 | "choice": [1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1],
165 | "x": [1, 1.2, 1.4, 0.3, 0.9, 1.11, 0.53, 0.82,
166 | 1.31, 1.24, 0.98, 0.76]})
167 | # Create the result that we expect from the function being tested.
168 | expected_res = {1: fake_df.iloc[0:2],
169 | 2: fake_df.iloc[2:4],
170 | 3: fake_df.iloc[4:6],
171 | 4: fake_df.iloc[6:8],
172 | 5: fake_df.iloc[8:10],
173 | 6: fake_df.iloc[10:]}
174 | # Alias the function being tested
175 | func = bs.create_deepcopied_groupby_dict
176 |
177 | # Get the result of the function
178 | func_result = func(fake_df, "obs_id")
179 |
180 | # Perform the requisite tests
181 | # Ensure the returned value is a dictionary
182 | self.assertIsInstance(func_result, dict)
183 | # Ensure the returned value and the expected value have the same keys.
184 | self.assertEqual(sorted(func_result.keys()),
185 | sorted(expected_res.keys()))
186 | for key in func_result:
187 | # Get the expected and returned dataframes for each observation id
188 | sub_func_result = func_result[key]
189 | sub_expected_res = expected_res[key]
190 |
191 | # Ensure that the dataframes have equal values.
192 | npt.assert_allclose(sub_func_result.values,
193 | sub_expected_res.values)
194 |
195 | # Ensure the dataframes don't share the same location in memory.
196 | self.assertNotEqual(id(sub_func_result), id(sub_expected_res))
197 | return None
198 |
199 | def test_check_column_existence(self):
200 | # Create the fake dataframe for the test.
201 | fake_df = pd.DataFrame({"obs_id": [1, 1, 2, 2, 3, 3],
202 | "alt_id": [1, 2, 1, 2, 1, 2],
203 | "choice": [1, 0, 0, 1, 1, 0]})
204 | # Create the sets of arguments and keyword arguments that should not
205 | # lead to raising errors.
206 | good_cols = ["obs_id", "boot_id"]
207 | good_kwargs = [{"presence": True}, {"presence": False}]
208 |
209 | # Alias the function that is being tested
210 | func = bs.check_column_existence
211 |
212 | # Perform the desired tests.
213 | for pos in xrange(len(good_cols)):
214 | col = good_cols[pos]
215 | current_good_kwargs = good_kwargs[pos]
216 | current_bad_kwargs =\
217 | {"presence": bool(1 - current_good_kwargs["presence"])}
218 | pattern = ("Ensure that `{}` is ".format(col) +
219 | "not " * (1 - current_bad_kwargs["presence"]) +
220 | "in `df.columns`.")
221 |
222 | self.assertIsNone(func(col, fake_df, **current_good_kwargs))
223 | self.assertRaisesRegexp(ValueError,
224 | pattern,
225 | func,
226 | col,
227 | fake_df,
228 | **current_bad_kwargs)
229 |
230 | return None
231 |
232 | def test_ensure_resampled_obs_ids_in_df(self):
233 | # Create fake data for the test.
234 | good_resampled_obs_ids = np.array([1, 1, 4, 3, 4])
235 | bad_resampled_obs_ids = np.array([1, 1, 4, 3, 8])
236 | fake_orig_obs_ids = np.arange(1, 6)
237 |
238 | # Expected error msg pattern
239 | expected_err_msg =\
240 | "All values in `resampled_obs_ids` MUST be in `orig_obs_id_array`."
241 |
242 | # Alias the function being tested.
243 | func = bs.ensure_resampled_obs_ids_in_df
244 |
245 | # Perform the desired tests
246 | self.assertIsNone(func(good_resampled_obs_ids, fake_orig_obs_ids))
247 | self.assertRaisesRegexp(ValueError,
248 | expected_err_msg,
249 | func,
250 | bad_resampled_obs_ids,
251 | fake_orig_obs_ids)
252 | return None
253 |
254 | def test_create_bootstrap_dataframe(self):
255 | # Create the dataframe of fake data
256 | fake_df = pd.DataFrame({"obs_id": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6],
257 | "alt_id": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
258 | "choice": [1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1],
259 | "x": [1, 1.2, 1.4, 0.3, 0.9, 1.11, 0.53, 0.82,
260 | 1.31, 1.24, 0.98, 0.76]})
261 | # Note the observation id column
262 | obs_id_col = "obs_id"
263 |
264 | # Get the bootstrapped samples of the observation ids
265 | sampling_args = [fake_df["obs_id"].values,
266 | fake_df["alt_id"].values,
267 | fake_df["choice"].values,
268 | 5]
269 | sampled_obs_ids =\
270 | bs.create_cross_sectional_bootstrap_samples(*sampling_args)
271 | rel_sampled_ids = sampled_obs_ids[0, :]
272 |
273 | # Get the groupby dictionary for this dataframe.
274 | groupby_dictionary =\
275 | bs.create_deepcopied_groupby_dict(fake_df, obs_id_col)
276 |
277 | # Alias the function necessary to create the bootstrap dataframe
278 | func = bs.create_bootstrap_dataframe
279 | # Create the bootstrap id column name
280 | boot_id_col = "new_id"
281 |
282 | # Create the expected result.
283 | expected_result =\
284 | [groupby_dictionary[obs_id].copy() for obs_id in rel_sampled_ids]
285 | for pos in xrange(len(expected_result)):
286 | expected_result[pos][boot_id_col] = pos + 1
287 | expected_result = pd.concat(expected_result, axis=0, ignore_index=True)
288 |
289 | # Get the function result
290 | func_result = func(fake_df,
291 | obs_id_col,
292 | rel_sampled_ids,
293 | groupby_dictionary,
294 | boot_id_col=boot_id_col)
295 |
296 | # Perform the desired tests.
297 | self.assertIsInstance(func_result, pd.DataFrame)
298 | self.assertIn(boot_id_col, func_result.columns.values)
299 | self.assertEqual(expected_result.shape, func_result.shape)
300 | npt.assert_allclose(expected_result.values, func_result.values)
301 | return None
302 |
--------------------------------------------------------------------------------
/tests/test_bootstrap_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for the bootstrap_utils.py file.
3 | """
4 | import unittest
5 |
6 | import numpy as np
7 | import numpy.testing as npt
8 |
9 | import pylogit.bootstrap_utils as bu
10 |
11 |
12 | class UtilityTester(unittest.TestCase):
13 | def test_check_conf_percentage_validity(self):
14 | # Create a list of valid and invalid arguments
15 | good_args = [80, 95.0, 30]
16 | bad_args = [-2, '95', None, (90,)]
17 | # Note the message that should be displayed in case of errors.
18 | expected_err_msg =\
19 | "conf_percentage MUST be a number between 0.0 and 100."
20 | # Alias the function being tested
21 | func = bu.check_conf_percentage_validity
22 | # Perform the desired tests
23 | for arg in good_args:
24 | self.assertIsNone(func(arg))
25 | for arg in bad_args:
26 | self.assertRaisesRegexp(ValueError,
27 | expected_err_msg,
28 | func,
29 | arg)
30 | return None
31 |
32 | def test_ensure_samples_is_ndim_ndarray(self):
33 | # Create a list of valid and invalid arguments
34 | base_array = np.arange(10)
35 | good_args = [base_array.copy().reshape((2, 5)),
36 | base_array.copy().reshape((5, 2))]
37 | bad_args = [base_array, base_array[None, None, :], 30]
38 | # Create a 'name' argument
39 | fake_name = 'test'
40 | # Note the message that should be displayed in case of errors.
41 | expected_err_msg =\
42 | "`{}` MUST be a 2D ndarray.".format(fake_name + '_samples')
43 | # Alias the function being tested
44 | func = bu.ensure_samples_is_ndim_ndarray
45 | # Perform the desired tests
46 | for arg in good_args:
47 | self.assertIsNone(func(arg, name=fake_name))
48 | for arg in bad_args:
49 | self.assertRaisesRegexp(ValueError,
50 | expected_err_msg,
51 | func,
52 | arg,
53 | name=fake_name)
54 | self.assertIsNone(func(base_array, ndim=1))
55 | return None
56 |
57 | def test_get_alpha_from_conf_percentage(self):
58 | # Create a list of valid confidence percentages
59 | good_args = [80, 95.0, 30]
60 | # Create a list of expected results
61 | expected_results = [20, 5, 70]
62 | # Alias the function being tested
63 | func = bu.get_alpha_from_conf_percentage
64 | # Perform the desired tests
65 | for pos, arg in enumerate(good_args):
66 | self.assertEqual(func(arg), expected_results[pos])
67 | return None
68 |
69 | def test_combine_conf_endpoints(self):
70 | # Create fake arguments
71 | lower_array = np.arange(5)
72 | upper_array = np.arange(2, 7)
73 | # Create the expected result
74 | expected_result =\
75 | np.array([lower_array.tolist(), upper_array.tolist()])
76 | # Alias the function being tested
77 | func = bu.combine_conf_endpoints
78 | # Perform the desired test
79 | npt.assert_allclose(expected_result, func(lower_array, upper_array))
80 | return None
81 |
--------------------------------------------------------------------------------
/tests/test_conditional_logit.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for the conditional_logit.py file. These tests do not include tests of
3 | the functions that perform the mathematical calculations necessary to estimate
4 | the MNL model.
5 | """
6 | import warnings
7 | import unittest
8 | from collections import OrderedDict
9 |
10 | import numpy as np
11 | import numpy.testing as npt
12 | import pandas as pd
13 |
14 | import pylogit.conditional_logit as mnl
15 |
16 |
17 | class HelperFuncTests(unittest.TestCase):
18 | """
19 | Defines the tests for the 'helper' functions for estimating the MNL model.
20 | """
21 |
22 | def setUp(self):
23 | # Set up the fake arguments
24 | self.fake_beta = np.arange(3)
25 | self.fake_args = ["foo", 1]
26 | self.fake_kwargs = {"fake_arg_1": "bar",
27 | "fake_arg_2": 2,
28 | "fake_arg_3": True}
29 | self.fake_design = np.arange(6).reshape((2, 3))
30 | self.fake_index = self.fake_design.dot(self.fake_beta)
31 |
32 | def test_split_param_vec(self):
33 | """
34 | Ensures that split_param_vec returns (None, None, index_coefs)
35 | when called from within conditional_logit.py.
36 | """
37 | # Store the results of split_param_vec()
38 | split_results = mnl.split_param_vec(self.fake_beta,
39 | return_all_types=False,
40 | *self.fake_args,
41 | **self.fake_kwargs)
42 | # Check for expected results.
43 | self.assertIsNone(split_results[0])
44 | self.assertIsNone(split_results[1])
45 | npt.assert_allclose(split_results[2], self.fake_beta)
46 |
47 | # Store the results of split_param_vec()
48 | split_results = mnl.split_param_vec(self.fake_beta,
49 | return_all_types=True,
50 | *self.fake_args,
51 | **self.fake_kwargs)
52 | # Check for expected results.
53 | self.assertIsNone(split_results[0])
54 | self.assertIsNone(split_results[1])
55 | self.assertIsNone(split_results[2])
56 | npt.assert_allclose(split_results[3], self.fake_beta)
57 |
58 | return None
59 |
60 | def test_mnl_utility_transform(self):
61 | """
62 | Ensures that mnl_utility_transform returns a 2D version of the 1D
63 | 1D index array that is passed to it.
64 | """
65 | # Get the results of _mnl_utiilty_transform()
66 | transform_results = mnl._mnl_utility_transform(self.fake_index,
67 | *self.fake_args,
68 | **self.fake_kwargs)
69 |
70 | # Check to make sure the results are as expected
71 | self.assertIsInstance(transform_results, np.ndarray)
72 | self.assertEqual(transform_results.shape, (2, 1))
73 | npt.assert_allclose(transform_results, self.fake_index[:, None])
74 |
75 | return None
76 |
77 | def test_mnl_transform_deriv_c(self):
78 | """
79 | Ensures that mnl_transform_deriv_c returns None.
80 | """
81 | derivative_results = mnl._mnl_transform_deriv_c(self.fake_index,
82 | *self.fake_args,
83 | **self.fake_kwargs)
84 | self.assertIsNone(derivative_results)
85 |
86 | return None
87 |
88 | def test_mnl_transform_deriv_alpha(self):
89 | """
90 | Ensures that mnl_transform_deriv_alpha returns None.
91 | """
92 | derivative_results = mnl._mnl_transform_deriv_alpha(self.fake_index,
93 | *self.fake_args,
94 | **self.fake_kwargs)
95 | self.assertIsNone(derivative_results)
96 |
97 | return None
98 |
99 |
100 | class ChoiceObjectTests(unittest.TestCase):
101 | """
102 | Defines the tests for the MNL model object's `__init__` function and its
103 | other methods.
104 | """
105 |
106 | def setUp(self):
107 | # Create fake versions of the needed arguments for the MNL constructor
108 | self.fake_df = pd.DataFrame({"obs_id": [1, 1, 2, 2, 3, 3],
109 | "alt_id": [1, 2, 1, 2, 1, 2],
110 | "choice": [0, 1, 0, 1, 1, 0],
111 | "x": range(6)})
112 | self.fake_specification = OrderedDict()
113 | self.fake_specification["x"] = [[1, 2]]
114 | self.fake_names = OrderedDict()
115 | self.fake_names["x"] = ["x (generic coefficient)"]
116 | self.alt_id_col = "alt_id"
117 | self.obs_id_col = "obs_id"
118 | self.choice_col = "choice"
119 | self.fake_beta = np.array([1])
120 |
121 | return None
122 |
123 | def test_outside_intercept_error_in_constructor(self):
124 | """
125 | Ensures that a ValueError is raised when the 'intercept_ref_pos' kwarg
126 | is passed to the MNL model constructor. This prevents people from
127 | expecting the use of outside intercept parameters to work with the MNL
128 | model.
129 | """
130 | # Create a variable for the standard arguments to this function.
131 | standard_args = [self.fake_df,
132 | self.alt_id_col,
133 | self.obs_id_col,
134 | self.choice_col,
135 | self.fake_specification]
136 | # Create a variable for the kwargs being passed to the constructor
137 | kwarg_map = {"intercept_ref_pos": 2}
138 |
139 | self.assertRaises(ValueError,
140 | mnl.MNL,
141 | *standard_args,
142 | **kwarg_map)
143 | return None
144 |
145 | def test_shape_ignore_msg_in_constructor(self):
146 | """
147 | Ensures that a UserWarning is raised when the 'shape_ref_pos' or
148 | 'shape_names' keyword arguments are passed to the MNL model
149 | constructor. This warns people against expecting the MNL to work with
150 | shape parameters, and alerts them to the fact they are using an MNL
151 | model when they might have been expecting to instantiate a different
152 | choice model.
153 | """
154 | # Create a variable for the standard arguments to this function.
155 | standard_args = [self.fake_df,
156 | self.alt_id_col,
157 | self.obs_id_col,
158 | self.choice_col,
159 | self.fake_specification]
160 |
161 | # Create a variable for the kwargs being passed to the constructor
162 | kwarg_map_1 = {"shape_ref_pos": 2}
163 | kwarg_map_2 = {"shape_names": OrderedDict([("x", ["foo"])])}
164 |
165 | # Test to ensure that the shape ignore message is printed when using
166 | # either of these two kwargs
167 | with warnings.catch_warnings(record=True) as context:
168 | # Use this filter to always trigger the UserWarnings
169 | warnings.simplefilter('always', UserWarning)
170 |
171 | for pos, bad_kwargs in enumerate([kwarg_map_1, kwarg_map_2]):
172 | # Create an MNL model object with the irrelevant kwargs.
173 | # This should trigger a UserWarning
174 | mnl_obj = mnl.MNL(*standard_args, **bad_kwargs)
175 | # Check that the warning has been created.
176 | self.assertEqual(len(context), pos + 1)
177 | self.assertIsInstance(context[-1].category, type(UserWarning))
178 | self.assertIn(mnl._shape_ignore_msg, str(context[-1].message))
179 |
180 | return None
181 |
182 | def test_outside_intercept_error_in_fit_mle(self):
183 | """
184 | Ensures that a ValueError is raised when users try to use any other
185 | type of initial value input methods other than the `init_vals`
186 | argument of `fit_mle()`. This prevents people from expecting the use
187 | of outside intercept or shape parameters to work with the MNL model.
188 | """
189 | # Create a variable for the standard arguments to the MNL constructor.
190 | standard_args = [self.fake_df,
191 | self.alt_id_col,
192 | self.obs_id_col,
193 | self.choice_col,
194 | self.fake_specification]
195 |
196 | # Create the mnl model object whose coefficients will be estimated.
197 | base_mnl = mnl.MNL(*standard_args)
198 |
199 | # Create a variable for the arguments to the fit_mle function.
200 | fit_args = [self.fake_beta]
201 |
202 | # Create variables for the incorrect kwargs.
203 | # The print_res = False arguments are to make sure strings aren't
204 | # printed to the console unnecessarily.
205 | kwarg_map_1 = {"init_shapes": np.array([1, 2]),
206 | "print_res": False}
207 | kwarg_map_2 = {"init_intercepts": np.array([1]),
208 | "print_res": False}
209 | kwarg_map_3 = {"init_coefs": np.array([1]),
210 | "print_res": False}
211 |
212 | # Test to ensure that the kwarg ignore message is printed when using
213 | # any of these three incorrect kwargs
214 | for kwargs in [kwarg_map_1, kwarg_map_2, kwarg_map_3]:
215 | self.assertRaises(ValueError, base_mnl.fit_mle,
216 | *fit_args, **kwargs)
217 |
218 | return None
219 |
220 | def test_ridge_warning_in_fit_mle(self):
221 | """
222 | Ensure that a UserWarning is raised when one passes the ridge keyword
223 | argument to the `fit_mle` method of an MNL model object.
224 | """
225 | # Create a variable for the standard arguments to the MNL constructor.
226 | standard_args = [self.fake_df,
227 | self.alt_id_col,
228 | self.obs_id_col,
229 | self.choice_col,
230 | self.fake_specification]
231 |
232 | # Create the mnl model object whose coefficients will be estimated.
233 | base_mnl = mnl.MNL(*standard_args)
234 |
235 | # Create a variable for the fit_mle function's kwargs.
236 | # The print_res = False arguments are to make sure strings aren't
237 | # printed to the console unnecessarily.
238 | kwargs = {"ridge": 0.5,
239 | "print_res": False}
240 |
241 | # Test to make sure that the ridge warning message is printed when
242 | # using the ridge keyword argument
243 | with warnings.catch_warnings(record=True) as w:
244 | # Use this filter to always trigger the UserWarnings
245 | warnings.simplefilter('always', UserWarning)
246 |
247 | base_mnl.fit_mle(self.fake_beta, **kwargs)
248 | self.assertGreaterEqual(len(w), 1)
249 | self.assertIsInstance(w[0].category, type(UserWarning))
250 | self.assertIn(mnl._ridge_warning_msg, str(w[0].message))
251 |
252 | return None
253 |
254 | def test_check_length_of_initial_values(self):
255 | """
256 | Ensure that a ValueError is raised when one passes an init_vals
257 | argument of the wrong length.
258 | """
259 | # Create a variable for the standard arguments to the MNL constructor.
260 | standard_args = [self.fake_df,
261 | self.alt_id_col,
262 | self.obs_id_col,
263 | self.choice_col,
264 | self.fake_specification]
265 |
266 | # Create the mnl model object whose coefficients will be estimated.
267 | base_mnl = mnl.MNL(*standard_args)
268 |
269 | # Create the EstimationObj
270 | mapping_res = base_mnl.get_mappings_for_fit()
271 | ridge = None
272 | zero_vector = np.zeros(1)
273 | split_params = mnl.split_param_vec
274 | mnl_estimator = mnl.MNLEstimator(base_mnl,
275 | mapping_res,
276 | ridge,
277 | zero_vector,
278 | split_params)
279 |
280 | # Alias the function to be checked
281 | func = mnl_estimator.check_length_of_initial_values
282 |
283 | for i in [2, 3]:
284 | init_vals = np.ones(i)
285 | self.assertRaises(ValueError, func, init_vals)
286 |
287 | self.assertIsNone(func(np.ones(1)))
288 |
289 | return None
290 |
291 | def test_just_point_kwarg(self):
292 | # Create a variable for the standard arguments to the MNL constructor.
293 | standard_args = [self.fake_df,
294 | self.alt_id_col,
295 | self.obs_id_col,
296 | self.choice_col,
297 | self.fake_specification]
298 |
299 | # Create the mnl model object whose coefficients will be estimated.
300 | base_mnl = mnl.MNL(*standard_args)
301 | # Alias the function being tested
302 | func = base_mnl.fit_mle
303 | # Get the necessary kwargs
304 | kwargs = {"just_point": True}
305 | # Get the function results
306 | func_result = func(self.fake_beta, **kwargs)
307 | # Perform the desired tests to make sure we get back a dictionary with
308 | # an "x" key in it and a value that is a ndarray.
309 | self.assertIsInstance(func_result, dict)
310 | self.assertIn("x", func_result)
311 | self.assertIsInstance(func_result["x"], np.ndarray)
312 | return None
313 |
--------------------------------------------------------------------------------
/tests/test_estimation.py:
--------------------------------------------------------------------------------
1 | """
2 | Use this file to test methods and classes in test_estimation.py
3 | """
4 | import warnings
5 | import unittest
6 | from collections import OrderedDict
7 | from numbers import Number
8 |
9 | import numpy as np
10 | import numpy.testing as npt
11 | import pandas as pd
12 | from scipy.sparse import csr_matrix
13 |
14 | import pylogit.asym_logit as asym
15 | import pylogit.estimation as estimation
16 |
17 | # Use the following to always show the warnings
18 | np.seterr(all='warn')
19 | warnings.simplefilter("always")
20 |
21 |
22 | class GenericTestCase(unittest.TestCase):
23 | """
24 | Defines the common setUp method used for the different type of tests.
25 | """
26 |
27 | def setUp(self):
28 | # The set up being used is one where there are two choice situations,
29 | # The first having three alternatives, and the second having only two
30 | # alternatives. There is one generic variable. Two alternative
31 | # specific constants and all three shape parameters are used.
32 |
33 | # Create the betas to be used during the tests
34 | self.fake_betas = np.array([-0.6])
35 |
36 | # Create the fake outside intercepts to be used during the tests
37 | self.fake_intercepts = np.array([1, 0.5])
38 |
39 | # Create names for the intercept parameters
40 | self.fake_intercept_names = ["ASC 1", "ASC 2"]
41 |
42 | # Record the position of the intercept that is not being estimated
43 | self.fake_intercept_ref_pos = 2
44 |
45 | # Create the shape parameters to be used during the tests. Note that
46 | # these are the reparameterized shape parameters, thus they will be
47 | # exponentiated in the fit_mle process and various calculations.
48 | self.fake_shapes = np.array([-1, 1])
49 |
50 | # Create names for the intercept parameters
51 | self.fake_shape_names = ["Shape 1", "Shape 2"]
52 |
53 | # Record the position of the shape parameter that is being constrained
54 | self.fake_shape_ref_pos = 2
55 |
56 | # Calculate the 'natural' shape parameters
57 | self.natural_shapes = asym._convert_eta_to_c(self.fake_shapes,
58 | self.fake_shape_ref_pos)
59 |
60 | # Create an array of all model parameters
61 | self.fake_all_params = np.concatenate((self.fake_shapes,
62 | self.fake_intercepts,
63 | self.fake_betas))
64 |
65 | # The mapping between rows and alternatives is given below.
66 | self.fake_rows_to_alts = csr_matrix(np.array([[1, 0, 0],
67 | [0, 1, 0],
68 | [0, 0, 1],
69 | [1, 0, 0],
70 | [0, 0, 1]]))
71 |
72 | # Create the fake design matrix with columns denoting X
73 | # The intercepts are not included because they are kept outside the
74 | # index in the scobit model.
75 | self.fake_design = np.array([[1],
76 | [2],
77 | [3],
78 | [1.5],
79 | [3.5]])
80 |
81 | # Create the index array for this set of choice situations
82 | self.fake_index = self.fake_design.dot(self.fake_betas)
83 |
84 | # Create the needed dataframe for the Asymmetric Logit constructor
85 | self.fake_df = pd.DataFrame({"obs_id": [1, 1, 1, 2, 2],
86 | "alt_id": [1, 2, 3, 1, 3],
87 | "choice": [0, 1, 0, 0, 1],
88 | "x": self.fake_design[:, 0],
89 | "intercept": [1 for i in range(5)]})
90 |
91 | # Record the various column names
92 | self.alt_id_col = "alt_id"
93 | self.obs_id_col = "obs_id"
94 | self.choice_col = "choice"
95 |
96 | # Create the index specification and name dictionaryfor the model
97 | self.fake_specification = OrderedDict()
98 | self.fake_names = OrderedDict()
99 | self.fake_specification["x"] = [[1, 2, 3]]
100 | self.fake_names["x"] = ["x (generic coefficient)"]
101 |
102 | # Bundle args and kwargs used to construct the Asymmetric Logit model.
103 | self.constructor_args = [self.fake_df,
104 | self.alt_id_col,
105 | self.obs_id_col,
106 | self.choice_col,
107 | self.fake_specification]
108 |
109 | # Create a variable for the kwargs being passed to the constructor
110 | self.constructor_kwargs = {"intercept_ref_pos":
111 | self.fake_intercept_ref_pos,
112 | "shape_ref_pos": self.fake_shape_ref_pos,
113 | "names": self.fake_names,
114 | "intercept_names":
115 | self.fake_intercept_names,
116 | "shape_names": self.fake_shape_names}
117 |
118 | # Initialize a basic Asymmetric Logit model whose coefficients will be
119 | # estimated.
120 | self.model_obj = asym.MNAL(*self.constructor_args,
121 | **self.constructor_kwargs)
122 |
123 | return None
124 |
125 |
126 | class EstimationObjTests(GenericTestCase):
127 | """
128 | Store the tests for the basic methods in the EstimationObj class.
129 | """
130 |
131 | def test_constructor(self):
132 | # Create a zero vector
133 | zero_vector = np.zeros(self.fake_all_params.shape[0])
134 | # Create a ridge parameter
135 | ridge_param = 0.5
136 | # Split parameter function
137 | split_param_func = asym.split_param_vec
138 | # Store the mapping dictionaries
139 | mapping_dict = self.model_obj.get_mappings_for_fit()
140 | # Store the positions of the parameters to be constrained
141 | constrained_pos = [0]
142 | # Create the kewargs for the estimation object
143 | kwargs = {"constrained_pos": constrained_pos}
144 |
145 | # Create the estimation object
146 | estimation_object = estimation.EstimationObj(self.model_obj,
147 | mapping_dict,
148 | ridge_param,
149 | zero_vector,
150 | split_param_func,
151 | **kwargs)
152 |
153 | # Perform the tests to ensure that the desired attributes were
154 | # correctly created
155 | attr_names = ["alt_id_vector",
156 | "choice_vector",
157 | "design",
158 | "intercept_ref_pos",
159 | "shape_ref_pos",
160 | "rows_to_obs",
161 | "rows_to_alts",
162 | "chosen_row_to_obs",
163 | "rows_to_nests",
164 | "rows_to_mixers",
165 | "ridge",
166 | "constrained_pos",
167 | "zero_vector",
168 | "split_params",
169 | "utility_transform",
170 | "calc_dh_dv",
171 | "calc_dh_d_alpha",
172 | "calc_dh_d_shape"]
173 | for attr in attr_names:
174 | self.assertTrue(hasattr(estimation_object, attr))
175 |
176 | # Make sure that the objects that should be arrays, are arrays
177 | for attr in ["alt_id_vector",
178 | "choice_vector",
179 | "design",
180 | "zero_vector"]:
181 | self.assertIsInstance(getattr(estimation_object, attr), np.ndarray)
182 | # Ensure that the arrays have the correct values
183 | npt.assert_allclose(estimation_object.alt_id_vector,
184 | self.model_obj.alt_IDs)
185 | npt.assert_allclose(estimation_object.choice_vector,
186 | self.model_obj.choices)
187 | npt.assert_allclose(estimation_object.design, self.model_obj.design)
188 | npt.assert_allclose(estimation_object.zero_vector, zero_vector)
189 |
190 | # Ensure that the scalars are scalars with the correct values
191 | for attr in ["intercept_ref_pos", "shape_ref_pos", "ridge"]:
192 | self.assertIsInstance(getattr(estimation_object, attr), Number)
193 | self.assertEqual(estimation_object.intercept_ref_pos,
194 | self.model_obj.intercept_ref_position)
195 | self.assertEqual(estimation_object.shape_ref_pos,
196 | self.model_obj.shape_ref_position)
197 | self.assertEqual(estimation_object.ridge, ridge_param)
198 |
199 | # Ensure that the mapping matrices are correct
200 | for attr in ["rows_to_obs", "rows_to_alts", "chosen_row_to_obs",
201 | "rows_to_nests", "rows_to_mixers"]:
202 | # Get the mapping matrix as stored on the model object.
203 | matrix_on_object = getattr(estimation_object, attr)
204 | if matrix_on_object is not None:
205 | npt.assert_allclose(matrix_on_object.A, mapping_dict[attr].A)
206 | else:
207 | self.assertIsNone(mapping_dict[attr])
208 |
209 | # Ensure that the function definitions point to the correct locations
210 | self.assertEqual(id(estimation_object.split_params),
211 | id(split_param_func))
212 | self.assertEqual(id(estimation_object.utility_transform),
213 | id(self.model_obj.utility_transform))
214 |
215 | # Make sure that the derivative functions return None, for now.
216 | for attr in ["calc_dh_dv",
217 | "calc_dh_d_alpha",
218 | "calc_dh_d_shape"]:
219 | func = getattr(estimation_object, attr)
220 | self.assertIsNone(func("foo"))
221 |
222 | return None
223 |
224 | def test_not_implemented_error_in_example_functions(self):
225 | # Create a zero vector
226 | zero_vector = np.zeros(self.fake_all_params.shape[0])
227 | # Create a ridge parameter
228 | ridge_param = 0.5
229 | # Split parameter function
230 | split_param_func = asym.split_param_vec
231 | # Store the mapping dictionaries
232 | mapping_dict = self.model_obj.get_mappings_for_fit()
233 | # Store the positions of the parameters to be constrained
234 | constrained_pos = [0]
235 | # Create the kwargs for the estimation object
236 | kwargs = {"constrained_pos": constrained_pos}
237 |
238 | # Create the estimation object
239 | estimation_object = estimation.EstimationObj(self.model_obj,
240 | mapping_dict,
241 | ridge_param,
242 | zero_vector,
243 | split_param_func,
244 | **kwargs)
245 |
246 | # Record the names of the methods that are created as examples
247 | example_methods = ["convenience_calc_probs",
248 | "convenience_calc_log_likelihood",
249 | "convenience_calc_gradient",
250 | "convenience_calc_hessian",
251 | "convenience_calc_fisher_approx"]
252 | for method_name in example_methods:
253 | func = getattr(estimation_object, method_name)
254 | error_msg = "Method should be defined by descendant classes"
255 | self.assertRaisesRegexp(NotImplementedError,
256 | error_msg,
257 | func,
258 | None)
259 |
260 | return None
261 |
262 | def test_ensure_positivity_and_length_of_weights(self):
263 | # Create a set of good and bad arguments
264 | num_rows = self.fake_design.shape[0]
265 | fake_data = pd.DataFrame(self.fake_design, columns=['x'])
266 | good_weights = [None, np.ones(num_rows)]
267 | bad_weights =\
268 | [1, np.ones((3, 3)), np.ones(num_rows + 1), -1 * np.ones(num_rows)]
269 | # Alias the function being tested
270 | func = estimation.ensure_positivity_and_length_of_weights
271 | # Note the error messages that should be raised.
272 | msg_1 = '`weights` MUST be a 1D ndarray.'
273 | msg_2 = '`weights` must have the same number of rows as `data`.'
274 | msg_3 = '`weights` MUST be >= 0.'
275 | expected_error_msgs = [msg_1, msg_1, msg_2, msg_3]
276 | # Perform the desired tests
277 | for weights in good_weights:
278 | self.assertIsNone(func(weights, fake_data))
279 | for pos, weights in enumerate(bad_weights):
280 | self.assertRaisesRegexp(ValueError,
281 | expected_error_msgs[pos],
282 | func,
283 | weights,
284 | fake_data)
285 | return None
286 |
--------------------------------------------------------------------------------
/tests/test_nested_logit.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for the nested_logit.py file. These tests do not include tests of
3 | the functions that perform the mathematical calculations necessary to estimate
4 | the Nested Logit model.
5 | """
6 | import warnings
7 | import unittest
8 | from collections import OrderedDict
9 |
10 | import numpy as np
11 | import numpy.testing as npt
12 | import pandas as pd
13 | from scipy.sparse import csr_matrix
14 |
15 | import pylogit.nested_logit as nl
16 |
17 |
18 | class NestedLogitTests(unittest.TestCase):
19 | """
20 | Tests of the `split_param_vec` function, the `NestedLogit` model
21 | constructor, and the `fit_mle()` method.
22 | """
23 |
24 | def setUp(self):
25 | # Create the betas to be used during the tests
26 | self.fake_betas = np.array([0.3, -0.6, 0.2])
27 | # Create the fake nest coefficients to be used during the tests
28 | self.fake_nest_coefs = np.array([1, 0.5])
29 | # Create an array of all model parameters
30 | self.fake_all_params = np.concatenate((self.fake_nest_coefs,
31 | self.fake_betas))
32 | # The set up being used is one where there are two choice situations,
33 | # The first having three alternatives, and the second having only two.
34 | # The nest memberships of these alternatives are given below.
35 | self.fake_rows_to_nests = csr_matrix(np.array([[0, 1],
36 | [0, 1],
37 | [1, 0],
38 | [0, 1],
39 | [1, 0]]))
40 |
41 | # Create a sparse matrix that maps the rows of the design matrix to the
42 | # observatins
43 | self.fake_rows_to_obs = csr_matrix(np.array([[1, 0],
44 | [1, 0],
45 | [1, 0],
46 | [0, 1],
47 | [0, 1]]))
48 |
49 | # Create the fake design matrix with columns denoting ASC_1, ASC_2, X
50 | self.fake_design = np.array([[1, 0, 1],
51 | [0, 1, 2],
52 | [0, 0, 3],
53 | [1, 0, 1.5],
54 | [0, 0, 3.5]])
55 |
56 | # Create fake versions of the needed arguments for the MNL constructor
57 | self.fake_df = pd.DataFrame({"obs_id": [1, 1, 1, 2, 2],
58 | "alt_id": [1, 2, 3, 1, 3],
59 | "choice": [0, 1, 0, 0, 1],
60 | "x": range(5),
61 | "intercept": [1 for i in range(5)]})
62 |
63 | # Record the various column names
64 | self.alt_id_col = "alt_id"
65 | self.obs_id_col = "obs_id"
66 | self.choice_col = "choice"
67 |
68 | # Create a sparse matrix that maps the chosen rows of the design
69 | # matrix to the observatins
70 | self.fake_chosen_rows_to_obs = csr_matrix(np.array([[0, 0],
71 | [1, 0],
72 | [0, 0],
73 | [0, 0],
74 | [0, 1]]))
75 |
76 | # Create the index specification and name dictionaryfor the model
77 | self.fake_specification = OrderedDict()
78 | self.fake_specification["intercept"] = [1, 2]
79 | self.fake_specification["x"] = [[1, 2, 3]]
80 | self.fake_names = OrderedDict()
81 | self.fake_names["intercept"] = ["ASC 1", "ASC 2"]
82 | self.fake_names["x"] = ["x (generic coefficient)"]
83 |
84 | # Create the nesting specification
85 | self.fake_nest_spec = OrderedDict()
86 | self.fake_nest_spec["Nest 1"] = [1, 2]
87 | self.fake_nest_spec["Nest 2"] = [3]
88 |
89 | return None
90 |
91 | def test_split_param_vec(self):
92 | """
93 | Ensures that split_param_vec returns a tuple of nest coefficients and
94 | index coefficients.
95 | """
96 | split_results = nl.split_param_vec(self.fake_all_params,
97 | self.fake_rows_to_nests)
98 |
99 | # Check that the results of split_param_vec are as expected
100 | self.assertIsInstance(split_results, tuple)
101 | self.assertEqual(len(split_results), 2)
102 | for item in split_results:
103 | self.assertIsInstance(item, np.ndarray)
104 | self.assertEqual(len(item.shape), 1)
105 | npt.assert_allclose(self.fake_nest_coefs, split_results[0])
106 | npt.assert_allclose(self.fake_betas, split_results[1])
107 |
108 | return None
109 |
110 | def test_missing_nest_spec_error_in_constructor(self):
111 | """
112 | Ensure that the Nested Logit model cannot be constructed without the
113 | `nest_spec` keyword argument being passed a value other than `None`.
114 | """
115 | # Bundle the arguments used to construct the nested logit model
116 | constructor_args = [self.fake_df,
117 | self.alt_id_col,
118 | self.obs_id_col,
119 | self.choice_col,
120 | self.fake_specification,
121 | self.fake_names]
122 |
123 | self.assertRaises(ValueError, nl.NestedLogit, *constructor_args)
124 |
125 | return None
126 |
127 | def test_ridge_warning_in_fit_mle(self):
128 | """
129 | Ensure that a UserWarning is raised when one passes the ridge keyword
130 | argument to the `fit_mle` method of a Nested Logit model object.
131 | """
132 | # Bundle the arguments used to construct the nested logit model
133 | constructor_args = [self.fake_df,
134 | self.alt_id_col,
135 | self.obs_id_col,
136 | self.choice_col,
137 | self.fake_specification,
138 | self.fake_names]
139 | # Bundle the kwargs for constructing the nested_logit_model
140 | constructor_kwargs = {"nest_spec": self.fake_nest_spec}
141 |
142 | # Create the mnl model object whose coefficients will be estimated.
143 | base_nl = nl.NestedLogit(*constructor_args, **constructor_kwargs)
144 |
145 | # Create a variable for the fit_mle function's kwargs.
146 | # The print_res = False arguments are to make sure strings aren't
147 | # printed to the console unnecessarily.
148 | fit_kwargs = {"constrained_pos": [1],
149 | "ridge": 0.5,
150 | "print_res": False}
151 |
152 | # Test to make sure that the ridge warning message is printed when
153 | # using the ridge keyword argument
154 | with warnings.catch_warnings(record=True) as w:
155 | # Use this filter to always trigger the UserWarnings
156 | warnings.simplefilter('always', UserWarning)
157 |
158 | base_nl.fit_mle(self.fake_all_params, **fit_kwargs)
159 | self.assertGreaterEqual(len(w), 1)
160 | self.assertIsInstance(w[0].category, type(UserWarning))
161 | self.assertIn(nl._ridge_warning_msg, str(w[0].message))
162 |
163 | return None
164 |
165 | def test_invalid_init_kwargs_error_in_fit_mle(self):
166 | """
167 | Ensures that a ValueError is raised when users try to use any other
168 | type of initial value input methods other than the `init_vals`
169 | argument of `fit_mle()`. This prevents people from expecting the use
170 | of outside intercept or shape parameters to work with the Nested Logit
171 | model.
172 | """
173 | # Bundle the arguments used to construct the nested logit model
174 | constructor_args = [self.fake_df,
175 | self.alt_id_col,
176 | self.obs_id_col,
177 | self.choice_col,
178 | self.fake_specification]
179 |
180 | # Bundle the kwargs for constructing the nested_logit_model
181 | constructor_kwargs = {"names": self.fake_names,
182 | "nest_spec": self.fake_nest_spec}
183 |
184 | # Create the mnl model object whose coefficients will be estimated.
185 | base_nl = nl.NestedLogit(*constructor_args, **constructor_kwargs)
186 |
187 | # Create a variable for the arguments to the fit_mle function.
188 | # this mimics the arguments passed when trying to use the shape_param
189 | # or outside intercepts kwargs with fit_mle.
190 | fit_args = [None]
191 |
192 | # Create variables for the incorrect kwargs.
193 | # The print_res = False arguments are to make sure strings aren't
194 | # printed to the console unnecessarily.
195 | kwarg_map_1 = {"init_shapes": np.array([1, 2]),
196 | "print_res": False}
197 | kwarg_map_2 = {"init_intercepts": np.array([1]),
198 | "print_res": False}
199 | kwarg_map_3 = {"init_coefs": np.array([1]),
200 | "print_res": False}
201 |
202 | # Test to ensure that the kwarg ignore message is printed when using
203 | # any of these three incorrect kwargs
204 | for kwargs in [kwarg_map_1, kwarg_map_2, kwarg_map_3]:
205 | self.assertRaises(ValueError, base_nl.fit_mle,
206 | *fit_args, **kwargs)
207 |
208 | return None
209 |
210 | def test_just_point_kwarg(self):
211 | """
212 | Ensure that calling `fit_mle` with `just_point = True` returns a
213 | dictionary with a 'x' key and a corresponding value that is an ndarray.
214 | """
215 | # Bundle the arguments used to construct the nested logit model
216 | constructor_args = [self.fake_df,
217 | self.alt_id_col,
218 | self.obs_id_col,
219 | self.choice_col,
220 | self.fake_specification]
221 |
222 | # Bundle the kwargs for constructing the nested_logit_model
223 | constructor_kwargs = {"names": self.fake_names,
224 | "nest_spec": self.fake_nest_spec}
225 |
226 | # Create the mnl model object whose coefficients will be estimated.
227 | base_nl = nl.NestedLogit(*constructor_args, **constructor_kwargs)
228 | # Create a variable for the arguments to the fit_mle function.
229 | fit_args = [self.fake_all_params]
230 | # Alias the function being tested
231 | func = base_nl.fit_mle
232 | # Get the necessary kwargs
233 | kwargs = {"just_point": True}
234 | # Get the function results
235 | func_result = func(*fit_args, **kwargs)
236 | # Perform the desired tests to make sure we get back a dictionary with
237 | # an "x" key in it and a value that is a ndarray.
238 | self.assertIsInstance(func_result, dict)
239 | self.assertIn("x", func_result)
240 | self.assertIsInstance(func_result["x"], np.ndarray)
241 | return None
242 |
243 | def test_invalid_init_vals_length_in_estimate(self):
244 | """
245 | Ensure that when _estimate() is called, with an init_values argument
246 | that is of an incorrect length, a ValueError is raised.
247 | """
248 | # Bundle the arguments used to construct the nested logit model
249 | constructor_args = [self.fake_df,
250 | self.alt_id_col,
251 | self.obs_id_col,
252 | self.choice_col,
253 | self.fake_specification,
254 | self.fake_names]
255 | # Bundle the kwargs for constructing the nested_logit_model
256 | constructor_kwargs = {"nest_spec": self.fake_nest_spec}
257 |
258 | # Create the mnl model object whose coefficients will be estimated.
259 | base_nl = nl.NestedLogit(*constructor_args, **constructor_kwargs)
260 |
261 | # Create an estimator object.
262 | zero_vector = np.zeros(self.fake_all_params.shape[0])
263 | estimator_args = [base_nl,
264 | base_nl.get_mappings_for_fit(),
265 | None,
266 | zero_vector,
267 | nl.split_param_vec]
268 | estimator_kwargs = {"constrained_pos": [1]}
269 | nested_estimator = nl.NestedEstimator(*estimator_args,
270 | **estimator_kwargs)
271 |
272 | # Alias the function being tested
273 | func = nested_estimator.check_length_of_initial_values
274 |
275 | # Test that the desired error is raised
276 | for i in [-1, 1]:
277 | init_values = np.arange(self.fake_all_params.shape[0] + i)
278 |
279 | self.assertRaisesRegexp(ValueError,
280 | "values are of the wrong dimension",
281 | func,
282 | init_values)
283 |
284 | return None
285 |
286 | def test_identify_degenerate_nests(self):
287 | """
288 | Ensure that `identify_degenerate_nests` returns the correct list when
289 | using nest specifications that do and do not contain degenerate nests.
290 | """
291 | good_spec = OrderedDict()
292 | good_spec["Nest 1"] = [1, 2]
293 | good_spec["Nest 2"] = [3, 4]
294 |
295 | bad_spec = OrderedDict()
296 | bad_spec["Nest 1"] = [1]
297 | bad_spec["Nest 2"] = [2, 3]
298 | bad_spec["Nest 3"] = [4]
299 |
300 | # Alias the function being tested
301 | func = nl.identify_degenerate_nests
302 |
303 | # Test the function
304 | self.assertEqual([], func(good_spec))
305 | self.assertEqual([0, 2], func(bad_spec))
306 |
307 | return None
308 |
--------------------------------------------------------------------------------
/tests/test_pylogit.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for the user-facing choice model constructor.
3 | """
4 | import unittest
5 | from collections import OrderedDict
6 |
7 | import numpy as np
8 | import numpy.testing as npt
9 | import pandas as pd
10 |
11 | import pylogit
12 | import pylogit.display_names as display_names
13 |
14 |
15 | # Get the dictionary that maps the model type to the names of the model that
16 | # are stored on the model object itself.
17 | model_type_to_display_name = display_names.model_type_to_display_name
18 |
19 |
20 | class ConstructorTests(unittest.TestCase):
21 | """
22 | Contains the tests of the choice model construction function.
23 | """
24 |
25 | def setUp(self):
26 | """
27 | Create the input data needed to test the choice model constructor.
28 | """
29 | # The set up being used is one where there are two choice situations,
30 | # The first having three alternatives, and the second having only two
31 | # alternatives. There is one generic variable. Two alternative
32 | # specific constants and all three shape parameters are used.
33 |
34 | # Create the betas to be used during the tests
35 | self.fake_betas = np.array([-0.6])
36 |
37 | # Create the fake outside intercepts to be used during the tests
38 | self.fake_intercepts = np.array([1, 0.5])
39 |
40 | # Create names for the intercept parameters
41 | self.fake_intercept_names = ["ASC 1", "ASC 2"]
42 |
43 | # Record the position of the intercept that is not being estimated
44 | self.fake_intercept_ref_pos = 2
45 |
46 | # Create the shape parameters to be used during the tests. Note that
47 | # these are the reparameterized shape parameters, thus they will be
48 | # exponentiated in the fit_mle process and various calculations.
49 | self.fake_shapes = np.array([-1, 0, 1])
50 |
51 | # Create names for the intercept parameters
52 | self.fake_shape_names = ["Shape 1", "Shape 2", "Shape 3"]
53 |
54 | # Create a shape ref position (used in the Asymmetric Logit Model)
55 | self.fake_shape_ref_pos = 2
56 |
57 | # # Create an array of all model parameters
58 | # self.fake_all_params = np.concatenate((self.fake_shapes,
59 | # self.fake_intercepts,
60 | # self.fake_betas))
61 |
62 | # # The mapping between rows and alternatives is given below.
63 | # self.fake_rows_to_alts = csr_matrix(np.array([[1, 0, 0],
64 | # [0, 1, 0],
65 | # [0, 0, 1],
66 | # [1, 0, 0],
67 | # [0, 0, 1]]))
68 |
69 | # Create the fake design matrix with columns denoting X
70 | # The intercepts are not included because they are kept outside the
71 | # index in the uneven model.
72 | self.fake_design = np.array([[1],
73 | [2],
74 | [3],
75 | [1.5],
76 | [3.5]])
77 |
78 | # Create the index array for this set of choice situations
79 | self.fake_index = self.fake_design.dot(self.fake_betas)
80 |
81 | # Create the needed dataframe for the choice model constructor
82 | self.fake_df = pd.DataFrame({"obs_id": [1, 1, 1, 2, 2],
83 | "alt_id": [1, 2, 3, 1, 3],
84 | "choice": [0, 1, 0, 0, 1],
85 | "x": self.fake_design[:, 0],
86 | "intercept": [1 for i in range(5)]})
87 |
88 | # Record the various column names
89 | self.alt_id_col = "alt_id"
90 | self.obs_id_col = "obs_id"
91 | self.choice_col = "choice"
92 |
93 | # Create the index specification and name dictionary for the model
94 | self.fake_specification = OrderedDict()
95 | self.fake_names = OrderedDict()
96 | self.fake_specification["x"] = [[1, 2, 3]]
97 | self.fake_names["x"] = ["x (generic coefficient)"]
98 |
99 | # Create the nesting specification
100 | self.fake_nest_spec = OrderedDict()
101 | self.fake_nest_spec["Nest 1"] = [1, 2]
102 | self.fake_nest_spec["Nest 2"] = [3]
103 |
104 | # Bundle the args and kwargs used to construct the models.
105 | # Note that "MNL" is used as a model_type placeholder, and it will be
106 | # replaced as needed by each model
107 | self.constructor_args = [self.fake_df,
108 | self.alt_id_col,
109 | self.obs_id_col,
110 | self.choice_col,
111 | self.fake_specification,
112 | "MNL"]
113 |
114 | # Create a variable for the kwargs being passed to the constructor
115 | self.constructor_kwargs = {"intercept_ref_pos":
116 | self.fake_intercept_ref_pos,
117 | "names": self.fake_names,
118 | "intercept_names":
119 | self.fake_intercept_names,
120 | "shape_names": self.fake_shape_names}
121 |
122 | def test_constructor(self):
123 | """
124 | Construct the various choice models and make sure the constructed
125 | object has the necessary attributes.
126 | """
127 | # Record the model types of all the models to be created
128 | all_model_types = model_type_to_display_name.keys()
129 |
130 | # Record the attribute / value pairs that are common to all models.
131 | common_attr_value_dict = {"data": self.fake_df,
132 | "name_spec": self.fake_names,
133 | "design": self.fake_design,
134 | "ind_var_names": self.fake_names["x"],
135 | "alt_id_col": self.alt_id_col,
136 | "obs_id_col": self.obs_id_col,
137 | "choice_col": self.choice_col,
138 | "specification": self.fake_specification,
139 | "alt_IDs": self.fake_df["alt_id"].values,
140 | "choices": self.fake_df["choice"].values}
141 |
142 | # Create a shape name dictionary to relate the various models to the
143 | # names of their shape parameters.
144 | shape_name_dict = {"MNL": None,
145 | "Asym": self.fake_shape_names[:2],
146 | "Cloglog": None,
147 | "Scobit": self.fake_shape_names,
148 | "Uneven": self.fake_shape_names,
149 | "Nested Logit": None,
150 | "Mixed Logit": None}
151 |
152 | # Create a shape reference position dictionary to relate the various
153 | # models to their shape reference positions.
154 | shape_ref_dict = {}
155 | for key in shape_name_dict:
156 | shape_ref_dict[key] = (None if key != "Asym" else
157 | self.fake_shape_ref_pos)
158 |
159 | # Create an intercept_names and intercept_ref_position dictionary to
160 | # relate the various models to their respective kwargs.
161 | intercept_names_dict = {}
162 | intercept_ref_dict = {}
163 | for key in shape_name_dict:
164 | if key in ["MNL", "Nested Logit", "Mixed Logit"]:
165 | intercept_names_dict[key] = None
166 | intercept_ref_dict[key] = None
167 | else:
168 | intercept_names_dict[key] = self.fake_intercept_names
169 | intercept_ref_dict[key] = self.fake_intercept_ref_pos
170 |
171 | # Create a nest_names dictionary to relate the various models to their
172 | # nest_name attributes
173 | nest_name_dict = {}
174 | nest_spec_dict = {}
175 | for key in shape_name_dict:
176 | if key != "Nested Logit":
177 | nest_name_dict[key] = None
178 | nest_spec_dict[key] = None
179 | else:
180 | nest_name_dict[key] = list(self.fake_nest_spec.keys())
181 | nest_spec_dict[key] = self.fake_nest_spec
182 |
183 | # Create dictionaries for the mixing_id_col, mixing_vars, and
184 | # mixing_pos attributes
185 | mixing_id_col_dict = {}
186 | mixing_vars_dict = {}
187 | mixing_pos_dict = {}
188 |
189 | for key in shape_name_dict:
190 | if key != "Mixed Logit":
191 | mixing_id_col_dict[key] = None
192 | mixing_vars_dict[key] = None
193 | mixing_pos_dict[key] = None
194 | else:
195 | mixing_id_col_dict[key] = self.obs_id_col
196 | mixing_vars_dict[key] = self.fake_names["x"]
197 | mixing_pos_dict[key] = [0]
198 |
199 | # Record the attribute / value pairs that vary across models
200 | varying_attr_value_dict = {"model_type": model_type_to_display_name,
201 | "intercept_names": intercept_names_dict,
202 | "intercept_ref_position":
203 | intercept_ref_dict,
204 | "shape_names": shape_name_dict,
205 | "shape_ref_position": shape_ref_dict,
206 | "nest_names": nest_name_dict,
207 | "nest_spec": nest_spec_dict,
208 | "mixing_id_col": mixing_id_col_dict,
209 | "mixing_vars": mixing_vars_dict,
210 | "mixing_pos": mixing_pos_dict}
211 |
212 | # Set up the keyword arguments that are needed for each of the model
213 | # types
214 | variable_kwargs = {}
215 | for model_name in all_model_types:
216 | variable_kwargs[model_name] = {}
217 | variable_kwargs[model_name]["intercept_names"] =\
218 | intercept_names_dict[model_name]
219 | variable_kwargs[model_name]["intercept_ref_pos"] =\
220 | intercept_ref_dict[model_name]
221 | variable_kwargs[model_name]["shape_ref_pos"] =\
222 | shape_ref_dict[model_name]
223 | variable_kwargs[model_name]["shape_names"] =\
224 | shape_name_dict[model_name]
225 | variable_kwargs[model_name]["nest_spec"] =\
226 | nest_spec_dict[model_name]
227 | variable_kwargs[model_name]["mixing_id_col"] =\
228 | mixing_id_col_dict[model_name]
229 | variable_kwargs[model_name]["mixing_vars"] =\
230 | mixing_vars_dict[model_name]
231 |
232 | # Execute the test for each model type
233 | for model_name in all_model_types:
234 | # Update the model type in the list of constructor args
235 | self.constructor_args[-1] = model_name
236 |
237 | # Use this specific model's keyword arguments
238 | self.constructor_kwargs.update(variable_kwargs[model_name])
239 |
240 | # Construct the model object
241 | model_obj = pylogit.create_choice_model(*self.constructor_args,
242 | **self.constructor_kwargs)
243 |
244 | # Make sure that the constructor has all of the required attributes
245 | for attr in common_attr_value_dict:
246 | value = common_attr_value_dict[attr]
247 | if isinstance(value, pd.DataFrame):
248 | self.assertTrue(value.equals(model_obj.data))
249 | elif isinstance(value, np.ndarray):
250 | npt.assert_allclose(value,
251 | model_obj.__getattribute__(attr))
252 | else:
253 | self.assertEqual(value,
254 | model_obj.__getattribute__(attr))
255 |
256 | for attr in varying_attr_value_dict:
257 | value = varying_attr_value_dict[attr][model_name]
258 |
259 | self.assertEqual(value,
260 | model_obj.__getattribute__(attr))
261 |
262 | return None
263 |
264 | def test_ensure_valid_model_type(self):
265 | """
266 | Ensure that the desired message is raised when an invalid type is
267 | passed, and that None is returned otherwise
268 | """
269 | # Note the "valid" type strings for our test
270 | test_types = ["bar", "foo", "Sreeta", "Feras"]
271 | # Note a set of invalid type strings for the test
272 | bad_types = ["Tim", "Sam"]
273 |
274 | # Alias the function to be tested
275 | func = pylogit.pylogit.ensure_valid_model_type
276 |
277 | # Make note of part of the error message that should be raised
278 | partial_error_msg = "The specified model_type was not valid."
279 |
280 | # Perform the requisite tests
281 | for good_example in test_types:
282 | self.assertIsNone(func(good_example, test_types))
283 | for bad_example in bad_types:
284 | self.assertRaisesRegexp(ValueError,
285 | partial_error_msg,
286 | func,
287 | bad_example,
288 | test_types)
289 |
290 | return None
291 |
--------------------------------------------------------------------------------