├── .gitattributes
├── .gitignore
├── .readthedocs.yaml
├── Makefile
├── README.md
├── edstan
    ├── __init__.py
    ├── data
    │   ├── 2pl_latent_reg.stan
    │   ├── gpcm_latent_reg.stan
    │   ├── grsm_latent_reg.stan
    │   ├── pcm_latent_reg.stan
    │   ├── rasch_latent_reg.stan
    │   └── rsm_latent_reg.stan
    ├── mcmc.py
    └── model.py
├── environment.yml
├── make.bat
├── pyproject.toml
├── requirements.txt
├── source
    ├── api.rst
    ├── conf.py
    ├── index.rst
    └── tech.rst
└── tests
    ├── conftest.py
    ├── test_gestalt.py
    └── test_internals.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | 
49 | test.py
50 | temp.py
51 | .spyproject/*
52 | 
53 | .idea/*
54 | env/*
55 | *__pycache__*
56 | 
57 | edstan/data/*
58 | !edstan/data/*.stan
59 | 
60 | bernoulli*
61 | 
62 | build/*
63 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Required
 2 | version: 2
 3 | 
 4 | # Set the OS, Python version, and other tools you might need
 5 | build:
 6 |   os: ubuntu-24.04
 7 |   tools:
 8 |     python: "miniconda3-4.7"
 9 | 
10 | # Build documentation in the "docs/" directory with Sphinx
11 | sphinx:
12 |    configuration: source/conf.py
13 | 
14 | conda:
15 |   environment: environment.yml
16 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # edstan for Python
 2 | 
 3 | A python module that simplifies the fitting of common Bayesian item response theory models using Stan. It is compatible
 4 | with and extends the functionality of **pystan**.
 5 | 
 6 | 
 7 | Features
 8 | 
 9 | - Streamlined interface to common Bayesian item response models using Stan
10 | - Models include: Rasch, two-parameter logistic, (generalized) rating scale, and (generalized) partial credit
11 | - Posterior summaries tailored to item response models
12 | 
13 | 
14 | ## Installation
15 | 
16 | **edstan** depends on the successful installation of **pystan**, so please see the
17 | [pystan installation guide](https://pystan.readthedocs.io/en/latest/installation.html).
18 | Note that compatibility with Windows OS
19 | [may be limited](https://pystan.readthedocs.io/en/latest/faq.html).
20 | 
21 | **edstan** may subsequently be installed with **pip**:
22 | 
23 | ```bash
24 | pip install edstan
25 | ```
26 | 
27 | ## API Reference
28 | 
29 | The API reference may be found at [Read the Docs](https://edstan-python.readthedocs.io/en/latest/index.html).
30 | 
31 | 
32 | ## Quickstart
33 | 
34 | Here is an example of running a model using data from a response matrix:
35 | 
36 | ```python
37 | from edstan import EdStanModel
38 | import numpy as np
39 | import pandas as pd
40 | 
41 | # Simulate a "wide format" data frame of item responses for
42 | # 5 items and 100 persons. Responses are scored 0 or 1.
43 | rng = np.random.default_rng(seed=42)
44 | data = pd.DataFrame(rng.binomial(1, p=.5, size=(100, 5)))
45 | data.columns = [f"Question {i}" for i in range(5)]
46 | data.index = [f"Respondent {i}" for i in range(100)]
47 | 
48 | # Instantiate the model, selecting the Rasch model
49 | model = EdStanModel("rasch")
50 | 
51 | # Sample from the model by MCMC
52 | fit = model.sample_from_wide(data)
53 | 
54 | # View a posterior summary of the item (and person distribution)
55 | # parameters
56 | print(fit.item_summary())
57 | 
58 | # View a posterior summary of the person parameters
59 | print(fit.person_summary())
60 | ```
61 | 
62 | Alternatively, this is an example of using long format data:
63 | 
64 | ```python
65 | from edstan import EdStanModel
66 | import numpy as np
67 | import pandas as pd
68 | 
69 | # Simulate a "long format" data frame of item responses for
70 | # 5 items and 100 persons. Responses are scored 0, 1, or 2.
71 | rng = np.random.default_rng(seed=42)
72 | data = pd.DataFrame(
73 |   {
74 |      "person": [f"Person {j}" for j in range(100) for i in range(5)],
75 |      "item": [f"Item {i}" for j in range(100) for i in range(5)],
76 |      "response": rng.binomial(2, p=.5, size=5*100),
77 |   }
78 | )
79 | 
80 | # Instantiate the model, choosing the generalized partial
81 | # credit model
82 | model = EdStanModel("gpcm")
83 | 
84 | # Sample from the model by MCMC
85 | fit = model.sample_from_long(
86 |   ii=data['item'],
87 |   jj=data['person'],
88 |   y=data['response'],
89 | )
90 | 
91 | # View a posterior summary of the item (and person distribution)
92 | # parameters
93 | print(fit.item_summary())
94 | 
95 | # View a posterior summary of the person parameters
96 | print(fit.person_summary())
97 | ```
98 | 


--------------------------------------------------------------------------------
/edstan/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import EdStanModel, data_from_long, data_from_wide
2 | from .mcmc import EdStanMCMC
3 | 


--------------------------------------------------------------------------------
/edstan/data/2pl_latent_reg.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int<lower=1> I; // # questions
 3 |   int<lower=1> J; // # persons
 4 |   int<lower=1> N; // # observations
 5 |   array[N] int<lower=1, upper=I> ii; // question for n
 6 |   array[N] int<lower=1, upper=J> jj; // person for n
 7 |   array[N] int<lower=0, upper=1> y; // correctness for n
 8 |   int<lower=1> K; // # person covariates
 9 |   matrix[J, K] W; // person covariate matrix
10 | }
11 | parameters {
12 |   vector<lower=0>[I] alpha;
13 |   sum_to_zero_vector[I] beta;
14 |   vector[J] theta;
15 |   vector[K] lambda;
16 | }
17 | model {
18 |   alpha ~ lognormal(.5, 1);
19 |   beta ~ normal(0, 3);
20 |   lambda ~ student_t(7, 0, 2.5);
21 |   theta ~ normal(W * lambda, 1);
22 |   y ~ bernoulli_logit(alpha[ii] .* theta[jj] - beta[ii]);
23 | }
24 | 


--------------------------------------------------------------------------------
/edstan/data/gpcm_latent_reg.stan:
--------------------------------------------------------------------------------
 1 | functions {
 2 |   real pcm(int y, real theta, vector beta) {
 3 |     vector[rows(beta) + 1] unsummed;
 4 |     vector[rows(beta) + 1] probs;
 5 |     unsummed = append_row(rep_vector(0.0, 1), theta - beta);
 6 |     probs = softmax(cumulative_sum(unsummed));
 7 |     return categorical_lpmf(y + 1 | probs);
 8 |   }
 9 | }
10 | data {
11 |   int<lower=1> I; // # items
12 |   int<lower=1> J; // # persons
13 |   int<lower=1> N; // # responses
14 |   array[N] int<lower=1, upper=I> ii; // i for n
15 |   array[N] int<lower=1, upper=J> jj; // j for n
16 |   array[N] int<lower=0> y; // response for n; y = 0, 1 ... m_i
17 |   int<lower=1> K; // # person covariates
18 |   matrix[J, K] W; // person covariate matrix
19 | }
20 | transformed data {
21 |   array[I] int m; // # parameters per item
22 |   array[I] int pos; // first position in beta vector for item
23 |   m = rep_array(0, I);
24 |   for (n in 1 : N) {
25 |     if (y[n] > m[ii[n]]) {
26 |       m[ii[n]] = y[n];
27 |     }
28 |   }
29 |   pos[1] = 1;
30 |   for (i in 2 : I) {
31 |     pos[i] = m[i - 1] + pos[i - 1];
32 |   }
33 | }
34 | parameters {
35 |   vector<lower=0>[I] alpha;
36 |   sum_to_zero_vector[sum(m)] beta;
37 |   vector[J] theta;
38 |   vector[K] lambda;
39 | }
40 | model {
41 |   alpha ~ lognormal(.5, 1);
42 |   beta ~ normal(0, 3);
43 |   theta ~ normal(W * lambda, 1);
44 |   lambda ~ student_t(7, 0, 2.5);
45 |   for (n in 1 : N) {
46 |     target += pcm(y[n], theta[jj[n]] .* alpha[ii[n]],
47 |                   segment(beta, pos[ii[n]], m[ii[n]]));
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/edstan/data/grsm_latent_reg.stan:
--------------------------------------------------------------------------------
 1 | functions {
 2 |   real rsm(int y, real theta, real beta, vector kappa) {
 3 |     vector[rows(kappa) + 1] unsummed;
 4 |     vector[rows(kappa) + 1] probs;
 5 |     unsummed = append_row(rep_vector(0, 1), theta - beta - kappa);
 6 |     probs = softmax(cumulative_sum(unsummed));
 7 |     return categorical_lpmf(y + 1 | probs);
 8 |   }
 9 | }
10 | data {
11 |   int<lower=1> I; // # items
12 |   int<lower=1> J; // # persons
13 |   int<lower=1> N; // # responses
14 |   array[N] int<lower=1, upper=I> ii; // i for n
15 |   array[N] int<lower=1, upper=J> jj; // j for n
16 |   array[N] int<lower=0> y; // response for n; y in {0 ... m_i}
17 |   int<lower=1> K; // # person covariates
18 |   matrix[J, K] W; // person covariate matrix
19 | }
20 | transformed data {
21 |   int m = max(y); // # steps
22 | }
23 | parameters {
24 |   vector<lower=0>[I] alpha;
25 |   sum_to_zero_vector[I] beta;
26 |   sum_to_zero_vector[m] kappa;
27 |   vector[J] theta;
28 |   vector[K] lambda;
29 | }
30 | model {
31 |   alpha ~ lognormal(.5, 1);
32 |   beta ~ normal(0, 3);
33 |   kappa ~ normal(0, 3);
34 |   theta ~ normal(W * lambda, 1);
35 |   lambda ~ student_t(7, 0, 2.5);
36 |   for (n in 1 : N) {
37 |     target += rsm(y[n], theta[jj[n]] .* alpha[ii[n]], beta[ii[n]], kappa);
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/edstan/data/pcm_latent_reg.stan:
--------------------------------------------------------------------------------
 1 | functions {
 2 |   real pcm(int y, real theta, vector beta) {
 3 |     vector[rows(beta) + 1] unsummed;
 4 |     vector[rows(beta) + 1] probs;
 5 |     unsummed = append_row(rep_vector(0.0, 1), theta - beta);
 6 |     probs = softmax(cumulative_sum(unsummed));
 7 |     return categorical_lpmf(y + 1 | probs);
 8 |   }
 9 | }
10 | data {
11 |   int<lower=1> I; // # items
12 |   int<lower=1> J; // # persons
13 |   int<lower=1> N; // # responses
14 |   array[N] int<lower=1, upper=I> ii; // i for n
15 |   array[N] int<lower=1, upper=J> jj; // j for n
16 |   array[N] int<lower=0> y; // response for n; y = 0, 1 ... m_i
17 |   int<lower=1> K; // # person covariates
18 |   matrix[J, K] W; // person covariate matrix
19 | }
20 | transformed data {
21 |   array[I] int m; // # parameters per item
22 |   array[I] int pos; // first position in beta vector for item
23 |   m = rep_array(0, I);
24 |   for (n in 1 : N) {
25 |     if (y[n] > m[ii[n]]) {
26 |       m[ii[n]] = y[n];
27 |     }
28 |   }
29 |   pos[1] = 1;
30 |   for (i in 2 : I) {
31 |     pos[i] = m[i - 1] + pos[i - 1];
32 |   }
33 | }
34 | parameters {
35 |   sum_to_zero_vector[sum(m)] beta;
36 |   vector[J] theta;
37 |   real<lower=0> sigma;
38 |   vector[K] lambda;
39 | }
40 | model {
41 |   beta ~ normal(0, 3);
42 |   theta ~ normal(W * lambda, sigma);
43 |   lambda ~ student_t(7, 0, 2.5);
44 |   sigma ~ gamma(2, 1);
45 |   for (n in 1 : N) {
46 |     target += pcm(y[n], theta[jj[n]], segment(beta, pos[ii[n]], m[ii[n]]));
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/edstan/data/rasch_latent_reg.stan:
--------------------------------------------------------------------------------
 1 | data {
 2 |   int<lower=1> I; // # questions
 3 |   int<lower=1> J; // # persons
 4 |   int<lower=1> N; // # observations
 5 |   array[N] int<lower=1, upper=I> ii; // question for n
 6 |   array[N] int<lower=1, upper=J> jj; // person for n
 7 |   array[N] int<lower=0, upper=1> y; // correctness for n
 8 |   int<lower=1> K; // # person covariates
 9 |   matrix[J, K] W; // person covariate matrix
10 | }
11 | parameters {
12 |   sum_to_zero_vector[I] beta;
13 |   vector[J] theta;
14 |   real<lower=0> sigma;
15 |   vector[K] lambda;
16 | }
17 | model {
18 |   beta ~ normal(0, 3);
19 |   theta ~ normal(W * lambda, sigma);
20 |   lambda ~ student_t(7, 0, 2.5);
21 |   sigma ~ gamma(2, 1);
22 |   y ~ bernoulli_logit(theta[jj] - beta[ii]);
23 | }
24 | 


--------------------------------------------------------------------------------
/edstan/data/rsm_latent_reg.stan:
--------------------------------------------------------------------------------
 1 | functions {
 2 |   real rsm(int y, real theta, real beta, vector kappa) {
 3 |     vector[rows(kappa) + 1] unsummed;
 4 |     vector[rows(kappa) + 1] probs;
 5 |     unsummed = append_row(rep_vector(0, 1), theta - beta - kappa);
 6 |     probs = softmax(cumulative_sum(unsummed));
 7 |     return categorical_lpmf(y + 1 | probs);
 8 |   }
 9 | }
10 | data {
11 |   int<lower=1> I; // # items
12 |   int<lower=1> J; // # persons
13 |   int<lower=1> N; // # responses
14 |   array[N] int<lower=1, upper=I> ii; // i for n
15 |   array[N] int<lower=1, upper=J> jj; // j for n
16 |   array[N] int<lower=0> y; // response for n; y in {0 ... m_i}
17 |   int<lower=1> K; // # person covariates
18 |   matrix[J, K] W; // person covariate matrix
19 | }
20 | transformed data {
21 |   int m = max(y); // # steps
22 | }
23 | parameters {
24 |   sum_to_zero_vector[I] beta;
25 |   sum_to_zero_vector[m] kappa;
26 |   vector[J] theta;
27 |   real<lower=0> sigma;
28 |   vector[K] lambda;
29 | }
30 | model {
31 |   beta ~ normal(0, 3);
32 |   kappa ~ normal(0, 3);
33 |   theta ~ normal(W * lambda, sigma);
34 |   lambda ~ student_t(7, 0, 2.5);
35 |   sigma ~ gamma(2, 1);
36 |   for (n in 1 : N) {
37 |     target += rsm(y[n], theta[jj[n]], beta[ii[n]], kappa);
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/edstan/mcmc.py:
--------------------------------------------------------------------------------
  1 | from cmdstanpy import CmdStanMCMC
  2 | from numpy.typing import NDArray
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | 
  7 | class EdStanMCMC:
  8 |     """
  9 |     A wrapper around :class:`pystan.CmdStanMCMC` that adds additional methods.
 10 | 
 11 |     This class delegates all unspecified attribute access to the underlying :class:`pystan.CmdStanMCMC` instance via
 12 |     :meth:`EdStanMCMC.__getattr__`. This allows it to behave like a :class:`pystan.CmdStanMCMC` object while also
 13 |     providing custom methods.
 14 |     """
 15 | 
 16 |     def __init__(
 17 |         self,
 18 |         mcmc: CmdStanMCMC,
 19 |         ii_labels: NDArray,
 20 |         jj_labels: NDArray,
 21 |         max_per_item: NDArray[np.integer],
 22 |     ):
 23 |         """
 24 |         Initializes an :class:`EdStanMCMC` instance.
 25 | 
 26 |         An instance of this class is generated by :meth:`ModelMCMC.sample_from_long` or
 27 |         :meth:`EdStanModel.sample_from_wide`. Though the class may be initialized directly, this is not the intended
 28 |         usage.
 29 | 
 30 |         :param mcmc: A fitted :mod:`edstan` model using MCMC.
 31 |         :param ii_labels: Labels associated with the items.
 32 |         :param jj_labels: Labels associated with the persons.
 33 |         :param max_per_item: The maximum score per item.
 34 |         """
 35 |         self.mcmc = mcmc
 36 |         self.ii_labels = ii_labels
 37 |         self.jj_labels = jj_labels
 38 |         self.max_per_item = max_per_item
 39 | 
 40 |     def __getattr__(self, name):
 41 |         return getattr(self.mcmc, name)
 42 | 
 43 |     def item_summary(self, **kwargs):
 44 |         """
 45 |         A wrapper around :meth:`pystan.CmdStanMCMC.summary` that provides posterior summaries grouped by item.
 46 | 
 47 |         :param kwargs: Additional optional arguments passed to :meth:`pystan.CmdStanMCMC.summary`, such as 'percentiles'
 48 |             and 'sig_figs'.
 49 |         :return: A summary :class:`pandas.DataFrame` filtered to include item and distribution parameters only, having a
 50 |             multi-index that associates parameters with their respective item labels (or the person distribution).
 51 |         """
 52 |         summary = self.mcmc.summary(**kwargs)
 53 |         summary.index.name = "parameter"
 54 | 
 55 |         expected = _get_expected_parameters_by_group(
 56 |             item_labels=self.ii_labels,
 57 |             max_per_item=self.max_per_item,
 58 |             rasch_family="sigma" in summary.index,
 59 |             ratings_model="kappa[1]" in summary.index,
 60 |         )
 61 | 
 62 |         return expected.merge(summary.reset_index(), on="parameter").set_index(
 63 |             ["parameter group", "parameter"]
 64 |         )
 65 | 
 66 |     def person_summary(self, **kwargs):
 67 |         """
 68 |         A wrapper around :meth:pystan.CmdStanMCMC.summary that provides posterior summaries grouped by person.
 69 | 
 70 |         :param kwargs: Additional optional arguments passed to :meth:`pystan.CmdStanMCMC.summary`, such as 'percentiles'
 71 |             and 'sig_figs'.
 72 |         :return: A summary :class:`pandas.DataFrame` filtered to include person parameters only, having a multi-index
 73 |             that associates parameters with their respective person labels.
 74 |         """
 75 |         summary = self.mcmc.summary(**kwargs)
 76 |         summary = summary.loc[summary.index.str.match("theta")]
 77 |         summary["person"] = self.jj_labels
 78 |         summary.index.name = "parameter"
 79 |         return summary.reset_index().set_index(["person", "parameter"])
 80 | 
 81 | 
 82 | def _get_expected_parameters_by_group(
 83 |     item_labels, max_per_item, rasch_family: bool, ratings_model: bool
 84 | ):
 85 |     """Generate a DataFrame listing the expected item/distribution parameters and their groupings."""
 86 |     holder = []
 87 | 
 88 |     if ratings_model:
 89 |         betas_per_item = np.ones(len(max_per_item), dtype=int)
 90 |     else:
 91 |         betas_per_item = np.array(max_per_item, dtype=int)
 92 | 
 93 |     beta_counter = 0
 94 |     for item, item_max in enumerate(betas_per_item):
 95 |         if not rasch_family:
 96 |             holder.append((item_labels[item], f"alpha[{item + 1}]"))
 97 |         for _ in range(item_max):
 98 |             beta_counter += 1
 99 |             holder.append((item_labels[item], f"beta[{beta_counter}]"))
100 | 
101 |     if ratings_model:
102 |         for j in range(int(max(max_per_item))):
103 |             holder.append(("Rating scale steps", f"kappa[{j + 1}]"))
104 | 
105 |     holder.append(("Ability distribution", "lambda[1]"))
106 | 
107 |     if rasch_family:
108 |         holder.append(("Ability distribution", "sigma"))
109 | 
110 |     df = pd.DataFrame(holder)
111 |     df.columns = ["parameter group", "parameter"]
112 | 
113 |     return df
114 | 


--------------------------------------------------------------------------------
/edstan/model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Union, Dict
  3 | from warnings import warn
  4 | import numpy as np
  5 | from cmdstanpy import CmdStanModel
  6 | from numpy.typing import NDArray
  7 | from pandas import DataFrame
  8 | from .mcmc import EdStanMCMC
  9 | 
 10 | 
 11 | class EdStanModel(CmdStanModel):
 12 |     """
 13 |     This class is a child of :class:`pystan.CmdStanModel` that adds functionality to load common item response models
 14 |     and accept data in common formats to perform MCMC sampling. Only the added functionality is documented here.
 15 |     """
 16 | 
 17 |     def __init__(self, model: str, **kwargs):
 18 |         """
 19 |         Initializes an :class:`EdStanModel` instance.
 20 | 
 21 |         Upon instantiating an :class:`EdStanModel` instance, the selected model is prepared for sampling. Afterwards,
 22 |         the :meth:`EdStanModel.sample_from_long` or :meth:`EdStanModel.sample_from_wide` methods may be used to
 23 |         initiate MCMC sampling with Stan.
 24 | 
 25 |         :param model: The (partial) file name of an :mod:`edstan` model, with matching based on the start of the file
 26 |             name. Consider specifying "rasch", "2pl", "rsm", "grsm", "pcm", or "gpcm".
 27 |         :param kwargs: Additional optional arguments passed to the :class:`pystan.CmdStanModel` parent class.
 28 |         """
 29 |         if not isinstance(model, str):
 30 |             raise ValueError("Invalid value for 'model'. Expected a string.")
 31 | 
 32 |         directory = os.path.join(os.path.dirname(__file__), "data")
 33 |         matching_files = []
 34 |         for filename in os.listdir(directory):
 35 |             if filename.endswith(".stan") and filename.startswith(model.lower()):
 36 |                 matching_files.append(os.path.join(directory, filename))
 37 | 
 38 |         if len(matching_files) == 0:
 39 |             raise ValueError(
 40 |                 f"Invalid value for 'model': {model}. No matching edstan model found."
 41 |             )
 42 | 
 43 |         if len(matching_files) > 1:
 44 |             raise ValueError(
 45 |                 f"Invalid value for 'model': {model}. More than one matching edstan model found."
 46 |             )
 47 | 
 48 |         self.model = matching_files[0]
 49 |         super().__init__(stan_file=matching_files[0], **kwargs)
 50 | 
 51 |     def sample_from_dict(self, data: Dict, **kwargs) -> EdStanMCMC:
 52 |         """
 53 |         Sample from the model using a dictionary of data.
 54 | 
 55 |         Generally it will be more convenient to initialize sampling using the :meth:`EdStanModel.sample_from_long`
 56 |         or :meth:`EdStanModel.sample_from_wide` methods, which prepare the required dictionary based on common data
 57 |         formats.
 58 | 
 59 |         :param data: A dictionary of data compatible with the :mod:`edstan` models.
 60 |         :param kwargs: Additional arguments passed to :meth:`pystan.CmdStanModel.sample`, excluding 'data'. Consider
 61 |             arguments such as 'chains', 'iter_warmup', 'iter_sampling', and 'adapt_delta'.
 62 |         :return: A fitted MCMC model.
 63 |         """
 64 |         ii_labels = data.pop("ii_labels")
 65 |         jj_labels = data.pop("jj_labels")
 66 |         max_per_item = data.pop("max_per_item")
 67 |         mcmc = super().sample(data=data, **kwargs)
 68 |         return EdStanMCMC(
 69 |             mcmc, jj_labels=jj_labels, ii_labels=ii_labels, max_per_item=max_per_item
 70 |         )
 71 | 
 72 |     def sample_from_long(
 73 |         self,
 74 |         ii: NDArray,
 75 |         jj: NDArray,
 76 |         y: NDArray[np.integer],
 77 |         integerize: bool = True,
 78 |         **kwargs,
 79 |     ) -> EdStanMCMC:
 80 |         """
 81 |         Sample from the model using response data in the form of several 1D arrays.
 82 | 
 83 |         This method is appropriate for "long format" item response data in which scored responses are stored in a flat
 84 |         array, and additional flat arrays index the person and item associated with each scored response. This format
 85 |         can accommodate missing responses by removing them beforehand.
 86 | 
 87 |         :param ii: A 1D NumPy array representing the item associated with a response. Must be integers
 88 |             if 'integerize' is set to False.
 89 |         :param jj: A 1D NumPy array representing the person associated with a response. Must be integers
 90 |             if 'integerize' is set to False.
 91 |         :param y: A 1D NumPy array representing the scored responses. The lowest value is expected to be
 92 |             zero.
 93 |         :param integerize: Whether to convert 'ii' and 'jj' to index arrays starting at one. This should generally
 94 |             be set to True but need not be if 'ii' and 'jj' are already formatted this way.
 95 |         :param kwargs: Additional arguments passed to :meth:`pystan.CmdStanModel.sample`, excluding 'data'. Consider
 96 |             arguments such as 'chains', 'iter_warmup', 'iter_sampling', and 'adapt_delta'.
 97 |         :return: A fitted MCMC model.
 98 |         """
 99 |         ii = _validate_vector(ii, label="ii")
100 |         jj = _validate_vector(jj, label="jj")
101 |         y = _validate_vector(y, label="y")
102 |         data = data_from_long(ii=ii, jj=jj, y=y, integerize=integerize, extended=True)
103 |         return self.sample_from_dict(data, **kwargs)
104 | 
105 |     def sample_from_wide(
106 |         self, response_matrix: Union[NDArray[np.integer], DataFrame], **kwargs
107 |     ) -> EdStanMCMC:
108 |         """
109 |         Sample from the model using response data in the form of a 2D array or :class:`pandas.DataFrame`.
110 | 
111 |         This method is appropriate for "wide format" item response data in which scored response are arrange in a table.
112 |         Each row represents a person, and each column represents an item.
113 | 
114 |         :param response_matrix: A (#persons, #items) 2D array or :class:`pandas.DataFrame` representing the scored
115 |             responses. The lowest value is expected to be zero.
116 |         :param kwargs: Additional arguments passed to :meth:`pystan.CmdStanModel.sample`, excluding 'data'. Consider
117 |             arguments such as 'chains', 'iter_warmup', 'iter_sampling', and 'adapt_delta'.
118 |         :return: A fitted MCMC model.
119 |         """
120 |         data = data_from_wide(response_matrix=response_matrix, extended=True)
121 |         return self.sample_from_dict(data, **kwargs)
122 | 
123 | 
124 | def data_from_long(
125 |     ii: NDArray,
126 |     jj: NDArray,
127 |     y: NDArray[np.integer],
128 |     integerize: bool = True,
129 |     extended: bool = False,
130 | ) -> Dict:
131 |     """
132 |     Create a dictionary compatible with the :mod:`edstan` models from several 1D arrays.
133 | 
134 |     In general the :meth:`EdStanModel.sample_from_long` method will be sufficient for preparing
135 |     data of this format and performing sampling. This function may be of interest if a copy of the prepared data is
136 |     desired.
137 | 
138 |     :param ii: A 1D NumPy representing the item associated with a response. Must be integers
139 |         if 'integerize' is set to False.
140 |     :param jj: A 1D NumPy array representing the person associated with a response. Must be integers
141 |         if 'integerize' is set to False.
142 |     :param y: A 1D NumPy array representing the scored responses. The lowest value is expected to be
143 |         zero.
144 |     :param integerize: Whether to convert 'ii' and 'jj' to index vectors starting at one. This should generally
145 |         be set to True.
146 |     :param extended: Whether to add additional metadata keys to the output dictionary. This should generally be set
147 |         to False if called by the user.
148 |     :return: A dictionary representing item response data.
149 |     """
150 |     ii = _validate_vector(ii, label="ii")
151 |     jj = _validate_vector(jj, label="jj")
152 |     y = _validate_vector(y, label="y")
153 | 
154 |     if not len(ii) == len(jj) == len(y):
155 |         raise ValueError("'ii', 'jj', and 'y' must all have the same length.")
156 | 
157 |     if integerize:
158 |         ii_ints, ii_labels = _map_to_unique_ids(ii)
159 |         jj_ints, jj_labels = _map_to_unique_ids(jj)
160 |     else:
161 |         ii_ints, ii_labels = ii, _unique_unsorted(ii)
162 |         jj_ints, jj_labels = jj, _unique_unsorted(jj)
163 | 
164 |     max_per_item = _validate_responses_by_item(y, ii_ints, ii_labels)
165 | 
166 |     data = {
167 |         "I": max(ii_ints),
168 |         "J": max(jj_ints),
169 |         "N": len(y),
170 |         "ii": ii_ints,
171 |         "jj": jj_ints,
172 |         "y": y,
173 |         "K": 1,
174 |         "W": [[1]] * max(jj_ints),
175 |     }
176 | 
177 |     if extended:
178 |         data.update(
179 |             {
180 |                 "ii_labels": ii_labels,
181 |                 "jj_labels": jj_labels,
182 |                 "max_per_item": max_per_item,
183 |             }
184 |         )
185 | 
186 |     return data
187 | 
188 | 
189 | def data_from_wide(
190 |     response_matrix: Union[NDArray[np.integer], DataFrame], extended: bool = False
191 | ) -> Dict:
192 |     """
193 |     Create a dictionary compatible with the :mod:`edstan` models from a response matrix.
194 | 
195 |     In general the :meth:`EdStanModel.sample_from_wide` method will be sufficient for preparing
196 |     data of this format and performing sampling. This function may be of interest if a copy of the prepared data is
197 |     desired.
198 | 
199 |     :param response_matrix: A (#persons, #items) array or :class:`pandas.DataFrame` representing the scored responses.
200 |         The lowest value is expected to be zero.
201 |     :param extended: Whether to add additional metadata keys to the output dictionary. This should generally be set
202 |         to False if called by the user.
203 |     :return: A dictionary representing item response data.
204 |     """
205 |     if isinstance(response_matrix, DataFrame):
206 |         mat = _validate_pandas_matrix(response_matrix)
207 |         ii = np.tile(response_matrix.columns, mat.shape[0])
208 |         jj = np.repeat(response_matrix.index, mat.shape[1])
209 |     else:
210 |         mat = _validate_numpy_matrix(response_matrix)
211 |         ii = np.tile(np.arange(mat.shape[1]) + 1, mat.shape[0])
212 |         jj = np.repeat(np.arange(mat.shape[0]) + 1, mat.shape[1])
213 | 
214 |     y = mat.flatten()
215 | 
216 |     return data_from_long(ii=ii, jj=jj, y=y, extended=extended, integerize=True)
217 | 
218 | 
219 | def _unique_unsorted(arr: NDArray):
220 |     """Given a 1D array, return the unique elements in the order of first observance."""
221 |     return np.array([x for i, x in enumerate(arr) if x not in arr[:i]])
222 | 
223 | 
224 | def _map_to_unique_ids(arr: NDArray):
225 |     """Turn a 1D array into a tuple of an index and the unique values."""
226 |     unique_values = _unique_unsorted(arr)
227 |     unique_values_list = unique_values.tolist()
228 |     indices = np.array([unique_values_list.index(x) + 1 for x in arr])
229 |     return indices, unique_values
230 | 
231 | 
232 | def _validate_pandas_matrix(response_matrix: Union[NDArray, DataFrame]) -> NDArray:
233 |     """Apply checks to a response matrix dataframe and convert to a 2D NDArray."""
234 |     if response_matrix.shape[0] != len(np.unique(response_matrix.index)):
235 |         raise ValueError("The pandas dataframe must not have duplicate index values.")
236 | 
237 |     if response_matrix.shape[1] != len(np.unique(response_matrix.columns)):
238 |         raise ValueError("The pandas dataframe must not have duplicate column names.")
239 | 
240 |     if response_matrix.index.nlevels != 1:
241 |         raise ValueError(
242 |             "The pandas dataframe must not have a multi-index along the rows."
243 |         )
244 | 
245 |     if response_matrix.columns.nlevels != 1:
246 |         raise ValueError(
247 |             "The pandas dataframe must not have a multi-index along the columns."
248 |         )
249 | 
250 |     return _validate_numpy_matrix(response_matrix)
251 | 
252 | 
253 | def _validate_numpy_matrix(response_matrix: Union[NDArray, DataFrame]) -> NDArray:
254 |     """Convert a response matrix to an NDArray, apply checks, and return the NDArray."""
255 |     try:
256 |         mat = np.asarray(response_matrix)
257 |     except Exception as exc:
258 |         raise ValueError(
259 |             "'response_matrix' must be a 2-dimensional numpy array or an object convertable to the same."
260 |         ) from exc
261 | 
262 |     if mat.ndim != 2:
263 |         raise ValueError(
264 |             f"'response_matrix' has {mat.ndim} dimensions, but must have two."
265 |         )
266 |     return mat
267 | 
268 | 
269 | def _validate_vector(arr: NDArray, label: str) -> NDArray:
270 |     """Convert argument to an NDArray, check that it is 1D, and return the NDArray."""
271 |     try:
272 |         arr = np.array(arr)
273 |     except Exception as exc:
274 |         raise ValueError(
275 |             f"'{label}' must be a 1-dimensional numpy array or an object convertable to the same."
276 |         ) from exc
277 | 
278 |     if arr.ndim != 1:
279 |         raise ValueError(
280 |             f"'{label}' must be a 1-dimensional numpy array or an object convertable to the same."
281 |         )
282 | 
283 |     return arr
284 | 
285 | 
286 | def _validate_responses_by_item(
287 |     y: NDArray, ii_ints: NDArray, ii_labels: NDArray
288 | ) -> NDArray:
289 |     """Apply checks to 1D NDArray of item responses and return the max score per item."""
290 |     max_per_item = np.zeros(max(ii_ints))
291 | 
292 |     for u in np.unique(ii_ints):
293 |         responses = y[ii_ints == u]
294 |         label = ii_labels[u - 1]
295 | 
296 |         mn = min(responses)
297 |         mx = max(responses)
298 | 
299 |         if mn != 0:
300 |             warn(f"Item {label} does not have a minimum response value of zero.")
301 | 
302 |         if len(np.unique(responses)) == 1:
303 |             warn(f"Item {label} only has response values of {responses[0]}.")
304 | 
305 |         if len(np.unique(responses)) != (mx - mn + 1):
306 |             warn(f"Item {label} has missing response categories.")
307 | 
308 |         max_per_item[u - 1] = mx
309 | 
310 |     return max_per_item
311 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 | dependencies:
 4 |   - python = 3.11
 5 |   - pylint
 6 |   - black
 7 |   - pytest
 8 |   - sphinx
 9 |   - build
10 |   - twine
11 |   - cmdstanpy
12 |   - numpy
13 |   - pandas
14 | 


--------------------------------------------------------------------------------
/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.pylint]
 2 | max-line-length = 120
 3 | 
 4 | [build-system]
 5 | requires = ["setuptools>=61.0", "wheel"]
 6 | build-backend = "setuptools.build_meta"
 7 | 
 8 | [project]
 9 | name = "edstan"
10 | version = "0.2.0"
11 | description = "Streamlines the fitting of common Bayesian item response theory models using Stan"
12 | readme = "README.md"
13 | authors = [
14 |   { name = "Daniel C. Furr", email = "danielcfurr@berkeley.edu" }
15 | ]
16 | license = { text = "MIT" }
17 | requires-python = ">=3.8"
18 | classifiers = [
19 |   "Programming Language :: Python :: 3",
20 |   "License :: OSI Approved :: MIT License",
21 |   "Operating System :: OS Independent"
22 | ]
23 | urls = [
24 |   { "Homepage" = "https://github.com/danielcfurr/edstan-python" },
25 |   { "Reference" = "https://edstan-python.readthedocs.io/en/latest/"}
26 | ]
27 | 
28 | [tool.setuptools.packages.find]
29 | where = ["."]
30 | include = ["edstan"]
31 | 
32 | [tool.setuptools.package-data]
33 | edstan = [
34 |   "data/rasch_latent_reg.stan",
35 |   "data/2pl_latent_reg.stan",
36 |   "data/rsm_latent_reg.stan",
37 |   "data/grsm_latent_reg.stan",
38 |   "data/pcm_latent_reg.stan",
39 |   "data/gpcm_latent_reg.stan"
40 | ]
41 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy~=2.2.4
2 | pandas~=2.2.3
3 | pytest~=8.3.5
4 | cmdstanpy~=1.2.5
5 | setuptools~=75.8.2


--------------------------------------------------------------------------------
/source/api.rst:
--------------------------------------------------------------------------------
 1 | API Reference
 2 | =============
 3 | 
 4 | .. autoclass:: edstan.EdStanModel
 5 |       :members: __init__, sample_from_dict, sample_from_long, sample_from_wide
 6 |       :undoc-members:
 7 | 
 8 | .. autoclass:: edstan.EdStanMCMC
 9 |       :members: __init__, item_summary, person_summary
10 |       :undoc-members:
11 | 
12 | .. autofunction:: edstan.data_from_long
13 | 
14 | .. autofunction:: edstan.data_from_wide


--------------------------------------------------------------------------------
/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Project information -----------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 8 | 
 9 | project = 'edstan'
10 | copyright = '2025, Daniel C. Furr'
11 | author = 'Daniel C. Furr'
12 | release = '0.2.0'
13 | 
14 | # -- General configuration ---------------------------------------------------
15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
16 | 
17 | extensions = [
18 |     'sphinx.ext.autodoc',
19 |     'sphinx.ext.napoleon',  # for Google/NumPy-style docstrings
20 |     'sphinx.ext.viewcode',  # optional, adds source code links
21 | ]
22 | 
23 | templates_path = ['_templates']
24 | exclude_patterns = []
25 | 
26 | import os
27 | import sys
28 | sys.path.insert(0, os.path.abspath('../'))
29 | 
30 | 
31 | # -- Options for HTML output -------------------------------------------------
32 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
33 | 
34 | html_theme = 'alabaster'
35 | html_static_path = ['_static']


--------------------------------------------------------------------------------
/source/index.rst:
--------------------------------------------------------------------------------
  1 | .. edstan documentation master file, created by
  2 |    sphinx-quickstart on Fri Apr  4 11:19:26 2025.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | edstan
  7 | ======
  8 | 
  9 | A python module that simplifies the fitting of common Bayesian item response theory models using Stan. It is compatible
 10 | with and extends the functionality of :mod:`pystan`.
 11 | 
 12 | 
 13 | Features
 14 | --------
 15 | 
 16 | - Streamlined interface to common Bayesian item response models using Stan
 17 | - Models include: Rasch, two-parameter logistic, (generalized) rating scale, and (generalized) partial credit
 18 | - Posterior summaries tailored to item response models
 19 | 
 20 | 
 21 | Installation
 22 | ------------
 23 | 
 24 | :mod:`edstan` depends on the successful installation of :mod:`pystan`, so please see the
 25 | `pystan installation guide <https://pystan.readthedocs.io/en/latest/installation.html>`_.
 26 | Note that compatibility with Windows OS
 27 | `may be limited <https://pystan.readthedocs.io/en/latest/faq.html>`_.
 28 | 
 29 | :mod:`edstan` may subsequently be installed with :mod:`pip`:
 30 | 
 31 | .. code-block:: bash
 32 | 
 33 |    pip install edstan
 34 | 
 35 | 
 36 | Quickstart
 37 | ----------
 38 | 
 39 | Here is an example of running a model using data from a response matrix:
 40 | 
 41 | .. code-block:: python
 42 | 
 43 |    from edstan import EdStanModel
 44 |    import numpy as np
 45 |    import pandas pd
 46 | 
 47 |    # Simulate a "wide format" data frame of item responses for
 48 |    # 5 items and 100 persons. Responses are scored 0 or 1.
 49 |    rng = np.random.default_generator(seed=42)
 50 |    data = pd.DataFrame(rng.binomial(1, p=.5, size=(100, 5)))
 51 |    data.columns = [f"Question {i}" for i in range(5)]
 52 |    data.index = [f"Respondent {i}" for i in range(100)]
 53 | 
 54 |    # Instantiate the model, selecting the Rasch model
 55 |    model = EdStanModel("rasch")
 56 | 
 57 |    # Sample from the model by MCMC
 58 |    fit = model.sample_from_long(data)
 59 | 
 60 |    # View a posterior summary of the item (and person distribution)
 61 |    # parameters
 62 |    print(fit.item_summary())
 63 | 
 64 |    # View a posterior summary of the person parameters
 65 |    print(fit.person_summary())
 66 | 
 67 | 
 68 | Alternatively, this is an example of using long format data:
 69 | 
 70 | .. code-block:: python
 71 | 
 72 |    from edstan import EdStanModel
 73 |    import numpy as np
 74 |    import pandas as pd
 75 | 
 76 |    # Simulate a "long format" data frame of item responses for
 77 |    # 5 items and 100 persons. Responses are scored 0, 1, or 2.
 78 |    rng = np.random.default_generator(seed=42)
 79 |    data = pd.DataFrame(
 80 |       {
 81 |          "person": [f"Person {j}" for j in range(100) for i in range(5)]
 82 |          "item": [f"Item {i}" for j in range(100) for i in range(5)]
 83 |          "response": rng.binomial(2, p=.5, size=5*100)
 84 |       }
 85 |    )
 86 | 
 87 |    # Instantiate the model, choosing the generalized partial
 88 |    # credit model
 89 |    model = EdStanModel("gpcm")
 90 | 
 91 |    # Sample from the model by MCMC
 92 |    fit = model.sample_from_long(
 93 |       ii=data['item'],
 94 |       jj=data['person'],
 95 |       y=data['response']
 96 |    )
 97 | 
 98 |    # View a posterior summary of the item (and person distribution)
 99 |    # parameters
100 |    print(fit.item_summary())
101 | 
102 |    # View a posterior summary of the person parameters
103 |    print(fit.person_summary())
104 | 
105 | 
106 | Contents
107 | --------
108 | 
109 | .. toctree::
110 |    :maxdepth: 2
111 |    :caption: Contents:
112 | 
113 |    api
114 |    tech
115 | 


--------------------------------------------------------------------------------
/source/tech.rst:
--------------------------------------------------------------------------------
  1 | Technical Notes
  2 | ===============
  3 | 
  4 | Users will be able to fit the :mod:`edstan` models without full knowledge of
  5 | the technical details, though these are provided in this section. All
  6 | that is really needed for interpreting results is to know the meanings
  7 | assigned to the Greek letters.
  8 | 
  9 | Notation
 10 | --------
 11 | 
 12 | Variables and parameters are similar across :mod:`edstan` models. The variables
 13 | used are:
 14 | 
 15 | - :math:`i = 1 \ldots I` indexes items.
 16 | - :math:`j = 1 \ldots J` indexes persons.
 17 | - :math:`m_i` is simultaneously the maximum score and the number of step
 18 |   difficulty parameters for item $i$ for partial credit models.
 19 |   Alternatively, :math:`m` is the same across all items for rating scale
 20 |   models.
 21 | - :math:`s = 1 \ldots m_i` or :math:`s = 1 \ldots m` indexes steps within items.
 22 | - :math:`y_{ij}` is the scored response of person :math:`j` to item :math:`i`. The lowest
 23 |   score for items must be zero (except for rating scale models).
 24 | 
 25 | The parameters used are:
 26 | 
 27 | - For the Rasch and 2PL models, :math:`\beta_i` is the difficulty for item
 28 |   :math:`i`. For the rating scale models, :math:`\beta_i` is the mean difficulty for
 29 |   item :math:`i`. For partial credit models, :math:`\beta_{is}` is the difficulty
 30 |   for step :math:`s` of item :math:`i`.
 31 | - :math:`\kappa_s` is a step difficulty for the (generalized) rating scale
 32 |   model.
 33 | - :math:`\alpha_i` is the discrimination parameter for item :math:`i` (when
 34 |   applicable).
 35 | - :math:`\theta_j` is the ability for person :math:`j`.
 36 | - :math:`\lambda` is mean of the ability distribution.
 37 | - :math:`\sigma` is standard deviation for the ability distribution..
 38 | 
 39 | The *.stan* files and the notation for the models below closely adhere
 40 | to these conventions.
 41 | 
 42 | Rasch family models
 43 | -------------------
 44 | 
 45 | Rasch model
 46 | ^^^^^^^^^^^
 47 | 
 48 | *rasch_latent_reg.stan*
 49 | 
 50 | .. math::
 51 | 
 52 |     \mathrm{logit} [ \Pr(y_{ij} = 1 | \theta_j, \beta_i) ] =
 53 |       \theta_j - \beta_i
 54 | 
 55 | Partial credit model
 56 | ^^^^^^^^^^^^^^^^^^^^
 57 | 
 58 | *pcm_latent_reg.stan*
 59 | 
 60 | .. math::
 61 |     \Pr(Y_{ij} = y,~y > 0 | \theta_j, \beta_i) =
 62 |     \frac{\exp \sum_{s=1}^y (\theta_j - \beta_{is})}
 63 |          {1 + \sum_{k=1}^{m_i} \exp \sum_{s=1}^k (\theta_j - \beta_{is})}
 64 | 
 65 | .. math::
 66 |     \Pr(Y_{ij} = y,~y = 0 | \theta_j, \beta_i) =
 67 |     \frac{1}
 68 |          {1 + \sum_{k=1}^{m_i} \exp \sum_{s=1}^k (\theta_j - \beta_{is})}
 69 | 
 70 | Rating scale model
 71 | ^^^^^^^^^^^^^^^^^^
 72 | 
 73 | *rsm_latent_reg.stan*
 74 | 
 75 | .. math::
 76 |     \Pr(Y_{ij} = y,~y > 0 | \theta_j, \beta_i, \kappa_s) =
 77 |     \frac{\exp \sum_{s=1}^y (\theta_j - \beta_i - \kappa_s)}
 78 |          {1 + \sum_{k=1}^{m} \exp \sum_{s=1}^k (\theta_j - \beta_i - \kappa_s)}
 79 | 
 80 | .. math::
 81 |     \Pr(Y_{ij} = y,~y = 0 | \theta_j, \beta_i, \kappa_s) =
 82 |     \frac{1}
 83 |          {1 + \sum_{k=1}^{m} \exp \sum_{s=1}^k (\theta_j - \beta_i - \kappa_s)}
 84 | 
 85 | Models featuring discrimination parameters
 86 | ------------------------------------------
 87 | 
 88 | Two-parameter logistic model
 89 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 90 | 
 91 | *2pl_latent_reg.stan*
 92 | 
 93 | .. math::
 94 |   \mathrm{logit} [ \Pr(y_{ij} = 1 | \alpha_i, \beta_i, \theta_j) ] =
 95 |   \alpha_i \theta_j - \beta_i
 96 | 
 97 | Generalized partial credit model
 98 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 99 | 
100 | *gpcm_latent_reg.stan*
101 | 
102 | .. math::
103 |     \Pr(Y_{ij} = y,~y > 0 | \theta_j, \alpha_i, \beta_i) =
104 |     \frac{\exp \sum_{s=1}^y (\alpha_i  \theta_j - \beta_{is})}
105 |          {1 + \sum_{k=1}^{m_i} \exp \sum_{s=1}^k
106 |            (\alpha_i \theta_j - \beta_{is})}
107 | 
108 | .. math::
109 |     \Pr(Y_{ij} = y,~y = 0 | \theta_j, \alpha_i, \beta_i) =
110 |     \frac{1}
111 |          {1 + \sum_{k=1}^{m_i} \exp \sum_{s=1}^k
112 |            (\alpha_i \theta_j + w_{j}' \lambda - \beta_{is})}
113 | 
114 | Generalized rating scale model
115 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
116 | 
117 | *grsm_latent_reg.stan*
118 | 
119 | .. math::
120 |     \Pr(Y_{ij} = y,~y > 0 | \theta_j, \lambda, \alpha_i, \beta_i, \kappa_s) =
121 |     \frac{\exp \sum_{s=1}^y
122 |            (\alpha_i \theta_j - \beta_i - \kappa_s)}
123 |          {1 + \sum_{k=1}^{m} \exp \sum_{s=1}^k
124 |            (\alpha_i \theta_j - \beta_i - \kappa_s)}
125 | 
126 | .. math::
127 |     \Pr(Y_{ij} = y,~y = 0 | \theta_j, \lambda, \alpha_i, \beta_i, \kappa_s) =
128 |     \frac{1}
129 |          {1 + \sum_{k=1}^{m} \exp \sum_{s=1}^k
130 |            (\alpha_i \theta_j - \beta_i - \kappa_s)}
131 | 
132 | 
133 | Prior distributions
134 | -------------------
135 | 
136 | For Rasch family models, the prior distributions for the person-related
137 | parameters are
138 | 
139 | - :math:`\theta_j \sim \mathrm{N}(\lambda, \sigma^2)`
140 | - :math:`\lambda \sim t_7(0, 2.5)`
141 | - :math:`\sigma \sim \mathrm{gamma}(2, 1)`
142 | 
143 | For models with discrimination parameters, the priors are
144 | 
145 | - :math:`\theta_j \sim \mathrm{N}(\lambda, 1)`
146 | - :math:`\lambda \sim t_7(0, 2.5)`
147 | 
148 | The priors for the item parameters are
149 | 
150 | - :math:`\alpha \sim \mathrm{lognormal}(.5, 1)`
151 | - :math:`\beta \sim \mathrm{N}(0, 9)`
152 | - :math:`\kappa \sim \mathrm{N}(0, 9)`
153 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4 | 
5 | 


--------------------------------------------------------------------------------
/tests/test_gestalt.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from edstan import EdStanModel, EdStanMCMC
  4 | 
  5 | rng = np.random.default_rng(42)
  6 | 
  7 | # Preset number of items and persons for data generation
  8 | items = 5
  9 | persons = 100
 10 | 
 11 | # Dichotomous response matrix as array
 12 | dich_arr = rng.binomial(1, p=.5, size=(persons, items))
 13 | 
 14 | # Dichotomous response matrix as data frame
 15 | dich_df = pd.DataFrame(dich_arr)
 16 | dich_df.columns = [f'Item {i}' for i in range(items)]
 17 | dich_df.index = [f'Person {i}' for i in range(persons)]
 18 | 
 19 | # Polytomous response matrix as array
 20 | poly_arr = rng.binomial(2, p=.5, size=(persons, items))
 21 | 
 22 | # Polytomous response matrix as data frame
 23 | poly_df = pd.DataFrame(poly_arr)
 24 | poly_df.columns = dich_df.columns
 25 | poly_df.index = dich_df.index
 26 | 
 27 | # Dich/polytomous responses as long-format arrays
 28 | dich_y = dich_arr.flatten()
 29 | poly_y = poly_arr.flatten()
 30 | ii = np.tile(dich_df.columns, reps=persons)
 31 | jj = np.repeat(dich_df.index, repeats=items)
 32 | 
 33 | 
 34 | def test_long():
 35 |     result = EdStanModel('rasch').sample_from_long(
 36 |         ii=ii, jj=jj, y=dich_y,
 37 |         iter_warmup=100, iter_sampling=100, chains=1
 38 |     )
 39 |     assert isinstance(result, EdStanMCMC)
 40 | 
 41 | 
 42 | def test_wide_numpy():
 43 |     response_matrix = rng.binomial(n=1, p=.5, size=(100, 5))
 44 |     result = EdStanModel('rasch').sample_from_wide(
 45 |         dich_arr,
 46 |         iter_warmup=100, iter_sampling=100, chains=1
 47 |     )
 48 |     assert isinstance(result, EdStanMCMC)
 49 | 
 50 | 
 51 | def test_wide_pandas():
 52 |     result = EdStanModel('rasch').sample_from_wide(
 53 |         dich_df,
 54 |         iter_warmup=100, iter_sampling=100, chains=1
 55 |     )
 56 |     assert isinstance(result, EdStanMCMC)
 57 | 
 58 | 
 59 | def test_wide_polytomous():
 60 |     result = EdStanModel('rsm').sample_from_wide(
 61 |         poly_df,
 62 |         iter_warmup=100, iter_sampling=100, chains=1
 63 |     )
 64 |     assert isinstance(result, EdStanMCMC)
 65 | 
 66 | 
 67 | def test_summary_rasch():
 68 |     fit = EdStanModel('rasch').sample_from_wide(
 69 |         dich_df,
 70 |         iter_warmup=100, iter_sampling=100, chains=1
 71 |     )
 72 |     assert fit.item_summary().shape[0] == items * 1 + 2
 73 |     assert fit.person_summary().shape[0] == persons
 74 | 
 75 | 
 76 | def test_summary_2pl():
 77 |     fit = EdStanModel('2pl').sample_from_wide(
 78 |         dich_df,
 79 |         iter_warmup=100, iter_sampling=100, chains=1
 80 |     )
 81 |     assert fit.item_summary().shape[0] == items * 2 + 1
 82 |     assert fit.person_summary().shape[0] == persons
 83 | 
 84 | 
 85 | def test_summary_rsm():
 86 |     fit = EdStanModel('rsm').sample_from_wide(
 87 |         poly_df,
 88 |         iter_warmup=100, iter_sampling=100, chains=1
 89 |     )
 90 |     assert fit.item_summary().shape[0] == items * 1 + 4
 91 |     assert fit.person_summary().shape[0] == persons
 92 | 
 93 | 
 94 | def test_summary_grsm():
 95 |     fit = EdStanModel('grsm').sample_from_wide(
 96 |         poly_df,
 97 |         iter_warmup=100, iter_sampling=100, chains=1
 98 |     )
 99 |     assert fit.item_summary().shape[0] == items * 2 + 3
100 |     assert fit.person_summary().shape[0] == persons
101 | 
102 | 
103 | def test_summary_pcm():
104 |     fit = EdStanModel('pcm').sample_from_wide(
105 |         poly_df,
106 |         iter_warmup=100, iter_sampling=100, chains=1
107 |     )
108 |     assert fit.item_summary().shape[0] == items * 2 + 2
109 |     assert fit.person_summary().shape[0] == persons
110 | 
111 | 
112 | def test_summary_gpcm():
113 |     fit = EdStanModel('gpcm').sample_from_wide(
114 |         poly_df,
115 |         iter_warmup=100, iter_sampling=100, chains=1
116 |     )
117 |     assert fit.item_summary().shape[0] == items * 3 + 1
118 |     assert fit.person_summary().shape[0] == persons
119 | 


--------------------------------------------------------------------------------
/tests/test_internals.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | from edstan import model
 6 | 
 7 | 
 8 | def test_unique_unsorted():
 9 |     x = np.array(["dog", "cat", "dog", "pony", "owl"])
10 |     result = model._unique_unsorted(x)
11 |     expected = np.array(["dog", "cat", "pony", "owl"])
12 |     np.testing.assert_array_equal(result, expected)
13 | 
14 | 
15 | def test_map_to_unique_ids():
16 |     x = np.array(["dog", "cat", "dog", "pony", "owl"])
17 |     result_ints, result_labels = model._map_to_unique_ids(x)
18 |     expected_ints = np.array([1, 2, 1, 3, 4])
19 |     expected_labels = np.array(["dog", "cat", "pony", "owl"])
20 |     np.testing.assert_array_equal(result_ints, expected_ints)
21 |     np.testing.assert_array_equal(result_labels, expected_labels)
22 | 
23 | 
24 | def test_validate_numpy_matrix():
25 |     x = np.zeros((2, 2))
26 |     result = model._validate_numpy_matrix(x)
27 |     np.testing.assert_array_equal(result, x)
28 | 
29 | 
30 | def test_validate_numpy_matrix__errors():
31 |     with pytest.raises(ValueError):
32 |         model._validate_numpy_matrix(np.zeros(3))
33 |     with pytest.raises(ValueError):
34 |         model._validate_numpy_matrix(np.zeros((3, 3, 3)))
35 | 
36 | 
37 | def test_validate_pandas_matrix():
38 |     x = np.zeros((2, 2))
39 |     df = pd.DataFrame(x)
40 |     result = model._validate_pandas_matrix(df)
41 |     np.testing.assert_array_equal(result, x)
42 | 
43 | 
44 | def test_validate_pandas_matrix__errors():
45 |     df = pd.DataFrame(np.zeros((2, 2)))
46 |     with pytest.raises(ValueError):
47 |         x = df.copy()
48 |         x.columns = ["a", "a"]
49 |         model._validate_pandas_matrix(x)
50 |     with pytest.raises(ValueError):
51 |         x = df.copy()
52 |         x.index = ["a", "a"]
53 |         model._validate_pandas_matrix(x)
54 |     with pytest.raises(ValueError):
55 |         x = df.copy()
56 |         x.index = pd.MultiIndex.from_product([["a"], ["b", "c"]])
57 |         model._validate_pandas_matrix(x)
58 |     with pytest.raises(ValueError):
59 |         x = df.copy()
60 |         x.columns = pd.MultiIndex.from_product([["a"], ["b", "c"]])
61 |         model._validate_pandas_matrix(x)
62 | 
63 | 
64 | def test_validate_responses_by_item__warnings():
65 |     ii_ints = np.array([1, 1, 2, 2])
66 |     with pytest.warns(UserWarning):
67 |         # No variation in second item
68 |         model._validate_responses_by_item(np.array([0, 1, 1, 1]), ii_ints, ii_ints)
69 |     with pytest.warns(UserWarning):
70 |         # No variation in second item
71 |         model._validate_responses_by_item(np.array([0, 1, 0, 0]), ii_ints, ii_ints)
72 |     with pytest.warns(UserWarning):
73 |         # Missing zero in second item
74 |         model._validate_responses_by_item(np.array([0, 1, 1, 2]), ii_ints, ii_ints)
75 |     with pytest.warns(UserWarning):
76 |         # Missing category in second item
77 |         model._validate_responses_by_item(np.array([0, 1, 0, 2]), ii_ints, ii_ints)
78 | 


--------------------------------------------------------------------------------