├── .gitattributes ├── .gitignore ├── .readthedocs.yaml ├── Makefile ├── README.md ├── edstan ├── __init__.py ├── data │ ├── 2pl_latent_reg.stan │ ├── gpcm_latent_reg.stan │ ├── grsm_latent_reg.stan │ ├── pcm_latent_reg.stan │ ├── rasch_latent_reg.stan │ └── rsm_latent_reg.stan ├── mcmc.py └── model.py ├── environment.yml ├── make.bat ├── pyproject.toml ├── requirements.txt ├── source ├── api.rst ├── conf.py ├── index.rst └── tech.rst └── tests ├── conftest.py ├── test_gestalt.py └── test_internals.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # Windows shortcuts 18 | *.lnk 19 | 20 | # ========================= 21 | # Operating System Files 22 | # ========================= 23 | 24 | # OSX 25 | # ========================= 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear in the root of a volume 35 | .DocumentRevisions-V100 36 | .fseventsd 37 | .Spotlight-V100 38 | .TemporaryItems 39 | .Trashes 40 | .VolumeIcon.icns 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | 49 | test.py 50 | temp.py 51 | .spyproject/* 52 | 53 | .idea/* 54 | env/* 55 | *__pycache__* 56 | 57 | edstan/data/* 58 | !edstan/data/*.stan 59 | 60 | bernoulli* 61 | 62 | build/* 63 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Required 2 | version: 2 3 | 4 | # Set the OS, Python version, and other tools you might need 5 | build: 6 | os: ubuntu-24.04 7 | tools: 8 | python: "miniconda3-4.7" 9 | 10 | # Build documentation in the "docs/" directory with Sphinx 11 | sphinx: 12 | configuration: source/conf.py 13 | 14 | conda: 15 | environment: environment.yml 16 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # edstan for Python 2 | 3 | A python module that simplifies the fitting of common Bayesian item response theory models using Stan. It is compatible 4 | with and extends the functionality of **pystan**. 5 | 6 | 7 | Features 8 | 9 | - Streamlined interface to common Bayesian item response models using Stan 10 | - Models include: Rasch, two-parameter logistic, (generalized) rating scale, and (generalized) partial credit 11 | - Posterior summaries tailored to item response models 12 | 13 | 14 | ## Installation 15 | 16 | **edstan** depends on the successful installation of **pystan**, so please see the 17 | [pystan installation guide](https://pystan.readthedocs.io/en/latest/installation.html). 18 | Note that compatibility with Windows OS 19 | [may be limited](https://pystan.readthedocs.io/en/latest/faq.html). 20 | 21 | **edstan** may subsequently be installed with **pip**: 22 | 23 | ```bash 24 | pip install edstan 25 | ``` 26 | 27 | ## API Reference 28 | 29 | The API reference may be found at [Read the Docs](https://edstan-python.readthedocs.io/en/latest/index.html). 30 | 31 | 32 | ## Quickstart 33 | 34 | Here is an example of running a model using data from a response matrix: 35 | 36 | ```python 37 | from edstan import EdStanModel 38 | import numpy as np 39 | import pandas as pd 40 | 41 | # Simulate a "wide format" data frame of item responses for 42 | # 5 items and 100 persons. Responses are scored 0 or 1. 43 | rng = np.random.default_rng(seed=42) 44 | data = pd.DataFrame(rng.binomial(1, p=.5, size=(100, 5))) 45 | data.columns = [f"Question {i}" for i in range(5)] 46 | data.index = [f"Respondent {i}" for i in range(100)] 47 | 48 | # Instantiate the model, selecting the Rasch model 49 | model = EdStanModel("rasch") 50 | 51 | # Sample from the model by MCMC 52 | fit = model.sample_from_wide(data) 53 | 54 | # View a posterior summary of the item (and person distribution) 55 | # parameters 56 | print(fit.item_summary()) 57 | 58 | # View a posterior summary of the person parameters 59 | print(fit.person_summary()) 60 | ``` 61 | 62 | Alternatively, this is an example of using long format data: 63 | 64 | ```python 65 | from edstan import EdStanModel 66 | import numpy as np 67 | import pandas as pd 68 | 69 | # Simulate a "long format" data frame of item responses for 70 | # 5 items and 100 persons. Responses are scored 0, 1, or 2. 71 | rng = np.random.default_rng(seed=42) 72 | data = pd.DataFrame( 73 | { 74 | "person": [f"Person {j}" for j in range(100) for i in range(5)], 75 | "item": [f"Item {i}" for j in range(100) for i in range(5)], 76 | "response": rng.binomial(2, p=.5, size=5*100), 77 | } 78 | ) 79 | 80 | # Instantiate the model, choosing the generalized partial 81 | # credit model 82 | model = EdStanModel("gpcm") 83 | 84 | # Sample from the model by MCMC 85 | fit = model.sample_from_long( 86 | ii=data['item'], 87 | jj=data['person'], 88 | y=data['response'], 89 | ) 90 | 91 | # View a posterior summary of the item (and person distribution) 92 | # parameters 93 | print(fit.item_summary()) 94 | 95 | # View a posterior summary of the person parameters 96 | print(fit.person_summary()) 97 | ``` 98 | -------------------------------------------------------------------------------- /edstan/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import EdStanModel, data_from_long, data_from_wide 2 | from .mcmc import EdStanMCMC 3 | -------------------------------------------------------------------------------- /edstan/data/2pl_latent_reg.stan: -------------------------------------------------------------------------------- 1 | data { 2 | int I; // # questions 3 | int J; // # persons 4 | int N; // # observations 5 | array[N] int ii; // question for n 6 | array[N] int jj; // person for n 7 | array[N] int y; // correctness for n 8 | int K; // # person covariates 9 | matrix[J, K] W; // person covariate matrix 10 | } 11 | parameters { 12 | vector[I] alpha; 13 | sum_to_zero_vector[I] beta; 14 | vector[J] theta; 15 | vector[K] lambda; 16 | } 17 | model { 18 | alpha ~ lognormal(.5, 1); 19 | beta ~ normal(0, 3); 20 | lambda ~ student_t(7, 0, 2.5); 21 | theta ~ normal(W * lambda, 1); 22 | y ~ bernoulli_logit(alpha[ii] .* theta[jj] - beta[ii]); 23 | } 24 | -------------------------------------------------------------------------------- /edstan/data/gpcm_latent_reg.stan: -------------------------------------------------------------------------------- 1 | functions { 2 | real pcm(int y, real theta, vector beta) { 3 | vector[rows(beta) + 1] unsummed; 4 | vector[rows(beta) + 1] probs; 5 | unsummed = append_row(rep_vector(0.0, 1), theta - beta); 6 | probs = softmax(cumulative_sum(unsummed)); 7 | return categorical_lpmf(y + 1 | probs); 8 | } 9 | } 10 | data { 11 | int I; // # items 12 | int J; // # persons 13 | int N; // # responses 14 | array[N] int ii; // i for n 15 | array[N] int jj; // j for n 16 | array[N] int y; // response for n; y = 0, 1 ... m_i 17 | int K; // # person covariates 18 | matrix[J, K] W; // person covariate matrix 19 | } 20 | transformed data { 21 | array[I] int m; // # parameters per item 22 | array[I] int pos; // first position in beta vector for item 23 | m = rep_array(0, I); 24 | for (n in 1 : N) { 25 | if (y[n] > m[ii[n]]) { 26 | m[ii[n]] = y[n]; 27 | } 28 | } 29 | pos[1] = 1; 30 | for (i in 2 : I) { 31 | pos[i] = m[i - 1] + pos[i - 1]; 32 | } 33 | } 34 | parameters { 35 | vector[I] alpha; 36 | sum_to_zero_vector[sum(m)] beta; 37 | vector[J] theta; 38 | vector[K] lambda; 39 | } 40 | model { 41 | alpha ~ lognormal(.5, 1); 42 | beta ~ normal(0, 3); 43 | theta ~ normal(W * lambda, 1); 44 | lambda ~ student_t(7, 0, 2.5); 45 | for (n in 1 : N) { 46 | target += pcm(y[n], theta[jj[n]] .* alpha[ii[n]], 47 | segment(beta, pos[ii[n]], m[ii[n]])); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /edstan/data/grsm_latent_reg.stan: -------------------------------------------------------------------------------- 1 | functions { 2 | real rsm(int y, real theta, real beta, vector kappa) { 3 | vector[rows(kappa) + 1] unsummed; 4 | vector[rows(kappa) + 1] probs; 5 | unsummed = append_row(rep_vector(0, 1), theta - beta - kappa); 6 | probs = softmax(cumulative_sum(unsummed)); 7 | return categorical_lpmf(y + 1 | probs); 8 | } 9 | } 10 | data { 11 | int I; // # items 12 | int J; // # persons 13 | int N; // # responses 14 | array[N] int ii; // i for n 15 | array[N] int jj; // j for n 16 | array[N] int y; // response for n; y in {0 ... m_i} 17 | int K; // # person covariates 18 | matrix[J, K] W; // person covariate matrix 19 | } 20 | transformed data { 21 | int m = max(y); // # steps 22 | } 23 | parameters { 24 | vector[I] alpha; 25 | sum_to_zero_vector[I] beta; 26 | sum_to_zero_vector[m] kappa; 27 | vector[J] theta; 28 | vector[K] lambda; 29 | } 30 | model { 31 | alpha ~ lognormal(.5, 1); 32 | beta ~ normal(0, 3); 33 | kappa ~ normal(0, 3); 34 | theta ~ normal(W * lambda, 1); 35 | lambda ~ student_t(7, 0, 2.5); 36 | for (n in 1 : N) { 37 | target += rsm(y[n], theta[jj[n]] .* alpha[ii[n]], beta[ii[n]], kappa); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /edstan/data/pcm_latent_reg.stan: -------------------------------------------------------------------------------- 1 | functions { 2 | real pcm(int y, real theta, vector beta) { 3 | vector[rows(beta) + 1] unsummed; 4 | vector[rows(beta) + 1] probs; 5 | unsummed = append_row(rep_vector(0.0, 1), theta - beta); 6 | probs = softmax(cumulative_sum(unsummed)); 7 | return categorical_lpmf(y + 1 | probs); 8 | } 9 | } 10 | data { 11 | int I; // # items 12 | int J; // # persons 13 | int N; // # responses 14 | array[N] int ii; // i for n 15 | array[N] int jj; // j for n 16 | array[N] int y; // response for n; y = 0, 1 ... m_i 17 | int K; // # person covariates 18 | matrix[J, K] W; // person covariate matrix 19 | } 20 | transformed data { 21 | array[I] int m; // # parameters per item 22 | array[I] int pos; // first position in beta vector for item 23 | m = rep_array(0, I); 24 | for (n in 1 : N) { 25 | if (y[n] > m[ii[n]]) { 26 | m[ii[n]] = y[n]; 27 | } 28 | } 29 | pos[1] = 1; 30 | for (i in 2 : I) { 31 | pos[i] = m[i - 1] + pos[i - 1]; 32 | } 33 | } 34 | parameters { 35 | sum_to_zero_vector[sum(m)] beta; 36 | vector[J] theta; 37 | real sigma; 38 | vector[K] lambda; 39 | } 40 | model { 41 | beta ~ normal(0, 3); 42 | theta ~ normal(W * lambda, sigma); 43 | lambda ~ student_t(7, 0, 2.5); 44 | sigma ~ gamma(2, 1); 45 | for (n in 1 : N) { 46 | target += pcm(y[n], theta[jj[n]], segment(beta, pos[ii[n]], m[ii[n]])); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /edstan/data/rasch_latent_reg.stan: -------------------------------------------------------------------------------- 1 | data { 2 | int I; // # questions 3 | int J; // # persons 4 | int N; // # observations 5 | array[N] int ii; // question for n 6 | array[N] int jj; // person for n 7 | array[N] int y; // correctness for n 8 | int K; // # person covariates 9 | matrix[J, K] W; // person covariate matrix 10 | } 11 | parameters { 12 | sum_to_zero_vector[I] beta; 13 | vector[J] theta; 14 | real sigma; 15 | vector[K] lambda; 16 | } 17 | model { 18 | beta ~ normal(0, 3); 19 | theta ~ normal(W * lambda, sigma); 20 | lambda ~ student_t(7, 0, 2.5); 21 | sigma ~ gamma(2, 1); 22 | y ~ bernoulli_logit(theta[jj] - beta[ii]); 23 | } 24 | -------------------------------------------------------------------------------- /edstan/data/rsm_latent_reg.stan: -------------------------------------------------------------------------------- 1 | functions { 2 | real rsm(int y, real theta, real beta, vector kappa) { 3 | vector[rows(kappa) + 1] unsummed; 4 | vector[rows(kappa) + 1] probs; 5 | unsummed = append_row(rep_vector(0, 1), theta - beta - kappa); 6 | probs = softmax(cumulative_sum(unsummed)); 7 | return categorical_lpmf(y + 1 | probs); 8 | } 9 | } 10 | data { 11 | int I; // # items 12 | int J; // # persons 13 | int N; // # responses 14 | array[N] int ii; // i for n 15 | array[N] int jj; // j for n 16 | array[N] int y; // response for n; y in {0 ... m_i} 17 | int K; // # person covariates 18 | matrix[J, K] W; // person covariate matrix 19 | } 20 | transformed data { 21 | int m = max(y); // # steps 22 | } 23 | parameters { 24 | sum_to_zero_vector[I] beta; 25 | sum_to_zero_vector[m] kappa; 26 | vector[J] theta; 27 | real sigma; 28 | vector[K] lambda; 29 | } 30 | model { 31 | beta ~ normal(0, 3); 32 | kappa ~ normal(0, 3); 33 | theta ~ normal(W * lambda, sigma); 34 | lambda ~ student_t(7, 0, 2.5); 35 | sigma ~ gamma(2, 1); 36 | for (n in 1 : N) { 37 | target += rsm(y[n], theta[jj[n]], beta[ii[n]], kappa); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /edstan/mcmc.py: -------------------------------------------------------------------------------- 1 | from cmdstanpy import CmdStanMCMC 2 | from numpy.typing import NDArray 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | class EdStanMCMC: 8 | """ 9 | A wrapper around :class:`pystan.CmdStanMCMC` that adds additional methods. 10 | 11 | This class delegates all unspecified attribute access to the underlying :class:`pystan.CmdStanMCMC` instance via 12 | :meth:`EdStanMCMC.__getattr__`. This allows it to behave like a :class:`pystan.CmdStanMCMC` object while also 13 | providing custom methods. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | mcmc: CmdStanMCMC, 19 | ii_labels: NDArray, 20 | jj_labels: NDArray, 21 | max_per_item: NDArray[np.integer], 22 | ): 23 | """ 24 | Initializes an :class:`EdStanMCMC` instance. 25 | 26 | An instance of this class is generated by :meth:`ModelMCMC.sample_from_long` or 27 | :meth:`EdStanModel.sample_from_wide`. Though the class may be initialized directly, this is not the intended 28 | usage. 29 | 30 | :param mcmc: A fitted :mod:`edstan` model using MCMC. 31 | :param ii_labels: Labels associated with the items. 32 | :param jj_labels: Labels associated with the persons. 33 | :param max_per_item: The maximum score per item. 34 | """ 35 | self.mcmc = mcmc 36 | self.ii_labels = ii_labels 37 | self.jj_labels = jj_labels 38 | self.max_per_item = max_per_item 39 | 40 | def __getattr__(self, name): 41 | return getattr(self.mcmc, name) 42 | 43 | def item_summary(self, **kwargs): 44 | """ 45 | A wrapper around :meth:`pystan.CmdStanMCMC.summary` that provides posterior summaries grouped by item. 46 | 47 | :param kwargs: Additional optional arguments passed to :meth:`pystan.CmdStanMCMC.summary`, such as 'percentiles' 48 | and 'sig_figs'. 49 | :return: A summary :class:`pandas.DataFrame` filtered to include item and distribution parameters only, having a 50 | multi-index that associates parameters with their respective item labels (or the person distribution). 51 | """ 52 | summary = self.mcmc.summary(**kwargs) 53 | summary.index.name = "parameter" 54 | 55 | expected = _get_expected_parameters_by_group( 56 | item_labels=self.ii_labels, 57 | max_per_item=self.max_per_item, 58 | rasch_family="sigma" in summary.index, 59 | ratings_model="kappa[1]" in summary.index, 60 | ) 61 | 62 | return expected.merge(summary.reset_index(), on="parameter").set_index( 63 | ["parameter group", "parameter"] 64 | ) 65 | 66 | def person_summary(self, **kwargs): 67 | """ 68 | A wrapper around :meth:pystan.CmdStanMCMC.summary that provides posterior summaries grouped by person. 69 | 70 | :param kwargs: Additional optional arguments passed to :meth:`pystan.CmdStanMCMC.summary`, such as 'percentiles' 71 | and 'sig_figs'. 72 | :return: A summary :class:`pandas.DataFrame` filtered to include person parameters only, having a multi-index 73 | that associates parameters with their respective person labels. 74 | """ 75 | summary = self.mcmc.summary(**kwargs) 76 | summary = summary.loc[summary.index.str.match("theta")] 77 | summary["person"] = self.jj_labels 78 | summary.index.name = "parameter" 79 | return summary.reset_index().set_index(["person", "parameter"]) 80 | 81 | 82 | def _get_expected_parameters_by_group( 83 | item_labels, max_per_item, rasch_family: bool, ratings_model: bool 84 | ): 85 | """Generate a DataFrame listing the expected item/distribution parameters and their groupings.""" 86 | holder = [] 87 | 88 | if ratings_model: 89 | betas_per_item = np.ones(len(max_per_item), dtype=int) 90 | else: 91 | betas_per_item = np.array(max_per_item, dtype=int) 92 | 93 | beta_counter = 0 94 | for item, item_max in enumerate(betas_per_item): 95 | if not rasch_family: 96 | holder.append((item_labels[item], f"alpha[{item + 1}]")) 97 | for _ in range(item_max): 98 | beta_counter += 1 99 | holder.append((item_labels[item], f"beta[{beta_counter}]")) 100 | 101 | if ratings_model: 102 | for j in range(int(max(max_per_item))): 103 | holder.append(("Rating scale steps", f"kappa[{j + 1}]")) 104 | 105 | holder.append(("Ability distribution", "lambda[1]")) 106 | 107 | if rasch_family: 108 | holder.append(("Ability distribution", "sigma")) 109 | 110 | df = pd.DataFrame(holder) 111 | df.columns = ["parameter group", "parameter"] 112 | 113 | return df 114 | -------------------------------------------------------------------------------- /edstan/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Union, Dict 3 | from warnings import warn 4 | import numpy as np 5 | from cmdstanpy import CmdStanModel 6 | from numpy.typing import NDArray 7 | from pandas import DataFrame 8 | from .mcmc import EdStanMCMC 9 | 10 | 11 | class EdStanModel(CmdStanModel): 12 | """ 13 | This class is a child of :class:`pystan.CmdStanModel` that adds functionality to load common item response models 14 | and accept data in common formats to perform MCMC sampling. Only the added functionality is documented here. 15 | """ 16 | 17 | def __init__(self, model: str, **kwargs): 18 | """ 19 | Initializes an :class:`EdStanModel` instance. 20 | 21 | Upon instantiating an :class:`EdStanModel` instance, the selected model is prepared for sampling. Afterwards, 22 | the :meth:`EdStanModel.sample_from_long` or :meth:`EdStanModel.sample_from_wide` methods may be used to 23 | initiate MCMC sampling with Stan. 24 | 25 | :param model: The (partial) file name of an :mod:`edstan` model, with matching based on the start of the file 26 | name. Consider specifying "rasch", "2pl", "rsm", "grsm", "pcm", or "gpcm". 27 | :param kwargs: Additional optional arguments passed to the :class:`pystan.CmdStanModel` parent class. 28 | """ 29 | if not isinstance(model, str): 30 | raise ValueError("Invalid value for 'model'. Expected a string.") 31 | 32 | directory = os.path.join(os.path.dirname(__file__), "data") 33 | matching_files = [] 34 | for filename in os.listdir(directory): 35 | if filename.endswith(".stan") and filename.startswith(model.lower()): 36 | matching_files.append(os.path.join(directory, filename)) 37 | 38 | if len(matching_files) == 0: 39 | raise ValueError( 40 | f"Invalid value for 'model': {model}. No matching edstan model found." 41 | ) 42 | 43 | if len(matching_files) > 1: 44 | raise ValueError( 45 | f"Invalid value for 'model': {model}. More than one matching edstan model found." 46 | ) 47 | 48 | self.model = matching_files[0] 49 | super().__init__(stan_file=matching_files[0], **kwargs) 50 | 51 | def sample_from_dict(self, data: Dict, **kwargs) -> EdStanMCMC: 52 | """ 53 | Sample from the model using a dictionary of data. 54 | 55 | Generally it will be more convenient to initialize sampling using the :meth:`EdStanModel.sample_from_long` 56 | or :meth:`EdStanModel.sample_from_wide` methods, which prepare the required dictionary based on common data 57 | formats. 58 | 59 | :param data: A dictionary of data compatible with the :mod:`edstan` models. 60 | :param kwargs: Additional arguments passed to :meth:`pystan.CmdStanModel.sample`, excluding 'data'. Consider 61 | arguments such as 'chains', 'iter_warmup', 'iter_sampling', and 'adapt_delta'. 62 | :return: A fitted MCMC model. 63 | """ 64 | ii_labels = data.pop("ii_labels") 65 | jj_labels = data.pop("jj_labels") 66 | max_per_item = data.pop("max_per_item") 67 | mcmc = super().sample(data=data, **kwargs) 68 | return EdStanMCMC( 69 | mcmc, jj_labels=jj_labels, ii_labels=ii_labels, max_per_item=max_per_item 70 | ) 71 | 72 | def sample_from_long( 73 | self, 74 | ii: NDArray, 75 | jj: NDArray, 76 | y: NDArray[np.integer], 77 | integerize: bool = True, 78 | **kwargs, 79 | ) -> EdStanMCMC: 80 | """ 81 | Sample from the model using response data in the form of several 1D arrays. 82 | 83 | This method is appropriate for "long format" item response data in which scored responses are stored in a flat 84 | array, and additional flat arrays index the person and item associated with each scored response. This format 85 | can accommodate missing responses by removing them beforehand. 86 | 87 | :param ii: A 1D NumPy array representing the item associated with a response. Must be integers 88 | if 'integerize' is set to False. 89 | :param jj: A 1D NumPy array representing the person associated with a response. Must be integers 90 | if 'integerize' is set to False. 91 | :param y: A 1D NumPy array representing the scored responses. The lowest value is expected to be 92 | zero. 93 | :param integerize: Whether to convert 'ii' and 'jj' to index arrays starting at one. This should generally 94 | be set to True but need not be if 'ii' and 'jj' are already formatted this way. 95 | :param kwargs: Additional arguments passed to :meth:`pystan.CmdStanModel.sample`, excluding 'data'. Consider 96 | arguments such as 'chains', 'iter_warmup', 'iter_sampling', and 'adapt_delta'. 97 | :return: A fitted MCMC model. 98 | """ 99 | ii = _validate_vector(ii, label="ii") 100 | jj = _validate_vector(jj, label="jj") 101 | y = _validate_vector(y, label="y") 102 | data = data_from_long(ii=ii, jj=jj, y=y, integerize=integerize, extended=True) 103 | return self.sample_from_dict(data, **kwargs) 104 | 105 | def sample_from_wide( 106 | self, response_matrix: Union[NDArray[np.integer], DataFrame], **kwargs 107 | ) -> EdStanMCMC: 108 | """ 109 | Sample from the model using response data in the form of a 2D array or :class:`pandas.DataFrame`. 110 | 111 | This method is appropriate for "wide format" item response data in which scored response are arrange in a table. 112 | Each row represents a person, and each column represents an item. 113 | 114 | :param response_matrix: A (#persons, #items) 2D array or :class:`pandas.DataFrame` representing the scored 115 | responses. The lowest value is expected to be zero. 116 | :param kwargs: Additional arguments passed to :meth:`pystan.CmdStanModel.sample`, excluding 'data'. Consider 117 | arguments such as 'chains', 'iter_warmup', 'iter_sampling', and 'adapt_delta'. 118 | :return: A fitted MCMC model. 119 | """ 120 | data = data_from_wide(response_matrix=response_matrix, extended=True) 121 | return self.sample_from_dict(data, **kwargs) 122 | 123 | 124 | def data_from_long( 125 | ii: NDArray, 126 | jj: NDArray, 127 | y: NDArray[np.integer], 128 | integerize: bool = True, 129 | extended: bool = False, 130 | ) -> Dict: 131 | """ 132 | Create a dictionary compatible with the :mod:`edstan` models from several 1D arrays. 133 | 134 | In general the :meth:`EdStanModel.sample_from_long` method will be sufficient for preparing 135 | data of this format and performing sampling. This function may be of interest if a copy of the prepared data is 136 | desired. 137 | 138 | :param ii: A 1D NumPy representing the item associated with a response. Must be integers 139 | if 'integerize' is set to False. 140 | :param jj: A 1D NumPy array representing the person associated with a response. Must be integers 141 | if 'integerize' is set to False. 142 | :param y: A 1D NumPy array representing the scored responses. The lowest value is expected to be 143 | zero. 144 | :param integerize: Whether to convert 'ii' and 'jj' to index vectors starting at one. This should generally 145 | be set to True. 146 | :param extended: Whether to add additional metadata keys to the output dictionary. This should generally be set 147 | to False if called by the user. 148 | :return: A dictionary representing item response data. 149 | """ 150 | ii = _validate_vector(ii, label="ii") 151 | jj = _validate_vector(jj, label="jj") 152 | y = _validate_vector(y, label="y") 153 | 154 | if not len(ii) == len(jj) == len(y): 155 | raise ValueError("'ii', 'jj', and 'y' must all have the same length.") 156 | 157 | if integerize: 158 | ii_ints, ii_labels = _map_to_unique_ids(ii) 159 | jj_ints, jj_labels = _map_to_unique_ids(jj) 160 | else: 161 | ii_ints, ii_labels = ii, _unique_unsorted(ii) 162 | jj_ints, jj_labels = jj, _unique_unsorted(jj) 163 | 164 | max_per_item = _validate_responses_by_item(y, ii_ints, ii_labels) 165 | 166 | data = { 167 | "I": max(ii_ints), 168 | "J": max(jj_ints), 169 | "N": len(y), 170 | "ii": ii_ints, 171 | "jj": jj_ints, 172 | "y": y, 173 | "K": 1, 174 | "W": [[1]] * max(jj_ints), 175 | } 176 | 177 | if extended: 178 | data.update( 179 | { 180 | "ii_labels": ii_labels, 181 | "jj_labels": jj_labels, 182 | "max_per_item": max_per_item, 183 | } 184 | ) 185 | 186 | return data 187 | 188 | 189 | def data_from_wide( 190 | response_matrix: Union[NDArray[np.integer], DataFrame], extended: bool = False 191 | ) -> Dict: 192 | """ 193 | Create a dictionary compatible with the :mod:`edstan` models from a response matrix. 194 | 195 | In general the :meth:`EdStanModel.sample_from_wide` method will be sufficient for preparing 196 | data of this format and performing sampling. This function may be of interest if a copy of the prepared data is 197 | desired. 198 | 199 | :param response_matrix: A (#persons, #items) array or :class:`pandas.DataFrame` representing the scored responses. 200 | The lowest value is expected to be zero. 201 | :param extended: Whether to add additional metadata keys to the output dictionary. This should generally be set 202 | to False if called by the user. 203 | :return: A dictionary representing item response data. 204 | """ 205 | if isinstance(response_matrix, DataFrame): 206 | mat = _validate_pandas_matrix(response_matrix) 207 | ii = np.tile(response_matrix.columns, mat.shape[0]) 208 | jj = np.repeat(response_matrix.index, mat.shape[1]) 209 | else: 210 | mat = _validate_numpy_matrix(response_matrix) 211 | ii = np.tile(np.arange(mat.shape[1]) + 1, mat.shape[0]) 212 | jj = np.repeat(np.arange(mat.shape[0]) + 1, mat.shape[1]) 213 | 214 | y = mat.flatten() 215 | 216 | return data_from_long(ii=ii, jj=jj, y=y, extended=extended, integerize=True) 217 | 218 | 219 | def _unique_unsorted(arr: NDArray): 220 | """Given a 1D array, return the unique elements in the order of first observance.""" 221 | return np.array([x for i, x in enumerate(arr) if x not in arr[:i]]) 222 | 223 | 224 | def _map_to_unique_ids(arr: NDArray): 225 | """Turn a 1D array into a tuple of an index and the unique values.""" 226 | unique_values = _unique_unsorted(arr) 227 | unique_values_list = unique_values.tolist() 228 | indices = np.array([unique_values_list.index(x) + 1 for x in arr]) 229 | return indices, unique_values 230 | 231 | 232 | def _validate_pandas_matrix(response_matrix: Union[NDArray, DataFrame]) -> NDArray: 233 | """Apply checks to a response matrix dataframe and convert to a 2D NDArray.""" 234 | if response_matrix.shape[0] != len(np.unique(response_matrix.index)): 235 | raise ValueError("The pandas dataframe must not have duplicate index values.") 236 | 237 | if response_matrix.shape[1] != len(np.unique(response_matrix.columns)): 238 | raise ValueError("The pandas dataframe must not have duplicate column names.") 239 | 240 | if response_matrix.index.nlevels != 1: 241 | raise ValueError( 242 | "The pandas dataframe must not have a multi-index along the rows." 243 | ) 244 | 245 | if response_matrix.columns.nlevels != 1: 246 | raise ValueError( 247 | "The pandas dataframe must not have a multi-index along the columns." 248 | ) 249 | 250 | return _validate_numpy_matrix(response_matrix) 251 | 252 | 253 | def _validate_numpy_matrix(response_matrix: Union[NDArray, DataFrame]) -> NDArray: 254 | """Convert a response matrix to an NDArray, apply checks, and return the NDArray.""" 255 | try: 256 | mat = np.asarray(response_matrix) 257 | except Exception as exc: 258 | raise ValueError( 259 | "'response_matrix' must be a 2-dimensional numpy array or an object convertable to the same." 260 | ) from exc 261 | 262 | if mat.ndim != 2: 263 | raise ValueError( 264 | f"'response_matrix' has {mat.ndim} dimensions, but must have two." 265 | ) 266 | return mat 267 | 268 | 269 | def _validate_vector(arr: NDArray, label: str) -> NDArray: 270 | """Convert argument to an NDArray, check that it is 1D, and return the NDArray.""" 271 | try: 272 | arr = np.array(arr) 273 | except Exception as exc: 274 | raise ValueError( 275 | f"'{label}' must be a 1-dimensional numpy array or an object convertable to the same." 276 | ) from exc 277 | 278 | if arr.ndim != 1: 279 | raise ValueError( 280 | f"'{label}' must be a 1-dimensional numpy array or an object convertable to the same." 281 | ) 282 | 283 | return arr 284 | 285 | 286 | def _validate_responses_by_item( 287 | y: NDArray, ii_ints: NDArray, ii_labels: NDArray 288 | ) -> NDArray: 289 | """Apply checks to 1D NDArray of item responses and return the max score per item.""" 290 | max_per_item = np.zeros(max(ii_ints)) 291 | 292 | for u in np.unique(ii_ints): 293 | responses = y[ii_ints == u] 294 | label = ii_labels[u - 1] 295 | 296 | mn = min(responses) 297 | mx = max(responses) 298 | 299 | if mn != 0: 300 | warn(f"Item {label} does not have a minimum response value of zero.") 301 | 302 | if len(np.unique(responses)) == 1: 303 | warn(f"Item {label} only has response values of {responses[0]}.") 304 | 305 | if len(np.unique(responses)) != (mx - mn + 1): 306 | warn(f"Item {label} has missing response categories.") 307 | 308 | max_per_item[u - 1] = mx 309 | 310 | return max_per_item 311 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - python = 3.11 5 | - pylint 6 | - black 7 | - pytest 8 | - sphinx 9 | - build 10 | - twine 11 | - cmdstanpy 12 | - numpy 13 | - pandas 14 | -------------------------------------------------------------------------------- /make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pylint] 2 | max-line-length = 120 3 | 4 | [build-system] 5 | requires = ["setuptools>=61.0", "wheel"] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [project] 9 | name = "edstan" 10 | version = "0.2.0" 11 | description = "Streamlines the fitting of common Bayesian item response theory models using Stan" 12 | readme = "README.md" 13 | authors = [ 14 | { name = "Daniel C. Furr", email = "danielcfurr@berkeley.edu" } 15 | ] 16 | license = { text = "MIT" } 17 | requires-python = ">=3.8" 18 | classifiers = [ 19 | "Programming Language :: Python :: 3", 20 | "License :: OSI Approved :: MIT License", 21 | "Operating System :: OS Independent" 22 | ] 23 | urls = [ 24 | { "Homepage" = "https://github.com/danielcfurr/edstan-python" }, 25 | { "Reference" = "https://edstan-python.readthedocs.io/en/latest/"} 26 | ] 27 | 28 | [tool.setuptools.packages.find] 29 | where = ["."] 30 | include = ["edstan"] 31 | 32 | [tool.setuptools.package-data] 33 | edstan = [ 34 | "data/rasch_latent_reg.stan", 35 | "data/2pl_latent_reg.stan", 36 | "data/rsm_latent_reg.stan", 37 | "data/grsm_latent_reg.stan", 38 | "data/pcm_latent_reg.stan", 39 | "data/gpcm_latent_reg.stan" 40 | ] 41 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy~=2.2.4 2 | pandas~=2.2.3 3 | pytest~=8.3.5 4 | cmdstanpy~=1.2.5 5 | setuptools~=75.8.2 -------------------------------------------------------------------------------- /source/api.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | .. autoclass:: edstan.EdStanModel 5 | :members: __init__, sample_from_dict, sample_from_long, sample_from_wide 6 | :undoc-members: 7 | 8 | .. autoclass:: edstan.EdStanMCMC 9 | :members: __init__, item_summary, person_summary 10 | :undoc-members: 11 | 12 | .. autofunction:: edstan.data_from_long 13 | 14 | .. autofunction:: edstan.data_from_wide -------------------------------------------------------------------------------- /source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = 'edstan' 10 | copyright = '2025, Daniel C. Furr' 11 | author = 'Daniel C. Furr' 12 | release = '0.2.0' 13 | 14 | # -- General configuration --------------------------------------------------- 15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 16 | 17 | extensions = [ 18 | 'sphinx.ext.autodoc', 19 | 'sphinx.ext.napoleon', # for Google/NumPy-style docstrings 20 | 'sphinx.ext.viewcode', # optional, adds source code links 21 | ] 22 | 23 | templates_path = ['_templates'] 24 | exclude_patterns = [] 25 | 26 | import os 27 | import sys 28 | sys.path.insert(0, os.path.abspath('../')) 29 | 30 | 31 | # -- Options for HTML output ------------------------------------------------- 32 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 33 | 34 | html_theme = 'alabaster' 35 | html_static_path = ['_static'] -------------------------------------------------------------------------------- /source/index.rst: -------------------------------------------------------------------------------- 1 | .. edstan documentation master file, created by 2 | sphinx-quickstart on Fri Apr 4 11:19:26 2025. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | edstan 7 | ====== 8 | 9 | A python module that simplifies the fitting of common Bayesian item response theory models using Stan. It is compatible 10 | with and extends the functionality of :mod:`pystan`. 11 | 12 | 13 | Features 14 | -------- 15 | 16 | - Streamlined interface to common Bayesian item response models using Stan 17 | - Models include: Rasch, two-parameter logistic, (generalized) rating scale, and (generalized) partial credit 18 | - Posterior summaries tailored to item response models 19 | 20 | 21 | Installation 22 | ------------ 23 | 24 | :mod:`edstan` depends on the successful installation of :mod:`pystan`, so please see the 25 | `pystan installation guide `_. 26 | Note that compatibility with Windows OS 27 | `may be limited `_. 28 | 29 | :mod:`edstan` may subsequently be installed with :mod:`pip`: 30 | 31 | .. code-block:: bash 32 | 33 | pip install edstan 34 | 35 | 36 | Quickstart 37 | ---------- 38 | 39 | Here is an example of running a model using data from a response matrix: 40 | 41 | .. code-block:: python 42 | 43 | from edstan import EdStanModel 44 | import numpy as np 45 | import pandas pd 46 | 47 | # Simulate a "wide format" data frame of item responses for 48 | # 5 items and 100 persons. Responses are scored 0 or 1. 49 | rng = np.random.default_generator(seed=42) 50 | data = pd.DataFrame(rng.binomial(1, p=.5, size=(100, 5))) 51 | data.columns = [f"Question {i}" for i in range(5)] 52 | data.index = [f"Respondent {i}" for i in range(100)] 53 | 54 | # Instantiate the model, selecting the Rasch model 55 | model = EdStanModel("rasch") 56 | 57 | # Sample from the model by MCMC 58 | fit = model.sample_from_long(data) 59 | 60 | # View a posterior summary of the item (and person distribution) 61 | # parameters 62 | print(fit.item_summary()) 63 | 64 | # View a posterior summary of the person parameters 65 | print(fit.person_summary()) 66 | 67 | 68 | Alternatively, this is an example of using long format data: 69 | 70 | .. code-block:: python 71 | 72 | from edstan import EdStanModel 73 | import numpy as np 74 | import pandas as pd 75 | 76 | # Simulate a "long format" data frame of item responses for 77 | # 5 items and 100 persons. Responses are scored 0, 1, or 2. 78 | rng = np.random.default_generator(seed=42) 79 | data = pd.DataFrame( 80 | { 81 | "person": [f"Person {j}" for j in range(100) for i in range(5)] 82 | "item": [f"Item {i}" for j in range(100) for i in range(5)] 83 | "response": rng.binomial(2, p=.5, size=5*100) 84 | } 85 | ) 86 | 87 | # Instantiate the model, choosing the generalized partial 88 | # credit model 89 | model = EdStanModel("gpcm") 90 | 91 | # Sample from the model by MCMC 92 | fit = model.sample_from_long( 93 | ii=data['item'], 94 | jj=data['person'], 95 | y=data['response'] 96 | ) 97 | 98 | # View a posterior summary of the item (and person distribution) 99 | # parameters 100 | print(fit.item_summary()) 101 | 102 | # View a posterior summary of the person parameters 103 | print(fit.person_summary()) 104 | 105 | 106 | Contents 107 | -------- 108 | 109 | .. toctree:: 110 | :maxdepth: 2 111 | :caption: Contents: 112 | 113 | api 114 | tech 115 | -------------------------------------------------------------------------------- /source/tech.rst: -------------------------------------------------------------------------------- 1 | Technical Notes 2 | =============== 3 | 4 | Users will be able to fit the :mod:`edstan` models without full knowledge of 5 | the technical details, though these are provided in this section. All 6 | that is really needed for interpreting results is to know the meanings 7 | assigned to the Greek letters. 8 | 9 | Notation 10 | -------- 11 | 12 | Variables and parameters are similar across :mod:`edstan` models. The variables 13 | used are: 14 | 15 | - :math:`i = 1 \ldots I` indexes items. 16 | - :math:`j = 1 \ldots J` indexes persons. 17 | - :math:`m_i` is simultaneously the maximum score and the number of step 18 | difficulty parameters for item $i$ for partial credit models. 19 | Alternatively, :math:`m` is the same across all items for rating scale 20 | models. 21 | - :math:`s = 1 \ldots m_i` or :math:`s = 1 \ldots m` indexes steps within items. 22 | - :math:`y_{ij}` is the scored response of person :math:`j` to item :math:`i`. The lowest 23 | score for items must be zero (except for rating scale models). 24 | 25 | The parameters used are: 26 | 27 | - For the Rasch and 2PL models, :math:`\beta_i` is the difficulty for item 28 | :math:`i`. For the rating scale models, :math:`\beta_i` is the mean difficulty for 29 | item :math:`i`. For partial credit models, :math:`\beta_{is}` is the difficulty 30 | for step :math:`s` of item :math:`i`. 31 | - :math:`\kappa_s` is a step difficulty for the (generalized) rating scale 32 | model. 33 | - :math:`\alpha_i` is the discrimination parameter for item :math:`i` (when 34 | applicable). 35 | - :math:`\theta_j` is the ability for person :math:`j`. 36 | - :math:`\lambda` is mean of the ability distribution. 37 | - :math:`\sigma` is standard deviation for the ability distribution.. 38 | 39 | The *.stan* files and the notation for the models below closely adhere 40 | to these conventions. 41 | 42 | Rasch family models 43 | ------------------- 44 | 45 | Rasch model 46 | ^^^^^^^^^^^ 47 | 48 | *rasch_latent_reg.stan* 49 | 50 | .. math:: 51 | 52 | \mathrm{logit} [ \Pr(y_{ij} = 1 | \theta_j, \beta_i) ] = 53 | \theta_j - \beta_i 54 | 55 | Partial credit model 56 | ^^^^^^^^^^^^^^^^^^^^ 57 | 58 | *pcm_latent_reg.stan* 59 | 60 | .. math:: 61 | \Pr(Y_{ij} = y,~y > 0 | \theta_j, \beta_i) = 62 | \frac{\exp \sum_{s=1}^y (\theta_j - \beta_{is})} 63 | {1 + \sum_{k=1}^{m_i} \exp \sum_{s=1}^k (\theta_j - \beta_{is})} 64 | 65 | .. math:: 66 | \Pr(Y_{ij} = y,~y = 0 | \theta_j, \beta_i) = 67 | \frac{1} 68 | {1 + \sum_{k=1}^{m_i} \exp \sum_{s=1}^k (\theta_j - \beta_{is})} 69 | 70 | Rating scale model 71 | ^^^^^^^^^^^^^^^^^^ 72 | 73 | *rsm_latent_reg.stan* 74 | 75 | .. math:: 76 | \Pr(Y_{ij} = y,~y > 0 | \theta_j, \beta_i, \kappa_s) = 77 | \frac{\exp \sum_{s=1}^y (\theta_j - \beta_i - \kappa_s)} 78 | {1 + \sum_{k=1}^{m} \exp \sum_{s=1}^k (\theta_j - \beta_i - \kappa_s)} 79 | 80 | .. math:: 81 | \Pr(Y_{ij} = y,~y = 0 | \theta_j, \beta_i, \kappa_s) = 82 | \frac{1} 83 | {1 + \sum_{k=1}^{m} \exp \sum_{s=1}^k (\theta_j - \beta_i - \kappa_s)} 84 | 85 | Models featuring discrimination parameters 86 | ------------------------------------------ 87 | 88 | Two-parameter logistic model 89 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 90 | 91 | *2pl_latent_reg.stan* 92 | 93 | .. math:: 94 | \mathrm{logit} [ \Pr(y_{ij} = 1 | \alpha_i, \beta_i, \theta_j) ] = 95 | \alpha_i \theta_j - \beta_i 96 | 97 | Generalized partial credit model 98 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 99 | 100 | *gpcm_latent_reg.stan* 101 | 102 | .. math:: 103 | \Pr(Y_{ij} = y,~y > 0 | \theta_j, \alpha_i, \beta_i) = 104 | \frac{\exp \sum_{s=1}^y (\alpha_i \theta_j - \beta_{is})} 105 | {1 + \sum_{k=1}^{m_i} \exp \sum_{s=1}^k 106 | (\alpha_i \theta_j - \beta_{is})} 107 | 108 | .. math:: 109 | \Pr(Y_{ij} = y,~y = 0 | \theta_j, \alpha_i, \beta_i) = 110 | \frac{1} 111 | {1 + \sum_{k=1}^{m_i} \exp \sum_{s=1}^k 112 | (\alpha_i \theta_j + w_{j}' \lambda - \beta_{is})} 113 | 114 | Generalized rating scale model 115 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 116 | 117 | *grsm_latent_reg.stan* 118 | 119 | .. math:: 120 | \Pr(Y_{ij} = y,~y > 0 | \theta_j, \lambda, \alpha_i, \beta_i, \kappa_s) = 121 | \frac{\exp \sum_{s=1}^y 122 | (\alpha_i \theta_j - \beta_i - \kappa_s)} 123 | {1 + \sum_{k=1}^{m} \exp \sum_{s=1}^k 124 | (\alpha_i \theta_j - \beta_i - \kappa_s)} 125 | 126 | .. math:: 127 | \Pr(Y_{ij} = y,~y = 0 | \theta_j, \lambda, \alpha_i, \beta_i, \kappa_s) = 128 | \frac{1} 129 | {1 + \sum_{k=1}^{m} \exp \sum_{s=1}^k 130 | (\alpha_i \theta_j - \beta_i - \kappa_s)} 131 | 132 | 133 | Prior distributions 134 | ------------------- 135 | 136 | For Rasch family models, the prior distributions for the person-related 137 | parameters are 138 | 139 | - :math:`\theta_j \sim \mathrm{N}(\lambda, \sigma^2)` 140 | - :math:`\lambda \sim t_7(0, 2.5)` 141 | - :math:`\sigma \sim \mathrm{gamma}(2, 1)` 142 | 143 | For models with discrimination parameters, the priors are 144 | 145 | - :math:`\theta_j \sim \mathrm{N}(\lambda, 1)` 146 | - :math:`\lambda \sim t_7(0, 2.5)` 147 | 148 | The priors for the item parameters are 149 | 150 | - :math:`\alpha \sim \mathrm{lognormal}(.5, 1)` 151 | - :math:`\beta \sim \mathrm{N}(0, 9)` 152 | - :math:`\kappa \sim \mathrm{N}(0, 9)` 153 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 4 | 5 | -------------------------------------------------------------------------------- /tests/test_gestalt.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from edstan import EdStanModel, EdStanMCMC 4 | 5 | rng = np.random.default_rng(42) 6 | 7 | # Preset number of items and persons for data generation 8 | items = 5 9 | persons = 100 10 | 11 | # Dichotomous response matrix as array 12 | dich_arr = rng.binomial(1, p=.5, size=(persons, items)) 13 | 14 | # Dichotomous response matrix as data frame 15 | dich_df = pd.DataFrame(dich_arr) 16 | dich_df.columns = [f'Item {i}' for i in range(items)] 17 | dich_df.index = [f'Person {i}' for i in range(persons)] 18 | 19 | # Polytomous response matrix as array 20 | poly_arr = rng.binomial(2, p=.5, size=(persons, items)) 21 | 22 | # Polytomous response matrix as data frame 23 | poly_df = pd.DataFrame(poly_arr) 24 | poly_df.columns = dich_df.columns 25 | poly_df.index = dich_df.index 26 | 27 | # Dich/polytomous responses as long-format arrays 28 | dich_y = dich_arr.flatten() 29 | poly_y = poly_arr.flatten() 30 | ii = np.tile(dich_df.columns, reps=persons) 31 | jj = np.repeat(dich_df.index, repeats=items) 32 | 33 | 34 | def test_long(): 35 | result = EdStanModel('rasch').sample_from_long( 36 | ii=ii, jj=jj, y=dich_y, 37 | iter_warmup=100, iter_sampling=100, chains=1 38 | ) 39 | assert isinstance(result, EdStanMCMC) 40 | 41 | 42 | def test_wide_numpy(): 43 | response_matrix = rng.binomial(n=1, p=.5, size=(100, 5)) 44 | result = EdStanModel('rasch').sample_from_wide( 45 | dich_arr, 46 | iter_warmup=100, iter_sampling=100, chains=1 47 | ) 48 | assert isinstance(result, EdStanMCMC) 49 | 50 | 51 | def test_wide_pandas(): 52 | result = EdStanModel('rasch').sample_from_wide( 53 | dich_df, 54 | iter_warmup=100, iter_sampling=100, chains=1 55 | ) 56 | assert isinstance(result, EdStanMCMC) 57 | 58 | 59 | def test_wide_polytomous(): 60 | result = EdStanModel('rsm').sample_from_wide( 61 | poly_df, 62 | iter_warmup=100, iter_sampling=100, chains=1 63 | ) 64 | assert isinstance(result, EdStanMCMC) 65 | 66 | 67 | def test_summary_rasch(): 68 | fit = EdStanModel('rasch').sample_from_wide( 69 | dich_df, 70 | iter_warmup=100, iter_sampling=100, chains=1 71 | ) 72 | assert fit.item_summary().shape[0] == items * 1 + 2 73 | assert fit.person_summary().shape[0] == persons 74 | 75 | 76 | def test_summary_2pl(): 77 | fit = EdStanModel('2pl').sample_from_wide( 78 | dich_df, 79 | iter_warmup=100, iter_sampling=100, chains=1 80 | ) 81 | assert fit.item_summary().shape[0] == items * 2 + 1 82 | assert fit.person_summary().shape[0] == persons 83 | 84 | 85 | def test_summary_rsm(): 86 | fit = EdStanModel('rsm').sample_from_wide( 87 | poly_df, 88 | iter_warmup=100, iter_sampling=100, chains=1 89 | ) 90 | assert fit.item_summary().shape[0] == items * 1 + 4 91 | assert fit.person_summary().shape[0] == persons 92 | 93 | 94 | def test_summary_grsm(): 95 | fit = EdStanModel('grsm').sample_from_wide( 96 | poly_df, 97 | iter_warmup=100, iter_sampling=100, chains=1 98 | ) 99 | assert fit.item_summary().shape[0] == items * 2 + 3 100 | assert fit.person_summary().shape[0] == persons 101 | 102 | 103 | def test_summary_pcm(): 104 | fit = EdStanModel('pcm').sample_from_wide( 105 | poly_df, 106 | iter_warmup=100, iter_sampling=100, chains=1 107 | ) 108 | assert fit.item_summary().shape[0] == items * 2 + 2 109 | assert fit.person_summary().shape[0] == persons 110 | 111 | 112 | def test_summary_gpcm(): 113 | fit = EdStanModel('gpcm').sample_from_wide( 114 | poly_df, 115 | iter_warmup=100, iter_sampling=100, chains=1 116 | ) 117 | assert fit.item_summary().shape[0] == items * 3 + 1 118 | assert fit.person_summary().shape[0] == persons 119 | -------------------------------------------------------------------------------- /tests/test_internals.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | from edstan import model 6 | 7 | 8 | def test_unique_unsorted(): 9 | x = np.array(["dog", "cat", "dog", "pony", "owl"]) 10 | result = model._unique_unsorted(x) 11 | expected = np.array(["dog", "cat", "pony", "owl"]) 12 | np.testing.assert_array_equal(result, expected) 13 | 14 | 15 | def test_map_to_unique_ids(): 16 | x = np.array(["dog", "cat", "dog", "pony", "owl"]) 17 | result_ints, result_labels = model._map_to_unique_ids(x) 18 | expected_ints = np.array([1, 2, 1, 3, 4]) 19 | expected_labels = np.array(["dog", "cat", "pony", "owl"]) 20 | np.testing.assert_array_equal(result_ints, expected_ints) 21 | np.testing.assert_array_equal(result_labels, expected_labels) 22 | 23 | 24 | def test_validate_numpy_matrix(): 25 | x = np.zeros((2, 2)) 26 | result = model._validate_numpy_matrix(x) 27 | np.testing.assert_array_equal(result, x) 28 | 29 | 30 | def test_validate_numpy_matrix__errors(): 31 | with pytest.raises(ValueError): 32 | model._validate_numpy_matrix(np.zeros(3)) 33 | with pytest.raises(ValueError): 34 | model._validate_numpy_matrix(np.zeros((3, 3, 3))) 35 | 36 | 37 | def test_validate_pandas_matrix(): 38 | x = np.zeros((2, 2)) 39 | df = pd.DataFrame(x) 40 | result = model._validate_pandas_matrix(df) 41 | np.testing.assert_array_equal(result, x) 42 | 43 | 44 | def test_validate_pandas_matrix__errors(): 45 | df = pd.DataFrame(np.zeros((2, 2))) 46 | with pytest.raises(ValueError): 47 | x = df.copy() 48 | x.columns = ["a", "a"] 49 | model._validate_pandas_matrix(x) 50 | with pytest.raises(ValueError): 51 | x = df.copy() 52 | x.index = ["a", "a"] 53 | model._validate_pandas_matrix(x) 54 | with pytest.raises(ValueError): 55 | x = df.copy() 56 | x.index = pd.MultiIndex.from_product([["a"], ["b", "c"]]) 57 | model._validate_pandas_matrix(x) 58 | with pytest.raises(ValueError): 59 | x = df.copy() 60 | x.columns = pd.MultiIndex.from_product([["a"], ["b", "c"]]) 61 | model._validate_pandas_matrix(x) 62 | 63 | 64 | def test_validate_responses_by_item__warnings(): 65 | ii_ints = np.array([1, 1, 2, 2]) 66 | with pytest.warns(UserWarning): 67 | # No variation in second item 68 | model._validate_responses_by_item(np.array([0, 1, 1, 1]), ii_ints, ii_ints) 69 | with pytest.warns(UserWarning): 70 | # No variation in second item 71 | model._validate_responses_by_item(np.array([0, 1, 0, 0]), ii_ints, ii_ints) 72 | with pytest.warns(UserWarning): 73 | # Missing zero in second item 74 | model._validate_responses_by_item(np.array([0, 1, 1, 2]), ii_ints, ii_ints) 75 | with pytest.warns(UserWarning): 76 | # Missing category in second item 77 | model._validate_responses_by_item(np.array([0, 1, 0, 2]), ii_ints, ii_ints) 78 | --------------------------------------------------------------------------------