├── Plot.png ├── data ├── votes.Rda └── votes_data.Rda ├── saved_runs └── 1d_model_stan.Rda ├── models └── Stan │ ├── 1d_model.stan │ ├── 1d_hierarchical.stan │ ├── dynamic_model.stan │ └── 2d_model.stan ├── scripts ├── 1d_hierarchical_model_Stan.R ├── 1d_model_Stan.R ├── Dynamic_model.R ├── 2Dmodel_Stan.R └── create_data.R ├── plots └── plot_stan.R └── README.md /Plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertMyles/IRT/HEAD/Plot.png -------------------------------------------------------------------------------- /data/votes.Rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertMyles/IRT/HEAD/data/votes.Rda -------------------------------------------------------------------------------- /data/votes_data.Rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertMyles/IRT/HEAD/data/votes_data.Rda -------------------------------------------------------------------------------- /saved_runs/1d_model_stan.Rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertMyles/IRT/HEAD/saved_runs/1d_model_stan.Rda -------------------------------------------------------------------------------- /models/Stan/1d_model.stan: -------------------------------------------------------------------------------- 1 | data { 2 | int J; //Legislators 3 | int K; //Proposals/bills 4 | int N; //no. of observations 5 | int j[N]; //Legislator for observation n 6 | int k[N]; //proposal for observation n 7 | int y[N]; //vote of observation n 8 | } 9 | parameters { 10 | vector[K] alpha; 11 | vector[K] beta; 12 | vector[J] theta; 13 | } 14 | model { 15 | alpha ~ normal(0,10); 16 | beta ~ normal(0,10); 17 | theta ~ normal(0,1); 18 | theta[1] ~ normal(1, .01); //constraints 19 | theta[2] ~ normal(-1, .01); 20 | for (n in 1:N) 21 | y[n] ~ bernoulli_logit(theta[j[n]] * beta[k[n]] - alpha[k[n]]); 22 | } -------------------------------------------------------------------------------- /scripts/1d_hierarchical_model_Stan.R: -------------------------------------------------------------------------------- 1 | ## One dimensional Hierarchical IRT ideal point model 2 | 3 | library(rstan) 4 | 5 | 6 | load("data/votes.Rda") 7 | 8 | # take out NA for Stan: 9 | nas <- which(is.na(m_votes)) 10 | votes <- m_votes[-nas] 11 | N <- length(votes) 12 | j <- rep(1:50, times = 150) 13 | j <- j[-nas] 14 | k <- rep(1:150, each = 50) 15 | k <- k[-nas] 16 | J <- max(j) 17 | K <- max(k) 18 | 19 | senate_data <- list(N = N, K = K, J = J, j = j, k = k, y = votes) 20 | 21 | 22 | stan.fit <- stan(file = "models/Stan/1d_hierarchical.stan", 23 | data = senate_data, iter = 5000, warmup = 2500, chains = 4, 24 | thin = 5, init = "random", verbose = TRUE, cores = 4, seed = 1234) 25 | -------------------------------------------------------------------------------- /scripts/1d_model_Stan.R: -------------------------------------------------------------------------------- 1 | ## One dimensional IRT ideal point model, using Stan in R 2 | 3 | library(rstan) 4 | 5 | load("data/votes.Rda") 6 | 7 | # take out NA for Stan: 8 | nas <- which(is.na(m_votes)) 9 | votes <- m_votes[-nas] 10 | N <- length(votes) 11 | j <- rep(1:50, times = 150) 12 | j <- j[-nas] 13 | k <- rep(1:150, each = 50) 14 | k <- k[-nas] 15 | J <- max(j) 16 | K <- max(k) 17 | 18 | senate_data <- list(N = N, K = K, J = J, j = j, k = k, y = votes) 19 | 20 | 21 | stan.fit <- stan(file = "models/Stan/1d_hierarchical.stan", 22 | data = senate_data, iter = 5000, warmup = 2500, chains = 4, 23 | thin = 5, init = "random", verbose = TRUE, cores = 4, seed = 1234) 24 | 25 | save(stan.fit, file = "saved_runs/1d_model_stan.Rda") 26 | -------------------------------------------------------------------------------- /models/Stan/1d_hierarchical.stan: -------------------------------------------------------------------------------- 1 | data { 2 | int J; //Legislators 3 | int K; //Proposals/bills 4 | int N; //no. of observations 5 | int j[N]; //Legislator for observation n 6 | int k[N]; //proposal for observation n 7 | int y[N]; //vote of observation n 8 | } 9 | parameters { 10 | vector[K] alpha; 11 | vector[K] beta; 12 | vector[J] theta; 13 | vector[J] mu_theta; 14 | } 15 | model { 16 | alpha ~ normal(0,10); 17 | beta ~ normal(0,10); 18 | theta ~ normal(mu_theta,1); 19 | mu_theta ~ normal(0, 10); 20 | theta[1] ~ normal(1, .01); //constraints 21 | theta[2] ~ normal(-1, .01); 22 | for (n in 1:N) 23 | y[n] ~ bernoulli_logit(theta[j[n]] * beta[k[n]] - alpha[k[n]]); 24 | } -------------------------------------------------------------------------------- /scripts/Dynamic_model.R: -------------------------------------------------------------------------------- 1 | ## Dynamic one dimensional IRT ideal point model 2 | 3 | library(rstan) 4 | 5 | 6 | load("data/votes.Rda") 7 | load("data/votes_data.Rda") 8 | 9 | votes_data <- mutate(votes_data, year_index = ifelse(year == "2017", 1, 2)) 10 | # take out NA: 11 | nas <- which(is.na(m_votes)) 12 | votes <- m_votes[-nas] 13 | N <- length(votes) 14 | j <- rep(1:50, times = 150) 15 | j <- j[-nas] 16 | k <- rep(1:150, each = 50) 17 | k <- k[-nas] 18 | J <- max(j) 19 | K <- max(k) 20 | t <- rep(1:2, each = N/2) 21 | T <- 2 22 | 23 | stan_data <- list(N = N, K = K, J = J, j = j, k = k, t = t, T = T, y = votes) 24 | 25 | 26 | dyn_fit <- stan("models/Stan/dynamic_model.stan", 27 | data = stan_data, iter = 5000, warmup = 2500, 28 | thin = 5, chains = 4, seed = 1234, cores = 4) 29 | 30 | -------------------------------------------------------------------------------- /models/Stan/dynamic_model.stan: -------------------------------------------------------------------------------- 1 | data { 2 | int J; //legislators 3 | int K; //Proposals 4 | int N; //no. of observations 5 | int T; // number of different time periods 6 | int y[N]; // response for n; y = 0, 1 7 | int j[N]; //Legislator for observation n 8 | int k[N]; //Proposal for observation n 9 | int t[N]; // time period for obs n 10 | } 11 | parameters { 12 | vector[K] alpha; 13 | vector[K] beta; 14 | vector[J] theta[T]; 15 | } 16 | model { 17 | theta[1] ~ normal(0, 1); 18 | for (i in 2:T){ 19 | theta[i] ~ normal(theta[i - 1], 1); 20 | } 21 | alpha ~ normal(0, 1); 22 | beta ~ normal(0, 5); 23 | for (n in 1:N) 24 | y[n] ~ bernoulli_logit(theta[t[n], j[n]] * beta[k[n]] - alpha[k[n]]); 25 | } -------------------------------------------------------------------------------- /scripts/2Dmodel_Stan.R: -------------------------------------------------------------------------------- 1 | # Two-Dimensional IRT ideal point model, with vote parameters (Beta) used as 2 | # identifying constraints as per Simon Jackman ("Multidimensional analysis of 3 | # roll call data via Bayesian simulation: identification, estimation, inference, 4 | # and model checking", Political Analysis, Vol 9, Issue 3, 2001). 5 | library(rstan) 6 | 7 | # data: 8 | load("data/votes.Rda") 9 | 10 | 11 | # take out NA: 12 | nas <- which(is.na(m_votes)) 13 | votes <- m_votes[-nas] 14 | N <- length(votes) 15 | j <- rep(1:50, times = 150) 16 | j <- j[-nas] 17 | k <- rep(1:150, each = 50) 18 | k <- k[-nas] 19 | J <- max(j) 20 | K <- max(k) 21 | 22 | 23 | stan.data <- list(J = J, K = K, N = N, j = j, k = k, y = votes, D = 2) 24 | 25 | # Stan run: 26 | stan.fit <- stan("models/Stan/2d_model.stan", 27 | data = stan.data, iter = 5000, warmup = 2500, 28 | chains = 4, verbose = TRUE, cores = 4, seed = 1234) 29 | -------------------------------------------------------------------------------- /models/Stan/2d_model.stan: -------------------------------------------------------------------------------- 1 | data { 2 | int J; //Legislators 3 | int K; //Proposals 4 | int N; //no. of observations 5 | int j[N]; //Legislator for observation n 6 | int k[N]; //Proposal for observation n 7 | int y[N]; //Vote of observation n 8 | int D; //No. of dimensions (here 2) 9 | } 10 | parameters { 11 | real alpha[K]; //difficulty (intercept) 12 | matrix[K,D] beta; //discrimination (slope) 13 | matrix[J,D] theta; //latent trait (ideal points) 14 | } 15 | model { 16 | alpha ~ normal(0,10); 17 | to_vector(beta) ~ normal(0,10); 18 | to_vector(theta) ~ normal(0,1); 19 | theta[1,1] ~ normal(1, .01); //constraints, ideal points, dimension 1 20 | theta[2,1] ~ normal(-1, .01); 21 | beta[1,2] ~ normal(-4, 2); // beta constraints 22 | beta[2,2] ~ normal(4, 2); 23 | beta[1,1] ~ normal(0, .1); 24 | beta[2,1] ~ normal(0, .1); 25 | for (n in 1:N) 26 | y[n] ~ bernoulli_logit(theta[j[n],1] * beta[k[n],1] + theta[j[n],2] * beta[k[n],2] - alpha[k[n]]); 27 | } -------------------------------------------------------------------------------- /plots/plot_stan.R: -------------------------------------------------------------------------------- 1 | library(rstan) # loads ggplot2 by default 2 | library(dplyr) 3 | library(plyr) 4 | 5 | load("data/votes.Rda") 6 | load("data/votes_data.Rda") 7 | load("saved_runs/1d_model_stan.Rda") 8 | 9 | votes_data <- votes_data %>% 10 | select(legislator_id, legislator_party, government) %>% 11 | distinct(legislator_id, .keep_all = TRUE) 12 | 13 | ips <- summary(stan.fit, pars = "theta", probs = c(0.025, 0.975)) 14 | ips <- as_data_frame(ips$summary) %>% 15 | mutate(legislator_id = row.names(m_votes), 16 | index = gsub("[A-Za-z_]*", "", legislator_id), 17 | index = as.numeric(index)) %>% 18 | left_join(votes_data) %>% 19 | rename(lower = `2.5%`, upper = `97.5%`) %>% 20 | arrange(index) 21 | 22 | y <- 1:nrow(ips) 23 | ggplot(ips, aes(x = mean, y = y, colour = government)) + 24 | geom_point() + 25 | geom_errorbarh(aes(xmin = lower, xmax = upper)) + 26 | theme_minimal() + xlab("Ideal Point") + 27 | theme(axis.title.y = element_blank(), 28 | axis.text.y = element_blank(), 29 | axis.ticks.y = element_blank(), 30 | legend.position = "bottom") + 31 | scale_x_continuous(breaks = c(-1, 0, 1), limits = c(-1.25, 1.25)) + 32 | scale_colour_manual(values = c("#2F3061", "#D62839")) 33 | 34 | ## for a 2-dimensional plot, we use a hull to highlight things. 35 | # Assuming you ran the code in 2Dmodel_Stan.R, 36 | # 1-50 is dimension 1, 51-100 dimension 2: 37 | ips <- summary(stan.fit, pars = "theta", probs = c(0.025, 0.975)) 38 | ips <- as_data_frame(ips$summary) 39 | ips_d1 <- ips[1:50, ] 40 | ips_d1 <- rename(ips_d1, meanD1 = mean) 41 | ips_d2 <- ips[51:100, "mean"] 42 | ips_d2 <- rename(ips_d2, meanD2 = mean) 43 | ips <- bind_cols(ips_d1, ips_d2) 44 | ips <- ips %>% 45 | mutate(legislator_id = row.names(m_votes), 46 | index = gsub("[A-Za-z_]*", "", legislator_id), 47 | index = as.numeric(index)) %>% 48 | left_join(votes_data) %>% 49 | arrange(index) 50 | 51 | find_hull <- function(x) x[chull(x$meanD1, x$meanD2), ] 52 | hulls <- ddply(ips, "government", find_hull) 53 | 54 | ggplot(ips, aes(x = meanD1, y = meanD2, colour = government, 55 | fill = government)) + 56 | geom_point(shape = 19, size = 2.5) + 57 | geom_polygon(data = hulls, alpha = .18) + 58 | scale_colour_manual(values = c("#2F3061", "#D62839")) + 59 | scale_fill_manual(values = c("#2F3061", "#D62839")) + 60 | theme_minimal() + 61 | theme(axis.title=element_blank(), legend.position="none") + 62 | geom_hline(yintercept = 0, size = .3) + geom_vline(xintercept = 0, size = .35) + 63 | scale_x_continuous(limits = c(-1.5, 1.25)) #if necessary to adjust scale 64 | -------------------------------------------------------------------------------- /scripts/create_data.R: -------------------------------------------------------------------------------- 1 | # create vote matrix of votes. Row names are voters, column names are 2 | # votes ('bills' or 'rollcalls'). 50 legislators, 150 votes. 3 | 4 | library(dplyr) 5 | 6 | # simulate data: 100 legislators, 150 votes 7 | N <- 50 8 | M <- 150 9 | m_votes <- matrix(NA, nrow = N, ncol = M) 10 | # Liberals (majority Gov. party) 11 | for(n in 1:20){ 12 | m_votes[n, ] <- rbinom(M, size = 1, prob = 0.9) 13 | } 14 | # Conservatives (uneasy coalition) 15 | for(n in 21:32){ 16 | m_votes[n, ] <- rbinom(M, size = 1, prob = 0.7) 17 | } 18 | # Socialists (opposition) 19 | for(n in 33:40){ 20 | m_votes[n, ] <- rbinom(M, size = 1, prob = 0.3) 21 | } 22 | # Greens (opposition) 23 | for(n in 41:45){ 24 | m_votes[n, ] <- rbinom(M, size = 1, prob = 0.25) 25 | } 26 | # Religious (opposition) 27 | for(n in 46:48){ 28 | m_votes[n, ] <- rbinom(M, size = 1, prob = 0.1) 29 | } 30 | # Independents (random) 31 | for(n in 49:50){ 32 | m_votes[n, ] <- rbinom(M, size = 1, prob = 0.5) 33 | } 34 | rm(n) 35 | 36 | votes_data <- data_frame( 37 | vote_id = rep(paste0("Vote_", 1:M), each = N), 38 | legislator_id = rep(1:N, times = M), 39 | vote = as.vector(m_votes), 40 | legislator_party = "" 41 | ) %>% 42 | mutate(legislator_party = case_when( 43 | legislator_id <= 20 ~ "The Classic Liberal Party", 44 | legislator_id > 20 & legislator_id <= 32 ~ "The Conservative Party", 45 | legislator_id > 32 & legislator_id <= 40 ~ "The Socialist Party", 46 | legislator_id > 40 & legislator_id <= 45 ~ "The Green Party", 47 | legislator_id > 45 & legislator_id <= 48 ~ "The Religious Party", 48 | TRUE ~ "Independent"), 49 | legislator_id = paste0("Legislator_", legislator_id), 50 | government = ifelse(legislator_party %in% c("The Classic Liberal Party", 51 | "The Conservative Party"), 52 | "Government", "Opposition"), 53 | index = gsub("[A-Za-z_]*", "", vote_id), 54 | index = as.numeric(index), 55 | year = ifelse(index <= 75, "2017", "2018")) %>% 56 | select(-index) 57 | 58 | dimnames(m_votes)[[1]] <- unique(votes_data$legislator_id) 59 | dimnames(m_votes)[[2]] <- unique(votes_data$vote_id) 60 | 61 | # make the first two voters roughly opposite: 62 | # put voter from religious party in 2nd row 63 | religious <- m_votes[46, ] 64 | liberal <- m_votes[2, ] 65 | m_votes[2, ] <- religious 66 | m_votes[46, ] <- liberal 67 | dimnames(m_votes)[[1]][2] <- "Legislator_46" 68 | dimnames(m_votes)[[1]][46] <- "Legislator_2" 69 | 70 | # and make a random subset NA (missed votes, common in real datasets): 71 | m_votes[sample(seq(m_votes), 50)] <- NA 72 | # save: 73 | save(votes_data, file = "data/votes_data.Rda") 74 | save(m_votes, file = "data/votes.Rda") 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bayesian Ideal Point IRT Models in R using Stan 2 | 3 | This repo includes data, scripts and [Stan](http://mc-stan.org/) code to run Item Response Theory (IRT) models in R. These models are geared towards analysis of legislative voting data (the scripts can be used for other types of voting data too, for example court decisions). The Stan code can also be easily used in Python with pyStan. The models run for 5000 iterations by default, but this can be changed in the R scripts in the call to `stan()`. 4 cores are used to process 4 chains, which again can be changed in the call to `stan()`. For the dynamic and 2-dimensional models, I recommend up to about 8000 iterations to get good estimates, depending on the data. 4 | The fake data were created in `scripts/create_data.R`. This code and toy dataset are for demonstrating how to build these models only, they are not meant to be high-performing models (you will find convergence issues if you run them with the data provided, for example). 5 | 6 | ## Repo structure 7 | 8 | Example data can be found in `data/`. All R scripts, found in `scripts/`, use this dataset. The IRT model code can be found in `models/`, being `models/Stan/` for the Stan model code. R code for making plots of the results of the modelling stage can be found in `plots/`, with an example run in `saved_runs/`. What it makes (from the one dimensional model): 9 | 10 | ![](Plot.png) 11 | 12 | There is also code for a plot of the two dimensional model in the same script. 13 | 14 | ## Models 15 | - One dimensional model (`1d_model_Stan.R`) 16 | - Two dimensional model (`2Dmodel_Stan.R`) 17 | - Dynamic model (`Dynamic_model.R`) 18 | - Hierarchical model (`1d_hierarchical.stan`) (this can also be adapted to have a higher level regression) 19 | 20 | ## IRT modelling 21 | *Ideal point* IRT models differ from regular IRT models in that the discrimination parameter (the *beta* in y = beta*theta - alpha) in a 2-parameter IRT model cannot be negative, whereas in an ideal point model it can. This is to capture the fact that variation along the latent trait can move in both directions in the political context (for example, along a left-right scale). In regular IRT, this does not make much sense as movement upwards along the scale of the latent trait (usually some type of ability) should be associated with positive movement in levels of the actual unobserved trait, i.e. the more of an ability you have, the more chance of getting a correct response. 22 | 23 | BUGS or JAGS users can find a host of regular IRT models in Ian Curtis' [paper](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&ved=0ahUKEwjEk-b0_oLOAhUGDpAKHd4CCjMQFggeMAA&url=https%3A%2F%2Fwww.jstatsoft.org%2Farticle%2Fview%2Fv036c01%2Fv36c01.pdf&usg=AFQjCNEs9TOxtdwHK3wdInSin01WCa-Iyw&sig2=Pg9jjBeFewZIzYaAIE_gTg). 24 | JAGS is commonly used in the field for this type of model, but I would recommend using Stan, as JAGS can take a *very* long time for even medium-sized datasets (in the non big-data sense, for example 100 voters voting on 200 votes). The reason for this is that JAGS is unable to build a Directed Acyclic Graph from the unobserved regressor in the basic ideal point IRT equation (see [here](https://sourceforge.net/p/mcmc-jags/discussion/610037/thread/5c9e9026/ )). 25 | 26 | ## Packages 27 | This repo makes use of the [`rstan`](http://mc-stan.org/users/interfaces/rstan) package for R, and also some packages from the [tidyverse](https://www.tidyverse.org/) set of packages for data organisation and plotting. 28 | 29 | ## Issues 30 | If you have any problems running the code, or any questions/criticisms/suggestions, please leave an [issue](https://github.com/RobertMyles/IRT/issues) here on the repo. 31 | --------------------------------------------------------------------------------