├── Data ├── Meta_all.csv ├── Meta_smg_sc.csv └── Meta_smg_sn.csv ├── Explained Variability ├── Barplot.LFLMM.R ├── LFLMM.LMMBF.R ├── convert_h5ad.py ├── convert_h5ad_smg.py ├── run.R ├── run_smg_sc.R └── run_smg_sn.R ├── Plots ├── All_sc.png ├── SMG_sc_bin.png ├── SMG_sc_lin.png ├── SMG_sn_bin.png └── SMG_sn_lin.png ├── README.md └── TCR-clonotypes.ipynb /Explained Variability/Barplot.LFLMM.R: -------------------------------------------------------------------------------- 1 | Barplot <- 2 | function(x, plotfile, title="", col=1){ 3 | 4 | getUpper95=function(psi, V){ 5 | denom = sum(psi^2)+1 6 | psivar=NULL 7 | for(k in 1:length(psi)){ 8 | grad = - (2*psi)*psi[k]^2/denom^2 9 | grad[k] = grad[k] + (2*psi[k])/denom 10 | psivar = c(psivar, sum(t(V*grad)*grad)) 11 | } 12 | 13 | psi^2/denom + 1.96 * sqrt(psivar) 14 | } 15 | 16 | png(plotfile) 17 | 18 | remids=c(1) 19 | V = solve(x$hessian)[-remids,-remids] 20 | labs = names(x$nh)[-remids] 21 | psi = x$psi[-remids] 22 | ord = rev(order(psi^2)) 23 | par(mar=c(8, 4.1, 4.1, 2.1), mgp=c(2,0.5,0),family="Liberation Sans") 24 | 25 | varexp = psi[ord]^2/(1+sum(psi^2))*100 26 | varse = getUpper95(psi,V)[ord]*100 27 | labs_plot <- gsub(".", " ", labs, fixed=T) 28 | labs_plot <- gsub("X ", "%", labs_plot, fixed=T) 29 | 30 | xcoords = barplot(varexp, name=labs_plot[ord],las=2, ylim=c(0,max(varse)+2), col=col,border=col, main=title) 31 | mtext("Variance Explained (%)",2,line=2) 32 | 33 | segments(xcoords, varexp, xcoords, varse) 34 | segments(xcoords-0.3, varse, xcoords+0.3, varse) 35 | 36 | dev.off() 37 | 38 | res = cbind(varexp, varse) 39 | rownames(res)=labs_plot[ord] 40 | res 41 | } 42 | -------------------------------------------------------------------------------- /Explained Variability/LFLMM.LMMBF.R: -------------------------------------------------------------------------------- 1 | emn <- 2 | function(X){ 3 | N=ncol(X) 4 | geta = apply(X,1,max)-10 5 | X = X-geta 6 | print(range(-geta)) 7 | #x[x==Inf&!is.na(x)]=max(x[xlkhd0 && abs(lkhd-lkhd0)<1e-7){cat("Converged.\n");return(z[,c(2:(N+1),1)])}else{print(c(lkhd0,lkhd,p)); lkhd0=lkhd} 18 | } 19 | } 20 | LMMBF <- 21 | function(Y, res.nlm, colname, ncont=2, Resid=NULL, geta=100, AllClus=F){ 22 | 23 | logDet = function(x){eval = eigen(x,T)[[1]]; sum(log(eval))} 24 | 25 | H = res.nlm$H 26 | K=ncol(H) 27 | M=nrow(H) 28 | 29 | k = seq(K)[names(res.nlm$nh)==colname] 30 | Mk= sum(H[,k]) 31 | 32 | Z = res.nlm$Z 33 | Zk= Z[,H[,k]==1] 34 | psi = res.nlm$psi 35 | psi[k] = psi[k]*geta 36 | u = c(H%*%psi) 37 | s2 = res.nlm$sigma2 38 | yhat = c(res.nlm$X%*%res.nlm$beta) 39 | # Ytilde = t(t(Y)-yhat) - (res.nlm$a)%*%t(res.nlm$delta) 40 | 41 | ZtZ = t(Z)%*%Z 42 | A = ZtZ + diag(u^(-2)) 43 | YZ = as.matrix(Y%*%Z) - rep(1,nrow(Y))%*%(t(yhat)%*%Z) 44 | 45 | CON = NULL 46 | DB=NULL 47 | DBSE=NULL 48 | if(Mk==1){ 49 | BF = array(0,c(nrow(Y),1)) 50 | DB = array(0,c(nrow(Y),1)) 51 | DBSE = array(0,c(nrow(Y),1)) 52 | }else{ 53 | for(i in 2:ncont){ CON=c(CON, getContrasts2(Mk,i,NULL,NULL)) }; #CON=CON[c(15,2^c(3:0))] 54 | BF = array(0,c(nrow(Y),length(CON)+1)) 55 | DB = array(0,c(nrow(Y),length(CON))) 56 | DBSE = array(0,c(nrow(Y),length(CON))) 57 | } 58 | Z0 = cbind(Z[,H[,k]==0]) 59 | H0 = H[H[,k]==0,-k] 60 | ZtZ0 = t(Z0)%*%Z0 61 | u0 = c(H0%*%psi[-k]) 62 | A0 = ZtZ0 + diag(u0^(-2)) 63 | print(dim(A0)) 64 | YZ0 = as.matrix(Y%*%Z0) - rep(1,nrow(Y))%*%(t(yhat)%*%Z0) # Ytilde%*%Z0 65 | if(!is.null(Resid)){ 66 | resicoef = t(solve(A,t(YZ)))[match(Resid,rownames(Y)),] 67 | resicoef[,H[,k]==1]=0 68 | resicoef[,H[,1]==1] = 0 69 | Re = as.matrix(t(Y[match(Resid,rownames(Y)),])-yhat) - Z%*%t(resicoef) 70 | } 71 | M0 = M-Mk 72 | if(Mk==1){ 73 | BF[,1] = (apply((YZ%*%solve(A))*YZ,1,sum) - apply((YZ0%*%solve(A0))*YZ0,1,sum))/2/s2 - logDet(diag(M)+t(t(ZtZ)*u)*u)/2 + logDet(diag(M0)+(t(t(ZtZ0)*u0)*u0))/2 74 | DB[,1] = (YZ%*%solve(A))[,H[,k]==1] 75 | DBSE[,1] = sqrt(solve(A)[H[,k]==1,H[,k]==1]*s2) 76 | }else{ 77 | BF[,length(CON)+1] = (apply((YZ%*%solve(A))*YZ,1,sum) - apply((YZ0%*%solve(A0))*YZ0,1,sum))/2/s2 - logDet(diag(M)+t(t(ZtZ)*u)*u)/2 + logDet(diag(M0)+(t(t(ZtZ0)*u0)*u0))/2 78 | for(l in 1:length(CON)){ 79 | print(l) 80 | H1 = rbind(H[H[,k]==0,], H[H[,k]==1,][seq(ncol(CON[[l]])),]) 81 | u1 = c(H1%*%psi) 82 | Z1 = cbind(Z[,H[,k]==0],Zk%*%CON[[l]]) 83 | ZtZ1 = t(Z1)%*%Z1 84 | A1 = ZtZ1 + diag(u1^(-2)) 85 | YZ1 = as.matrix(Y%*%Z1) - rep(1,nrow(Y))%*%(t(yhat)%*%Z1) # Ytilde%*%Z1 86 | M1 = M-(Mk-ncol(CON[[l]])) 87 | BF[,l] = (apply((YZ1%*%solve(A1))*YZ1,1,sum) - apply((YZ0%*%solve(A0))*YZ0,1,sum))/2/s2 - logDet(diag(M1)+(t(t(ZtZ1)*u1)*u1))/2 + logDet(diag(M0)+t(t(ZtZ0)*u0)*u0)/2 88 | B1 = (YZ1%*%solve(A1))[,H1[,k]==1] 89 | Vg1 = solve(A1)[H1[,k]==1,H1[,k]==1] 90 | if(ncol(B1)==2){ 91 | DBSE[,l] = sqrt(sum(c(Vg1)*c(1,-1,-1,1))*s2) 92 | DB[,l]=B1[,2]-B1[,1] 93 | } 94 | } 95 | } 96 | cat("Posterior mean calculation") 97 | u = c(H%*%res.nlm$psi) 98 | A = diag(1/u/u) + ZtZ 99 | B = t(solve(A,t(YZ)))[,H[,k]==1,drop=F] 100 | Vb = solve(A)[H[,k]==1,H[,k]==1] 101 | SE = sqrt(s2)%*%t(sqrt(diag(Vb))) 102 | 103 | ltsr = pnorm(0, DB, DBSE) 104 | ltsr[ltsr<0.5] = 1-ltsr[ltsr<0.5] 105 | if(Mk>2){if(AllClus){z = emn(BF)}else{z = emn(BF[,1:length(CON)])}}else{z=emn(BF[,1,drop=F])} 106 | ltsr = ltsr*z[,1:ncol(ltsr)] 107 | 108 | if(ncol(ltsr)>1){ 109 | ltsrname = unlist(lapply(CON,function(x1,lev){paste(paste(lev[x1[,1]==1],collapse=","),paste(lev[x1[,2]==1],collapse=","),sep="_vs_")},lev=colnames(B))) 110 | if(ncol(ltsr)>length(CON)){ 111 | ltsrname=c(ltsrname, "All") 112 | } 113 | colnames(ltsr) = ltsrname 114 | } 115 | list(bf=BF, beta=B, se=SE, deltabeta=DB, deltabetase=DBSE, contrasts=CON, ltsr=ltsr, z=z, residual=Re) 116 | } 117 | getContrasts2 <- 118 | function(n,k,v,res){ 119 | if(length(v)==n){ 120 | if(length(unique(v))==k){if(sum(unique(v)==seq(k))==k){ 121 | tmp = matrix(as.numeric(model.matrix(~0+factor(v))),n) 122 | res=c(res,list(tmp)) 123 | }} 124 | }else{ 125 | for(i in 1:k){ 126 | res = getContrasts2(n,k,c(v,i),res) 127 | } 128 | } 129 | return(res) 130 | } 131 | Gost <- 132 | function(res,id=NULL,contid,pos=T,gid=F,th=0.5,thb=1){ 133 | if(is.null(res$contrasts)){ 134 | if(is.na(pos)){ 135 | lab="both" 136 | flag=res$ltsr[,1]>th 137 | }else{ 138 | if(pos){ 139 | lab="pos" 140 | flag=res$ltsr[,1]>th&res$deltabeta[,1]>log(thb) 141 | }else if(!pos){ 142 | lab="neg" 143 | flag=res$ltsr[,1]>th&res$deltabeta[,1]th 150 | lab=paste(paste(res$levels[res$contrasts[[id]][,2]>0],collapse="-"),"(both)") 151 | }else{ 152 | if(pos){ 153 | flag=res$ltsr[,id]>th&res$deltabeta[,id]>log(thb) 154 | print(res$levels[res$contrasts[[id]][,2]>0]) 155 | lab=paste(res$levels[res$contrasts[[id]][,2]>0],collapse="-") 156 | }else{ 157 | flag=res$ltsr[,id]>th&res$deltabeta[,id]0]) 159 | lab=paste(res$levels[res$contrasts[[id]][,1]>0],collapse="-") 160 | } 161 | } 162 | } 163 | print(table(is.na(flag))); 164 | if(gid){ 165 | rownames(res$beta)[flag] 166 | }else{ 167 | query=list(rownames(res$beta)[flag]) 168 | names(query)=lab 169 | gost(query) 170 | #gost(query,exclude_iea=T) 171 | #gost(query,custom_bg=rownames(res$beta),exclude_iea=T) 172 | } 173 | } 174 | LFLMM <- 175 | function(Y, X, a=rep(0,nrow(Y)), beta=NULL, psi=NULL, delta=NULL, Psi=NULL, theta=NULL, omega=NULL, subset=NULL, nLatent=0, forced=F, ITRMAX=200, PLOT=F, nexpcells=NULL, xgamma, s0=NULL){ 176 | 177 | library(Matrix) 178 | 179 | if(nrow(X)!=ncol(Y)){warning("Input TPM matrix Y is not compatible with covariate matrix X."); return()} 180 | if(!is.data.frame(X)){warning("Covariate matrix X is not a data frame."); return()} 181 | 182 | if(is.null(subset)){subset=rep(T, nrow(X))} 183 | nh = 1 184 | Z = rep(1,sum(subset)) 185 | isnum = 0 186 | for(i in seq(ncol(X))){ 187 | if(is.character(X[,i]) || is.factor(X[,i])){ 188 | isnum = c(isnum, 0) 189 | Z1 = array(0,c(sum(subset),length(table(X[subset,i])))) 190 | Z1[!is.na(X[subset,i]),] = model.matrix(~0+X[subset,i]) 191 | colnames(Z1) = gsub("X\\[subset, i\\]","",colnames(model.matrix(~0+X[subset,i]))) 192 | }else{ 193 | isnum = c(isnum, 1) 194 | Z1 = matrix(scale(as.numeric(X[subset,i])),sum(subset)) 195 | Z1[is.na(Z1)]=0 196 | colnames(Z1) = colnames(X)[i] 197 | } 198 | Z = cbind(Z, Z1) 199 | nh = c(nh, ncol(Z1)) 200 | } 201 | names(nh)=c("Intercept", names(X)) 202 | 203 | isnum = cumsum(1-isnum)*isnum 204 | Collapse=function(x,flag){ 205 | x1=x[cumsum(flag)==0] 206 | x2=x[rev(cumsum(rev(flag)))==0] 207 | res = sum(x[flag]) 208 | names(res) = paste(names(x[flag]),collapse="/") 209 | return(c(x1, res, x2)) 210 | } 211 | if(sum(isnum)>0){for(i in seq(max(isnum))){ 212 | if(sum(isnum==i)>2){ 213 | if(!forced){ 214 | mflag = "n" #readline(paste("Do you want to merge ", paste(names(nh)[isnum==i],collapse=","), " factors? [N/y]: ",sep="")) 215 | }else{ 216 | mflag="n" 217 | } 218 | if(sum(mflag%in%c("y","Y"))){ 219 | nh = Collapse(nh, isnum==i) 220 | isnum = Collapse(isnum, isnum==i) 221 | } 222 | } 223 | }} 224 | if(nLatent>0){nh=c(nh, LatentFactor=nLatent);} 225 | print(nh) 226 | K = length(nh) 227 | H = diag(K)[rep(seq(K),nh),] 228 | X = Z 229 | 230 | # Latent factors 231 | #Z = cbind(Z, array(0,c(nrow(Z),nLatent))) 232 | if(!is.null(Psi)){ 233 | Z = cbind(Z, Psi) 234 | }else{ 235 | Z = cbind(Z, array(0,c(nrow(Z),nLatent))) 236 | } 237 | 238 | # Likelihood 239 | logDet = function(x){eval = eigen(x,T)[[1]]; sum(log(eval))} 240 | Kurt=function(V){diag(V)%*%t(diag(V))+2*V^2} 241 | Solve = function(X){if(length(X)==1){c(1/X)}else{X=eigen(X,T); r=X[[1]]; r[r<1e-7]=1; X[[2]]%*%diag(1/r)%*%t(X[[2]])}} 242 | lkhd = function(beta, psi, delta, omega, theta, s0, Y, X, Z, H, Yt, YYt, YX, YZ, y2){ 243 | J = nrow(Y) 244 | N = ncol(Y) 245 | M = ncol(Z) 246 | z = c(apply(Z, 2, sum)) 247 | ZtZ = t(Z)%*%Z # for W 248 | 249 | # A = (ZtZ+U^-2) MAT 250 | u = c(H%*%psi) 251 | A = ZtZ+diag(1/u^2) 252 | 253 | #print("sigma2") 254 | # signa^2 | beta delta Psi(Z) psi(A) 255 | if(nLatent==0){ 256 | ZtRYt = t(YZ) # t(as.matrix(Y%*%(Z*R))) 257 | }else{ 258 | ZtRYt = t(as.matrix(Y%*%Z)) 259 | } 260 | ZtRX = t(Z)%*%X 261 | ZtRYtmb = (ZtRYt - c(ZtRX%*%beta)) 262 | AinvZtRYtmb = solve(A, ZtRYtmb) 263 | yR2y = y2 # colSums((Yt*R)^2) 264 | Xb = c(X%*%beta) 265 | YR2Xb = YX%*%beta # as.matrix(Y%*%(Xb*R^2)) 266 | if(is.null(s0)){ 267 | s2 = c( ( yR2y - 2*YR2Xb + sum((Xb)^2) ) - colSums(AinvZtRYtmb*ZtRYtmb) ) 268 | if(!is.null(nexpcells)){ 269 | tau = (N/2+theta) / (s2/2+theta*c(nexpcells%*%omega)) 270 | logtau = log(tau/(N/2+theta)) + digamma((N/2+theta)) 271 | resglm = GammaGlm(tau[xgamma>0.05],logtau[xgamma>0.05],nexpcells[xgamma>0.05,,drop=F],PLOT=1,x=xgamma[xgamma>0.05]); omega=resglm$omega; theta=resglm$theta 272 | cat("omega theta=");print(c(omega,theta)) 273 | s2 = (s2/2+theta*c(nexpcells%*%omega)) / (N/2+theta) 274 | }else{ 275 | s2 = (s2+1)/(N+1) 276 | } 277 | }else{s2=s0} 278 | 279 | #print("beta") 280 | # beta | delta sigma2 Psi(Z) psi(A) 281 | XtR2X = t(X)%*%(X) 282 | y = colSums(Y/s2) / sum(1/s2) 283 | beta0=0.01 284 | beta = c(Solve(diag(ncol(X))*beta0 + XtR2X-t(ZtRX)%*%solve(A)%*%ZtRX) %*% (t(X)%*%y-t(ZtRX)%*%solve(A,t(Z)%*%y))) 285 | Xb = c(X%*%beta) 286 | 287 | #print("Psi") 288 | # Psi | beta delta psi(A) sigma2 289 | PL=NULL 290 | if(nLatent>0){ 291 | cat("Latent factor") 292 | M2 = M-nLatent 293 | A2 = t(Z[,1:M2])%*%Z[,1:M2] + diag(1/u[1:M2]^2) 294 | RXb = c(X%*%beta) 295 | YR2Xb = c(YX%*%beta) 296 | YRZ = YZ[,1:M2] 297 | ZtRXb = c(t(Z[,1:M2])%*%RXb) 298 | CAind = c(YRZ%*%solve(A2, ZtRXb)) 299 | YVYt = YYt 300 | 301 | YVYt = t(YVYt-YR2Xb)-YR2Xb + sum(RXb^2) - YRZ%*%solve(A2)%*%t(YRZ) 302 | YVYt = t(YVYt+CAind)+CAind - sum(solve(A2,ZtRXb)*ZtRXb) 303 | YVYt = t(YVYt/sqrt(s2))/sqrt(s2)/J 304 | cat("Eigen decomp") 305 | PL = Eigen(YVYt,nLatent+5) 306 | 307 | flage=(apply(PL$evec^2,2,max)/apply(PL$evec^2,2,sum))<0.3 308 | cat("good pcs=");print(sum(flage[1:nLatent])) 309 | lambda = PL$eval[1:nLatent]; lambda[lambda<1]=1 310 | Z[,(M2+1):M] = ( as.matrix((Yt)%*%(PL$evec[,1:nLatent]/sqrt(s2))) - RXb%*%t(colSums(PL$evec[,1:nLatent]/sqrt(s2))) ) %*%diag(sqrt(1-1/lambda)) 311 | ZtZ = t(Z)%*%Z 312 | A = ZtZ+diag(1/u^2) 313 | ZtRX = t(Z)%*%(X) # for W 314 | ZtRYt = t(as.matrix(Y%*%(Z))) # for W 315 | } 316 | 317 | # Var(xi) 318 | Vx = ZtZ - ZtZ %*% solve(A, ZtZ) 319 | hess = diag(psi)%*%t(H)%*%Kurt(Vx)%*%H%*%diag(psi)*2/N + t(H)%*%(Vx * t(solve(A)/u)/u)%*%H*2 - diag(c(t(H)%*%diag(Vx)))*2 320 | 321 | # likelihood 322 | res = sum(log(s2))*N/2 + J * logDet(diag(M) + t(ZtZ*u)*u)/2 323 | 324 | print("psi") 325 | # psi | beta delta Psi(Z) sigma2 326 | ZtRYtmb = (ZtRYt - c(ZtRX%*%beta)) 327 | AinvZtRYtmb = solve(A, ZtRYtmb) 328 | attr(res, "gradient") = - (c( t(H)%*%apply(t(t(AinvZtRYtmb^2)/s2)/u^3,1,sum) ) - J * c( t(H)%*%(diag(solve(A,ZtZ))/u) )) 329 | attr(res, "hessian") = - (hess+t(hess))/2 * J 330 | attr(res, "beta") = beta 331 | attr(res, "omega") = omega 332 | attr(res, "theta") = theta 333 | attr(res, "psi") = psi 334 | attr(res, "sigma2") = s2 335 | attr(res, "Z") = Z 336 | attr(res, "X") = X 337 | attr(res, "H") = H 338 | attr(res, "delta") = delta 339 | attr(res, "PL") = PL 340 | res 341 | } 342 | 343 | # Matrix prep 344 | lkhd.all=NULL 345 | res.min=NULL 346 | if(is.null(psi)){psi=rep(0.01,length(nh)); if(nLatent>0){psi[length(psi)]=1}} 347 | if(is.null(beta)){beta=rep(0.1,ncol(X))} 348 | if(is.null(delta)){delta=rep(0,nrow(Z))} 349 | if(is.null(omega) && !is.null(nexpcells)){ 350 | s2 = apply(Y,1,var) 351 | omega = coef(lm(s2~0+nexpcells)) 352 | lambda = 1/c(nexpcells%*%omega) 353 | theta = mean(lambda^2/(1/s2-lambda)^2) 354 | print(c(omega,theta)) 355 | } 356 | 357 | Y = Y[,subset] 358 | Yt = NULL; YYt=NULL; YX=NULL; YZ=NULL; 359 | if(nLatent>0){print("matrix prep 1") 360 | Yt = t(Y) 361 | YYt=as.matrix(Y%*%Yt) 362 | YX=as.matrix(Y%*%X) 363 | YZ=as.matrix(Y%*%Z) 364 | y2 = rowSums(Y^2) 365 | }else{print("matrix prep 2") 366 | print("Y") 367 | print(dim(Y)) 368 | print(Y[1:5, 1:5]) 369 | 370 | print("Z") 371 | print(dim(Z)) 372 | print(Z[1:5, 1:5]) 373 | 374 | Y = as.matrix(Y) 375 | YZ=as.matrix(Y%*%Z) 376 | YX=as.matrix(Y%*%X) 377 | y2 = rowSums(Y^2) 378 | } 379 | print("matrix prep end") 380 | 381 | # Iteration start 382 | convstat=F 383 | for(i in 1:ITRMAX){ 384 | cat(paste("[",i,"]")) 385 | tmp = lkhd(beta=beta, psi=psi, delta=delta, omega=omega, theta=theta, s0=s0, Y=Y, Z=Z, X=X, H=H, Yt=Yt, YYt=YYt, YZ=YZ, YX=YX, y2=y2) 386 | #return(tmp) 387 | if(is.null(lkhd.all) || tmp0){ 391 | K2 = ncol(H)-1 392 | psi[1:K2] = psi[1:K2] - solve(attr(tmp,"hessian")[1:K2,1:K2]+2*diag(0.00001/psi[1:K2]^2-1),attr(tmp,"gradient")[1:K2]+2*(-0.00001/psi[1:K2]+psi[1:K2])) 393 | }else{ 394 | psi = psi - solve(attr(tmp,"hessian")+2*diag(0.00001/psi^2-1),attr(tmp,"gradient")+2*(-0.00001/psi+psi)) 395 | } 396 | #cat("psi="); 397 | print(psi) 398 | beta =attr(tmp, "beta") 399 | delta =attr(tmp, "delta") 400 | omega =attr(tmp, "omega") 401 | theta =attr(tmp, "theta") 402 | #cat("omega=");print(c(omega,theta)) 403 | Z = attr(tmp, "Z") # Psi 404 | if(length(lkhd.all)>50){lkhd.all=rev(rev(lkhd.all)[1:50])} 405 | if(i>1 & abs(diff(rev(lkhd.all)[1:2])/rev(lkhd.all)[1])<1e-7){convstat=T; cat("Converged\n");break} 406 | } 407 | c(list(lkhd=-as.numeric(res.min)), attributes(res.min), list(nh=nh), Converged=convstat) 408 | } 409 | -------------------------------------------------------------------------------- /Explained Variability/convert_h5ad.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scanpy as sc 4 | import scipy.sparse as de 5 | 6 | adata = sc.read_h5ad("../Data/lung_5loc_sc_sn_cellxgene_030222.h5ad") 7 | cells = adata[adata.obs.scsn=="cells"].copy() 8 | 9 | to_write = pd.DataFrame(de.csr_matrix.todense(cells.X), index=cells.obs.index, columns=cells.var.index).T 10 | 11 | 12 | print(to_write.head()) 13 | 14 | 15 | to_write.to_csv("../Data/cells.csv") 16 | 17 | -------------------------------------------------------------------------------- /Explained Variability/convert_h5ad_smg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scanpy as sc 4 | import scipy.sparse as de 5 | 6 | adata = sc.read_h5ad("../Data/lung_5loc_sc_sn_cellxgene_030222.h5ad") 7 | adata = adata[adata.obs.Celltypes.isin(["SMG_Duct", "SMG_Mucous", "SMG_Serous"])].copy() 8 | adata = adata[adata.obs.Loc_true.isin(['a_Trachea', 'b_Bronchi.2.3', 'c_Bronchi.4'])].copy() 9 | 10 | cells = adata[adata.obs.scsn=="cells"].copy() 11 | nuclei = adata[adata.obs.scsn=="nuclei"].copy() 12 | nuclei = nuclei[nuclei.obs.Donor!="A42"].copy() 13 | 14 | to_write_sc = pd.DataFrame(de.csr_matrix.todense(cells.X), index=cells.obs.index, columns=cells.var.index).T 15 | to_write_sn = pd.DataFrame(de.csr_matrix.todense(nuclei.X), index=nuclei.obs.index, columns=nuclei.var.index).T 16 | 17 | 18 | print(to_write_sc.head()) 19 | print(to_write_sn.head()) 20 | 21 | 22 | to_write_sc.to_csv("../Data/smg_sc.csv") 23 | to_write_sn.to_csv("../Data/smg_sn.csv") 24 | 25 | -------------------------------------------------------------------------------- /Explained Variability/run.R: -------------------------------------------------------------------------------- 1 | # Log CPM matrix (gene x cell) 2 | # Execute only the first time 3 | cpm=read.table("../Data/cells.csv", header=T, sep=",") 4 | cpm <- data.frame(cpm[,-1], row.names = cpm[,1]) 5 | 6 | # select genes expressed in at least 5% of all cells 7 | percent <- dim(cpm)[2]*0.95 8 | cpm <- cpm[rowSums(cpm == 0) <= percent, ] 9 | 10 | # meta data file 11 | mdata=read.table("../Data/Meta_all.csv",header=T, sep=",", row.names=1) 12 | 13 | # Logaritmize Total counts and Number of Genes 14 | mdata$Total.counts=log(mdata$Total.counts) 15 | mdata$Number.of.genes=log(mdata$Number.of.genes) 16 | 17 | # Linear mixed model 18 | source("LFLMM.LMMBF.R") 19 | 20 | # barplot for the lmm output 21 | source("Barplot.LFLMM.R") 22 | 23 | # fitting lmm 24 | res = LFLMM(cpm, mdata) 25 | 26 | # visualising variance explained (%) 27 | Barplot(res, "../Plots/All_sc.png", "Cells") 28 | 29 | -------------------------------------------------------------------------------- /Explained Variability/run_smg_sc.R: -------------------------------------------------------------------------------- 1 | # Linear mixed mode 2 | source("LFLMM.LMMBF.R") 3 | 4 | # barplot for the lmm output 5 | source("Barplot.LFLMM.R") 6 | 7 | 8 | # Log CPM matrix (gene x cell) 9 | cpm=read.table("../Data/smg_sc.csv", header=T, sep=",") 10 | cpm <- data.frame(cpm[,-1], row.names = cpm[,1]) 11 | 12 | ######select genes expressed in at least 5% of all cells 13 | check_genes = c("CCL28", "CCL20", "CCL2", "TNFSF13", "IL6", "HLA-DRA", "HLA-DRB1", "HLA-DRB5", "HLA-DPB1", "HLA-DQB1", "HLA-DPA1", "HLA-DMA", "HLA-DMB", "HLA-DQA1", "HLA-DPB1", "CD40", "PIGR") 14 | 15 | percent <- dim(cpm)[2]*0.95 16 | cpm_filt <- cpm[(rowSums(cpm == 0) <= percent) | (rownames(cpm) %in% check_genes), ] 17 | 18 | for (i in check_genes){ 19 | print(paste0(i, " ", i %in% rownames(cpm_filt))) 20 | } 21 | 22 | ###### Select predetermined genes 23 | cpm_genes <-cpm[rownames(cpm) %in% check_genes, ] 24 | 25 | 26 | ###### meta data file 27 | mdata=read.table("../Data/Meta_smg_sc.csv",header=T, sep=",", row.names=1) 28 | 29 | # Logaritmize Total counts and Number of Genes 30 | mdata$Total.counts=log(mdata$Total.counts) 31 | mdata$Number.of.genes=log(mdata$Number.of.genes) 32 | 33 | ####### LOCATION AS LINEAR ######### 34 | mdata$Location[mdata$Location == "a_Trachea"] <- 1 35 | mdata$Location[mdata$Location == "b_Bronchi.2.3"] <- 2 36 | mdata$Location[mdata$Location == "c_Bronchi.4"] <- 3 37 | 38 | # fitting lmm 39 | res = LFLMM(cpm_filt, mdata) 40 | Barplot(res, "../Plots/SMG_sc_lin.png", "SMG Cells") 41 | 42 | 43 | ###### LOCATION AS BINARY ####### 44 | mdata$Location[mdata$Location == 2] <- 0 45 | mdata$Location[mdata$Location == 3] <- 0 46 | 47 | # fitting lmm 48 | res = LFLMM(cpm_filt, mdata) 49 | Barplot(res, "../Plots/SMG_sc_bin.png", "SMG Cells") 50 | -------------------------------------------------------------------------------- /Explained Variability/run_smg_sn.R: -------------------------------------------------------------------------------- 1 | # Linear mixed mode 2 | source("LFLMM.LMMBF.R") 3 | 4 | # barplot for the lmm output 5 | source("Barplot.LFLMM.R") 6 | 7 | 8 | # Log CPM matrix (gene x cell) 9 | cpm=read.table("../Data/smg_sn.csv", header=T, sep=",") 10 | cpm <- data.frame(cpm[,-1], row.names = cpm[,1]) 11 | 12 | ######select genes expressed in at least 5% of all cells 13 | check_genes = c("CCL28", "CCL20", "CCL2", "TNFSF13", "IL6", "HLA-DRA", "HLA-DRB1", "HLA-DRB5", "HLA-DPB1", "HLA-DQB1", "HLA-DPA1", "HLA-DMA", "HLA-DMB", "HLA-DQA1", "HLA-DPB1", "CD40", "PIGR") 14 | 15 | percent <- dim(cpm)[2]*0.95 16 | cpm_filt <- cpm[(rowSums(cpm == 0) <= percent) | (rownames(cpm) %in% check_genes), ] 17 | 18 | for (i in check_genes){ 19 | print(paste0(i, " ", i %in% rownames(cpm_filt))) 20 | } 21 | 22 | ###### Select predetermined genes 23 | cpm_genes <-cpm[rownames(cpm) %in% check_genes, ] 24 | 25 | ###### meta data file 26 | mdata=read.table("../Data/Meta_smg_sn.csv",header=T, sep=",", row.names=1) 27 | 28 | # Logaritmize Total counts and Number of Genes 29 | mdata$Total.counts=log(mdata$Total.counts) 30 | mdata$Number.of.genes=log(mdata$Number.of.genes) 31 | 32 | ####### LOCATION AS LINEAR ######### 33 | mdata$Location[mdata$Location == "a_Trachea"] <- 1 34 | mdata$Location[mdata$Location == "b_Bronchi.2.3"] <- 2 35 | mdata$Location[mdata$Location == "c_Bronchi.4"] <- 3 36 | 37 | # fitting lmm 38 | res = LFLMM(cpm_filt, mdata) 39 | Barplot(res, "../Plots/SMG_sn_lin.png", "SMG Nuclei") 40 | 41 | ###### LOCATION AS BINARY ####### 42 | mdata$Location[mdata$Location == 2] <- 0 43 | mdata$Location[mdata$Location == 3] <- 0 44 | 45 | # fitting lmm 46 | res = LFLMM(cpm_filt, mdata) 47 | Barplot(res, "../Plots/SMG_sn_bin.png", "SMG Nuclei") 48 | -------------------------------------------------------------------------------- /Plots/All_sc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elo073/5loclung/9b52cd7fd7142b42a25efff0bba1bc36c25ea4a3/Plots/All_sc.png -------------------------------------------------------------------------------- /Plots/SMG_sc_bin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elo073/5loclung/9b52cd7fd7142b42a25efff0bba1bc36c25ea4a3/Plots/SMG_sc_bin.png -------------------------------------------------------------------------------- /Plots/SMG_sc_lin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elo073/5loclung/9b52cd7fd7142b42a25efff0bba1bc36c25ea4a3/Plots/SMG_sc_lin.png -------------------------------------------------------------------------------- /Plots/SMG_sn_bin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elo073/5loclung/9b52cd7fd7142b42a25efff0bba1bc36c25ea4a3/Plots/SMG_sn_bin.png -------------------------------------------------------------------------------- /Plots/SMG_sn_lin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elo073/5loclung/9b52cd7fd7142b42a25efff0bba1bc36c25ea4a3/Plots/SMG_sn_lin.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 5loclung (5 location lung study) 2 | ## A spatially resolved atlas of the human lung characterizes a gland-associated immune niche 3 | 4 |

5 | 6 | 7 | 8 |

9 | 10 | This repository contains custom codes used in the analysis of single cell, nuclei and spatial transcriptomics data from the healthy human lung, now published in [Nature Genetics](https://www.nature.com/articles/s41588-022-01243-4). 11 | 12 | Visit our CellxGene browser!: https://www.lungcellatlas.org/ 13 | 14 | ### Code availability 15 | 16 | Most of the codes used in manuscript are publicly available packages with specifications written in the methods of the study. 17 | 18 | - Code for fGWAS plots and for cell type proportion analysis is available here: https://github.com/natsuhiko/PHM 19 | 20 | - Code for marker gene dot plots with mean group expressions and expression of TCR regions were previously published [(Park, J et al. Science 2020)](https://www.science.org/doi/10.1126/science.aay3224) and the code available [here](https://zenodo.org/record/3711134#.ZBnPc-zP2qA) (10.5281/zenodo.3711134) 21 | 22 | - Code and data from cell2location analysis of Visium data is available [here](https://github.com/vitkl/adult_lung_mapping/) 23 | 24 | - Code for shared TCR clonotype analysis across donors and locations is in the tile TCR-clonotypes.ipynb. 25 | 26 | - Code for Cell type composition analysis using linear mixed model (eg. Figure 1e) is available [here](https://github.com/Teichlab/sctkr/blob/2a024cafef1aae192bf9656349449c5a84d1c6ed/R/CellTypeCompositionAnalysis.R#L4) 27 | 28 | - Code for explained variability code is explained below and in folders Data, Explained Variability and Plots. 29 | 30 | ### Data Availability 31 | 32 | The processed scRNA-seq, snRNA-seq and Visium ST data are available for browsing and download via our website www.lungcellatlas.org. The dataset (raw data and metadata) is available on the [Human Cell Atlas Data Portal](https://data.humancellatlas.org/explore/projects/957261f7-2bd6-4358-a6ed-24ee080d5cfc) and on the European Nucleotide Archive (ENA) under accession number [PRJEB52292](https://www.ebi.ac.uk/ena/browser/view/PRJEB52292) and BioStudies accession [S-SUBS17](https://www.ebi.ac.uk/biostudies/dsp/studies/S-SUBS17). The Visium data are publicly available on ArrayExpress with the accession number [E-MTAB-11640](https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-11640). Imaging data can be downloaded from European Bioinformatics Institute (EBI) BioImage Archive under accession number [S-BIAD570](https://www.ebi.ac.uk/biostudies/bioimages/studies/S-BIAD570). Additional data were accessed to support analysis and conclusions, which can be accessed through National Centre for Biotechnology Information Gene Expression Omnibus [GSE136831](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE136831), and [GSE134174](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE134174) and the HLCA integration, which can be accessed at https://github.com/LungCellAtlas/HLCA. 33 | 34 | ### Instructions on the analysis for calculating explained varibility by a metadata factor 35 | 36 | ### How to Execute 37 | 38 | First, clone the repository 39 | ```bash 40 | $ git clone https://github.com/elo073/5loclung.git 41 | ``` 42 | 43 | Next, access the data portal () and download the H5AD object under "All data". Save it in 5loclung/Data 44 | 45 | Finaly, run the following commands: 46 | 47 | ```bash 48 | # Access the script's folder 49 | $ cd 5loclung/Explained\ Variability/ 50 | 51 | # Write count matrices 52 | $ python convert_h5ad.py 53 | $ python convert_h5ad_smg.py 54 | 55 | # Execute scripts for explained variability 56 | $ Rscript run.R 57 | $ Rscript run_smg_sc.R 58 | $ Rscript tun_smg_sn.R 59 | 60 | ``` 61 | 62 | The plots will be saved in the 'Plots' folder 63 | --------------------------------------------------------------------------------