├── HiC_spector.jl ├── README.md ├── data └── readme_data ├── hic_spector_tutorial.jl ├── run_reproducibility.jl └── run_reproducibility_v2.py /HiC_spector.jl: -------------------------------------------------------------------------------- 1 | #using HDF5; 2 | #using JLD; 3 | #using MAT; 4 | #the packages above not required by the code here, but will be useful for reading/writing common files for analysis purpose 5 | using DataFrames; 6 | using CurveFit; 7 | using Interpolations; 8 | 9 | function get_reproducibility(M1,M2,num_evec); 10 | 11 | if ~isequal(M1,M1') 12 | tmp1=M1-spdiagm(diag(M1)); 13 | M1=tmp1+tmp1'+spdiagm(diag(M1)); 14 | end 15 | if ~isequal(M2,M2') 16 | tmp2=M2-spdiagm(diag(M2)); 17 | M2=tmp2+tmp2'+spdiagm(diag(M2)); 18 | end 19 | 20 | N=size(M1)[1]; 21 | #get rid of isolated nodes 22 | k1=sum(spones(M1),2); 23 | d1=diag(M1); 24 | kd1=!((k1.==1).*(d1.>0)) 25 | k2=sum(spones(M2),2); 26 | d2=diag(M2); 27 | kd2=!((k2.==1).*(d2.>0)) 28 | iz=find((k1+k2.>0).*(kd1.>0).*(kd2.>0)); 29 | 30 | M1b=M1[iz,iz]; 31 | M2b=M2[iz,iz]; 32 | 33 | i_nz1=find(sum(M1b,2).>0); 34 | i_nz2=find(sum(M2b,2).>0); 35 | 36 | i_z1=find(sum(M1b,2).==0); 37 | i_z2=find(sum(M2b,2).==0); 38 | 39 | Ln1_nz1=get_Laplacian(M1b); 40 | Ln2_nz2=get_Laplacian(M2b); 41 | 42 | (a1,b1)=eigs(speye(length(i_nz1))-Ln1_nz1,nev=num_evec,which=:LM); 43 | #(a1,b1)=eigs(speye(length(i_nz1))-Ln1_nz1,nev=num_evec+1,which=:LM); 44 | a1=1-a1; 45 | (a2,b2)=eigs(speye(length(i_nz2))-Ln2_nz2,nev=num_evec,which=:LM); 46 | #(a2,b2)=eigs(speye(length(i_nz2))-Ln2_nz2,nev=num_evec+1,which=:LM); 47 | a2=1-a2; 48 | 49 | ipr_cut=5; 50 | 51 | b1_extend=zeros(size(M1b,1),num_evec); 52 | for i=1:num_evec 53 | #b1_extend[i_nz1,i]=b1[:,i+1]; 54 | b1_extend[i_nz1,i]=b1[:,i]; 55 | end 56 | 57 | ipr1=zeros(num_evec); 58 | for i=1:num_evec; 59 | ipr1[i]=get_ipr(b1_extend[:,i]); 60 | end 61 | 62 | b1_extend_eff=b1_extend[:,ipr1.>ipr_cut]; 63 | 64 | b2_extend=zeros(size(M2b,1),num_evec); 65 | for i=1:num_evec 66 | #b2_extend[i_nz2,i]=b2[:,i+1]; 67 | b2_extend[i_nz2,i]=b2[:,i]; 68 | end 69 | 70 | ipr2=zeros(num_evec); 71 | for i=1:num_evec; 72 | ipr2[i]=get_ipr(b2_extend[:,i]); 73 | end 74 | 75 | b2_extend_eff=b2_extend[:,ipr2.>ipr_cut]; 76 | 77 | num_evec_eff=minimum([size(b1_extend_eff,2);size(b2_extend_eff,2)]); 78 | 79 | 80 | evd=zeros(num_evec_eff); 81 | for i=1:num_evec_eff; 82 | evd[i]=evec_distance(b1_extend_eff[:,i],b2_extend_eff[:,i]); 83 | end 84 | 85 | Sd=sum(evd); 86 | 87 | evs=abs(sqrt(2)-Sd/num_evec_eff)/sqrt(2); 88 | 89 | if (sum(ipr1.>N/100)<=1)|(sum(ipr2.>N/100)<=1) 90 | print("at least one of the maps does not look like typical Hi-C maps") 91 | evs=NaN; 92 | end 93 | 94 | return evs,a1,a2,evd; 95 | 96 | end 97 | 98 | function get_Laplacian(M); 99 | 100 | K=vec(sum(M,1)); 101 | i_nz=find(K.>0); 102 | D_nz=spdiagm(K[i_nz]); 103 | D_isq=spdiagm(1./sqrt(K[i_nz])); 104 | 105 | Ln_nz=M[i_nz,i_nz]*D_isq; 106 | Ln_nz=speye(length(i_nz))-D_isq*Ln_nz; 107 | n=size(M,1); 108 | 109 | Ln_nz=(Ln_nz+Ln_nz')/2; 110 | return Ln_nz; 111 | end 112 | 113 | function get_ipr(evec); 114 | #evec should be a unit vector 115 | ipr=1./sum(evec.^4,1)[1]; 116 | 117 | end 118 | 119 | function evec_distance(x,y); 120 | 121 | d1=sum((x-y).^2); 122 | d2=sum((x+y).^2); 123 | if d10); 139 | A=M[iz,iz]; 140 | n=size(A,1); 141 | e = ones(n,1); 142 | res=[]; 143 | delta = 0.1; 144 | x0 = e; 145 | tol = 1e-6; 146 | g=0.9; etamax = 0.1; # Parameters used in inner stopping criterion. 147 | 148 | eta = etamax; 149 | x = x0; rt = tol^2; v = x.*(A*x); rk = 1 - v; 150 | rho_km1=sum(rk.^2); 151 | rout = rho_km1; rold = rout; 152 | MVP = 0; # count matrix vector products. 153 | i = 0; # Outer iteration count. 154 | 155 | while rout > rt # Outer iteration 156 | i = i + 1; k = 0; y = e; 157 | innertol = maximum([eta^2*rout;rt]); 158 | while rho_km1 > innertol #Inner iteration by CG 159 | k = k + 1; 160 | if k == 1 161 | Z = rk./v; p=Z; rho_km1 = sum(rk.*Z); 162 | else 163 | beta=rho_km1/rho_km2; 164 | p=Z + beta*p; 165 | end 166 | # Update search direction efficiently. 167 | w = x.*(A*(x.*p)) + v.*p; 168 | #w=squeeze(w,2); 169 | alpha = rho_km1/sum(p.*w); 170 | ap =squeeze(alpha*p,2); 171 | # Test distance to boundary of cone. 172 | ynew = y + ap; 173 | if minimum(ynew) <= delta 174 | if delta == 0 175 | break 176 | end 177 | ind = find(ap .< 0); 178 | gamma = minimum((delta - y[ind])./ap[ind]); 179 | y = y + gamma*ap; 180 | break 181 | end 182 | y = ynew; 183 | rk = rk - alpha*w; rho_km2 = rho_km1; rho_km2=rho_km2[1]; 184 | Z = rk./v; rho_km1 = sum(rk.*Z); 185 | end 186 | x = x.*y; v = x.*(A*x); 187 | rk = 1 - v; rho_km1 = sum(rk.*rk); rout = rho_km1; 188 | MVP = MVP + k + 1; 189 | # Update inner iteration stopping criterion. 190 | rat = rout/rold; rold = rout; r_norm = sqrt(rout); 191 | eta_o = eta; eta = g*rat; 192 | if g*eta_o^2 > 0.1 193 | eta = maximum([eta;g*eta_o^2]); 194 | end 195 | eta = maximum([minimum([eta;etamax]);0.5*tol/r_norm]); 196 | #@sprintf("%3d %6d %.3e %.3e %.3e \n", i,k,r_norm,minimum(y),minimum(x)); 197 | display(rout); 198 | #res=[res; r_norm]; 199 | end 200 | #@printf("Matrix-vector products = %6d\n", MVP); 201 | x=squeeze(x,2); 202 | A2=A*diagm(x); 203 | A2=diagm(x)*A2; 204 | A_balance=extend_mat(A2,iz,L); 205 | A_balance=(A_balance+A_balance')/2; 206 | x_final=zeros(L); 207 | x_final[iz]=x; 208 | 209 | return x_final,A_balance; 210 | 211 | end 212 | 213 | function extend_mat(Z,iz,L); 214 | (u,v)=ind2sub(size(Z),find(Z.!=0)); 215 | w=Z[find(Z)]; 216 | #w=nonzeros(Z); 217 | u=iz[u]; 218 | v=iz[v]; 219 | Z_extend=sparse(u,v,w,L,L); 220 | Z_extend=full(Z_extend); 221 | return Z_extend; 222 | end 223 | 224 | ######################################################################################################################### 225 | 226 | function local_smoothing(x,y); 227 | 228 | span=0.01; 229 | v=sortperm(x); 230 | x=x[v]; 231 | y=y[v]; 232 | ux=unique(x); 233 | uy_smooth=zeros(size(ux)); 234 | n=Int(floor(length(x)*span/2)); 235 | 236 | mm=zeros(size(x)); 237 | L=2*n+1; 238 | i=n+1; 239 | st=1; 240 | ed=i+n; 241 | mm[i]=mean(y[st:ed]); 242 | for i=n+2:length(y)-n; 243 | #display(i); 244 | ed=ed+1; 245 | mm[i]=mm[i-1]+y[ed]/L-y[st]/L; 246 | st=st+1; 247 | end 248 | for i=1:n 249 | mm[i]=mean(y[1:n+i]); 250 | end 251 | for i=1:n; 252 | mm[end-n+i]=mean(y[end-n+1-n+i:end]); 253 | end 254 | 255 | for i=1:length(ux); 256 | iz=find(x.==ux[i]); 257 | uy_smooth[i]=mean(mm[iz]); 258 | end 259 | 260 | return ux,uy_smooth; 261 | 262 | end 263 | 264 | function get_expect_vs_d_single_chr_v0(W,chr2bins,bin_size); 265 | 266 | W=full(W); 267 | W[isnan(W)]=0; 268 | 269 | N=size(W,1); 270 | 271 | 272 | (u,v,w)=findnz(triu(W)); 273 | d=float(v-u); 274 | d2=float(d); 275 | d2[d2.==0]=1/3;#this is the average distance for 2 points drawn from an uniform distribution between [0.1]; 276 | d3=d2*bin_size; 277 | 278 | x=log10(d3); 279 | y=log10(w); 280 | 281 | xs,ys_smooth=local_smoothing(x,y); 282 | 283 | xs_all=collect(0:1.0:size(W,1)-1);xs_all[1]=1/3; 284 | xs_all=xs_all*bin_size; 285 | xs_all_aux=log10(xs_all); 286 | 287 | ys_all=zeros(size(xs_all)); 288 | for k=1:length(xs_all_aux); 289 | ik=find(xs.==xs_all_aux[k]); 290 | if ~isempty(ik) 291 | ys_all[k]=ys_smooth[ik][1]; 292 | end 293 | end 294 | 295 | A_x=find(ys_all.>0); 296 | knots=(A_x,); 297 | itp=interpolate(knots,ys_smooth, Gridded(Linear())); 298 | 299 | A_nz=find(ys_all.==0); 300 | for i=1:length(A_nz); 301 | ys_all[A_nz[i]]=itp[A_nz[i]]; 302 | end 303 | 304 | expect=10.^ys_all; 305 | 306 | return xs_all, expect; 307 | 308 | end 309 | 310 | function get_expect_vs_d_WG_v0(contact,chr2bins,bin_size); 311 | 312 | all_d2=Float64[]; 313 | all_w=Float64[]; 314 | Ltmp=zeros(23); 315 | for chr_num=1:23 316 | 317 | #display(chr_num); 318 | W=extract_chr(contact,chr2bins,chr_num); 319 | W=full(W); 320 | W[isnan(W)]=0; 321 | 322 | N=size(W,1); 323 | 324 | (u,v,w)=findnz(triu(W)); 325 | 326 | d=float(v-u); 327 | d2=float(d); 328 | d2[d2.==0]=1/3; 329 | 330 | all_d2=[all_d2;d2]; 331 | all_w=[all_w;w]; 332 | Ltmp[chr_num]=size(W,1); 333 | 334 | end 335 | 336 | all_d3=all_d2*bin_size; 337 | 338 | x=log10(all_d3); 339 | y=log10(all_w); 340 | 341 | xs,ys_smooth=local_smoothing(x,y); 342 | 343 | xs_all=collect(0:1.0:maximum(Ltmp)-1);xs_all[1]=1/3; 344 | xs_all=xs_all*bin_size; 345 | xs_all_aux=log10(xs_all); 346 | 347 | ys_all=zeros(size(xs_all)); 348 | for k=1:length(xs_all_aux); 349 | ik=find(xs.==xs_all_aux[k]); 350 | if ~isempty(ik) 351 | ys_all[k]=ys_smooth[ik][1]; 352 | end 353 | end 354 | 355 | A_x=find(ys_all.>0); 356 | knots=(A_x,); 357 | itp=interpolate(knots,ys_smooth, Gridded(Linear())); 358 | 359 | A_nz=find(ys_all.==0); 360 | for i=1:length(A_nz); 361 | ys_all[A_nz[i]]=itp[A_nz[i]]; 362 | end 363 | 364 | expect=10.^ys_all; 365 | 366 | return xs_all, expect; 367 | 368 | end 369 | 370 | 371 | function extract_chr(A,chr2bins,chr_num); 372 | st=1+chr2bins[1,chr_num]; 373 | ed=1+chr2bins[2,chr_num]; 374 | A_chr=A[st:ed,st:ed]; 375 | return A_chr; 376 | end 377 | 378 | function get_f_W(W,ys); 379 | 380 | N=size(W,1); 381 | W[isnan(W)]=0; 382 | dark_bins=find(sum(W,1).==0); 383 | num_dark=length(dark_bins); 384 | N_eff=N-num_dark; 385 | f_W=zeros(size(W)); 386 | 387 | x=collect(1:N); 388 | 389 | for d=0:N-1 390 | f_W[1+d:N+1:end-d*N]=ys[d+1]; 391 | end 392 | tmp=f_W-diagm(diag(f_W)); 393 | f_W=f_W+tmp'; 394 | #sum(f_W[1,:])=1 here.. 395 | 396 | f_W[dark_bins,:]=0; 397 | f_W[:,dark_bins]=0; 398 | f_W=f_W/sum(f_W)*N_eff.^2; 399 | 400 | return f_W; 401 | 402 | end 403 | 404 | function get_compartment_A_B(W,f_W); 405 | 406 | iz=find(sum(W,2).>0); 407 | izz=find(sum(W,2).==0); 408 | Wn=W[iz,iz]./f_W[iz,iz]; 409 | C=cor(Wn); 410 | (U,V)=eigs(C); 411 | i_max=indmax(U); 412 | ev=V[:,i_max:i_max+5]; 413 | ev_whole=zeros(size(W,1),6); 414 | ev_whole[iz,:]=ev; 415 | ev_whole[izz,:]=NaN; 416 | 417 | (loc,span)=get_chunks_v2(sign(ev_whole[:,1]),1);# 418 | loc=round(loc); 419 | span=round(span); 420 | cpt=sign(ev_whole[loc,1]); 421 | 422 | return loc,span,ev_whole,cpt; 423 | 424 | end 425 | 426 | #id is the starting loc of a chunk, and d is the length it spans.. 427 | function get_chunks_v2(a,singleton=0); 428 | # adopt from a matlab code by Jiro Doke; 429 | a = [NaN; a; NaN]; 430 | b = diff(a); 431 | b1 = b; # to be used in fullList (below) 432 | ii = trues(size(b)); 433 | ii[b.==0] = false; 434 | b[ii] = 1; 435 | c = diff(b); 436 | id = find(c.==-1); 437 | 438 | #Get single-element chunks also 439 | if singleton.==1 440 | b1[id] = 0; 441 | ii2 = find(b1[1:end-1]); 442 | d = vcat(find(c.==1) - id + 1, ones(length(ii2))); 443 | id = [id;ii2]; 444 | v=sortperm(id); 445 | id=sort(id); 446 | #(id,tmp) = sort(id); 447 | d = d[v]; 448 | else 449 | d = find(c.==1) - id + 1; 450 | end 451 | 452 | return id,d; 453 | end 454 | 455 | function report_compartments(hg19_info,bin2loc,loc,span,ev1,chr_num); 456 | 457 | iz=find(bin2loc[1,:].==chr_num-1); 458 | a=round(bin2loc[2:3,iz]'); 459 | st=a[loc]; 460 | ed=a[loc+span]-1; 461 | output=DataFrame(); 462 | chr=cell(size(st)); 463 | for i=1:length(chr); 464 | chr[i]=change_chr(hg19_info,chr_num); 465 | end 466 | output[:chr]=chr; 467 | output[:start]=st; 468 | output[:end]=ed; 469 | x=ev1[loc]; 470 | x[isnan(x)]=0; 471 | output[:compartment]=sign(x); 472 | 473 | return output; 474 | end 475 | 476 | ######################################################################################################################### 477 | 478 | function generate_arbitrary_mapping_files(hg19_info,bin_size); 479 | 480 | num_of_chromosomes=size(hg19_info,1); 481 | chr2bins=zeros(2,num_of_chromosomes); 482 | chr_length=hg19_info[:length]; 483 | chr_num_bins=round(Int64,floor(chr_length/bin_size))+1 484 | #chr_num_bins=int(floor(chr_length/bin_size))+1; 485 | chr2bins[2,:]=cumsum(chr_num_bins)'-1; 486 | chr2bins[1,1]=0; 487 | chr2bins[1,2:end]=chr2bins[2,1:end-1]+1; 488 | X=round(Int,chr2bins+1); 489 | bin2loc=zeros(3,X[2,end]); 490 | for c=1:size(hg19_info,1); 491 | bin2loc[1,X[1,c]:X[2,c]]=c-1; 492 | bin2loc[2,X[1,c]:X[2,c]]=round(Int,collect(1:bin_size:chr_length[c]))'; 493 | bin2loc[3,X[1,c]:X[2,c]]=[round(Int,collect(bin_size:bin_size:chr_length[c]))' chr_length[c]]; 494 | end 495 | return round(Int64,chr2bins),round(Int64,bin2loc); 496 | 497 | end 498 | 499 | function define_hg19_genome(); 500 | 501 | hg19_info=DataFrame(); 502 | hg19_info[:id]=1:25; 503 | hg19_info[:chr]=["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10", 504 | "chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21", 505 | "chr22","chrX","chrY","chrM"]; 506 | hg19_info[:length]=[249250621,243199373,198022430,191154276,180915260,171115067,159138663, 507 | 146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753, 508 | 81195210,78077248,59128983,63025520,48129895,51304566,155270560,59373566,16571]; 509 | 510 | return hg19_info; 511 | 512 | end 513 | 514 | function change_chr(hg19_info,chr) 515 | 516 | if typeof(chr)==Float64||typeof(chr)==Int64; 517 | chr2=hg19_info[:chr][hg19_info[:id].==chr][1]; 518 | elseif typeof(chr)==ASCIIString||typeof(chr)==SubString{ASCIIString}||typeof(chr)==UTF8String 519 | chr2=hg19_info[:id][hg19_info[:chr].==chr][1]; 520 | end 521 | 522 | return chr2; 523 | 524 | end 525 | 526 | 527 | #input file is generated by HiC-Pro, with 3 columns: row_index, column_index, entry 528 | function read_simple_contact_map(input_file,hg19_info,bin_size); 529 | 530 | chr_length=hg19_info[:length]; 531 | X=readtable(input_file,separator='\t',header=false); 532 | chr2bins,bin2loc=generate_arbitrary_mapping_files(hg19_info,bin_size); 533 | N=size(bin2loc,1); 534 | M=sparse(X[:,1],X[:,2],X[:,3],N,N); 535 | if ~isequal(M,M'); 536 | tmp=M-spdiagm(diag(M)); 537 | M=M+tmp'; 538 | end 539 | 540 | return M; 541 | end 542 | 543 | function define_ce10_genome(); 544 | 545 | ce10_info=DataFrame(); 546 | ce10_info[:id]=1:7; 547 | ce10_info[:chr]=["chrV","chrX","chrIV","chrII","chrI","chrIII","chrM"]; 548 | ce10_info[:length]=[20924149,17718866,17493793,15279345,15072423,13783700,13794]; 549 | return ce10_info; 550 | 551 | end 552 | 553 | function define_dm3_genome(); 554 | 555 | dm3_info=DataFrame(); 556 | dm3_info[:id]=1:15; 557 | dm3_info[:chr]=["2L","2LHet","2R","2RHet","3L","3LHet","3R","3RHet","4","U","Uextra","X","Xhet","YHet","dmel_mitochondrion_genome"]; 558 | dm3_info[:length]=[23011544,368872,21146708,3288761,24543557,2555491,27905053,2517507,1351857,10049037,29004656,22422827,204112,347038,19517]; 559 | 560 | return dm3_info; 561 | 562 | end 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HiC-spector 2 | 3 | A matrix library for spectral and reproducibility analysis of Hi-C contact maps. Several useful functions: 4 | 5 | get_reproducibility 6 | - to calculate the reproducibility metric between 2 HiC contact maps 7 | 8 | knight_ruiz 9 | - Knight Ruiz algorithm for matrix balancing 10 | 11 | get_expect_vs_d_single_chr_v0 12 | - to find the average contact frequency as a function of genomic distance 13 | 14 | get_compartment_A_B 15 | - to find A, B compartment, using method described in Liberman et al. Science 2009 16 | 17 | and a few functions for binning a genome, and reading HiC maps 18 | 19 |

Installation

20 | HiC-spector is mostly written in Julia. It has been tested in Julia 0.4 and 0.5. Both the Julia language (http://julialang.org/) and the required packages have to be installed. Please refer to the beginning of the file HiC_spector.jl for the necessary packages. To get some contact maps for testing the code, please follow the instructions shown in the file data/readme_data. 21 | 22 | There is a Python script available for quantifying reproducibility. The Python version can read files in genomic coordinates as well as the .hic format (https://github.com/theaidenlab/juicebox/wiki/Data). To do so, please download the Python version of the tool straw (straw.py) developed by the Aiden lab (https://github.com/theaidenlab/straw). 23 | 24 |

Usage

25 | The script run_reproducibility.jl is used to get the reproducibility score from a command-line interface. Usage: 26 | > julia run_reproducibility.jl matrix_file1 matrix_file2 27 | 28 | The input file here is a simple text delimited format with no header. 29 | 30 | 1 1 20 31 | 32 | 1 2 18 33 | 34 | ... 35 | 36 | The first and second columns represent the row and column indices of a contact map, whereas the third column is the count. To represent a full matrix, only the upper-triangular component is required. Note that the index should begin with 1. 37 | 38 | Please use the files stored in the folder A549 mentioned in ./data/readme_data to test the script run_reproducibility.jl. 39 | 40 | Julia users can include the file HiC_spector.jl for their own analysis by simply using 41 | > include("./HiC_spector.jl"); 42 | 43 | Please refer to the file hic_spector_tutorial.jl for how to use some of the functions and how to read files in other formats. 44 | 45 | For non-Julia users, one can use the Python script run_reproducibility.py to obtain the reproducibility score. Usage: 46 | > python run_reproducibility.py -F matrix_file1 matrix_file2 47 | 48 | If the matrix files are labeled in genomic coordinates of bins, USage: 49 | > python run_reproducibility.py t matrix_file1 matrix_file2 40000 50 | 51 | where 40000 is the bin size used in the two files 52 | 53 | In addition to the text delimited input files, the Python script can calculate reproducibility score for contact maps stored in .hic format. Usage: 54 | > python run_reproducibility.py -f hic_file1 hic_file2 chrid resolution 55 | 56 | A script is provided in the tool straw (https://github.com/theaidenlab/straw/tree/master/python) for reading the headers (including chr id and the available resolutions) in .hic file. 57 | 58 | Regarding memory, given two contact maps of human chr1 binned in a bin-size of 10kb, the code works fine in a laptop (16GB memory) from our experience. 59 | 60 |

Aurthor/Support

61 | Koon-Kiu Yan, koonkiu.yan@gmail.com; Mark Gerstein, mark@gersteinlab.org 62 | 63 |

Reference

64 | Yan KK, Galip Gürkan Yardımcı, William S Noble and Gerstein M. HiC-Spector: a matrix library for spectral and reproducibility analysis of Hi-C contact maps. Bioinformatics 22 March 2017. https://doi.org/10.1093/bioinformatics/btx152 65 | -------------------------------------------------------------------------------- /data/readme_data: -------------------------------------------------------------------------------- 1 | Because of their sizes, please download the data from the following links to the data folder, and unzip the files. 2 | 3 | http://homes.gersteinlab.org/people/kkyan/sharebox/A549.tar.gz 4 | 5 | http://homes.gersteinlab.org/people/kkyan/sharebox/MCF7-WT.tar.gz 6 | 7 | http://homes.gersteinlab.org/people/kkyan/sharebox/examples_hicpro.tar.gz 8 | 9 | The A549 Hi-C experiments were performed by the ENCODE consortium. The raw reads can be download from the ENCODE portal (https://www.encodeproject.org/) with libraries ENCLB571HTP ENCLB222WYT. The reads were mapped to generate contact maps using the tool HiC-Pro, with bin size 40kb. Please note that the files here contains only intra-chromosomal interactions. The contact maps were not normalized. 10 | 11 | The MCF7 data were obtained from Barutcu et al. Genome Biology 2015 (GSE66733). The contact maps have bin size 250kb, and have already been normalized by the ICE algorithm. 12 | 13 | The hicpro files were generated using Hi-C data of hESC and IMR90 from Dixon etal. Nature 2012, wth bin size 500kb. They include both intra- and inter- chromosomal interactions. 14 | 15 | Sample .hic files can be obtained from the Aiden lab. 16 | For code demonstration, please download the files HIC073.hic and HIC074.hic from 17 | https://bcm.app.box.com/v/aidenlab/1/11406189541 18 | in the folder Rao & Huntley et al. | Cell 2014 /K562 19 | 20 | 21 | -------------------------------------------------------------------------------- /hic_spector_tutorial.jl: -------------------------------------------------------------------------------- 1 | include("./HiC_spector.jl"); 2 | 3 | #####Calculating reproducibility scores using A549 data 4 | 5 | #The contact maps are obtained by binning the human genome in 40kb. 6 | 7 | hg19_info=define_hg19_genome(); 8 | bin_size=40000; 9 | chr2bins,bin2loc=generate_arbitrary_mapping_files(hg19_info,bin_size); 10 | 11 | #The number of eigenvectors (suggested value=20) 12 | 13 | r=20; 14 | 15 | #the reproducibility score Q for 23 chromosomes, 1 to 22, and X 16 | Q=zeros(23); 17 | 18 | ###these arrays are used for benchmark only 19 | elasped_time=zeros(23); 20 | mem=zeros(23); 21 | 22 | for chr_num=2:23; 23 | 24 | display(chr_num); 25 | 26 | chr_string=change_chr(hg19_info,chr_num); 27 | 28 | map_file1="./data/A549/A549C-HindIII-R1_"*chr_string*".inter"; 29 | map_file2="./data/A549/A549D-HindIII-R2_"*chr_string*".inter"; 30 | 31 | X1=readdlm(map_file1,Int64); 32 | X2=readdlm(map_file2,Int64); 33 | 34 | ib=find(bin2loc[1,:].==chr_num-1); 35 | N=length(ib); 36 | 37 | M1=sparse(X1[:,1],X1[:,2],X1[:,3],N,N); 38 | M2=sparse(X2[:,1],X2[:,2],X2[:,3],N,N); 39 | 40 | M1_tmp=M1-spdiagm(diag(M1)); 41 | M2_tmp=M2-spdiagm(diag(M2)); 42 | M1=M1+M1_tmp'; 43 | M2=M2+M2_tmp'; 44 | 45 | #Note that each interaction has shown once in these files. M1, M2 are therefore asymmetric. 46 | #we therefore do M=M+M'. 47 | #neveetheless, it's not necessary because the code get_reproducibility(M1,M2,num_evec) does it for you 48 | 49 | evs,a1,a2=get_reproducibility(M1,M2,r); 50 | Q[chr_num]=evs; 51 | 52 | #######the next few lines can be used for benchmark############## 53 | #info=@timed get_reproducibility(M1,M2,r); 54 | #evs=info[1][1]; 55 | #Q[chr_num]=mean(evs); 56 | #elasped_time[chr_num]=info[2]; 57 | #mem[chr_num]=info[3]; 58 | 59 | end 60 | 61 | ###how to read the matrices generated by HiC-Pro############### 62 | 63 | input_file="./data/examples_hicpro/reads_500000_2.matrix"; 64 | hg19_info=define_hg19_genome(); 65 | bin_size=500000; 66 | chr2bins,bin2loc=generate_arbitrary_mapping_files(hg19_info,bin_size); 67 | #the file reads_500000_abs.bed contains the bin size info generated by HiC-Pro 68 | #The information has already been captured by the above 2 arrays.. 69 | 70 | #this is the WG-WG contact map, NB that HiC-Pro outputs only upper-triangular entries, we have performed symmetrization implicitly 71 | W=read_simple_contact_map(input_file,hg19_info,bin_size); 72 | 73 | #to obtain maps for individual chromosomes: 74 | chr_num=10; 75 | W_chr=extract_chr(W,chr2bins,chr_num); 76 | 77 | 78 | #####Other analysis using MCF7 data 79 | 80 | data_loc="./data/MCF7-WT/"; 81 | 82 | #The contact maps are obtained by binning the human genome in 250kb. 83 | 84 | hg19_info=define_hg19_genome(); 85 | bin_size=250000; 86 | chr2bins,bin2loc=generate_arbitrary_mapping_files(hg19_info,bin_size); 87 | 88 | ##Find the distance dependency of intro-chromosomal interaction frequency. 89 | 90 | chr_num=10; 91 | input_file=data_loc*"HiCStein-MCF7-WT__hg19__genome__C-250000-iced__"*change_chr(hg19_info,chr_num)*"__"*change_chr(hg19_info,chr_num)*"__cis.matrix"; 92 | X=readtable(input_file,header=true,separator='\t'); 93 | W=X[:,2:end]; 94 | W=array(W); 95 | W[isnan(W)]=0; 96 | 97 | xs_all, expect=get_expect_vs_d_single_chr_v0(W,chr2bins,bin_size); 98 | 99 | using PyPlot; 100 | PyPlot.plot(log10(xs_all),log10(expect)); 101 | 102 | ##Matrix Balancing: turn W to W_balance ##### 103 | #the row sums and columns sum of W_balance are all 1, except the empty rows/columns 104 | x,W_balance=knight_ruiz(W); 105 | 106 | ##Find A/B compartments 107 | 108 | f_W=get_f_W(W,expect); 109 | loc,span,ev_whole,cpt=get_compartment_A_B(W,f_W); 110 | 111 | ev1=ev_whole[:,1]; 112 | #ev_whole records the leading eigenvector of the covariance matrix, ev1 reported the compartment. 113 | #bins with +ve and -ve values in ev1 correspond to different compartments. As by convention compartment A refers to the expressed 114 | #part whereas compartment B refers to the lowly expressed part, extra information is required to determine which sign corresponds to which compartment. 115 | #the value of ev1 is zero if the bin have no read mapped. 116 | 117 | #to obtain the genomic coordinates, use 118 | 119 | output=report_compartment(hg19_info,bin2loc,loc,span,ev1,chr_num); 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /run_reproducibility.jl: -------------------------------------------------------------------------------- 1 | include("./HiC_spector.jl"); 2 | 3 | r=20; 4 | 5 | map_file1=ARGS[1]; 6 | map_file2=ARGS[2]; 7 | 8 | X1=readdlm(map_file1,Int64); 9 | X2=readdlm(map_file2,Int64); 10 | 11 | N=maximum([maximum(X1[:,1:2]),maximum(X2[:,1:2])]); 12 | 13 | e=minimum([minimum(X1[:,1:2]),minimum(X2[:,1:2])]); 14 | 15 | if e.>0 16 | M1=sparse(X1[:,1],X1[:,2],X1[:,3],N,N); 17 | M2=sparse(X2[:,1],X2[:,2],X2[:,3],N,N); 18 | elseif e.==0; 19 | N=N+1; 20 | M1=sparse(X1[:,1]+1,X1[:,2]+1,X1[:,3],N,N); 21 | M2=sparse(X2[:,1]+1,X2[:,2]+1,X2[:,3],N,N); 22 | end 23 | 24 | if ~isequal(M1,M1'); 25 | M1_tmp=M1-spdiagm(diag(M1)); 26 | M1=M1+M1_tmp'; 27 | end 28 | 29 | if ~isequal(M2,M2'); 30 | M2_tmp=M2-spdiagm(diag(M2)); 31 | M2=M2+M2_tmp'; 32 | end 33 | 34 | Q,a1,b1=get_reproducibility(M1,M2,r); 35 | 36 | println("size of maps:",size(M1,1),"\t","reproducibility score=",Q); 37 | 38 | -------------------------------------------------------------------------------- /run_reproducibility_v2.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import scipy 3 | import numpy 4 | import straw 5 | from scipy.sparse import lil_matrix 6 | from scipy.sparse.linalg import eigsh 7 | 8 | def Parse_matrix(file1,file2): 9 | max_index=0 10 | max_index_temp=0 11 | with open(file1) as input_file: 12 | for line in input_file: 13 | x,y,z=map(int, line.split()) 14 | max_index_temp=max(x,y) 15 | if max_index_temp>max_index: 16 | max_index=max_index_temp 17 | 18 | with open(file2) as input_file: 19 | for line in input_file: 20 | x,y,z=map(int, line.split()) 21 | max_index_temp=max(x,y) 22 | if max_index_temp>max_index: 23 | max_index=max_index_temp 24 | 25 | 26 | M1=lil_matrix((max_index,max_index)) 27 | M2=lil_matrix((max_index,max_index)) 28 | with open(file1) as input_file: 29 | for line in input_file: 30 | x,y,z=map(int, line.split()) 31 | M1[x-1,y-1]=z 32 | M1[y-1,x-1]=z 33 | with open(file2) as input_file: 34 | for line in input_file: 35 | x,y,z=map(int, line.split()) 36 | M2[x-1,y-1]=z 37 | M2[y-1,x-1]=z 38 | return M1, M2 39 | 40 | 41 | def Parse_matrix_lieberman(file1,file2, resolution): 42 | max_index=0 43 | max_index_temp=0 44 | with open(file1) as input_file: 45 | for line in input_file: 46 | if line[0]!="#": 47 | x,y,z=map(int, line.split()) 48 | max_index_temp=max(x,y) 49 | if max_index_temp>max_index: 50 | max_index=max_index_temp 51 | 52 | with open(file2) as input_file: 53 | for line in input_file: 54 | 55 | if line[0]!="#": 56 | x,y,z=map(int, line.split()) 57 | max_index_temp=max(x,y) 58 | if max_index_temp>max_index: 59 | max_index=max_index_temp 60 | 61 | max_index=max_index/resolution+1 62 | print(max_index) 63 | M1=lil_matrix((max_index,max_index)) 64 | M2=lil_matrix((max_index,max_index)) 65 | with open(file1) as input_file: 66 | for line in input_file: 67 | if line[0]!="#": 68 | x,y,z=map(int, line.split()) 69 | M1[x/resolution,y/resolution]=z 70 | M1[y/resolution,x/resolution]=z 71 | with open(file2) as input_file: 72 | for line in input_file: 73 | if line[0]!="#": 74 | x,y,z=map(int, line.split()) 75 | M2[x/resolution,y/resolution]=z 76 | M2[y/resolution,x/resolution]=z 77 | return M1, M2 78 | 79 | 80 | def Parse_matrix_hic(file1, file2, chrn, resolution): 81 | 82 | Table1=straw.straw("NONE",file1, chrn, chrn,"BP",resolution) 83 | Table2=straw.straw("NONE",file2, chrn, chrn,"BP",resolution) 84 | max_index=max(max(Table1[0]),max(Table1[1]),max(Table2[0]),max(Table2[1])) 85 | max_index=max_index/resolution 86 | M1=lil_matrix((max_index+1,max_index+1)) 87 | M2=lil_matrix((max_index+1,max_index+1)) 88 | 89 | for i in range(len(Table1[0])): 90 | M1[Table1[0][i]/resolution,Table1[1][i]/resolution]=Table1[2][i] 91 | M1[Table1[1][i]/resolution,Table1[0][i]/resolution]=Table1[2][i] 92 | 93 | for i in range(len(Table2[0])): 94 | M2[Table2[0][i]/resolution,Table2[1][i]/resolution]=Table2[2][i] 95 | M2[Table2[1][i]/resolution,Table2[0][i]/resolution]=Table2[2][i] 96 | return M1, M2 97 | 98 | 99 | def get_Laplacian(M): 100 | S=M.sum(1) 101 | i_nz=numpy.where(S>0)[0] 102 | S=S[i_nz] 103 | M=(M[i_nz].T)[i_nz].T 104 | S=1/numpy.sqrt(S) 105 | M=S*M 106 | M=(S*M.T).T 107 | n=numpy.size(S) 108 | M=numpy.identity(n)-M 109 | M=(M+M.T)/2 110 | return M 111 | 112 | def evec_distance(v1,v2): 113 | d1=numpy.dot(v1-v2,v1-v2) 114 | d2=numpy.dot(v1+v2,v1+v2) 115 | if d10)) 130 | k2=numpy.sign(M2.A).sum(1) 131 | d2=numpy.diag(M2.A) 132 | kd2=~((k2==1)*(d2>0)) 133 | iz=numpy.nonzero((k1+k2>0)*(kd1>0)*(kd2>0))[0] 134 | M1b=(M1[iz].A.T)[iz].T 135 | M2b=(M2[iz].A.T)[iz].T 136 | 137 | i_nz1=numpy.where(M1b.sum(1)>0)[0] 138 | i_nz2=numpy.where(M2b.sum(1)>0)[0] 139 | i_z1=numpy.where(M1b.sum(1)==0)[0] 140 | i_z2=numpy.where(M2b.sum(1)==0)[0] 141 | 142 | M1b_L=get_Laplacian(M1b) 143 | M2b_L=get_Laplacian(M2b) 144 | 145 | a1, b1=eigsh(M1b_L,k=num_evec,which="SM") 146 | a2, b2=eigsh(M2b_L,k=num_evec,which="SM") 147 | 148 | b1_extend=numpy.zeros((numpy.size(M1b,0),num_evec)) 149 | b2_extend=numpy.zeros((numpy.size(M2b,0),num_evec)) 150 | for i in range(num_evec): 151 | b1_extend[i_nz1,i]=b1[:,i] 152 | b2_extend[i_nz2,i]=b2[:,i] 153 | 154 | ipr_cut=5 155 | ipr1=numpy.zeros(num_evec) 156 | ipr2=numpy.zeros(num_evec) 157 | for i in range(num_evec): 158 | ipr1[i]=get_ipr(b1_extend[:,i]) 159 | ipr2[i]=get_ipr(b2_extend[:,i]) 160 | 161 | b1_extend_eff=b1_extend[:,ipr1>ipr_cut] 162 | b2_extend_eff=b2_extend[:,ipr2>ipr_cut] 163 | num_evec_eff=min(numpy.size(b1_extend_eff,1),numpy.size(b2_extend_eff,1)) 164 | 165 | evd=numpy.zeros(num_evec_eff) 166 | for i in range(num_evec_eff): 167 | evd[i]=evec_distance(b1_extend_eff[:,i],b2_extend_eff[:,i]) 168 | 169 | Sd=evd.sum() 170 | l=numpy.sqrt(2) 171 | evs=abs(l-Sd/num_evec_eff)/l 172 | 173 | N=float(M1.shape[1]); 174 | if (numpy.sum(ipr1>N/100)<=1)|(numpy.sum(ipr2>N/100)<=1): 175 | print("at least one of the maps does not look like typical Hi-C maps") 176 | else: 177 | print("size of maps: %d" %(numpy.size(M1,0))) 178 | print("reproducibility score: %6.3f " %(evs)) 179 | print("num_evec_eff: %d" %(num_evec_eff)) 180 | return evs 181 | 182 | def main(): 183 | num_evec=20; 184 | if len(sys.argv)==4 and sys.argv[1]=="-F": 185 | M1, M2=Parse_matrix(sys.argv[2],sys.argv[3]) 186 | get_reproducibility(M1,M2,num_evec) 187 | elif len(sys.argv)==6 and sys.argv[1]=="-f": 188 | M1, M2=Parse_matrix_hic(sys.argv[2],sys.argv[3],sys.argv[4],int(sys.argv[5])) 189 | get_reproducibility(M1,M2,num_evec) 190 | elif len(sys.argv)==5 and sys.argv[1]=="t": 191 | M1, M2=Parse_matrix_lieberman(sys.argv[2],sys.argv[3],int(sys.argv[4])) 192 | get_reproducibility(M1,M2,num_evec) 193 | else: 194 | print('3, 4 or 5 arguments required') 195 | print('To use matrix table files as the input:') 196 | print('python run_reproducibility.py -F matrix_file1 matrix_file2') 197 | print('To use .hic files as the input:') 198 | print('python run_reproducibility.py -f hic_file1 hic_file2 chrid resolution[int]') 199 | print('To use lieberman matrix files as the input:') 200 | print('python run_reproducibility.py t matrix_file1 matrix_file2 resolution[int]') 201 | 202 | 203 | if __name__ == '__main__': 204 | main() 205 | --------------------------------------------------------------------------------