├── HiC_spector.jl
├── README.md
├── data
    └── readme_data
├── hic_spector_tutorial.jl
├── run_reproducibility.jl
└── run_reproducibility_v2.py


/HiC_spector.jl:
--------------------------------------------------------------------------------
  1 | #using HDF5;
  2 | #using JLD;
  3 | #using MAT;
  4 | #the packages above not required by the code here, but will be useful for reading/writing common files for analysis purpose
  5 | using DataFrames;
  6 | using CurveFit;
  7 | using Interpolations;
  8 | 
  9 | function get_reproducibility(M1,M2,num_evec);
 10 | 
 11 | 	if ~isequal(M1,M1')
 12 | 		tmp1=M1-spdiagm(diag(M1));
 13 | 		M1=tmp1+tmp1'+spdiagm(diag(M1));
 14 | 	end
 15 | 	if ~isequal(M2,M2')
 16 | 		tmp2=M2-spdiagm(diag(M2));
 17 | 		M2=tmp2+tmp2'+spdiagm(diag(M2));
 18 | 	end
 19 | 	
 20 | 	N=size(M1)[1];
 21 | 	#get rid of isolated nodes
 22 | 	k1=sum(spones(M1),2);
 23 | 	d1=diag(M1);
 24 | 	kd1=!((k1.==1).*(d1.>0))
 25 | 	k2=sum(spones(M2),2);
 26 | 	d2=diag(M2);
 27 | 	kd2=!((k2.==1).*(d2.>0))
 28 | 	iz=find((k1+k2.>0).*(kd1.>0).*(kd2.>0));
 29 | 
 30 | 	M1b=M1[iz,iz];
 31 | 	M2b=M2[iz,iz];
 32 | 
 33 | 	i_nz1=find(sum(M1b,2).>0);
 34 | 	i_nz2=find(sum(M2b,2).>0);
 35 | 
 36 | 	i_z1=find(sum(M1b,2).==0);
 37 | 	i_z2=find(sum(M2b,2).==0);
 38 | 
 39 | 	Ln1_nz1=get_Laplacian(M1b);
 40 | 	Ln2_nz2=get_Laplacian(M2b);
 41 | 
 42 | 	(a1,b1)=eigs(speye(length(i_nz1))-Ln1_nz1,nev=num_evec,which=:LM);
 43 | 	#(a1,b1)=eigs(speye(length(i_nz1))-Ln1_nz1,nev=num_evec+1,which=:LM);
 44 | 	a1=1-a1;
 45 | 	(a2,b2)=eigs(speye(length(i_nz2))-Ln2_nz2,nev=num_evec,which=:LM);
 46 | 	#(a2,b2)=eigs(speye(length(i_nz2))-Ln2_nz2,nev=num_evec+1,which=:LM);	
 47 | 	a2=1-a2;
 48 | 
 49 | 	ipr_cut=5;
 50 | 
 51 | 	b1_extend=zeros(size(M1b,1),num_evec);
 52 | 	for i=1:num_evec
 53 | 		#b1_extend[i_nz1,i]=b1[:,i+1];
 54 | 		b1_extend[i_nz1,i]=b1[:,i];
 55 | 	end
 56 | 
 57 | 	ipr1=zeros(num_evec);
 58 | 	for i=1:num_evec;
 59 | 		ipr1[i]=get_ipr(b1_extend[:,i]);
 60 | 	end
 61 | 
 62 | 	b1_extend_eff=b1_extend[:,ipr1.>ipr_cut];
 63 | 
 64 | 	b2_extend=zeros(size(M2b,1),num_evec);
 65 | 	for i=1:num_evec
 66 | 		#b2_extend[i_nz2,i]=b2[:,i+1];
 67 | 		b2_extend[i_nz2,i]=b2[:,i];
 68 | 	end
 69 | 
 70 | 	ipr2=zeros(num_evec);
 71 | 	for i=1:num_evec;
 72 | 		ipr2[i]=get_ipr(b2_extend[:,i]);
 73 | 	end
 74 | 
 75 | 	b2_extend_eff=b2_extend[:,ipr2.>ipr_cut];
 76 | 
 77 | 	num_evec_eff=minimum([size(b1_extend_eff,2);size(b2_extend_eff,2)]);
 78 | 
 79 | 
 80 | 	evd=zeros(num_evec_eff);
 81 | 	for i=1:num_evec_eff;
 82 | 		evd[i]=evec_distance(b1_extend_eff[:,i],b2_extend_eff[:,i]);
 83 | 	end
 84 | 
 85 | 	Sd=sum(evd);
 86 | 
 87 | 	evs=abs(sqrt(2)-Sd/num_evec_eff)/sqrt(2);
 88 | 	
 89 | 	if (sum(ipr1.>N/100)<=1)|(sum(ipr2.>N/100)<=1)
 90 | 		print("at least one of the maps does not look like typical Hi-C maps")
 91 | 		evs=NaN;
 92 | 	end
 93 | 
 94 | 	return evs,a1,a2,evd;
 95 | 
 96 | end
 97 | 
 98 | function get_Laplacian(M);
 99 | 
100 | 	K=vec(sum(M,1));
101 | 	i_nz=find(K.>0);
102 | 	D_nz=spdiagm(K[i_nz]);
103 | 	D_isq=spdiagm(1./sqrt(K[i_nz]));
104 | 
105 | 	Ln_nz=M[i_nz,i_nz]*D_isq;
106 | 	Ln_nz=speye(length(i_nz))-D_isq*Ln_nz;
107 | 	n=size(M,1);
108 | 
109 | 	Ln_nz=(Ln_nz+Ln_nz')/2;
110 | 	return Ln_nz;
111 | end
112 | 
113 | function get_ipr(evec);
114 | 	#evec should be a unit vector
115 | 	ipr=1./sum(evec.^4,1)[1];
116 | 
117 | end
118 | 
119 | function evec_distance(x,y);
120 | 	
121 | 	d1=sum((x-y).^2);
122 | 	d2=sum((x+y).^2);
123 | 	if d1<d2
124 | 		d=d1;
125 | 	else
126 | 		d=d2;
127 | 	end
128 | 
129 | 	return sqrt(d);
130 | end
131 | 
132 | #########################################################################################################################
133 | 
134 | #This code was adapted from the MATLAB code implemented in Knight and Ruiz, IMA Journal of Numerical Analysis (2012)
135 | function knight_ruiz(M);
136 | 	M[isnan(M)]=0;
137 | 	L=size(M,1);
138 | 	iz=find(sum(M,2).>0);
139 | 	A=M[iz,iz];
140 | 	n=size(A,1);
141 | 	e = ones(n,1);
142 | 	res=[];
143 | 	delta = 0.1;
144 | 	x0 = e;
145 | 	tol = 1e-6;
146 | 	g=0.9; etamax = 0.1; # Parameters used in inner stopping criterion.
147 | 
148 | 	eta = etamax;
149 | 	x = x0; rt = tol^2; v = x.*(A*x); rk = 1 - v;
150 | 	rho_km1=sum(rk.^2);
151 | 	rout = rho_km1; rold = rout;
152 | 	MVP = 0; # count matrix vector products.
153 | 	i = 0; # Outer iteration count.
154 | 
155 | 	while rout > rt # Outer iteration
156 |     	i = i + 1; k = 0; y = e;
157 |     	innertol = maximum([eta^2*rout;rt]);
158 |     	while rho_km1 > innertol #Inner iteration by CG
159 |         	k = k + 1;
160 |         	if k == 1
161 |             	Z = rk./v; p=Z; rho_km1 = sum(rk.*Z);
162 |         	else
163 |             	beta=rho_km1/rho_km2;
164 |             	p=Z + beta*p;
165 |         	end
166 |         	# Update search direction efficiently.
167 |         	w = x.*(A*(x.*p)) + v.*p;
168 |         	#w=squeeze(w,2);
169 |         	alpha = rho_km1/sum(p.*w);
170 |         	ap =squeeze(alpha*p,2);
171 |         	# Test distance to boundary of cone.
172 |         	ynew = y + ap;
173 |         	if minimum(ynew) <= delta
174 |             	if delta == 0
175 |             		break
176 |             	end
177 |             	ind = find(ap .< 0);
178 |             	gamma = minimum((delta - y[ind])./ap[ind]);
179 |             	y = y + gamma*ap;
180 |             	break
181 |         	end
182 |         	y = ynew;
183 |         	rk = rk - alpha*w; rho_km2 = rho_km1; rho_km2=rho_km2[1];
184 |         	Z = rk./v; rho_km1 = sum(rk.*Z);
185 |     	end
186 |     	x = x.*y; v = x.*(A*x);
187 |     	rk = 1 - v; rho_km1 = sum(rk.*rk); rout = rho_km1;
188 |     	MVP = MVP + k + 1;
189 |     	# Update inner iteration stopping criterion.
190 |     	rat = rout/rold; rold = rout; r_norm = sqrt(rout);
191 |     	eta_o = eta; eta = g*rat;
192 |     	if g*eta_o^2 > 0.1
193 |         	eta = maximum([eta;g*eta_o^2]);
194 |     	end
195 |     	eta = maximum([minimum([eta;etamax]);0.5*tol/r_norm]);
196 |     	#@sprintf("%3d %6d %.3e %.3e %.3e \n", i,k,r_norm,minimum(y),minimum(x));
197 |         display(rout);
198 |         #res=[res; r_norm];
199 | 	end
200 | 	#@printf("Matrix-vector products = %6d\n", MVP);
201 | 	x=squeeze(x,2);
202 | 	A2=A*diagm(x);
203 | 	A2=diagm(x)*A2;
204 | 	A_balance=extend_mat(A2,iz,L);
205 | 	A_balance=(A_balance+A_balance')/2;
206 | 	x_final=zeros(L);
207 | 	x_final[iz]=x;
208 | 
209 | 	return x_final,A_balance;
210 | 
211 | end
212 | 
213 | function extend_mat(Z,iz,L);
214 |     (u,v)=ind2sub(size(Z),find(Z.!=0));
215 |     w=Z[find(Z)];
216 |     #w=nonzeros(Z);
217 |     u=iz[u];
218 |     v=iz[v];
219 |     Z_extend=sparse(u,v,w,L,L);
220 |     Z_extend=full(Z_extend);
221 |     return Z_extend;
222 | end
223 | 
224 | #########################################################################################################################
225 | 
226 | function local_smoothing(x,y);
227 | 
228 | 	span=0.01;
229 | 	v=sortperm(x);
230 | 	x=x[v];
231 | 	y=y[v];
232 | 	ux=unique(x);
233 | 	uy_smooth=zeros(size(ux));
234 | 	n=Int(floor(length(x)*span/2));
235 | 
236 | 	mm=zeros(size(x));
237 | 	L=2*n+1;
238 | 	i=n+1;
239 | 	st=1;
240 | 	ed=i+n;
241 | 	mm[i]=mean(y[st:ed]);
242 | 	for i=n+2:length(y)-n;
243 | 		#display(i);
244 |     	ed=ed+1;
245 |     	mm[i]=mm[i-1]+y[ed]/L-y[st]/L;
246 | 	    st=st+1;
247 | 	end
248 | 	for i=1:n
249 |     	mm[i]=mean(y[1:n+i]);
250 | 	end
251 | 	for i=1:n;
252 |     	mm[end-n+i]=mean(y[end-n+1-n+i:end]);
253 | 	end
254 | 
255 | 	for i=1:length(ux);
256 |     	iz=find(x.==ux[i]);
257 |     	uy_smooth[i]=mean(mm[iz]);
258 | 	end
259 | 
260 | 	return ux,uy_smooth;
261 | 
262 | end
263 | 
264 | function get_expect_vs_d_single_chr_v0(W,chr2bins,bin_size);
265 | 
266 | 	W=full(W);
267 | 	W[isnan(W)]=0;
268 | 
269 | 	N=size(W,1);
270 | 
271 | 
272 | 	(u,v,w)=findnz(triu(W));
273 | 	d=float(v-u);
274 | 	d2=float(d);
275 | 	d2[d2.==0]=1/3;#this is the average distance for 2 points drawn from an uniform distribution between [0.1];
276 | 	d3=d2*bin_size;
277 | 
278 | 	x=log10(d3);
279 | 	y=log10(w);
280 | 
281 | 	xs,ys_smooth=local_smoothing(x,y);
282 | 
283 | 	xs_all=collect(0:1.0:size(W,1)-1);xs_all[1]=1/3;
284 | 	xs_all=xs_all*bin_size;
285 | 	xs_all_aux=log10(xs_all);
286 | 
287 | 	ys_all=zeros(size(xs_all));
288 | 	for k=1:length(xs_all_aux);
289 | 		ik=find(xs.==xs_all_aux[k]);
290 | 		if ~isempty(ik)
291 | 			ys_all[k]=ys_smooth[ik][1];
292 | 		end
293 | 	end
294 | 
295 | 	A_x=find(ys_all.>0);
296 | 	knots=(A_x,);
297 | 	itp=interpolate(knots,ys_smooth, Gridded(Linear()));
298 | 
299 | 	A_nz=find(ys_all.==0);
300 | 	for i=1:length(A_nz);
301 | 		ys_all[A_nz[i]]=itp[A_nz[i]];
302 | 	end
303 | 
304 | 	expect=10.^ys_all;
305 | 
306 | 	return xs_all, expect;
307 | 
308 | end
309 | 
310 | function get_expect_vs_d_WG_v0(contact,chr2bins,bin_size);
311 | 
312 | 	all_d2=Float64[];
313 | 	all_w=Float64[];
314 | 	Ltmp=zeros(23);
315 | 	for chr_num=1:23
316 | 
317 | 		#display(chr_num);
318 | 		W=extract_chr(contact,chr2bins,chr_num);
319 | 		W=full(W);
320 | 		W[isnan(W)]=0;
321 | 
322 | 		N=size(W,1);
323 | 
324 | 		(u,v,w)=findnz(triu(W));
325 | 
326 | 		d=float(v-u);
327 | 		d2=float(d);
328 | 		d2[d2.==0]=1/3;
329 | 
330 | 		all_d2=[all_d2;d2];
331 | 		all_w=[all_w;w];
332 | 		Ltmp[chr_num]=size(W,1);
333 | 
334 | 	end
335 | 
336 | 	all_d3=all_d2*bin_size;
337 | 
338 | 	x=log10(all_d3);
339 | 	y=log10(all_w);
340 | 
341 | 	xs,ys_smooth=local_smoothing(x,y);
342 | 
343 | 	xs_all=collect(0:1.0:maximum(Ltmp)-1);xs_all[1]=1/3;
344 | 	xs_all=xs_all*bin_size;
345 | 	xs_all_aux=log10(xs_all);
346 | 
347 | 	ys_all=zeros(size(xs_all));
348 | 	for k=1:length(xs_all_aux);
349 | 		ik=find(xs.==xs_all_aux[k]);
350 | 		if ~isempty(ik)
351 | 			ys_all[k]=ys_smooth[ik][1];
352 | 		end
353 | 	end
354 | 
355 | 	A_x=find(ys_all.>0);
356 | 	knots=(A_x,);
357 | 	itp=interpolate(knots,ys_smooth, Gridded(Linear()));
358 | 
359 | 	A_nz=find(ys_all.==0);
360 | 	for i=1:length(A_nz);
361 | 		ys_all[A_nz[i]]=itp[A_nz[i]];
362 | 	end
363 | 
364 | 	expect=10.^ys_all;
365 | 
366 | 	return xs_all, expect;
367 | 
368 | end
369 | 
370 | 
371 | function extract_chr(A,chr2bins,chr_num);
372 | 	st=1+chr2bins[1,chr_num];
373 | 	ed=1+chr2bins[2,chr_num];
374 | 	A_chr=A[st:ed,st:ed];
375 | 	return A_chr;
376 | end
377 | 
378 | function get_f_W(W,ys);
379 | 
380 | 	N=size(W,1);
381 | 	W[isnan(W)]=0;
382 | 	dark_bins=find(sum(W,1).==0);
383 | 	num_dark=length(dark_bins);
384 | 	N_eff=N-num_dark;
385 | 	f_W=zeros(size(W));
386 | 
387 | 	x=collect(1:N);
388 | 
389 | 	for d=0:N-1
390 | 		f_W[1+d:N+1:end-d*N]=ys[d+1];
391 | 	end
392 | 	tmp=f_W-diagm(diag(f_W));
393 | 	f_W=f_W+tmp';
394 | 	#sum(f_W[1,:])=1 here..
395 | 
396 | 	f_W[dark_bins,:]=0;
397 | 	f_W[:,dark_bins]=0;
398 | 	f_W=f_W/sum(f_W)*N_eff.^2;
399 | 
400 | 	return f_W;
401 | 
402 | end
403 | 
404 | function get_compartment_A_B(W,f_W);
405 | 
406 | 	iz=find(sum(W,2).>0);
407 | 	izz=find(sum(W,2).==0);
408 | 	Wn=W[iz,iz]./f_W[iz,iz];
409 | 	C=cor(Wn);
410 | 	(U,V)=eigs(C);
411 | 	i_max=indmax(U);
412 | 	ev=V[:,i_max:i_max+5];
413 | 	ev_whole=zeros(size(W,1),6);
414 | 	ev_whole[iz,:]=ev;
415 | 	ev_whole[izz,:]=NaN;
416 | 
417 | 	(loc,span)=get_chunks_v2(sign(ev_whole[:,1]),1);#
418 | 	loc=round(loc);
419 | 	span=round(span);
420 | 	cpt=sign(ev_whole[loc,1]);
421 | 
422 | 	return loc,span,ev_whole,cpt;
423 | 
424 | end
425 | 
426 | #id is the starting loc of a chunk, and d is the length it spans..
427 | function get_chunks_v2(a,singleton=0);
428 | 	# adopt from a matlab code by Jiro Doke;
429 | 	 a                 = [NaN; a; NaN];
430 | 	 b                 = diff(a);
431 | 	 b1                = b;  # to be used in fullList (below)
432 | 	 ii                = trues(size(b));
433 | 	 ii[b.==0] = false;
434 | 	 b[ii]             = 1;
435 | 	 c                 = diff(b);
436 | 	 id                = find(c.==-1);
437 | 
438 | 	 #Get single-element chunks also
439 | 	 if singleton.==1
440 | 	 	b1[id]          = 0;
441 | 	 	ii2             = find(b1[1:end-1]);
442 | 	 	d               = vcat(find(c.==1) - id + 1, ones(length(ii2)));
443 | 	 	id              = [id;ii2];
444 | 	 	v=sortperm(id);
445 | 	 	id=sort(id);
446 | 	 	#(id,tmp)        = sort(id);
447 | 	 	d               = d[v];
448 | 	 else
449 | 	 	d               = find(c.==1) - id + 1;
450 | 	 end
451 | 
452 | 	 return id,d;
453 | end
454 | 
455 | function report_compartments(hg19_info,bin2loc,loc,span,ev1,chr_num);
456 | 	
457 | 	iz=find(bin2loc[1,:].==chr_num-1);
458 | 	a=round(bin2loc[2:3,iz]');
459 | 	st=a[loc];
460 | 	ed=a[loc+span]-1;
461 | 	output=DataFrame();
462 | 	chr=cell(size(st));
463 | 	for i=1:length(chr);
464 | 		chr[i]=change_chr(hg19_info,chr_num);
465 | 	end
466 | 	output[:chr]=chr;
467 | 	output[:start]=st;
468 | 	output[:end]=ed;
469 | 	x=ev1[loc];
470 | 	x[isnan(x)]=0;
471 | 	output[:compartment]=sign(x);
472 | 
473 | 	return output;
474 | end
475 | 
476 | #########################################################################################################################
477 | 
478 | function generate_arbitrary_mapping_files(hg19_info,bin_size);
479 | 
480 | 	num_of_chromosomes=size(hg19_info,1);
481 | 	chr2bins=zeros(2,num_of_chromosomes);
482 | 	chr_length=hg19_info[:length];
483 | 	chr_num_bins=round(Int64,floor(chr_length/bin_size))+1
484 | 	#chr_num_bins=int(floor(chr_length/bin_size))+1;
485 | 	chr2bins[2,:]=cumsum(chr_num_bins)'-1;
486 | 	chr2bins[1,1]=0;
487 | 	chr2bins[1,2:end]=chr2bins[2,1:end-1]+1;
488 | 	X=round(Int,chr2bins+1);
489 | 	bin2loc=zeros(3,X[2,end]);
490 | 	for c=1:size(hg19_info,1);
491 | 		bin2loc[1,X[1,c]:X[2,c]]=c-1;
492 | 		bin2loc[2,X[1,c]:X[2,c]]=round(Int,collect(1:bin_size:chr_length[c]))';
493 | 		bin2loc[3,X[1,c]:X[2,c]]=[round(Int,collect(bin_size:bin_size:chr_length[c]))' chr_length[c]];
494 | 	end
495 | 	return round(Int64,chr2bins),round(Int64,bin2loc);
496 | 
497 | end
498 | 
499 | function define_hg19_genome();
500 | 
501 | 	hg19_info=DataFrame();
502 | 	hg19_info[:id]=1:25;
503 | 	hg19_info[:chr]=["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10",
504 | 	"chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21",
505 | 	"chr22","chrX","chrY","chrM"];
506 | 	hg19_info[:length]=[249250621,243199373,198022430,191154276,180915260,171115067,159138663,
507 | 	146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,
508 | 	81195210,78077248,59128983,63025520,48129895,51304566,155270560,59373566,16571];
509 | 
510 | 	return hg19_info;
511 | 
512 | end
513 | 
514 | function change_chr(hg19_info,chr)
515 | 
516 | 	if typeof(chr)==Float64||typeof(chr)==Int64;
517 | 		chr2=hg19_info[:chr][hg19_info[:id].==chr][1];
518 | 	elseif typeof(chr)==ASCIIString||typeof(chr)==SubString{ASCIIString}||typeof(chr)==UTF8String
519 | 		chr2=hg19_info[:id][hg19_info[:chr].==chr][1];
520 | 	end
521 | 
522 | 	return chr2;
523 | 
524 | end
525 | 
526 | 
527 | #input file is generated by HiC-Pro, with 3 columns: row_index, column_index, entry
528 | function read_simple_contact_map(input_file,hg19_info,bin_size);
529 | 
530 | 	chr_length=hg19_info[:length];
531 | 	X=readtable(input_file,separator='\t',header=false);
532 | 	chr2bins,bin2loc=generate_arbitrary_mapping_files(hg19_info,bin_size);
533 | 	N=size(bin2loc,1);
534 | 	M=sparse(X[:,1],X[:,2],X[:,3],N,N);
535 | 	if ~isequal(M,M');
536 | 		tmp=M-spdiagm(diag(M));
537 | 		M=M+tmp';
538 | 	end
539 | 
540 | 	return M;
541 | end
542 | 
543 | function define_ce10_genome();
544 | 
545 | 	ce10_info=DataFrame();
546 | 	ce10_info[:id]=1:7;
547 | 	ce10_info[:chr]=["chrV","chrX","chrIV","chrII","chrI","chrIII","chrM"];
548 | 	ce10_info[:length]=[20924149,17718866,17493793,15279345,15072423,13783700,13794];
549 | 	return ce10_info;
550 | 
551 | end
552 | 
553 | function define_dm3_genome();
554 | 
555 | 	dm3_info=DataFrame();
556 | 	dm3_info[:id]=1:15;
557 | 	dm3_info[:chr]=["2L","2LHet","2R","2RHet","3L","3LHet","3R","3RHet","4","U","Uextra","X","Xhet","YHet","dmel_mitochondrion_genome"];
558 | 	dm3_info[:length]=[23011544,368872,21146708,3288761,24543557,2555491,27905053,2517507,1351857,10049037,29004656,22422827,204112,347038,19517];
559 | 
560 | 	return dm3_info;
561 | 
562 | end
563 | 
564 | 
565 | 
566 | 
567 | 
568 | 
569 | 
570 | 
571 | 
572 | 
573 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HiC-spector
 2 | 
 3 | A matrix library for spectral and reproducibility analysis of Hi-C contact maps. Several useful functions:
 4 | 
 5 | get_reproducibility
 6 |   - to calculate the reproducibility metric between 2 HiC contact maps
 7 |   
 8 | knight_ruiz
 9 |   - Knight Ruiz algorithm for matrix balancing
10 |   
11 | get_expect_vs_d_single_chr_v0
12 |   - to find the average contact frequency as a function of genomic distance
13 |   
14 | get_compartment_A_B
15 |   - to find A, B compartment, using method described in Liberman et al. Science 2009
16 |   
17 | and a few functions for binning a genome, and reading HiC maps
18 | 
19 | <h3>Installation</h3> 
20 | HiC-spector is mostly written in Julia. It has been tested in Julia 0.4 and 0.5. Both the Julia language (http://julialang.org/) and the required packages have to be installed. Please refer to the beginning of the file HiC_spector.jl for the necessary packages. To get some contact maps for testing the code, please follow the instructions shown in the file data/readme_data.
21 | 
22 | There is a Python script available for quantifying reproducibility. The Python version can read files in genomic coordinates as well as the .hic format (https://github.com/theaidenlab/juicebox/wiki/Data). To do so, please download the Python version of the tool straw (straw.py) developed by the Aiden lab (https://github.com/theaidenlab/straw).
23 | 
24 | <h3>Usage</h3>
25 | The script run_reproducibility.jl is used to get the reproducibility score from a command-line interface. Usage:
26 | > julia run_reproducibility.jl matrix_file1 matrix_file2 
27 | 
28 | The input file here is a simple text delimited format with no header.
29 | 
30 | 1 1 20
31 | 
32 | 1 2 18
33 | 
34 | ...
35 | 
36 | The first and second columns represent the row and column indices of a contact map, whereas the third column is the count. To represent a full matrix, only the upper-triangular component is required. Note that the index should begin with 1. 
37 | 
38 | Please use the files stored in the folder A549 mentioned in ./data/readme_data to test the script run_reproducibility.jl.
39 | 
40 | Julia users can include the file HiC_spector.jl for their own analysis by simply using
41 | > include("./HiC_spector.jl");
42 | 
43 | Please refer to the file hic_spector_tutorial.jl for how to use some of the functions and how to read files in other formats.
44 | 
45 | For non-Julia users, one can use the Python script run_reproducibility.py to obtain the reproducibility score. Usage:
46 | > python run_reproducibility.py -F matrix_file1 matrix_file2
47 | 
48 | If the matrix files are labeled in genomic coordinates of bins, USage:
49 | > python run_reproducibility.py t matrix_file1 matrix_file2 40000
50 | 
51 | where 40000 is the bin size used in the two files
52 | 
53 | In addition to the text delimited input files, the Python script can calculate reproducibility score for contact maps stored in .hic format. Usage:
54 | > python run_reproducibility.py -f hic_file1 hic_file2 chrid resolution
55 | 
56 | A script is provided in the tool straw (https://github.com/theaidenlab/straw/tree/master/python) for reading the headers (including chr id and the available resolutions) in .hic file. 
57 | 
58 | Regarding memory, given two contact maps of human chr1 binned in a bin-size of 10kb, the code works fine in a laptop (16GB memory) from our experience. 
59 | 
60 | <h3>Aurthor/Support</h3>
61 | Koon-Kiu Yan, koonkiu.yan@gmail.com; Mark Gerstein, mark@gersteinlab.org
62 | 
63 | <h3>Reference</h3>
64 | Yan KK, Galip Gürkan Yardımcı, William S Noble and Gerstein M. HiC-Spector: a matrix library for spectral and reproducibility analysis of Hi-C contact maps. Bioinformatics 22 March 2017. https://doi.org/10.1093/bioinformatics/btx152
65 | 


--------------------------------------------------------------------------------
/data/readme_data:
--------------------------------------------------------------------------------
 1 | Because of their sizes, please download the data from the following links to the data folder, and unzip the files.
 2 | 
 3 | http://homes.gersteinlab.org/people/kkyan/sharebox/A549.tar.gz
 4 | 
 5 | http://homes.gersteinlab.org/people/kkyan/sharebox/MCF7-WT.tar.gz
 6 | 
 7 | http://homes.gersteinlab.org/people/kkyan/sharebox/examples_hicpro.tar.gz
 8 | 
 9 | The A549 Hi-C experiments were performed by the ENCODE consortium. The raw reads can be download from the ENCODE portal (https://www.encodeproject.org/) with libraries ENCLB571HTP ENCLB222WYT. The reads were mapped to generate contact maps using the tool HiC-Pro, with bin size 40kb. Please note that the files here contains only intra-chromosomal interactions. The contact maps were not normalized.
10 | 
11 | The MCF7 data were obtained from Barutcu et al. Genome Biology 2015 (GSE66733). The contact maps have bin size 250kb, and have already been normalized by the ICE algorithm.
12 | 
13 | The hicpro files were generated using Hi-C data of hESC and IMR90 from Dixon etal. Nature 2012, wth bin size 500kb. They include both intra- and inter- chromosomal interactions.
14 | 
15 | Sample .hic files can be obtained from the Aiden lab.
16 | For code demonstration, please download the files HIC073.hic and HIC074.hic from 
17 | https://bcm.app.box.com/v/aidenlab/1/11406189541
18 | in the folder Rao & Huntley et al. | Cell 2014 /K562
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/hic_spector_tutorial.jl:
--------------------------------------------------------------------------------
  1 | include("./HiC_spector.jl");
  2 | 
  3 | #####Calculating reproducibility scores using A549 data
  4 | 
  5 | #The contact maps are obtained by binning the human genome in 40kb. 
  6 | 
  7 | hg19_info=define_hg19_genome();
  8 | bin_size=40000;
  9 | chr2bins,bin2loc=generate_arbitrary_mapping_files(hg19_info,bin_size);
 10 | 
 11 | #The number of eigenvectors (suggested value=20)
 12 | 
 13 | r=20;
 14 | 
 15 | #the reproducibility score Q for 23 chromosomes, 1 to 22, and X
 16 | Q=zeros(23);
 17 | 
 18 | ###these arrays are used for benchmark only
 19 | elasped_time=zeros(23);
 20 | mem=zeros(23);
 21 | 
 22 | for chr_num=2:23;
 23 | 
 24 | 	display(chr_num);
 25 | 
 26 | 	chr_string=change_chr(hg19_info,chr_num);
 27 | 
 28 | 	map_file1="./data/A549/A549C-HindIII-R1_"*chr_string*".inter";
 29 | 	map_file2="./data/A549/A549D-HindIII-R2_"*chr_string*".inter";
 30 | 
 31 | 	X1=readdlm(map_file1,Int64);
 32 | 	X2=readdlm(map_file2,Int64);
 33 | 
 34 | 	ib=find(bin2loc[1,:].==chr_num-1);
 35 | 	N=length(ib);
 36 | 
 37 | 	M1=sparse(X1[:,1],X1[:,2],X1[:,3],N,N);
 38 | 	M2=sparse(X2[:,1],X2[:,2],X2[:,3],N,N);
 39 | 
 40 | 	M1_tmp=M1-spdiagm(diag(M1));
 41 | 	M2_tmp=M2-spdiagm(diag(M2));
 42 | 	M1=M1+M1_tmp';
 43 | 	M2=M2+M2_tmp';
 44 | 
 45 | 	#Note that each interaction has shown once in these files. M1, M2 are therefore asymmetric.
 46 | 	#we therefore do M=M+M'.
 47 | 	#neveetheless, it's not necessary because the code get_reproducibility(M1,M2,num_evec) does it for you
 48 | 	
 49 | 	evs,a1,a2=get_reproducibility(M1,M2,r);
 50 | 	Q[chr_num]=evs;
 51 | 
 52 | 	#######the next few lines can be used for benchmark##############
 53 | 	#info=@timed get_reproducibility(M1,M2,r);
 54 | 	#evs=info[1][1];
 55 | 	#Q[chr_num]=mean(evs);
 56 | 	#elasped_time[chr_num]=info[2];
 57 | 	#mem[chr_num]=info[3];
 58 | 
 59 | end
 60 | 
 61 | ###how to read the matrices generated by HiC-Pro###############
 62 | 
 63 | input_file="./data/examples_hicpro/reads_500000_2.matrix";
 64 | hg19_info=define_hg19_genome();
 65 | bin_size=500000;
 66 | chr2bins,bin2loc=generate_arbitrary_mapping_files(hg19_info,bin_size);
 67 | #the file reads_500000_abs.bed contains the bin size info generated by HiC-Pro
 68 | #The information has already been captured by the above 2 arrays..
 69 | 
 70 | #this is the WG-WG contact map, NB that HiC-Pro outputs only upper-triangular entries, we have performed symmetrization implicitly
 71 | W=read_simple_contact_map(input_file,hg19_info,bin_size);
 72 | 
 73 | #to obtain maps for individual chromosomes:
 74 | chr_num=10;
 75 | W_chr=extract_chr(W,chr2bins,chr_num);
 76 | 
 77 | 
 78 | #####Other analysis using MCF7 data
 79 | 
 80 | data_loc="./data/MCF7-WT/";
 81 | 
 82 | #The contact maps are obtained by binning the human genome in 250kb. 
 83 | 
 84 | hg19_info=define_hg19_genome();
 85 | bin_size=250000;
 86 | chr2bins,bin2loc=generate_arbitrary_mapping_files(hg19_info,bin_size);
 87 | 
 88 | ##Find the distance dependency of intro-chromosomal interaction frequency.
 89 | 
 90 | chr_num=10;
 91 | input_file=data_loc*"HiCStein-MCF7-WT__hg19__genome__C-250000-iced__"*change_chr(hg19_info,chr_num)*"__"*change_chr(hg19_info,chr_num)*"__cis.matrix";
 92 | X=readtable(input_file,header=true,separator='\t');
 93 | W=X[:,2:end];
 94 | W=array(W);
 95 | W[isnan(W)]=0;
 96 | 
 97 | xs_all, expect=get_expect_vs_d_single_chr_v0(W,chr2bins,bin_size);
 98 | 
 99 | using PyPlot;
100 | PyPlot.plot(log10(xs_all),log10(expect));
101 | 
102 | ##Matrix Balancing: turn W to W_balance #####
103 | #the row sums and columns sum of W_balance are all 1, except the empty rows/columns
104 | x,W_balance=knight_ruiz(W);
105 | 
106 | ##Find A/B compartments
107 | 
108 | f_W=get_f_W(W,expect);
109 | loc,span,ev_whole,cpt=get_compartment_A_B(W,f_W);
110 | 
111 | ev1=ev_whole[:,1];
112 | #ev_whole records the leading eigenvector of the covariance matrix, ev1 reported the compartment. 
113 | #bins with +ve and -ve values in ev1 correspond to different compartments. As by convention compartment A refers to the expressed
114 | #part whereas compartment B refers to the lowly expressed part, extra information is required to determine which sign corresponds to which compartment.
115 | #the value of ev1 is zero if the bin have no read mapped.
116 | 
117 | #to obtain the genomic coordinates, use
118 | 
119 | output=report_compartment(hg19_info,bin2loc,loc,span,ev1,chr_num);
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/run_reproducibility.jl:
--------------------------------------------------------------------------------
 1 | include("./HiC_spector.jl");
 2 | 
 3 | r=20;
 4 | 
 5 | map_file1=ARGS[1];
 6 | map_file2=ARGS[2];
 7 | 
 8 | X1=readdlm(map_file1,Int64);
 9 | X2=readdlm(map_file2,Int64);
10 | 
11 | N=maximum([maximum(X1[:,1:2]),maximum(X2[:,1:2])]);
12 | 
13 | e=minimum([minimum(X1[:,1:2]),minimum(X2[:,1:2])]);
14 | 
15 | if e.>0
16 | 	M1=sparse(X1[:,1],X1[:,2],X1[:,3],N,N);
17 | 	M2=sparse(X2[:,1],X2[:,2],X2[:,3],N,N);
18 | elseif e.==0;
19 | 	N=N+1;
20 | 	M1=sparse(X1[:,1]+1,X1[:,2]+1,X1[:,3],N,N);
21 | 	M2=sparse(X2[:,1]+1,X2[:,2]+1,X2[:,3],N,N);
22 | end
23 | 
24 | if ~isequal(M1,M1');
25 | 	M1_tmp=M1-spdiagm(diag(M1));
26 | 	M1=M1+M1_tmp';
27 | end
28 | 
29 | if ~isequal(M2,M2');
30 | 	M2_tmp=M2-spdiagm(diag(M2));
31 | 	M2=M2+M2_tmp';
32 | end
33 | 
34 | Q,a1,b1=get_reproducibility(M1,M2,r);
35 | 	
36 | println("size of maps:",size(M1,1),"\t","reproducibility score=",Q);
37 | 
38 | 


--------------------------------------------------------------------------------
/run_reproducibility_v2.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import scipy
  3 | import numpy
  4 | import straw
  5 | from scipy.sparse import lil_matrix
  6 | from scipy.sparse.linalg import eigsh
  7 | 
  8 | def Parse_matrix(file1,file2):
  9 |   max_index=0
 10 |   max_index_temp=0
 11 |   with open(file1) as input_file:
 12 |     for line in input_file:
 13 |         x,y,z=map(int, line.split())
 14 |         max_index_temp=max(x,y)
 15 |         if max_index_temp>max_index:
 16 |                max_index=max_index_temp
 17 |   
 18 |   with open(file2) as input_file:
 19 |     for line in input_file:
 20 |         x,y,z=map(int, line.split())
 21 |         max_index_temp=max(x,y)
 22 |         if max_index_temp>max_index:
 23 |                max_index=max_index_temp
 24 | 
 25 | 
 26 |   M1=lil_matrix((max_index,max_index))
 27 |   M2=lil_matrix((max_index,max_index))
 28 |   with open(file1) as input_file:
 29 |     for line in input_file:
 30 |         x,y,z=map(int, line.split())
 31 |         M1[x-1,y-1]=z
 32 |         M1[y-1,x-1]=z
 33 |   with open(file2) as input_file:
 34 |     for line in input_file:
 35 |         x,y,z=map(int, line.split())
 36 |         M2[x-1,y-1]=z
 37 |         M2[y-1,x-1]=z
 38 |   return M1, M2 
 39 | 
 40 | 
 41 | def Parse_matrix_lieberman(file1,file2, resolution):
 42 |   max_index=0
 43 |   max_index_temp=0
 44 |   with open(file1) as input_file:
 45 |     for line in input_file:
 46 |       if line[0]!="#":  
 47 |         x,y,z=map(int, line.split())
 48 |         max_index_temp=max(x,y)
 49 |         if max_index_temp>max_index:
 50 |                max_index=max_index_temp
 51 |   
 52 |   with open(file2) as input_file:
 53 |     for line in input_file:
 54 |       
 55 |       if line[0]!="#":  
 56 |         x,y,z=map(int, line.split())
 57 |         max_index_temp=max(x,y)
 58 |         if max_index_temp>max_index:
 59 |                max_index=max_index_temp
 60 | 
 61 |   max_index=max_index/resolution+1
 62 |   print(max_index)
 63 |   M1=lil_matrix((max_index,max_index))
 64 |   M2=lil_matrix((max_index,max_index))
 65 |   with open(file1) as input_file:
 66 |     for line in input_file:
 67 |       if line[0]!="#":  
 68 |         x,y,z=map(int, line.split())
 69 |         M1[x/resolution,y/resolution]=z
 70 |         M1[y/resolution,x/resolution]=z
 71 |   with open(file2) as input_file:
 72 |     for line in input_file:
 73 |       if line[0]!="#":  
 74 |         x,y,z=map(int, line.split())
 75 |         M2[x/resolution,y/resolution]=z
 76 |         M2[y/resolution,x/resolution]=z
 77 |   return M1, M2 
 78 | 
 79 | 
 80 | def Parse_matrix_hic(file1, file2, chrn, resolution):
 81 |     
 82 |     Table1=straw.straw("NONE",file1, chrn, chrn,"BP",resolution)
 83 |     Table2=straw.straw("NONE",file2, chrn, chrn,"BP",resolution)
 84 |     max_index=max(max(Table1[0]),max(Table1[1]),max(Table2[0]),max(Table2[1]))
 85 |     max_index=max_index/resolution
 86 |     M1=lil_matrix((max_index+1,max_index+1))
 87 |     M2=lil_matrix((max_index+1,max_index+1))
 88 |                      
 89 |     for i in range(len(Table1[0])):
 90 |         M1[Table1[0][i]/resolution,Table1[1][i]/resolution]=Table1[2][i]
 91 |         M1[Table1[1][i]/resolution,Table1[0][i]/resolution]=Table1[2][i]
 92 |                                                              
 93 |     for i in range(len(Table2[0])):
 94 |         M2[Table2[0][i]/resolution,Table2[1][i]/resolution]=Table2[2][i]
 95 |         M2[Table2[1][i]/resolution,Table2[0][i]/resolution]=Table2[2][i]
 96 |     return M1, M2
 97 | 
 98 | 
 99 | def  get_Laplacian(M):
100 |      S=M.sum(1)
101 |      i_nz=numpy.where(S>0)[0]
102 |      S=S[i_nz]
103 |      M=(M[i_nz].T)[i_nz].T
104 |      S=1/numpy.sqrt(S)
105 |      M=S*M
106 |      M=(S*M.T).T
107 |      n=numpy.size(S)
108 |      M=numpy.identity(n)-M
109 |      M=(M+M.T)/2
110 |      return M
111 | 
112 | def evec_distance(v1,v2):
113 |     d1=numpy.dot(v1-v2,v1-v2)
114 |     d2=numpy.dot(v1+v2,v1+v2)
115 |     if d1<d2:
116 |          d=d1
117 |     else:
118 |         d=d2
119 |     return numpy.sqrt(d)
120 | 
121 | def get_ipr(evec):
122 |       ipr=1.0/(evec*evec*evec*evec).sum()
123 |       return ipr
124 | 
125 | 
126 | def get_reproducibility(M1,M2,num_evec):
127 |    k1=numpy.sign(M1.A).sum(1)
128 |    d1=numpy.diag(M1.A)
129 |    kd1=~((k1==1)*(d1>0))
130 |    k2=numpy.sign(M2.A).sum(1)
131 |    d2=numpy.diag(M2.A)
132 |    kd2=~((k2==1)*(d2>0))
133 |    iz=numpy.nonzero((k1+k2>0)*(kd1>0)*(kd2>0))[0]
134 |    M1b=(M1[iz].A.T)[iz].T
135 |    M2b=(M2[iz].A.T)[iz].T
136 | 
137 |    i_nz1=numpy.where(M1b.sum(1)>0)[0]
138 |    i_nz2=numpy.where(M2b.sum(1)>0)[0]
139 |    i_z1=numpy.where(M1b.sum(1)==0)[0]
140 |    i_z2=numpy.where(M2b.sum(1)==0)[0]
141 |    
142 |    M1b_L=get_Laplacian(M1b)
143 |    M2b_L=get_Laplacian(M2b)
144 |    
145 |    a1, b1=eigsh(M1b_L,k=num_evec,which="SM")
146 |    a2, b2=eigsh(M2b_L,k=num_evec,which="SM")
147 |    
148 |    b1_extend=numpy.zeros((numpy.size(M1b,0),num_evec))
149 |    b2_extend=numpy.zeros((numpy.size(M2b,0),num_evec))
150 |    for i in range(num_evec):
151 |        b1_extend[i_nz1,i]=b1[:,i]
152 |        b2_extend[i_nz2,i]=b2[:,i]
153 |    
154 |    ipr_cut=5
155 |    ipr1=numpy.zeros(num_evec)
156 |    ipr2=numpy.zeros(num_evec)
157 |    for i in range(num_evec):
158 |        ipr1[i]=get_ipr(b1_extend[:,i])
159 |        ipr2[i]=get_ipr(b2_extend[:,i])
160 |   
161 |    b1_extend_eff=b1_extend[:,ipr1>ipr_cut]
162 |    b2_extend_eff=b2_extend[:,ipr2>ipr_cut]
163 |    num_evec_eff=min(numpy.size(b1_extend_eff,1),numpy.size(b2_extend_eff,1))
164 |   
165 |    evd=numpy.zeros(num_evec_eff)
166 |    for i in range(num_evec_eff):
167 |        evd[i]=evec_distance(b1_extend_eff[:,i],b2_extend_eff[:,i])
168 |    
169 |    Sd=evd.sum()
170 |    l=numpy.sqrt(2)
171 |    evs=abs(l-Sd/num_evec_eff)/l
172 | 
173 |    N=float(M1.shape[1]);
174 |    if (numpy.sum(ipr1>N/100)<=1)|(numpy.sum(ipr2>N/100)<=1):
175 |       print("at least one of the maps does not look like typical Hi-C maps")
176 |    else:
177 |       print("size of maps: %d" %(numpy.size(M1,0)))
178 |       print("reproducibility score: %6.3f " %(evs))
179 |       print("num_evec_eff: %d" %(num_evec_eff))
180 |    return evs
181 | 
182 | def main():
183 |     num_evec=20;
184 |     if len(sys.argv)==4 and sys.argv[1]=="-F":
185 |         M1, M2=Parse_matrix(sys.argv[2],sys.argv[3])
186 |         get_reproducibility(M1,M2,num_evec)
187 |     elif len(sys.argv)==6 and sys.argv[1]=="-f":
188 |         M1, M2=Parse_matrix_hic(sys.argv[2],sys.argv[3],sys.argv[4],int(sys.argv[5]))
189 |         get_reproducibility(M1,M2,num_evec)
190 |     elif len(sys.argv)==5 and sys.argv[1]=="t":
191 |         M1, M2=Parse_matrix_lieberman(sys.argv[2],sys.argv[3],int(sys.argv[4]))
192 |         get_reproducibility(M1,M2,num_evec)
193 |     else:
194 |       print('3, 4 or 5 arguments required')
195 |       print('To use matrix table files as the input:')
196 |       print('python run_reproducibility.py -F matrix_file1 matrix_file2')
197 |       print('To use .hic files as the input:')
198 |       print('python run_reproducibility.py -f hic_file1 hic_file2 chrid resolution[int]')
199 |       print('To use lieberman matrix files as the input:')
200 |       print('python run_reproducibility.py t matrix_file1 matrix_file2 resolution[int]')
201 | 
202 | 
203 | if __name__ == '__main__':
204 |     main()
205 | 


--------------------------------------------------------------------------------