├── .gitignore
├── LICENSE
├── README.md
├── bin
    ├── ALLHiC_linkage_distribution.py
    ├── ALLHiC_mono_allele_minimap.py
    ├── ALLHiC_partition.py
    ├── ALLHiC_pip.sh
    ├── ALLHiC_plot.py
    ├── ALLHiC_prune
    ├── ALLHiC_rescue.py
    └── partition_gmap.py
└── src
    ├── ALLHiC_prune.cpp
    ├── Makefile
    ├── Prune.cpp
    ├── Prune.h
    └── htslib-1.17.tar.bz2


/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore Mac system file
2 | .DS_Store
3 | .vscode
4 | htslib-1.17
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2021, Shengcheng Zhang
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its
16 |    contributors may be used to endorse or promote products derived from
17 |    this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Introduction
  2 | Some components that speed up and reduce resource cost for original ALLHiC.
  3 | 
  4 | ## Dependencies
  5 | - Python Modules
  6 |   * pysam    
  7 |   * numpy    
  8 |   * matplotlib    
  9 |   * jcvi    
 10 |   * h5py
 11 | 
 12 | ## Installation
 13 | ```bash
 14 | git clone https://github.com/sc-zhang/ALLHiC_extensions.git
 15 | cd ALLHiC_extensions
 16 | chmod +x bin/*.*
 17 | 
 18 | # install ALLHiC_prune
 19 | cd src/
 20 | make && make install
 21 | ```
 22 | 
 23 | ## Usage
 24 | **ALLHiC_prune** is used for prunning singals between allelic chromosomes, which was rewritten for speedup and mem reduce.
 25 | 
 26 | ```bash
 27 | ************************************************************************
 28 |     Usage: ./ALLHiC_prune -i Allele.ctg.table -b sorted.bam
 29 |       -h : help and usage.
 30 |       -i : Allele.ctg.table
 31 |       -b : sorted.bam
 32 | ************************************************************************
 33 | ```
 34 | 
 35 | **partition_gmap.py** is used for spliting bam and contig level fasta by chromosomes with allele table.
 36 | ```bash
 37 | usage: partition_gmap.py [-h] -r REF -g ALLELETABLE [-b BAM] [-d WORKDIR]
 38 |                          [-t THREAD]
 39 | 
 40 | optional arguments:
 41 |   -h, --help            show this help message and exit
 42 |   -r REF, --ref REF     reference contig level assembly
 43 |   -g ALLELETABLE, --alleletable ALLELETABLE
 44 |                         Allele.gene.table
 45 |   -b BAM, --bam BAM     bam file, default: prunning.bam
 46 |   -d WORKDIR, --workdir WORKDIR
 47 |                         work directory, default: wrk_dir
 48 |   -t THREAD, --thread THREAD
 49 |                         threads, default: 10
 50 | ```
 51 | 
 52 | **ALLHiC_partition.py** is an **experimental** script for clustering contigs into haplotypes.
 53 | ```bash
 54 | usage: ALLHiC_partition.py [-h] -r REF -b BAM -d BED -a ANCHORS -p POLY
 55 |                            [-e EXCLUDE] [-o OUT]
 56 | 
 57 | optional arguments:
 58 |   -h, --help            show this help message and exit
 59 |   -r REF, --ref REF     Contig level assembly fasta
 60 |   -b BAM, --bam BAM     Prunned bam file
 61 |   -d BED, --bed BED     dup.bed
 62 |   -a ANCHORS, --anchors ANCHORS
 63 |                         anchors file with dup.mono.anchors
 64 |   -p POLY, --poly POLY  Ploid count of polyploid
 65 |   -e EXCLUDE, --exclude EXCLUDE
 66 |                         A list file contains exclude contigs for partition,
 67 |                         default=""
 68 |   -o OUT, --out OUT     Output directory, default=workdir
 69 | ```
 70 | 
 71 | **ALLHiC_rescue.py** is a new version of rescue use jcvi to prevent the collinear contigs be rescued to same group.
 72 | ```bash
 73 | usage: ALLHiC_rescue.py [-h] -r REF -b BAM -c CLUSTER -n COUNTS -g GFF3 -j
 74 |                         JCVI [-e EXCLUDE] [-w WORKDIR]
 75 | 
 76 | optional arguments:
 77 |   -h, --help            show this help message and exit
 78 |   -r REF, --ref REF     Contig level assembly fasta
 79 |   -b BAM, --bam BAM     Unprunned bam
 80 |   -c CLUSTER, --cluster CLUSTER
 81 |                         Cluster file of contigs
 82 |   -n COUNTS, --counts COUNTS
 83 |                         count REs file
 84 |   -g GFF3, --gff3 GFF3  Gff3 file generated by gmap cds to contigs
 85 |   -j JCVI, --jcvi JCVI  CDS file for jcvi, bed file with same prefix must
 86 |                         exist in the same position
 87 |   -e EXCLUDE, --exclude EXCLUDE
 88 |                         cluster which need no rescue, default="", split by
 89 |                         comma
 90 |   -w WORKDIR, --workdir WORKDIR
 91 |                         Work directory, default=wrkdir
 92 | ```
 93 | 
 94 | **ALLHiC_plot.py** is used to plot heatmap of Hi-C singal, and compare with original version, it can reduce the usage of memory, and easier plot heatmap with other resolution.
 95 | ```bash
 96 | # Notice: bam file must be indexed
 97 | usage: ALLHiC_plot.py [-h] -b BAM -l LIST [-a AGP] [-5 H5] [-m MIN_SIZE] [-s SIZE] [-c CMAP] [-o OUTDIR] [--line | --block] [--linecolor LINECOLOR] [-t THREAD]
 98 | 
 99 | options:
100 |   -h, --help            show this help message and exit
101 |   -b BAM, --bam BAM     Input bam file
102 |   -l LIST, --list LIST  Chromosome list, contain: ID Length
103 |   -a AGP, --agp AGP     Input AGP file, if bam file is a contig-level mapping, agp file is required
104 |   -5 H5, --h5 H5        h5 file of hic signal, optional, if not exist, it will be generate after reading hic signals, or it will be loaded for drawing other resolution of heatmap
105 |   -m MIN_SIZE, --min_size MIN_SIZE
106 |                         Minium bin size of heatmap, default=50k
107 |   -s SIZE, --size SIZE  Bin size of heatmap, can be a list separated by comma, default=500k, notice: it must be n times of min_size (n is integer) or we will adjust it to nearest one
108 |   -c CMAP, --cmap CMAP  CMAP for drawing heatmap, default="YlOrRd"
109 |   -o OUTDIR, --outdir OUTDIR
110 |                         Output directory, default=workdir
111 |   --line                Draw dash line for each chromosome
112 |   --block               Draw dash block for each chromosome
113 |   --linecolor LINECOLOR
114 |                         Color of dash line or dash block, default="grey"
115 |   -t THREAD, --thread THREAD
116 |                         Threads for reading bam, default=1
117 | ```
118 | 
119 | **Other scripts** are under development, and not recommend to use.
120 | 


--------------------------------------------------------------------------------
/bin/ALLHiC_linkage_distribution.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import os
 4 | import pysam
 5 | import matplotlib as mpl
 6 | mpl.use('Agg')
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | 
10 | def get_linkage_dist(in_bam, out_dir):
11 |     if not os.path.exists(out_dir):
12 |         os.mkdir(out_dir)
13 |     
14 |     print("Getting linkages between contigs")
15 |     link_db = {}
16 |     with pysam.AlignmentFile(in_bam, 'rb') as fin:
17 |         for line in fin:
18 |             for line in fin:
19 |                 ctg1 = line.reference_name
20 |                 ctg2 = line.next_reference_name
21 |                 pos1 = line.reference_start+1
22 |                 pos2 = line.next_reference_start+1
23 |                 if pos1 == -1 or pos2 == -1 or ctg1 == ctg2:
24 |                     continue
25 |                 if ctg1 not in link_db:
26 |                     link_db[ctg1] = {}
27 |                 if ctg2 not in link_db[ctg1]:
28 |                     link_db[ctg1][ctg2] = 0
29 |                 if ctg2 not in link_db:
30 |                     link_db[ctg2] = {}
31 |                 if ctg1 not in link_db[ctg2]:
32 |                     link_db[ctg2][ctg1] = 0
33 |                 link_db[ctg1][ctg2] += 1
34 |                 link_db[ctg2][ctg1] += 1
35 | 
36 |     
37 |     print("Writing linkage distribution")
38 |     link_list = []
39 |     for ctg in link_db:
40 |         sig = 0
41 |         #for ctg2 in link_db[ctg]:
42 |         #    sig += link_db[ctg][ctg2]
43 |         sig = len(link_db[ctg])
44 |         link_list.append([ctg, sig])
45 |     
46 |     link_list = sorted(link_list, key=lambda x: -x[1])
47 |     dist_db = {}
48 |     with open(os.path.join(out_dir, 'linkages.txt'), 'w') as fout:
49 |         for ctg, links in link_list:
50 |             fout.write("%s\t%d\n"%(ctg, links))
51 |             links = int(links/10)
52 |             if links not in dist_db:
53 |                 dist_db[links] = 0
54 |             dist_db[links] += 1
55 |     
56 |     x_vals = []
57 |     y_vals = []
58 |     for links in sorted(dist_db):
59 |         x_vals.append(links)
60 |         y_vals.append(dist_db[links])
61 |     
62 | 
63 |     print("Drawing distributions")
64 | 
65 |     plt.figure(figsize=(10, 8), dpi=100)
66 |     plt.plot(x_vals, y_vals)
67 |     plt.savefig(os.path.join(out_dir, "dist.pdf"), bbox_inches="tight")
68 | 
69 |     print("Finished")
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     if len(sys.argv) < 3:
74 |         print("Usage: python %s <in_bam> <out_dir>"%sys.argv[0])
75 |     else:
76 |         in_bam, out_dir = sys.argv[1:]
77 |         get_linkage_dist(in_bam, out_dir)
78 | 


--------------------------------------------------------------------------------
/bin/ALLHiC_mono_allele_minimap.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | import os
  4 | import argparse
  5 | import gc
  6 | import time
  7 | 
  8 | 
  9 | def time_print(str):
 10 |     print("\033[32m%s\033[0m %s"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())), str))
 11 | 
 12 | 
 13 | def get_opts():
 14 | 	group = argparse.ArgumentParser()
 15 | 	group.add_argument("-r", "--reference", help="Chromosome fasta", required=True)
 16 | 	group.add_argument("-c", "--contig", help="Contig fasta", required=True)
 17 | 	group.add_argument("-p", "--ploidy", help="Ploidy of genome", required=True, type=int)
 18 | 	group.add_argument("-o", "--output", help="Output allele table", required=True)
 19 | 	group.add_argument("-w", "--win", help="Size of window, defalut: 20k", default="20k")
 20 | 	group.add_argument("-s", "--step", help="Size of step, default: 10k", default="10k")
 21 | 	group.add_argument("-d", "--dir", help="Work folder, default: wrk_dir", default="wrk_dir")
 22 | 	group.add_argument("-t", "--threads", help="Number of thread, default: 1", type=int, default=1)
 23 | 	return group.parse_args()
 24 | 
 25 | 
 26 | def read_fasta(in_fa):
 27 | 	fa_db = {}
 28 | 	with open(in_fa, 'r') as fin:
 29 | 		for line in fin:
 30 | 			if line[0] == '>':
 31 | 				id = line.strip().split()[0][1:]
 32 | 				fa_db[id] = []
 33 | 			else:
 34 | 				fa_db[id].append(line.strip())
 35 | 	
 36 | 	for id in fa_db:
 37 | 		fa_db[id] = ''.join(fa_db[id])
 38 | 	return fa_db
 39 | 
 40 | 
 41 | def gen_sub_seq(fa_db, win_size, step_size, wrk_dir):
 42 | 	with open(os.path.join(wrk_dir, 'sub_chr.fa'), 'w') as fout:
 43 | 		for id in sorted(fa_db):
 44 | 			if 'tig' in id or 'ctg' in id:
 45 | 				continue
 46 | 			for i in range(0, len(fa_db[id])-win_size+1, step_size):
 47 | 				sub_id = "%s-%d"%(id, i+1)
 48 | 				sub_seq = fa_db[id][i: i+win_size]
 49 | 				fout.write(">%s\n%s\n"%(sub_id, sub_seq))
 50 | 	return os.path.join(wrk_dir, 'sub_chr.fa')
 51 | 
 52 | 
 53 | def gen_allele_table(ref_fa, ctg_fa, allele_table, ploidy, win_size, step_size, wrk_dir, threads):
 54 | 	if not os.path.exists(wrk_dir):
 55 | 		os.mkdir(wrk_dir)
 56 | 	time_print("Loading reference genome")
 57 | 	fa_db = read_fasta(ref_fa)
 58 | 	
 59 | 	time_print("Generating sequences")
 60 | 	sub_chr_fn = gen_sub_seq(fa_db, win_size, step_size, wrk_dir)
 61 | 
 62 | 	del fa_db
 63 | 	gc.collect()
 64 | 		
 65 | 	time_print("Mapping")
 66 | 	paf_fn = os.path.join(wrk_dir, "mapping.paf")
 67 | 	cmd = "minimap2 -k19 -w19 -t%s %s %s > %s"%(threads, sub_chr_fn, ctg_fa, paf_fn)
 68 | 	os.system(cmd)
 69 | 	
 70 | 	time_print("Generating allele table")
 71 | 	map_db = {}
 72 | 	map_len_db = {}
 73 | 	with open(paf_fn, 'r') as fin:
 74 | 		for line in fin:
 75 | 			data = line.strip().split()
 76 | 			ctg = data[0]
 77 | 			tsp = int(data[2])
 78 | 			tep = int(data[3])
 79 | 			chrn = data[5]
 80 | 			chrn_base = chrn.split('-')[0]
 81 | 			map_len = abs(tsp-tep)+1
 82 | 			if ctg not in map_db:
 83 | 				map_db[ctg] = []
 84 | 			if ctg not in map_len_db:
 85 | 				map_len_db[ctg] = {}
 86 | 			if chrn_base not in map_len_db[ctg]:
 87 | 				map_len_db[ctg][chrn_base] = 0
 88 | 			map_len_db[ctg][chrn_base] += map_len
 89 | 			map_db[ctg].append([map_len, chrn])
 90 | 	
 91 | 	new_map_db = {}
 92 | 	for ctg in map_db:
 93 | 		best_match = ''
 94 | 		max_len = 0
 95 | 		for chrn_base in map_len_db[ctg]:
 96 | 			if map_len_db[ctg][chrn_base] > max_len:
 97 | 				best_match = chrn_base
 98 | 				max_len = map_len_db[ctg][chrn_base]
 99 | 		for map_len, chrn in map_db[ctg]:
100 | 			chrn_base = chrn.split('-')[0]
101 | 			if chrn_base == best_match:
102 | 				if chrn not in new_map_db:
103 | 					new_map_db[chrn] = []
104 | 				new_map_db[chrn].append([map_len, ctg])
105 | 	
106 | 	allele_db = {}
107 | 	tmp_list = []
108 | 	for chrn in new_map_db:
109 | 		allele_db[chrn] = []
110 | 		tmp_list = sorted(new_map_db[chrn], reverse=True)
111 | 		for i in range(0, ploidy):
112 | 			if i >= len(tmp_list):
113 | 				break
114 | 			allele_db[chrn].append(tmp_list[i][1])
115 | 	
116 | 	tmp_list = []
117 | 	for chrn in allele_db:
118 | 		id, idx = chrn.split('-')
119 | 		idx = int(idx)
120 | 		tmp_list.append([id, idx, sorted(list(set(allele_db[chrn])))])
121 | 	time_print("Generating success")
122 | 
123 | 	time_print("Writing allele table")
124 | 	with open(allele_table, 'w') as fout:
125 | 		for id, idx, allele_list in sorted(tmp_list):
126 | 			fout.write("%s\t%d\t%s\n"%(id, idx, '\t'.join(allele_list)))
127 | 	time_print("Writing success")
128 | 
129 | 	del allele_db, tmp_list
130 | 	gc.collect()
131 | 	
132 | 	time_print("Finished")
133 | 
134 | 
135 | if __name__ == "__main__":
136 | 	opts = get_opts()
137 | 	ref_fa = opts.reference
138 | 	ctg_fa = opts.contig
139 | 	ploidy = opts.ploidy
140 | 	win_size = opts.win
141 | 	step_size = opts.step
142 | 	threads = opts.threads
143 | 	allele_table = opts.output
144 | 	wrk_dir = opts.dir
145 | 	win_size = int(win_size.lower().replace('m', '000000').replace('k', '000'))
146 | 	step_size = int(step_size.lower().replace('m', '000000').replace('k', '000'))
147 | 
148 | 	gen_allele_table(ref_fa, ctg_fa, allele_table, ploidy, win_size, step_size, wrk_dir, threads)
149 | 
150 | 


--------------------------------------------------------------------------------
/bin/ALLHiC_partition.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import os
  4 | 
  5 | import pysam
  6 | import numpy as np
  7 | import time
  8 | 
  9 | 
 10 | class UnionFind():
 11 | 	def __init__(self, size):
 12 | 		self.__f = [i for i in range(0, size)]
 13 | 	
 14 | 	
 15 | 	def find(self, x):
 16 | 		if(self.__f[x] == x):
 17 | 			return x
 18 | 		self.__f[x] = self.find(self.__f[x])
 19 | 		return self.__f[x]
 20 | 	
 21 | 
 22 | 	def union(self, x, y):
 23 | 		fx = self.find(x)
 24 | 		fy = self.find(y)
 25 | 		if fx != fy:
 26 | 			self.__f[fy] = fx
 27 | 
 28 | 
 29 | def getOpts():
 30 | 	groups = argparse.ArgumentParser()
 31 | 	groups.add_argument('-r', '--ref', help="Contig level assembly fasta", required=True)
 32 | 	groups.add_argument('-b', '--bam', help="Prunned bam file", required=True)
 33 | 	groups.add_argument('-d', '--bed', help='dup.bed', required=True)
 34 | 	groups.add_argument('-a', '--anchors', help='anchors file with dup.mono.anchors', required=True)
 35 | 	groups.add_argument('-p', '--poly', help="Ploid count of polyploid", type=int, required=True)
 36 | 	groups.add_argument('-e', '--exclude', help="A list file contains exclude contigs for partition, default=\"\"", default="")
 37 | 	groups.add_argument('-o', '--out', help="Output directory, default=workdir", default="workdir")
 38 | 	return groups.parse_args()
 39 | 
 40 | 
 41 | def getSignal(inBam, seqCount, seqList, qryDB, excludeDB):
 42 | 	seqIdx = {}
 43 | 	for i in range(0, seqCount):
 44 | 		seqIdx[seqList[i]] = i
 45 | 
 46 | 	seqMat = [[0 for i in range(0, seqCount)] for j in range(0, seqCount)]
 47 | 	with pysam.AlignmentFile(inBam, 'rb') as fin:
 48 | 		for line in fin:
 49 | 			ctg1 = line.reference_name
 50 | 			ctg2 = line.next_reference_name
 51 | 			pos1 = line.reference_start+1
 52 | 			pos2 = line.next_reference_start+1
 53 | 			if pos1 == -1 or pos2 == -1 or ctg1 == ctg2 or ctg1 in excludeDB or ctg2 in excludeDB:
 54 | 				continue
 55 | 			idx1 = seqIdx[ctg1]
 56 | 			idx2 = seqIdx[ctg2]
 57 | 			if idx1>idx2:
 58 | 				idx1, idx2 = idx2, idx1
 59 | 			seqMat[idx1][idx2] += 1
 60 | 	
 61 | 	sigList = []
 62 | 	for idx1 in range(0, seqCount-1):
 63 | 		for idx2 in range(idx1+1, seqCount):
 64 | 			if seqMat[idx1][idx2] >= 10:
 65 | 				ctg1 = seqList[idx1]
 66 | 				ctg2 = seqList[idx2]
 67 | 				if (ctg1 not in qryDB) or (ctg2 not in qryDB) or (len(qryDB[ctg1])+len(qryDB[ctg2]))==0:
 68 | 					ovlp = 0.0
 69 | 				else:
 70 | 					ovlpCount = len(qryDB[ctg1].intersection(qryDB[ctg2]))
 71 | 					ovlp = ovlpCount*2.0/(len(qryDB[ctg1])+len(qryDB[ctg2]))
 72 | 				sigList.append([idx1, idx2, seqMat[idx1][idx2], ovlp])
 73 | 	return sigList
 74 | 
 75 | 
 76 | def checkLongestGroups(lengthList, polyCount):
 77 | 	#avgL = np.average(lengthList[: polyCount])
 78 | 	minL = min(lengthList[: polyCount])
 79 | 	maxL = max(lengthList[: polyCount])
 80 | 	print("\tMax %d groups, %s"%(polyCount, ','.join(map(str, lengthList[: polyCount]))))
 81 | 	if maxL<=minL*3: #avgL*1.5>maxL and avgL*0.5<minL:
 82 | 		return False
 83 | 	else:
 84 | 		return True
 85 | 
 86 | 
 87 | def allHiCPartition(refFasta, inBam, bed, anchors, polyCount, exclude, outDir):
 88 | 	# Get full file path
 89 | 	refFasta = os.path.abspath(refFasta)
 90 | 	inBam = os.path.abspath(inBam)
 91 | 	bed = os.path.abspath(bed)
 92 | 	anchors = os.path.abspath(anchors)
 93 | 
 94 | 	if exclude != "":
 95 | 		exclude = os.path.abspath(exclude)
 96 | 
 97 | 	outDir = os.path.abspath(outDir)
 98 | 	if not os.path.exists(outDir):
 99 | 		os.mkdir(outDir)
100 | 	
101 | 	# Enter work directory
102 | 	os.chdir(outDir)
103 | 	print("Loading fasta")
104 | 	excludeDB = {}
105 | 	if exclude != "":
106 | 		with open(exclude, 'r') as fin:
107 | 			for line in fin:
108 | 				excludeDB[line.strip()] = 1
109 | 	faDB = {}
110 | 	with open(refFasta, 'r') as fin:
111 | 		id = ""
112 | 		seq = ""
113 | 		for line in fin:
114 | 			if line[0] == '>':
115 | 				if seq != "" and id not in excludeDB:
116 | 					faDB[id] = seq
117 | 				id = line.strip()[1:]
118 | 				seq = ""
119 | 			else:
120 | 				seq += line.strip()
121 | 	if id not in excludeDB:
122 | 		faDB[id] = seq
123 | 
124 | 	# Get overlap
125 | 	print("Loading anchors")
126 | 	anchorsDB = {}
127 | 	with open(anchors, 'r') as fin:
128 | 		for line in fin:
129 | 			if line.strip() == '' or line[0] == '#':
130 | 				continue
131 | 			data = line.strip().split()
132 | 			anchorsDB[data[0]] = data[1]
133 | 
134 | 	qryDB = {}
135 | 	with open(bed, 'r') as fin:
136 | 		for line in fin:
137 | 			data = line.strip().split()
138 | 			tig = data[0]
139 | 			gene = data[3]
140 | 			if tig not in qryDB:
141 | 				qryDB[tig] = set()
142 | 			if gene not in anchorsDB:
143 | 				continue
144 | 			qryDB[tig].add(anchorsDB[gene])
145 | 
146 | 	# Get signals
147 | 	print("Getting signals")
148 | 	seqCount  = len(faDB)
149 | 	seqList = sorted(faDB)
150 | 	seqLen = []
151 | 	for i in range(0, seqCount):
152 | 		seqLen.append(len(faDB[seqList[i]]))
153 | 
154 | 	sigList = getSignal(inBam, seqCount, seqList, qryDB, excludeDB)
155 | 	
156 | 	# Save signal list
157 | 	print("Saving signal list")
158 | 	with open("signal.txt", 'w') as fout:
159 | 		for idx1, idx2, signal, ovlp in sigList:
160 | 			fout.write("%s\t%s\t%d\t%f\n"%(seqList[idx1], seqList[idx2], signal, ovlp))
161 | 	
162 | 	# Initial UnionFind
163 | 	sigList = sorted(sigList, key=lambda x: (-x[3], x[2]))
164 | 	sigCount = len(sigList)
165 | 	print("Generating Union find")
166 | 	uf = UnionFind(seqCount)
167 | 	for idx1, idx2, signal, ovlp in sigList:
168 | 		uf.union(idx1, idx2)
169 | 	
170 | 	# Get current groups
171 | 	currentGroupCount = 0
172 | 	for idx in range(0, seqCount):
173 | 		if uf.find(idx) == idx:
174 | 			currentGroupCount += 1
175 | 	
176 | 	print("\tInitial group count: %d, edge count: %d"%(currentGroupCount, sigCount))
177 | 
178 | 	groupDB = {}
179 | 	for idx in range(0, seqCount):
180 | 		gid = uf.find(idx)
181 | 		if gid not in groupDB:
182 | 			groupDB[gid] = []
183 | 		groupDB[gid].append(idx)	
184 | 	
185 | 	lengthList = []
186 | 	for gid in groupDB:
187 | 		curLen = 0
188 | 		for idx in groupDB[gid]:
189 | 			curLen += seqLen[idx]
190 | 		lengthList.append(curLen)
191 | 	lengthList = sorted(lengthList, reverse=True)
192 | 
193 | 	i = 1
194 | 	while sigList[i][3] > 0:
195 | 		i += 1
196 | 
197 | 	print("\tRemoved: %d edges while contigs were overlaped"%i)
198 | 
199 | 	while currentGroupCount < polyCount or checkLongestGroups(lengthList, polyCount):
200 | 		sig = sigList[i][2]
201 | 		while sigList[i][2] == sig:
202 | 			i += 1
203 | 		
204 | 		uf = UnionFind(seqCount)
205 | 		for idx in range(i, sigCount):
206 | 			idx1, idx2, signal, ovlp = sigList[idx]
207 | 			uf.union(idx1, idx2)
208 | 		
209 | 		lengthList = []
210 | 		currentGroupCount = 0
211 | 		for idx in range(0, seqCount):
212 | 			if uf.find(idx) == idx:
213 | 				currentGroupCount += 1
214 | 
215 | 		groupDB = {}
216 | 		for idx in range(0, seqCount):
217 | 			gid = uf.find(idx)
218 | 			if gid not in groupDB:
219 | 				groupDB[gid] = []
220 | 			groupDB[gid].append(idx)	
221 | 		
222 | 		lengthList = []
223 | 		for gid in groupDB:
224 | 			curLen = 0
225 | 			for idx in groupDB[gid]:
226 | 				curLen += seqLen[idx]
227 | 			lengthList.append(curLen)
228 | 		lengthList = sorted(lengthList, reverse=True)
229 | 
230 | 		print("\tCurrent group count: %d, removed edge count: %d"%(currentGroupCount, i))
231 | 		i += 1
232 | 
233 | 
234 | 	with open("remove.list", "w") as fout:
235 | 		for idx in range(0, i):
236 | 			idx1, idx2, signal, ovlp = sigList[idx]
237 | 			fout.write("Remove %d: %s, %s, %d, %f\n"%(idx+1, seqList[idx1], seqList[idx2], signal, ovlp))
238 | 	
239 | 
240 | 	checkLongestGroups(lengthList, polyCount)
241 | 	lengthDB = {}
242 | 	for gid in groupDB:
243 | 		curLen = 0
244 | 		for idx in groupDB[gid]:
245 | 			curLen += seqLen[idx]
246 | 		lengthDB[gid] = curLen
247 | 
248 | 	groupList = []
249 | 	for gid in groupDB:
250 | 		groupList.append([gid, lengthDB[gid]])
251 | 
252 | 	groupList = sorted(groupList, key=lambda x: -x[1])
253 | 
254 | 	print("Writing group list")
255 | 	with open("group.txt", "w") as fout:
256 | 		for i in range(0, len(groupList)):
257 | 			idx = groupList[i][0]
258 | 			fout.write("group%d\t"%(i+1))
259 | 			tmp = []
260 | 			for subIdx in sorted(groupDB[idx]):
261 | 				tmp.append(seqList[subIdx])
262 | 			fout.write("%s\n"%'\t'.join(tmp))
263 | 
264 | 	print("Finished")
265 | 
266 | 
267 | if __name__ == "__main__":
268 | 	opts = getOpts()
269 | 	refFasta = opts.ref
270 | 	inBam = opts.bam
271 | 	bed = opts.bed
272 | 	anchors = opts.anchors
273 | 	polyCount = opts.poly
274 | 	exclude = opts.exclude
275 | 	outDir = opts.out
276 | 	allHiCPartition(refFasta, inBam, bed, anchors, polyCount, exclude, outDir)
277 | 


--------------------------------------------------------------------------------
/bin/ALLHiC_pip.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | usage()
  4 | {
  5 | 	echo "    Usage: `basename $0` -r reference -1 R1.fq -2 R2.fq -k group_count [-e enzyme] [-t threads] [-b bin_size]"
  6 | 	echo "          -r: reference genome"
  7 | 	echo "          -1: Lib_R1.fq.gz"
  8 | 	echo "          -2: Lib_R2.fq.gz"
  9 | 	echo "          -k: group_count"
 10 | 	echo "          -e: enzyme_sites (HindIII: AAGCTT; MboI: GATC), default: HindIII"
 11 | 	echo "          -t: threads, default: 10"
 12 | 	echo "          -b: bin_size for hic heatmap, can be divided with comma, default: 500k"
 13 | 	exit 0
 14 | }
 15 | 
 16 | ### get options
 17 | while getopts ':r:1:2:k:e:t:b:' OPT; do
 18 | 	case $OPT in
 19 | 		r)
 20 | 			ref="$OPTARG";;
 21 | 		1)
 22 | 			R1="$OPTARG";;
 23 | 		2)
 24 | 			R2="$OPTARG";;
 25 | 		e)
 26 | 			enzyme="$OPTARG";;
 27 | 		k)
 28 | 			group_count="$OPTARG";;
 29 | 		t)
 30 | 			threads="$OPTARG";;
 31 | 		b)
 32 | 			bin_size="$OPTARG";;
 33 | 		?)
 34 | 			usage;;
 35 | 	esac
 36 | done
 37 | 
 38 | ### check required variants
 39 | if [ -z $ref ] || [ -z $R1 ] || [ -z $R2 ] || [ -z $group_count ]; then
 40 | 	usage
 41 | fi
 42 | 
 43 | ### set default values while optional variants were not set
 44 | if [ -z $threads ]; then
 45 | 	threads=10
 46 | fi
 47 | 
 48 | if [ -z $bin_size ]; then
 49 | 	bin_size=500k
 50 | fi
 51 | 
 52 | if [ -z $enzyme ]; then
 53 | 	enzyme=AAGCTT
 54 | fi
 55 | 
 56 | enzyme=`echo $enzyme | tr '[a-z]' '[A-Z]'`
 57 | 
 58 | if [ $enzyme = HINDIII ]; then
 59 | 	enzyme=AAGCTT
 60 | fi
 61 | 
 62 | if [ $enzyme = MBOI ]; then
 63 | 	enzyme=GATC
 64 | fi
 65 | 
 66 | ### link required files
 67 | ln -s ${ref} ./seq.fasta
 68 | ln -s ${R1} ./Lib_R1.fastq.gz
 69 | ln -s ${R2} ./Lib_R2.fastq.gz
 70 | 
 71 | ### index reference genome
 72 | /public1/user_program/sentieon-genomics-201711/bin/bwa index seq.fasta
 73 | samtools faidx seq.fasta
 74 | 
 75 | 
 76 | ### 1st round of mapping
 77 | /public1/user_program/sentieon-genomics-201711/bin/bwa mem -M -t $threads -K 10000000 seq.fasta Lib_R1.fastq.gz Lib_R2.fastq.gz | /public1/user_program/sentieon-genomics-201711/bin/sentieon util sort -r seq.fasta -o sorted.bam -t $threads --sam2bam -i - 
 78 | samtools index sorted.bam
 79 | 
 80 | ### correct contig
 81 | ALLHiC_corrector -m sorted.bam -r seq.fasta -o seq.HiCcorrected.fasta -t $threads
 82 | 
 83 | ### 2nd round of mapping
 84 | /public1/user_program/sentieon-genomics-201711/bin/bwa index seq.HiCcorrected.fasta
 85 | samtools faidx seq.HiCcorrected.fasta
 86 | /public1/user_program/sentieon-genomics-201711/bin/bwa mem -M -t $threads -K 10000000 seq.HiCcorrected.fasta Lib_R1.fastq.gz Lib_R2.fastq.gz | /public1/user_program/sentieon-genomics-201711/bin/sentieon util sort -r seq.HiCcorrected.fasta -o sample.bwa_mem.bam -t $threads --sam2bam -i -
 87 | 
 88 | ### partition
 89 | ALLHiC_partition -r seq.HiCcorrected.fasta -e $enzyme -k $group_count -b sample.bwa_mem.bam
 90 | 
 91 | ### optimize
 92 | rm cmd.list
 93 | for((K=1;K<=$group_count;K++));do echo "allhic optimize sample.bwa_mem.counts_$enzyme.${group_count}g${K}.txt sample.bwa_mem.clm" >> cmd.list;done
 94 | ParaFly -c cmd.list -CPU $group_count
 95 | 
 96 | ### check ParaFly success
 97 | i=0
 98 | while [ -e "FailedCommands" ]
 99 | do
100 |     i=$((i+1))
101 |     if [[ $i -gt 100 ]]
102 |     then
103 |         echo "FATAL ERROR, check all files"
104 |         exit -1
105 |     fi  
106 |     mv FailedCommands cmd.list
107 |     ParaFly -c cmd.list -CPU $threads
108 | done
109 | 
110 | ### build
111 | unset PERL5LIB
112 | ALLHiC_build seq.HiCcorrected.fasta
113 | 
114 | ### plot
115 | #perl /public1/user_program/script/getFaLen.pl -i groups.asm.fasta -o len.txt
116 | #grep sample len.txt > chrn.list
117 | #ALLHiC_plot sample.bwa_mem.bam groups.agp chrn.list $bin_size pdf
118 | 
119 | 


--------------------------------------------------------------------------------
/bin/ALLHiC_plot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import numpy as np
  4 | import h5py
  5 | import matplotlib as mpl
  6 | import matplotlib.pyplot as plt
  7 | import multiprocessing
  8 | import ctypes
  9 | import functools
 10 | import pysam
 11 | import time
 12 | import os
 13 | 
 14 | mpl.use("Agg")
 15 | 
 16 | 
 17 | def time_print(info):
 18 |     print("\033[32m%s\033[0m %s" % (time.strftime('[%H:%M:%S]', time.localtime(time.time())), info))
 19 | 
 20 | 
 21 | def get_opts():
 22 |     groups = argparse.ArgumentParser()
 23 |     groups.add_argument('-b', '--bam', help='Input bam file', required=True)
 24 |     groups.add_argument('-l', '--list', help='Chromosome list, contain: ID\tLength', required=True)
 25 |     groups.add_argument('-a', '--agp', help='Input AGP file, if bam file is a contig-level mapping, agp file is '
 26 |                                             'required', default="")
 27 |     groups.add_argument('-5', '--h5',
 28 |                         help="h5 file of hic signal, optional, if not exist, it will be generate after reading "
 29 |                              "hic signals, or it will be loaded for drawing other resolution of heatmap",
 30 |                         default="")
 31 |     groups.add_argument('-m', '--min_size', help="Minium bin size of heatmap, default=50k", default="50k")
 32 |     groups.add_argument('-s', '--size',
 33 |                         help="Bin size of heatmap, can be a list separated by comma, default=500k, notice: it must "
 34 |                              "be n times of min_size (n is integer) or we will adjust it to nearest one",
 35 |                         default="500k")
 36 |     groups.add_argument('-c', '--cmap', help='CMAP for drawing heatmap, default="YlOrRd"', default='YlOrRd')
 37 |     groups.add_argument('-o', '--outdir', help='Output directory, default=workdir', default='workdir')
 38 |     groups_ex = groups.add_mutually_exclusive_group()
 39 |     groups_ex.add_argument('--line', help='Draw dash line for each chromosome', action='store_true')
 40 |     groups_ex.add_argument('--block', help='Draw dash block for each chromosome', action='store_true')
 41 |     groups.add_argument('--linecolor', help='Color of dash line or dash block, default="grey"', default='grey')
 42 |     groups.add_argument('-t', '--thread', help='Threads for reading bam, default=1', type=int, default=1)
 43 |     return groups.parse_args()
 44 | 
 45 | 
 46 | def long2short(bin_size: int) -> str:
 47 |     bin_size = str(bin_size)
 48 |     short_bin_size = ""
 49 |     if bin_size[-9:] == '000000000':
 50 |         short_bin_size = bin_size[:-9] + 'G'
 51 |     elif bin_size[-6:] == '000000':
 52 |         short_bin_size = bin_size[:-6] + 'M'
 53 |     elif bin_size[-3:] == '000':
 54 |         short_bin_size = bin_size[:-3] + 'K'
 55 |     return short_bin_size
 56 | 
 57 | 
 58 | def short2long(bin_size: str) -> int:
 59 |     long_bin_size = bin_size.upper()
 60 |     long_bin_size = long_bin_size.replace('K', '000')
 61 |     long_bin_size = long_bin_size.replace('M', '000000')
 62 |     long_bin_size = long_bin_size.replace('G', '000000000')
 63 |     long_bin_size = int(long_bin_size)
 64 |     return long_bin_size
 65 | 
 66 | 
 67 | # Get chromosome length
 68 | def get_chr_len(chr_list):
 69 |     chr_len_db = {}
 70 |     chr_order = []
 71 |     with open(chr_list, 'r') as f_in:
 72 |         for line in f_in:
 73 |             if line.strip() == '':
 74 |                 continue
 75 |             data = line.strip().split()
 76 |             chr_order.append(data[0])
 77 |             chr_len_db[data[0]] = int(data[1])
 78 |     return chr_len_db, chr_order
 79 | 
 80 | 
 81 | # Init global shared array
 82 | def init_pool(bin_offset, read_count_whole_genome):
 83 |     global shared_bin_offset
 84 |     shared_bin_offset = bin_offset
 85 |     global shared_read_count_whole_genome
 86 |     shared_read_count_whole_genome = read_count_whole_genome
 87 | 
 88 | 
 89 | # agp reader
 90 | def load_agp(agp):
 91 |     ctg_on_chr = {}
 92 |     with open(agp, 'r') as f_in:
 93 |         for line in f_in:
 94 |             if line.strip() == '' or line[0] == '#':
 95 |                 continue
 96 |             data = line.strip().split()
 97 |             if data[4] == 'U':
 98 |                 continue
 99 |             chrn = data[0]
100 |             start_pos = int(data[1])
101 |             end_pos = int(data[2])
102 |             ctg = data[5].replace('_pilon', '')
103 |             direct = data[-1]
104 |             ctg_on_chr[ctg] = [chrn, start_pos, end_pos, direct]
105 |     return ctg_on_chr
106 | 
107 | 
108 | # bam reader with agp
109 | def bam_read_with_agp(agp, chr_list, bam, long_bin_size, total_bin_count, i):
110 |     ctg_on_chr = load_agp(agp)
111 |     chr_len_db, chr_order = get_chr_len(chr_list)
112 |     ctg_list = sorted(ctg_on_chr)
113 |     with pysam.AlignmentFile(bam, 'rb') as fin:
114 |         for line in fin.fetch(contig=ctg_list[i]):
115 |             if line.is_unmapped or line.mate_is_unmapped:
116 |                 continue
117 |             ctg1 = line.reference_name
118 |             ctg2 = line.next_reference_name
119 |             read_pos1 = line.reference_start + 1
120 |             read_pos2 = line.next_reference_start + 1
121 | 
122 |             if ctg1 not in ctg_on_chr or ctg2 not in ctg_on_chr:
123 |                 continue
124 |             chrn1, ctg_start_pos1, ctg_end_pos1, ctg_direct1 = ctg_on_chr[ctg1]
125 |             chrn2, ctg_start_pos2, ctg_end_pos2, ctg_direct2 = ctg_on_chr[ctg2]
126 |             if ctg_direct1 == '+':
127 |                 converted_pos1 = ctg_start_pos1 + read_pos1 - 1
128 |             else:
129 |                 converted_pos1 = ctg_end_pos1 - read_pos1 + 1
130 |             if ctg_direct2 == '+':
131 |                 converted_pos2 = ctg_start_pos2 + read_pos2 - 1
132 |             else:
133 |                 converted_pos2 = ctg_end_pos2 - read_pos2 + 1
134 |             if chrn1 not in chr_len_db or chrn2 not in chr_len_db:
135 |                 continue
136 |             pos1_index = int(converted_pos1 / long_bin_size)
137 |             pos2_index = int(converted_pos2 / long_bin_size)
138 | 
139 |             chr1_index = chr_order.index(chrn1)
140 |             chr2_index = chr_order.index(chrn2)
141 | 
142 |             bin_offset = np.frombuffer(shared_bin_offset, dtype=ctypes.c_int)
143 | 
144 |             whole_pos1 = bin_offset[chr1_index] + pos1_index
145 |             whole_pos2 = bin_offset[chr2_index] + pos2_index
146 | 
147 |             read_count_whole_genome = np.frombuffer(shared_read_count_whole_genome,
148 |                                                     dtype=ctypes.c_double).reshape(total_bin_count, total_bin_count)
149 |             read_count_whole_genome[whole_pos1][whole_pos2] += 1
150 |             read_count_whole_genome[whole_pos2][whole_pos1] += 1
151 | 
152 | 
153 | # bam reader without agp
154 | def bam_read_no_agp(chr_list, bam, long_bin_size, total_bin_count, i):
155 |     _, chr_order = get_chr_len(chr_list)
156 |     with pysam.AlignmentFile(bam, 'rb') as fin:
157 |         for line in fin.fetch(contig=chr_order[i]):
158 |             if line.is_unmapped or line.mate_is_unmapped:
159 |                 continue
160 |             chrn1 = line.reference_name
161 |             chrn2 = line.next_reference_name
162 |             if chrn1 not in chr_order or chrn2 not in chr_order:
163 |                 continue
164 | 
165 |             read_pos1 = line.reference_start + 1
166 |             read_pos2 = line.next_reference_start + 1
167 | 
168 |             pos1_index = int(read_pos1 / long_bin_size)
169 |             pos2_index = int(read_pos2 / long_bin_size)
170 | 
171 |             chr1_index = chr_order.index(chrn1)
172 |             chr2_index = chr_order.index(chrn2)
173 | 
174 |             bin_offset = np.frombuffer(shared_bin_offset, dtype=ctypes.c_int)
175 | 
176 |             whole_pos1 = bin_offset[chr1_index] + pos1_index
177 |             whole_pos2 = bin_offset[chr2_index] + pos2_index
178 |             read_count_whole_genome = np.frombuffer(shared_read_count_whole_genome,
179 |                                                     dtype=ctypes.c_double).reshape(total_bin_count, total_bin_count)
180 |             read_count_whole_genome[whole_pos1][whole_pos2] += 1
181 |             read_count_whole_genome[whole_pos2][whole_pos1] += 1
182 | 
183 | 
184 | # Calc read counts on each bin
185 | def calc_read_count_per_min_size(chr_list, bam, agp, min_size, thread):
186 |     long_bin_size = min_size
187 | 
188 |     chr_len_db, chr_order = get_chr_len(chr_list)
189 |     bin_offset = [0 for i in range(0, len(chr_order) + 1)]
190 |     bin_count = [0 for i in range(0, len(chr_order) + 1)]
191 |     total_bin_count = 0
192 | 
193 |     for chrn in chr_len_db:
194 |         bin_count_of_chr = int(round((chr_len_db[chrn] * 1.0 / long_bin_size + 0.51)))
195 |         total_bin_count += bin_count_of_chr
196 |         bin_count[chr_order.index(chrn) + 1] = bin_count_of_chr
197 | 
198 |     for i in range(1, len(bin_count)):
199 |         bin_offset[i] = bin_count[i] + bin_offset[i - 1]
200 | 
201 |     bin_offset_base = multiprocessing.RawArray(ctypes.c_int, np.array(bin_offset))
202 |     read_count_whole_genome_base = multiprocessing.RawArray(ctypes.c_double, total_bin_count * total_bin_count)
203 | 
204 |     # because of the hic signal is sparse between chromosomes, means the reads pair read by different process
205 |     # unlikely locate in same bin, that means different process unlikely write same bin at same time, so we do
206 |     # not use lock to avoid data write in same bin.
207 |     if agp:
208 |         ctg_cnt = len(load_agp(agp))
209 |         if thread > ctg_cnt:
210 |             time_print("Threads is larger than need, reduce to %d" % ctg_cnt)
211 |             thread = ctg_cnt
212 |         partial_bam_read_with_agp = functools.partial(bam_read_with_agp, agp, chr_list, bam,
213 |                                                       long_bin_size, total_bin_count)
214 |         pool = multiprocessing.Pool(processes=thread, initializer=init_pool,
215 |                                     initargs=(bin_offset_base, read_count_whole_genome_base))
216 |         pool.map(partial_bam_read_with_agp, range(ctg_cnt))
217 |     else:
218 |         chr_cnt = len(chr_order)
219 |         if thread > chr_cnt:
220 |             time_print("Threads is larger than need, reduce to %d" % chr_cnt)
221 |             thread = chr_cnt
222 |         partial_bam_read_no_agp = functools.partial(bam_read_no_agp, chr_list, bam, long_bin_size, total_bin_count)
223 |         pool = multiprocessing.Pool(processes=thread, initializer=init_pool,
224 |                                     initargs=(bin_offset_base, read_count_whole_genome_base))
225 |         pool.map(partial_bam_read_no_agp, range(chr_cnt))
226 | 
227 |     return np.array(bin_offset), np.array(np.frombuffer(read_count_whole_genome_base,
228 |                                                         dtype=ctypes.c_double).reshape(total_bin_count,
229 |                                                                                        total_bin_count))
230 | 
231 | 
232 | def draw_heatmap(read_count_whole_genome_min_size, bin_offset_min_size,
233 |                  ratio, chr_order, min_size, cmap, draw_line, draw_block,
234 |                  line_color):
235 |     bin_size = int(ratio * min_size)
236 |     short_bin_size = long2short(bin_size)
237 | 
238 |     total_cnt = len(read_count_whole_genome_min_size)
239 |     ratio_cnt = int(round(total_cnt * 1.0 / ratio + 0.51, 0))
240 |     plt_cnt = int(total_cnt * 1.0 / ratio)
241 | 
242 |     data = read_count_whole_genome_min_size
243 | 
244 |     data = np.pad(data, ((0, ratio_cnt * ratio - total_cnt), (0, ratio_cnt * ratio - total_cnt)), 'constant',
245 |                   constant_values=0)
246 |     data = data.reshape(-1, ratio_cnt, ratio).sum(axis=2)
247 |     data = data.reshape(ratio_cnt, -1, ratio_cnt).sum(axis=1)
248 | 
249 |     fn = "%s_Whole_genome.pdf" % short_bin_size
250 |     cmap = plt.get_cmap(cmap)
251 |     ax = plt.gca()
252 |     with np.errstate(divide='ignore'):
253 |         hmap = ax.imshow(np.log2(data[: plt_cnt, : plt_cnt]), interpolation='nearest', origin='lower', cmap=cmap,
254 |                          aspect='equal')
255 | 
256 |     plt.colorbar(mappable=hmap, cax=None, ax=None, shrink=0.5)
257 |     plt.tick_params(labelsize=6)
258 |     for ticks in ax.get_xticklabels():
259 |         ticks.set_rotation(90)
260 |     for ticks in ax.get_yticklabels():
261 |         ticks.set_rotation(0)
262 |     title = 'Whole_genome_' + short_bin_size
263 |     plt.xlabel("Bins (" + short_bin_size.lower() + "b per bin)", fontsize=8)
264 |     if draw_line or draw_block:
265 |         idx = 1
266 |         x_ticks = []
267 |         y_ticks = []
268 |         for _ in chr_order:
269 |             sr = bin_offset_min_size[idx - 1] * 1. / ratio
270 |             er = bin_offset_min_size[idx] * 1. / ratio
271 |             mr = (sr + er) / 2.
272 |             if draw_line:
273 |                 plt.plot((sr, sr), (0, plt_cnt), color=line_color, linestyle=':', lw=.5)
274 |                 plt.plot((er, er), (0, plt_cnt), color=line_color, linestyle=':', lw=.5)
275 |                 plt.plot((0, plt_cnt), (sr, sr), color=line_color, linestyle=':', lw=.5)
276 |                 plt.plot((0, plt_cnt), (er, er), color=line_color, linestyle=':', lw=.5)
277 |             else:
278 |                 plt.plot((sr, sr), (sr, er), color=line_color, linestyle=':', lw=.5)
279 |                 plt.plot((er, er), (sr, er), color=line_color, linestyle=':', lw=.5)
280 |                 plt.plot((sr, er), (sr, sr), color=line_color, linestyle=':', lw=.5)
281 |                 plt.plot((sr, er), (er, er), color=line_color, linestyle=':', lw=.5)
282 |             x_ticks.append(mr)
283 |             y_ticks.append(mr)
284 |             idx += 1
285 | 
286 |         plt.xticks(x_ticks, chr_order)
287 |         plt.yticks(y_ticks, chr_order)
288 |         plt.xlim(0, plt_cnt)
289 |         plt.ylim(0, plt_cnt)
290 |     else:
291 |         plt.xticks([])
292 |         plt.yticks([])
293 |     plt.title(title, y=1.01, fontsize=12)
294 |     plt.savefig(fn, bbox_inches='tight', dpi=200)
295 |     plt.close('all')
296 | 
297 |     chr_cnt = len(chr_order)
298 |     row_cnt = int(round(np.sqrt(chr_cnt) + 0.51))
299 |     col_cnt = int(round(chr_cnt * 1.0 / row_cnt + 0.51))
300 |     all_fn = '%s_all_chrs.pdf' % short_bin_size
301 |     plt.figure(figsize=(col_cnt * 2, row_cnt * 2))
302 |     idx = 1
303 |     for chrn in chr_order:
304 |         sr = bin_offset_min_size[idx - 1]
305 |         er = bin_offset_min_size[idx]
306 |         sub_data = read_count_whole_genome_min_size[sr: er, sr: er]
307 |         total_cnt = len(sub_data)
308 |         ratio_cnt = int(round(total_cnt * 1.0 / ratio + 0.51, 0))
309 |         plt_cnt = int(total_cnt * 1.0 / ratio)
310 | 
311 |         sub_data = np.pad(sub_data, ((0, ratio_cnt * ratio - total_cnt), (0, ratio_cnt * ratio - total_cnt)),
312 |                           'constant', constant_values=0)
313 |         sub_data = sub_data.reshape(-1, ratio_cnt, ratio).sum(axis=2)
314 |         sub_data = sub_data.reshape(ratio_cnt, -1, ratio_cnt).sum(axis=1)
315 | 
316 |         plt.subplot(row_cnt, col_cnt, idx)
317 |         ax = plt.gca()
318 |         with np.errstate(divide='ignore'):
319 |             hmap = ax.imshow(np.log2(sub_data[: plt_cnt, : plt_cnt]), interpolation='nearest', origin='lower',
320 |                              cmap=cmap, aspect='equal')
321 |         plt.colorbar(mappable=hmap, cax=None, ax=None, shrink=0.5)
322 |         plt.tick_params(labelsize=5)
323 |         plt.title(chrn)
324 |         idx += 1
325 | 
326 |     plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.5, hspace=0.5)
327 |     plt.savefig(all_fn, bbox_inches='tight', dpi=200)
328 |     plt.close('all')
329 | 
330 | 
331 | def ALLHiC_plot(bam, agp, chr_list, h5_file, minsize, binsize, cmap, draw_line, draw_block,
332 |                 line_color, out_dir, thread):
333 |     bam_file = os.path.abspath(bam)
334 |     if agp:
335 |         agp_file = os.path.abspath(agp)
336 |     else:
337 |         agp_file = agp
338 |     chr_list = os.path.abspath(chr_list)
339 |     if h5_file != "":
340 |         h5_file = os.path.abspath(h5_file)
341 | 
342 |     if not os.path.exists(out_dir):
343 |         os.mkdir(out_dir)
344 |     os.chdir(out_dir)
345 | 
346 |     min_size = short2long(minsize)
347 | 
348 |     bin_list = binsize.split(',')
349 |     bin_ratio = []
350 |     for bin_size in bin_list:
351 |         long_bin_size = short2long(bin_size)
352 |         bin_ratio.append(int(round(long_bin_size / min_size + 0.01, 0)))
353 | 
354 |     time_print("Step1: Get chromosome length")
355 |     chr_len_db, chr_order = get_chr_len(chr_list)
356 | 
357 |     time_print("Step2: Get signal matrix")
358 |     if h5_file != "" and os.path.exists(h5_file):
359 |         h5_data = h5py.File(h5_file, 'r')
360 |         bin_offset_min_size = h5_data['bin_offset_min_size']
361 |         read_count_whole_genome_min_size = h5_data['read_count_whole_genome_min_size']
362 |     else:
363 |         bin_offset_min_size, read_count_whole_genome_min_size = calc_read_count_per_min_size(chr_list, bam_file,
364 |                                                                                              agp_file, min_size,
365 |                                                                                              thread)
366 |         if h5_file != "":
367 |             h5 = h5py.File(h5_file, 'w')
368 |             h5.create_dataset('bin_offset_min_size', data=bin_offset_min_size)
369 |             h5.create_dataset('read_count_whole_genome_min_size', data=read_count_whole_genome_min_size)
370 | 
371 |     time_print("Step3: Draw heatmap")
372 | 
373 |     for i in range(0, len(bin_ratio)):
374 |         ratio = bin_ratio[i]
375 |         time_print("Drawing with bin size %s" % bin_list[i])
376 |         draw_heatmap(read_count_whole_genome_min_size, bin_offset_min_size,
377 |                      ratio, chr_order, min_size, cmap, draw_line, draw_block,
378 |                      line_color)
379 |     os.chdir('..')
380 |     time_print("Success")
381 | 
382 | 
383 | if __name__ == "__main__":
384 |     opts = get_opts()
385 |     bam = opts.bam
386 |     agp = opts.agp
387 |     chr_list = opts.list
388 |     h5_file = opts.h5
389 |     minsize = opts.min_size
390 |     binsize = opts.size
391 |     cmap = opts.cmap
392 |     out_dir = opts.outdir
393 |     draw_line = opts.line
394 |     draw_block = opts.block
395 |     line_color = opts.linecolor
396 |     thread = opts.thread
397 |     ALLHiC_plot(bam, agp, chr_list, h5_file, minsize, binsize, cmap, draw_line, draw_block, line_color, out_dir, thread)
398 | 


--------------------------------------------------------------------------------
/bin/ALLHiC_prune:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sc-zhang/ALLHiC_extensions/d309941732844f43a650194505c706b4df71a61f/bin/ALLHiC_prune


--------------------------------------------------------------------------------
/bin/ALLHiC_rescue.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | from genericpath import exists, getctime
  4 | import os
  5 | import re
  6 | from sys import path
  7 | import pysam
  8 | import time
  9 | 
 10 | 
 11 | def time_print(info, type='info'):
 12 |     if type != 'info':
 13 |         info = "\033[35m%s\033[0m"%info
 14 |     print("\033[32m%s\033[0m %s"%(time.strftime('[%H:%M:%S]',time.localtime(time.time())), info))
 15 | 
 16 | 
 17 | def get_opts():
 18 |     group = argparse.ArgumentParser()
 19 |     group.add_argument('-r', '--ref', help="Contig level assembly fasta", required=True)
 20 |     group.add_argument('-b', '--bam', help="Unprunned bam", required=True)
 21 |     group.add_argument('-c', '--cluster', help="Cluster file of contigs", required=True)
 22 |     group.add_argument('-n', '--counts', help="count REs file", required=True)
 23 |     group.add_argument('-g', '--gff3', help="Gff3 file generated by gmap cds to contigs", required=True)
 24 |     group.add_argument('-j', '--jcvi', help="CDS file for jcvi, bed file with same prefix must exist in the same position", required=True)
 25 |     group.add_argument('-e', '--exclude', help="cluster which need no rescue, default=\"\", split by comma", default="")
 26 |     group.add_argument('-w', '--workdir', help="Work directory, default=wrkdir", default="wrkdir")
 27 |     return group.parse_args()
 28 | 
 29 | 
 30 | def read_fasta(in_fa):
 31 |     fa_db = {}
 32 |     with open(in_fa, 'r') as fin:
 33 |         for line in fin:
 34 |             if line[0] == '>':
 35 |                 id = line.strip().split()[0][1:]
 36 |                 fa_db[id] = []
 37 |             else:
 38 |                 fa_db[id].append(line.strip())
 39 |     for id in fa_db:
 40 |         fa_db[id] = ''.join(fa_db[id])
 41 |     
 42 |     return fa_db
 43 | 
 44 | 
 45 | def create_qry_file(source_cds, gff, target_cds, target_bed):
 46 |     src_cds_db = read_fasta(source_cds)
 47 |     idx = 1
 48 |     qry_db = {}
 49 |     with open(target_cds, 'w') as fcds:
 50 |         with open(target_bed, 'w') as fbed:
 51 |             with open(gff, 'r') as fin:
 52 |                 for line in fin:
 53 |                     if line.strip() == '' or line[0] == '#':
 54 |                         continue
 55 |                     data = line.strip().split()
 56 |                     if data[2] != 'gene':
 57 |                         continue
 58 |                     id = re.findall(r'Name=(.*)', data[8])[0].split(';')[0]
 59 |                     new_id = "%s_%d"%(id, idx)
 60 |                     idx += 1
 61 |                     chrn = data[0]
 62 |                     sp = int(data[3])
 63 |                     ep = int(data[4])
 64 |                     if ep <= sp:
 65 |                         continue
 66 |                     direct = data[6]
 67 | 
 68 |                     if chrn not in qry_db:
 69 |                         qry_db[chrn] = set()
 70 |                     qry_db[chrn].add(new_id)
 71 |                     fcds.write(">%s\n%s\n"%(new_id, src_cds_db[id]))
 72 |                     fbed.write("%s\t%d\t%d\t%s\t0\t%s\n"%(chrn, sp, ep, new_id, direct))
 73 |     return qry_db
 74 | 
 75 | 
 76 | def read_anchors(anchors_file):
 77 |     anchor_db = {}
 78 |     with open(anchors_file, 'r') as fin:
 79 |         for line in fin:
 80 |             if line.strip() == '' or line[0] == '#':
 81 |                 continue
 82 |             data = line.strip().split()
 83 |             qry_gn = data[0]
 84 |             ref_gn = data[1]
 85 |             anchor_db[qry_gn] = ref_gn
 86 |     return anchor_db
 87 | 
 88 | 
 89 | def convert_query_db(qry_db, anchor_db):
 90 |     new_qry_db = {}
 91 |     for chrn in qry_db:
 92 |         new_qry_db[chrn] = set()
 93 |         for gn in qry_db[chrn]:
 94 |             if gn in anchor_db:
 95 |                 new_qry_db[chrn].add(anchor_db[gn])
 96 |     return new_qry_db
 97 | 
 98 | 
 99 | def get_ovlp(qry_set1, qry_set2):
100 |     return len(qry_set1.intersection(qry_set2))
101 | 
102 | 
103 | def get_clusters(clu):
104 |     clu_db = {}
105 |     clu_ctgs = {}
106 |     with open(clu, 'r') as fin:
107 |         for line in fin:
108 |             if line[0] == '#':
109 |                 continue
110 |             data = line.strip().split()
111 |             chrn = data[0]
112 |             ctgs = data[2:]
113 |             clu_db[chrn] = ctgs
114 |             for ctg in ctgs:
115 |                 clu_ctgs[ctg] = chrn
116 |     return clu_db, clu_ctgs
117 | 
118 | 
119 | def get_hic_signal(bam):
120 |     signals = {}
121 |     with pysam.AlignmentFile(bam, 'rb') as fin:
122 |         for line in fin:
123 |             ctg1 = line.reference_name
124 |             pos1 = line.reference_start
125 |             ctg2 = line.next_reference_name
126 |             pos2 = line.next_reference_start
127 |             if pos1==-1 or pos2==-1:
128 |                 continue
129 |             if ctg1 not in signals:
130 |                 signals[ctg1] = {}
131 |             if ctg2 not in signals[ctg1]:
132 |                 signals[ctg1][ctg2] = 0
133 |             signals[ctg1][ctg2] += 1
134 | 
135 |             if ctg2 not in signals:
136 |                 signals[ctg2] = {}
137 |             if ctg1 not in signals[ctg2]:
138 |                 signals[ctg2][ctg1] = 0
139 |             signals[ctg2][ctg1] += 1
140 |     return signals
141 | 
142 | 
143 | def get_counts(counts):
144 |     header = ""
145 |     counts_db = {}
146 |     with open(counts, 'r') as fin:
147 |         for line in fin:
148 |             if line[0] == '#':
149 |                 header = line
150 |             else:
151 |                 ctg = line.strip().split()[0]
152 |                 counts_db[ctg] = line
153 |     
154 |     return header, counts_db
155 | 
156 | 
157 | def ALLHiC_rescue(ref, bam, clu, counts, gff3, jprex, exclude, wrk):
158 |     if not os.path.exists(wrk):
159 |         os.mkdir(wrk)
160 |     
161 |     ref = os.path.abspath(ref)
162 |     bam = os.path.abspath(bam)
163 |     clu = os.path.abspath(clu)
164 |     counts = os.path.abspath(counts)
165 |     bed = os.path.abspath(jprex+'.bed')
166 |     cds = os.path.abspath(jprex+'.cds')
167 |     gff3 = os.path.abspath(gff3)
168 | 
169 |     exclude_set = set()
170 |     if exclude != "":
171 |         for grp in exclude.split(','):
172 |             exclude_set.add(grp)
173 | 
174 |     jprex = jprex.split('/')[-1]
175 |     time_print("Entering: %s"%wrk)
176 |     os.chdir(wrk)
177 | 
178 |     os.system("ln -sf %s %s.cds"%(cds, jprex))
179 |     os.system("ln -sf %s %s.bed"%(bed, jprex))
180 |     new_cds = "dup.cds"
181 |     new_bed = "dup.bed"
182 | 
183 |     qry_db = create_qry_file(cds, gff3, new_cds, new_bed)
184 |     
185 |     if not os.path.exists("dup.%s.anchors"%jprex):
186 |         time_print("Running jcvi", type="important")
187 |         cmd = "python -m jcvi.compara.catalog ortholog dup %s &> jcvi.log"%jprex
188 |         os.system(cmd)
189 |     else:
190 |         time_print("Anchors file found, skip", type="important")
191 |     
192 |     time_print("Loading anchors file")
193 |     anchor_db = read_anchors("dup.%s.anchors"%jprex)
194 |     
195 |     time_print("Converting query db")
196 |     qry_db = convert_query_db(qry_db, anchor_db)
197 | 
198 |     time_print("Loading clusters")
199 |     clu_db, clu_ctgs = get_clusters(clu)
200 |     clu_set = {}
201 |     for chrn in clu_db:
202 |         clu_set[chrn] = set()
203 |         for ctg in clu_db[chrn]:
204 |             if ctg not in qry_db:
205 |                 continue
206 |             clu_set[chrn] = clu_set[chrn].union(qry_db[ctg])
207 |     
208 |     remain_ctgs = []
209 |     ctg_db = read_fasta(ref)
210 | 
211 |     for ctg in ctg_db:
212 |         if ctg not in clu_ctgs:
213 |             remain_ctgs.append([ctg, len(ctg_db[ctg])])
214 |     
215 |     time_print("Loading HiC signals")
216 |     signal_db = get_hic_signal(bam)
217 | 
218 |     time_print("Get best matches")
219 |     for ctg, ctgl in sorted(remain_ctgs, key=lambda x: x[1], reverse=True):
220 |         score_list = []
221 |         if ctg not in signal_db:
222 |             continue
223 |         for ctg2 in signal_db[ctg]:
224 |             if ctg2 not in clu_ctgs:
225 |                 continue
226 |             sig = signal_db[ctg][ctg2]
227 |             ovlp = 0
228 |             chrn = clu_ctgs[ctg2]
229 |             if ctg in qry_db:
230 |                 ovlp = get_ovlp(qry_db[ctg], clu_set[chrn])
231 |             score_list.append([ovlp, sig, chrn])
232 |         if len(score_list)==0:
233 |             continue
234 |         for best_match in sorted(score_list, key=lambda x: [x[0], -x[1]]):
235 |             if best_match[2] in exclude_set:
236 |                 continue
237 |             else:
238 |                 break
239 |         
240 |         if best_match[1] < 10:
241 |             continue
242 |         time_print("\t%s matched %s, sig: %d, ovlp: %d"%(ctg, best_match[2], best_match[1], best_match[0]))
243 |         clu_db[best_match[2]].append(ctg)
244 |         if ctg in qry_db:
245 |             clu_set[best_match[2]] = clu_set[best_match[2]].union(qry_db[ctg])
246 |     
247 |     time_print("Writing new groups")
248 |     header, counts_db = get_counts(counts)
249 |     for chrn in clu_db:
250 |         with open("%s.txt"%chrn, 'w') as fout:
251 |             fout.write(header)
252 |             for ctg in clu_db[chrn]:
253 |                 fout.write(counts_db[ctg])     
254 |         
255 |     os.chdir("..")
256 |     time_print("Finished")
257 |     
258 | 
259 | if __name__ == "__main__":
260 |     opts = get_opts()
261 |     ref = opts.ref
262 |     bam = opts.bam
263 |     clu = opts.cluster
264 |     counts = opts.counts
265 |     gff3 = opts.gff3
266 |     jprex = opts.jcvi
267 |     jprex = '.'.join(jprex.split('.')[:-1])
268 |     exclude = opts.exclude
269 |     wrk = opts.workdir
270 |     ALLHiC_rescue(ref, bam, clu, counts, gff3, jprex, exclude, wrk)
271 | 


--------------------------------------------------------------------------------
/bin/partition_gmap.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | import os
  4 | import argparse
  5 | import multiprocessing
  6 | import pysam
  7 | 
  8 | 
  9 | def get_opt():
 10 | 	group = argparse.ArgumentParser()
 11 | 	group.add_argument('-r', '--ref', help='reference contig level assembly', required=True)
 12 | 	group.add_argument('-g', '--alleletable', help='Allele.gene.table', required=True)
 13 | 	group.add_argument('-b', '--bam', help='bam file, default: prunning.bam', default='prunning.bam')
 14 | 	group.add_argument('-d', '--workdir', help='work directory, default: wrk_dir', default='wrk_dir')
 15 | 	group.add_argument('-t', '--thread', help='threads, default: 10', type=int, default=10)
 16 | 	return group.parse_args()
 17 | 
 18 | 
 19 | def read_fasta(in_fa):
 20 | 	fa_db = {}
 21 | 	with open(in_fa, 'r') as fin:
 22 | 		for line in fin:
 23 | 			if line[0] == '>':
 24 | 				id = line.strip().split()[0][1:]
 25 | 				fa_db[id] = []
 26 | 			else:
 27 | 				fa_db[id].append(line.strip())
 28 | 	for id in fa_db:
 29 | 		fa_db[id] = ''.join(fa_db[id])
 30 | 	
 31 | 	return fa_db
 32 | 
 33 | 
 34 | def load_allele(allele_table):
 35 | 	ctg_on_chr = {}
 36 | 	chr_contain_ctg = {}
 37 | 	with open(allele_table, 'r') as fin:
 38 | 		for line in fin:
 39 | 			data = line.strip().split()
 40 | 			chrn = data[0]
 41 | 			if chrn.startswith('tig') or chrn.startswith('scaffold') or chrn.startswith('utg') or chrn.startswith('ctg'):
 42 | 				continue
 43 | 			for ctg in data[2:]:
 44 | 				if ctg not in ctg_on_chr:
 45 | 					ctg_on_chr[ctg] = {}
 46 | 				if chrn not in ctg_on_chr[ctg]:
 47 | 					ctg_on_chr[ctg][chrn] = 0
 48 | 				ctg_on_chr[ctg][chrn] += 1
 49 | 	for ctg in ctg_on_chr:
 50 | 		max_chr = ""
 51 | 		max_cnt = 0
 52 | 		for chrn in ctg_on_chr[ctg]:
 53 | 			if ctg_on_chr[ctg][chrn] > max_cnt:
 54 | 				max_cnt = ctg_on_chr[ctg][chrn]
 55 | 				max_chr = chrn
 56 | 		ctg_on_chr[ctg] = max_chr
 57 | 		if max_chr not in chr_contain_ctg:
 58 | 			chr_contain_ctg[max_chr] = {}
 59 | 		chr_contain_ctg[max_chr][ctg] = 1
 60 | 	return ctg_on_chr, chr_contain_ctg
 61 | 
 62 | 
 63 | def split_files(chrn, chr_contain_ctg, ctg_on_chr, fa_db, bam_file, wrk_dir):
 64 | 	wrk_dir = os.path.join(wrk_dir, chrn)
 65 | 	if not os.path.exists(wrk_dir):
 66 | 		os.mkdir(wrk_dir)
 67 | 	
 68 | 	print("\tWriting %s"%chrn)
 69 | 	sub_bam = os.path.join(wrk_dir, chrn+'.bam')
 70 | 	sub_fa = os.path.join(wrk_dir, chrn+'.fa')
 71 | 	with open(sub_fa, 'w') as fout:
 72 | 		for ctg in chr_contain_ctg[chrn]:
 73 | 			fout.write(">%s\n%s\n"%(ctg, fa_db[ctg]))
 74 | 
 75 | 	with pysam.AlignmentFile(bam_file, 'rb') as fin:
 76 | 		with pysam.AlignmentFile(sub_bam, 'wb', template=fin) as fout:
 77 | 			for ctg in chr_contain_ctg[chrn]:
 78 | 				for line in fin.fetch(contig=ctg):
 79 | 					if line.next_reference_name and line.next_reference_name in ctg_on_chr and ctg_on_chr[line.next_reference_name]==chrn:
 80 | 						fout.write(line)
 81 | 	
 82 | 
 83 | def partition_gmap(ref, allele_table, bam, wrkdir, threads):
 84 | 	if not os.path.exists(wrkdir):
 85 | 		os.mkdir(wrkdir)
 86 | 	
 87 | 	print("Loading allele table")
 88 | 	ctg_on_chr, chr_contain_ctg = load_allele(allele_table)
 89 | 	
 90 | 	print("Loading contig fasta")
 91 | 	fa_db = read_fasta(ref)
 92 | 
 93 | 	bai = bam+'.bai'
 94 | 	if not os.path.exists(bai):
 95 | 		print("BAI file not found, starting index...")
 96 | 		ret = os.system('samtools index %s'%bam)
 97 | 		if ret==0:
 98 | 			print("Index success")
 99 | 		else:
100 | 			print("Fatal: bam file must be sorted")
101 | 			sys.exit(-1)
102 | 
103 | 	print("Splitting files")
104 | 	if len(chr_contain_ctg) < threads:
105 | 		threads = len(chr_contain_ctg)
106 | 	pool = multiprocessing.Pool(processes=threads)
107 | 	for chrn in chr_contain_ctg:
108 | 		pool.apply_async(split_files, (chrn, chr_contain_ctg, ctg_on_chr, fa_db, bam, wrkdir,))
109 | 	pool.close()
110 | 	pool.join()
111 | 	print("Notice: If you got errors of \"Length mismatch\" during allhic extract, it is normal because we split bam with the same header, it will not effect the result")
112 | 	print("Finished")
113 | 
114 | 
115 | if __name__ == '__main__':
116 | 	opts = get_opt()
117 | 	ref = opts.ref
118 | 	allele_table = opts.alleletable
119 | 	bam = opts.bam
120 | 	wrkdir = opts.workdir
121 | 	threads = opts.thread
122 | 	partition_gmap(ref, allele_table, bam, wrkdir, threads)
123 | 


--------------------------------------------------------------------------------
/src/ALLHiC_prune.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <string>
 3 | #include <cstring>
 4 | #include <ctime>
 5 | #include "Prune.h"
 6 | 
 7 | using namespace std;
 8 | 
 9 | int main(int argc, char* argv[]) {
10 | 	if (argc != 5) {
11 | 		cout << "************************************************************************\n";
12 | 		cout << "    Usage: "<<argv[0]<<" -i Allele.ctg.table -b sorted.bam\n";
13 | 		cout << "      -h : help and usage.\n";
14 | 		cout << "      -i : Allele.ctg.table\n";
15 | 		cout << "      -b : sorted.bam\n";
16 | 		cout << "************************************************************************\n";
17 | 	}
18 | 	else {
19 | 		string bamfile;
20 | 		string table;
21 | 		clock_t startt, endt;
22 | 		startt = clock();
23 | 		for (long i = 1; i < 5; i += 2) {
24 | 			if (strcmp(argv[i], "-i") == 0) {
25 | 				table = argv[i + 1];
26 | 				continue;
27 | 			}
28 | 			if (strcmp(argv[i], "-b") == 0) {
29 | 				bamfile = argv[i + 1];
30 | 				continue;
31 | 			}
32 | 		}
33 | 		Prune prune;
34 | 		prune.SetParameter(bamfile, table);
35 | 		cout<<"Getting contig pairs"<<endl; 
36 | 		prune.GeneratePairsAndCtgs();
37 | 		cout<<"Generating remove reads"<<endl;
38 | 		prune.GenerateRemovedb();
39 | 		cout<<"Creating prunned bam file"<<endl;
40 | 		long long rmcnt = 0;
41 | 		rmcnt = prune.CreatePrunedBam();
42 | 		cout<<"Removed "<<rmcnt<<" reads"<<endl;
43 | 		
44 | 		endt = clock();
45 | 		cout << "use time: " << (endt - startt) / CLOCKS_PER_SEC << "s\n";
46 | 	}
47 | 	return 0;
48 | }
49 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | CC=gcc
 2 | CXX=g++
 3 | PREFIX=../bin/
 4 | 
 5 | all: htslib
 6 | 	$(CXX) -O3 --std=c++11 -o ALLHiC_prune -I. -Ihtslib-1.17 ALLHiC_prune.cpp Prune.cpp htslib-1.17/libhts.a -lz -llzma -lbz2 -lpthread -lcurl
 7 | 
 8 | htslib: untar
 9 | 	$(MAKE) -C htslib-1.17
10 | 
11 | untar:
12 | 	@if [ ! -d "htslib-1.17" ]; then echo "Untar"; tar jxvf htslib-1.17.tar.bz2; else echo "Skip"; fi
13 | 
14 | clean:
15 | 	rm -rf ALLHiC_prune htslib-1.17
16 | 
17 | install: all
18 | 	@if [ ! -d $(PREFIX) ]; then mkdir -p $(PREFIX); fi
19 | 	cp ALLHiC_prune $(PREFIX)
20 | 


--------------------------------------------------------------------------------
/src/Prune.cpp:
--------------------------------------------------------------------------------
  1 | #include "Prune.h"
  2 | #include <htslib/sam.h>
  3 | 
  4 | 
  5 | Prune::Prune() {
  6 | 	this->bamfile = "";
  7 | 	this->table = "";
  8 | }
  9 | 
 10 | Prune::Prune(std::string bamfile, std::string table) {
 11 | 	this->bamfile = bamfile;
 12 | 	this->table = table;
 13 | }
 14 | 
 15 | Prune::~Prune(){}
 16 | 
 17 | //Split string by delimiter
 18 | bool Prune::Split(std::string source, std::string delim, std::vector<std::string>&target) {
 19 | 	target.clear();
 20 | 	char *p;
 21 | 	p = strtok(const_cast<char*>(source.c_str()), delim.c_str());
 22 | 	if (!p) {
 23 | 		return false;
 24 | 	}
 25 | 	while (p) {
 26 | 		target.push_back(p);
 27 | 		p = strtok(NULL, delim.c_str());
 28 | 	}
 29 | 	return true;
 30 | }
 31 | 
 32 | 
 33 | void Prune::SetParameter(std::string bamfile, std::string table) {
 34 | 	this->bamfile = bamfile;
 35 | 	this->table = table;
 36 | }
 37 | 
 38 | 
 39 | //Read bamfiles and read them by samtools, then create pairdbs and ctgdbs;
 40 | bool Prune::GeneratePairsAndCtgs() {
 41 | 	if (bamfile == "" || table == "") {
 42 | 		return false;
 43 | 	}
 44 | 	else {
 45 | 		int ctg1, ctg2;
 46 | 		std::string sctg1, sctg2;
 47 | 		bam1_t *rec = bam_init1();
 48 | 		htsFile *inbam = hts_open(bamfile.c_str(), "rb");
 49 | 		sam_hdr_t *hdr = sam_hdr_read(inbam);
 50 | 		int res;
 51 | 
 52 | 		while((res = sam_read1(inbam, hdr, rec))>=0){
 53 | 			ctg1 = rec->core.tid;
 54 | 			ctg2 = rec->core.mtid;
 55 | 			if(ctg2==-1){
 56 | 				continue;
 57 | 			}
 58 | 			sctg1 = hdr->target_name[ctg1];
 59 | 			sctg2 = hdr->target_name[ctg2];
 60 | 
 61 | 			ctgidxdb[sctg1] = ctg1;
 62 | 			ctgidxdb[sctg2] = ctg2;
 63 | 			
 64 | 			sctgdb[ctg1] = sctg1;
 65 | 			sctgdb[ctg2] = sctg2;
 66 | 			
 67 | 			if(ctg1==ctg2){
 68 | 				continue;
 69 | 			}
 70 | 			if(sctg1.compare(sctg2)>=0){
 71 | 				int tmp = ctg1;
 72 | 				ctg1 = ctg2;
 73 | 				ctg2 = tmp;
 74 | 			}
 75 | 			pairdb[ctg1][ctg2]++;
 76 | 			ctgdb[ctg1]++;
 77 | 			ctgdb[ctg2]++;
 78 | 		}
 79 | 		hts_close(inbam);
 80 | 		delete rec;
 81 | 		delete hdr;
 82 | 	}
 83 | 	return true;
 84 | }
 85 | 
 86 | //Create removedb_Allele.txt, removedb_nonBest.txt and log.txt;
 87 | bool Prune::GenerateRemovedb() {
 88 | 	std::ifstream fin;
 89 | 	std::unordered_map<int, int> retaindb;
 90 | 	std::unordered_map<int, int> numdb;
 91 | 	std::unordered_map<int, std::unordered_set <int>> removedb;
 92 | 	std::vector<std::string>data;
 93 | 	std::string temp;
 94 | 	std::string sctg1, sctg2;
 95 | 	int ctg1, ctg2;
 96 | 	long long num_r;
 97 | 
 98 | 	fin.open(table);
 99 | 	if (fin) {
100 | 		while (getline(fin, temp)) {
101 | 			removedb.clear();
102 | 			Split(temp, "\t", data);
103 | 			if (data.size() <= 3) {
104 | 				continue;
105 | 			}
106 | 			for (long i = 2; i < data.size() - 1; i++) {
107 | 				sctg1 = data[i];
108 | 				for (long j = i + 1; j < data.size(); j++) {
109 | 					sctg2 = data[j];
110 | 					ctg1 = ctgidxdb[sctg1];
111 | 					ctg2 = ctgidxdb[sctg2];
112 | 					if(sctg1.compare(sctg2)>=0){
113 | 						int tmp = ctg1;
114 | 						ctg1 = ctg2;
115 | 						ctg2 = tmp;
116 | 					}
117 | 					removedb[ctg1].insert(ctg2);
118 | 					allremovedb[ctg1].insert(ctg2);
119 | 				}
120 | 			}
121 | 			retaindb.clear();
122 | 			numdb.clear();
123 | 			for (long i = 2; i < data.size(); i++) {
124 | 				sctg1 = data[i];
125 | 				ctg1 = ctgidxdb[sctg1];
126 | 				for(std::unordered_map<int, long>::iterator iter=ctgdb.begin(); iter!=ctgdb.end(); iter++){
127 | 					ctg2 = iter->first;
128 | 					sctg2 = sctgdb[ctg2];
129 | 					int nctg1=ctg1, nctg2=ctg2;
130 | 					if(sctg1.compare(sctg2)>=0){
131 | 						nctg1 = ctg2;
132 | 						nctg2 = ctg1;
133 | 					}
134 | 					if(removedb.count(nctg1) && removedb[nctg1].count(nctg2)){
135 | 						continue;
136 | 					}
137 | 					if(pairdb.count(nctg1)==0 || pairdb[nctg1].count(nctg2)==0){
138 | 						continue;
139 | 					}
140 | 					num_r = pairdb[nctg1][nctg2];
141 | 					if(retaindb.count(ctg2)==0){
142 | 						retaindb[ctg2] = ctg1;
143 | 						numdb[ctg2] = num_r;
144 | 					}else{
145 | 						int prectg1 = retaindb[ctg2];
146 | 						std::string presctg1 = sctgdb[prectg1];
147 | 						if(num_r>numdb[ctg2]){
148 | 							if(sctg2.compare(presctg1)>=0){
149 | 								allremovedb[prectg1].insert(ctg2);
150 | 							}else{
151 | 								allremovedb[ctg2].insert(prectg1);
152 | 							}
153 | 							retaindb[ctg2] = ctg1;
154 | 							numdb[ctg2] = num_r;
155 | 						}else{
156 | 							allremovedb[nctg1].insert(nctg2);
157 | 						}
158 | 					}
159 | 				}
160 | 			}
161 | 		}
162 | 	}else{
163 | 		return false;
164 | 	}
165 | 	return true;
166 | }
167 | 
168 | //Directly to create prunning.bam through pipe with samtools
169 | long long Prune::CreatePrunedBam() {
170 | 	int ctg1, ctg2;
171 | 	std::string sctg1, sctg2;
172 | 	std::string outbam = "prunning.bam";
173 | 	htsFile *in = hts_open(bamfile.c_str(), "rb");
174 | 	htsFile *out = hts_open(outbam.c_str(), "wb");
175 | 	sam_hdr_t *hdr = sam_hdr_read(in);
176 | 	bam1_t *rec = bam_init1();
177 | 	int res;
178 | 	long long rmcnt = 0;
179 | 
180 | 	if(sam_hdr_write(out, hdr)<0){
181 | 		hts_close(in);
182 | 		hts_close(out);
183 | 		delete hdr;
184 | 		delete rec;
185 | 		return -1;
186 | 	}
187 | 	while((res = sam_read1(in, hdr, rec))>=0){
188 | 		ctg1 = rec->core.tid;
189 | 		ctg2 = rec->core.mtid;
190 | 		sctg1 = sctgdb[ctg1];
191 | 		sctg2 = sctgdb[ctg2];
192 | 		if(sctg1.compare(sctg2)>=0){
193 | 			int tmp = ctg1;
194 | 			ctg1 = ctg2;
195 | 			ctg2 = tmp;
196 | 		}
197 | 		if(allremovedb.count(ctg1) && allremovedb[ctg1].count(ctg2)){
198 | 			rmcnt++;
199 | 			continue;
200 | 		}
201 | 		if(sam_write1(out, hdr, rec)<0){
202 | 			hts_close(in);
203 | 			hts_close(out);
204 | 			delete hdr;
205 | 			delete rec;
206 | 			return -1;
207 | 		}
208 | 	}
209 | 	if(hts_close(in)>=0&&hts_close(out)>=0){
210 | 		delete hdr;
211 | 		delete rec;
212 | 		return rmcnt;
213 | 	}
214 | 
215 | 	return -1;
216 | }
217 | 


--------------------------------------------------------------------------------
/src/Prune.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #ifndef __PRUNE_H__
 3 | #define __PRUNE_H__
 4 | #include <iostream>
 5 | #include <fstream>
 6 | #include <unordered_map>
 7 | #include <unordered_set>
 8 | #include <vector>
 9 | #include <string>
10 | #include <cstring>
11 | 
12 | class Prune {
13 | private:
14 | 	std::string bamfile;
15 | 	std::string table;
16 | 	std::unordered_map<int, std::unordered_map <int, long long>> pairdb;
17 | 	std::unordered_map<int, long> ctgdb;
18 | 	std::unordered_map<std::string, int> ctgidxdb;
19 | 	std::unordered_map<int, std::string> sctgdb;
20 | 	std::unordered_map<int, std::unordered_set <int>> allremovedb;
21 | 
22 | 	bool Split(std::string source, std::string delim, std::vector<std::string>&target);
23 | public:
24 | 	Prune();
25 | 	Prune(std::string bamfile, std::string table);
26 | 	~Prune();
27 | 	void SetParameter(std::string bamfile, std::string table);
28 | 	bool GeneratePairsAndCtgs();
29 | 	bool GenerateRemovedb();
30 | 	long long CreatePrunedBam();
31 | };
32 | 
33 | #endif


--------------------------------------------------------------------------------
/src/htslib-1.17.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sc-zhang/ALLHiC_extensions/d309941732844f43a650194505c706b4df71a61f/src/htslib-1.17.tar.bz2


--------------------------------------------------------------------------------