├── .DS_Store ├── README.md ├── hicplus ├── .DS_Store ├── __init__.py ├── __pycache__ │ ├── hicplus.cpython-37.pyc │ ├── model.cpython-36.pyc │ ├── model.cpython-37.pyc │ ├── readutils.cpython-37.pyc │ ├── strawHiCplus.cpython-37.pyc │ ├── utils.cpython-36.pyc │ └── utils.cpython-37.pyc ├── mat2cool.sh ├── mat2hic.sh ├── mm10.chrom.sizes ├── model.py ├── model.pyc ├── pred_chromosome.py ├── pred_genome.py ├── testConvNet.py ├── trainConvNet.py ├── trainConvNet.pyc ├── train_models.py ├── utils.py └── utils.pyc ├── model ├── GM_chr1-83900.model ├── model3400.model └── pytorch_HindIII_model_40000 ├── scripts └── hicplus └── setup.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangjuan001/hicplus/65d263906b3c62cd9e7f543229a38bed2bd52316/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | This is a package for HiCplus, which requires .hic file to train CNN model and with this model you can enhance the resolution of your hic data. High memory and GPU are not necessary when predicting to a high resolution matrix(e.g.10kb). 3 | 4 | ## Citation 5 | Yan Zhang, Lin An, Jie Xu, Bo Zhang, W. Jim Zheng, Ming Hu, Jijun Tang & Feng Yue. Enhancing Hi-C data resolution with deep convolutional neural network HiCPlus. https://doi.org/10.1038/s41467-018-03113-2. 6 | 7 | ### Installation 8 | ``` 9 | conda config --add channels pytorch 10 | conda create -n plus python=3.6 numpy pytorch torchvision scipy 11 | python3 -m pip install hic-straw 12 | source activate plus 13 | git clone https://github.com/wangjuan001/hicplus.git 14 | cd hicplus 15 | python setup.py install 16 | ``` 17 | 18 | ### Usage 19 | ``` 20 | hicplus 21 | 22 | usage: hicplus [-h] {train,pred_chromosome} ... 23 | 24 | Train CNN model with Hi-C data and make predictions for low resolution HiC 25 | data with the model. 26 | 27 | positional arguments: 28 | {train,pred_chromosome} 29 | train Train CNN model per chromosome 30 | pred_chromosome predict high resolution interaction frequencies for 31 | inter and intra chromosomes 32 | 33 | optional arguments: 34 | -h, --help show this help message and exit 35 | 36 | ``` 37 | 38 | HiCplus training process requires GPU nodes. 39 | ``` 40 | hicplus train 41 | 42 | usage: hicplus train [-h] [-i INPUTFILE] [-r SCALERATE] [-c CHROMOSOME] 43 | [-o OUTMODEL] 44 | 45 | optional arguments: 46 | -h, --help show this help message and exit 47 | -i INPUTFILE, --inputfile INPUTFILE 48 | path to a .hic file. 49 | -r SCALERATE, --scalerate SCALERATE 50 | downsampling rate to generate the low resolution 51 | training file 52 | -c CHROMOSOME, --chromosome CHROMOSOME 53 | choose one chromosome to do the model training. 54 | -o OUTMODEL, --outmodel OUTMODEL 55 | output model name. default = model_epochnumber.model 56 | 57 | ``` 58 | e.g. 59 | ``` 60 | hicplus train -i https://hicfiles.s3.amazonaws.com/hiseq/gm12878/in-situ/combined.hic -r 40 -c 19 61 | ``` 62 | You can do prediction on CPUs now. 63 | ``` 64 | hicplus pred_chromosome 65 | usage: hicplus pred_chromosome [-h] [-i INPUTFILE] [-m MODEL] [-b BINSIZE] -c 66 | chrN1 chrN2 67 | 68 | optional arguments: 69 | -h, --help show this help message and exit 70 | -i INPUTFILE, --inputfile INPUTFILE 71 | path to a .hic file. 72 | -o OUTPUTFILE, --outputfile OUTPUTFILE 73 | path to an output file. 74 | -m MODEL, --model MODEL 75 | path to a model file. 76 | -b BINSIZE, --binsize BINSIZE 77 | predicted resolustion, e.g.10kb, 25kb..., 78 | default=10000 79 | -c chrN1 chrN2, --chrN chrN1 chrN2 80 | chromosome number 81 | ``` 82 | e.g. 83 | ``` 84 | hicplus pred_chromosome -i test.hic -o test.out.txt -m ../HiCplus_straw/model/pytorch_HindIII_model_40000 -c 19 22 85 | ``` 86 | 87 | ## Transform output matrix to .cool file 88 | To transform the output matrix to .cool file, you can refer to the bash script hicplus/mat2cool.sh. To run the script, you can take the following steps: 89 | 1. Specify the chromosome size file for the species you work on. In the example, we used the mm10.chrom.sizes and the related file is provided as well. 90 | 2. As the transform is based on cooler https://github.com/mirnylab/cooler, install cooler first if you haven't done so. 91 | ``` 92 | conda install -c conda-forge -c bioconda cooler 93 | ``` 94 | 3. After the above setting, you should be able to run the script: 95 | ``` 96 | sh mat2cool.sh test.out.txt 97 | ``` 98 | 99 | ### Model 100 | It's important to use a suitable model when doing prediction. At this moment we only provide one model, which is suitable for 200~300M reads hic data (downsampling rate at 16). 101 | 102 | For other sequencing depth data, the users need to train models at a different downsampling rate (e.g. 40). For more information about how to select downsampling rate, please refer to the original HiCplus paper. 103 | -------------------------------------------------------------------------------- /hicplus/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangjuan001/hicplus/65d263906b3c62cd9e7f543229a38bed2bd52316/hicplus/.DS_Store -------------------------------------------------------------------------------- /hicplus/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | __version__='1.1.0' 3 | __license__='GPLv3+' 4 | Me = __file__ 5 | -------------------------------------------------------------------------------- /hicplus/__pycache__/hicplus.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangjuan001/hicplus/65d263906b3c62cd9e7f543229a38bed2bd52316/hicplus/__pycache__/hicplus.cpython-37.pyc -------------------------------------------------------------------------------- /hicplus/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangjuan001/hicplus/65d263906b3c62cd9e7f543229a38bed2bd52316/hicplus/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /hicplus/__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangjuan001/hicplus/65d263906b3c62cd9e7f543229a38bed2bd52316/hicplus/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /hicplus/__pycache__/readutils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangjuan001/hicplus/65d263906b3c62cd9e7f543229a38bed2bd52316/hicplus/__pycache__/readutils.cpython-37.pyc -------------------------------------------------------------------------------- /hicplus/__pycache__/strawHiCplus.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangjuan001/hicplus/65d263906b3c62cd9e7f543229a38bed2bd52316/hicplus/__pycache__/strawHiCplus.cpython-37.pyc -------------------------------------------------------------------------------- /hicplus/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangjuan001/hicplus/65d263906b3c62cd9e7f543229a38bed2bd52316/hicplus/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /hicplus/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangjuan001/hicplus/65d263906b3c62cd9e7f543229a38bed2bd52316/hicplus/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /hicplus/mat2cool.sh: -------------------------------------------------------------------------------- 1 | dat=$1 ##output from hicplus prediction 2 | chrom=mm10.chrom.sizes ##chrom size file, change to your own species. 3 | 4 | cat $dat | tr ':' '\t'|tr '-' '\t' > ${dat}_tmp 5 | 6 | ###transfrom the matrix file to .cool file 7 | cooler load -f bg2 ${chrom}:10000 ${dat}_tmp ${dat}.cool --input-copy-status duplex 8 | 9 | ## remove the intermediate tmp file. 10 | rm ${dat}_tmp 11 | -------------------------------------------------------------------------------- /hicplus/mat2hic.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | dat=$1 ##output from hicplus prediction 5 | 6 | chrom=hg38.chrom.sizes ##chrom size file, change to your own species. 7 | 8 | cat $dat | tr ':' '\t'|tr '-' '\t' | awk '{print 0,$1,$2,0,0,$4,$5,1,$7}' > ${dat}_tmp 9 | 10 | 11 | less ${dat}_tmp |awk '{OFS="\t"; for (i=0; i<$NF; i++) print }' |cut -f1-8 -d" " > ${dat}_tmp2 12 | 13 | java -Xmx40g -jar ~/jwn2291/juicer/scripts/scripts/juicer_tools_1.13.02.jar pre -d -c 18 -r 5000,10000,20000,25000,40000,50000,100000 ${dat}_tmp2 ${dat}.hic hg38 14 | -------------------------------------------------------------------------------- /hicplus/mm10.chrom.sizes: -------------------------------------------------------------------------------- 1 | chr1 195471971 2 | chr2 182113224 3 | chr3 160039680 4 | chr4 156508116 5 | chr5 151834684 6 | chr6 149736546 7 | chr7 145441459 8 | chr8 129401213 9 | chr9 124595110 10 | chr10 130694993 11 | chr11 122082543 12 | chr12 120129022 13 | chr13 120421639 14 | chr14 124902244 15 | chr15 104043685 16 | chr16 98207768 17 | chr17 94987271 18 | chr18 90702639 19 | chr19 61431566 20 | chrX 171031299 21 | chrY 91744698 22 | -------------------------------------------------------------------------------- /hicplus/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | from torch.utils import data 7 | import gzip 8 | import sys 9 | import torch.optim as optim 10 | conv2d1_filters_numbers = 8 11 | conv2d1_filters_size = 9 12 | conv2d2_filters_numbers = 8 13 | conv2d2_filters_size = 1 14 | conv2d3_filters_numbers = 1 15 | conv2d3_filters_size = 5 16 | 17 | class Net(nn.Module): 18 | def __init__(self, D_in, D_out): 19 | super(Net, self).__init__() 20 | # 1 input image channel, 6 output channels, 5x5 square convolution 21 | # kernel 22 | self.conv1 = nn.Conv2d(1, conv2d1_filters_numbers, conv2d1_filters_size) 23 | self.conv2 = nn.Conv2d(conv2d1_filters_numbers, conv2d2_filters_numbers, conv2d2_filters_size) 24 | self.conv3 = nn.Conv2d(conv2d2_filters_numbers, 1, conv2d3_filters_size) 25 | 26 | def forward(self, x): 27 | #print("start forwardingf") 28 | x = self.conv1(x) 29 | x = F.relu(x) 30 | x = self.conv2(x) 31 | x = F.relu(x) 32 | x = self.conv3(x) 33 | x = F.relu(x) 34 | return x 35 | ''' 36 | def num_flat_features(self, x): 37 | size = x.size()[1:] # all dimensions except the batch dimension 38 | num_features = 1 39 | for s in size: 40 | num_features *= s 41 | return num_features 42 | ''' 43 | ''' 44 | net = Net(40, 24) 45 | 46 | 47 | 48 | #sys.exit() 49 | #low_resolution_samples = low_resolution_samples.reshape((low_resolution_samples.shape[0], 40, 40)) 50 | #print low_resolution_samples[0:1, :,: ,: ].shape 51 | #low_resolution_samples = torch.from_numpy(low_resolution_samples[0:1, :,: ,: ]) 52 | #X = Variable(low_resolution_samples) 53 | #print X 54 | #Y = Variable(torch.from_numpy(Y[0])) 55 | #X = Variable(torch.randn(1, 1, 40, 40)) 56 | #print X 57 | optimizer = optim.SGD(net.parameters(), lr=0.0001, momentum=0.9) 58 | criterion = nn.MSELoss() 59 | for epoch in range(2): # loop over the dataset multiple times 60 | print "epoch", epoch 61 | 62 | running_loss = 0.0 63 | for i, data in enumerate(train_loader, 0): 64 | # get the inputs 65 | inputs, labels = data 66 | #print(inputs.size()) 67 | #print(labels.size()) 68 | #print type(inputs) 69 | 70 | # wrap them in Variable 71 | inputs, labels = Variable(inputs), Variable(labels) 72 | 73 | # zero the parameter gradients 74 | optimizer.zero_grad() 75 | 76 | # forward + backward + optimize 77 | outputs = net(inputs) 78 | #print outputs 79 | loss = criterion(outputs, labels) 80 | 81 | loss.backward() 82 | optimizer.step() 83 | print i 84 | # print statistics 85 | #print type(loss) 86 | #print loss 87 | #print loss.data[0] 88 | #print loss.data 89 | #print type(data), len(data) 90 | #print "the key is ", type(data[0]) 91 | 92 | 93 | 94 | print('Finished Training') 95 | 96 | 97 | output = net(X) 98 | print(output) 99 | print type(output) 100 | 101 | loss = criterion(output, Y) 102 | 103 | 104 | net.zero_grad() # zeroes the gradient buffers of all parameters 105 | 106 | print('conv1.bias.grad before backward') 107 | print(net.conv1.bias.grad) 108 | 109 | loss.backward() 110 | 111 | print('conv1.bias.grad after backward') 112 | print(net.conv1.weight.grad) 113 | 114 | ''' 115 | 116 | -------------------------------------------------------------------------------- /hicplus/model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangjuan001/hicplus/65d263906b3c62cd9e7f543229a38bed2bd52316/hicplus/model.pyc -------------------------------------------------------------------------------- /hicplus/pred_chromosome.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | from torch.utils import data 3 | from hicplus import model 4 | import torch 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | from torch.autograd import Variable 8 | import straw 9 | from scipy.sparse import csr_matrix, coo_matrix, vstack, hstack 10 | from scipy import sparse 11 | import numpy as np 12 | from hicplus import utils 13 | from time import gmtime, strftime 14 | from datetime import datetime 15 | import argparse 16 | 17 | startTime = datetime.now() 18 | 19 | use_gpu = 0 #opt.cuda 20 | #if use_gpu and not torch.cuda.is_available(): 21 | # raise Exception("No GPU found, please run without --cuda") 22 | 23 | def predict(M,N,inmodel): 24 | 25 | prediction_1 = np.zeros((N, N)) 26 | 27 | for low_resolution_samples, index in utils.divide(M): 28 | 29 | #print(index.shape) 30 | 31 | batch_size = low_resolution_samples.shape[0] #256 32 | 33 | lowres_set = data.TensorDataset(torch.from_numpy(low_resolution_samples), torch.from_numpy(np.zeros(low_resolution_samples.shape[0]))) 34 | try: 35 | lowres_loader = torch.utils.data.DataLoader(lowres_set, batch_size=batch_size, shuffle=False) 36 | except: 37 | continue 38 | 39 | hires_loader = lowres_loader 40 | 41 | m = model.Net(40, 28) 42 | m.load_state_dict(torch.load(inmodel, map_location=torch.device('cpu'))) 43 | 44 | if torch.cuda.is_available(): 45 | m = m.cuda() 46 | 47 | for i, v1 in enumerate(lowres_loader): 48 | _lowRes, _ = v1 49 | _lowRes = Variable(_lowRes).float() 50 | if use_gpu: 51 | _lowRes = _lowRes.cuda() 52 | y_prediction = m(_lowRes) 53 | 54 | 55 | y_predict = y_prediction.data.cpu().numpy() 56 | 57 | 58 | # recombine samples 59 | length = int(y_predict.shape[2]) 60 | y_predict = np.reshape(y_predict, (y_predict.shape[0], length, length)) 61 | 62 | 63 | for i in range(0, y_predict.shape[0]): 64 | 65 | x = int(index[i][1]) 66 | y = int(index[i][2]) 67 | #print np.count_nonzero(y_predict[i]) 68 | prediction_1[x+6:x+34, y+6:y+34] = y_predict[i] 69 | 70 | return(prediction_1) 71 | 72 | def chr_pred(hicfile, chrN1, chrN2, binsize, inmodel): 73 | M = utils.matrix_extract(chrN1, chrN2, binsize, hicfile) 74 | #print(M.shape) 75 | N = M.shape[0] 76 | 77 | chr_Mat = predict(M, N, inmodel) 78 | 79 | 80 | # if Ncol > Nrow: 81 | # chr_Mat = chr_Mat[:Ncol, :Nrow] 82 | # chr_Mat = chr_Mat.T 83 | # if Nrow > Ncol: 84 | # chr_Mat = chr_Mat[:Nrow, :Ncol] 85 | # print(dat.head()) 86 | return(chr_Mat) 87 | 88 | 89 | 90 | def writeBed(Mat, outname,binsize, chrN1,chrN2): 91 | with open(outname,'w') as chrom: 92 | r, c = Mat.nonzero() 93 | for i in range(r.size): 94 | contact = int(round(Mat[r[i],c[i]])) 95 | if contact == 0: 96 | continue 97 | #if r[i]*binsize > Len1 or (r[i]+1)*binsize > Len1: 98 | # continue 99 | #if c[i]*binsize > Len2 or (c[i]+1)*binsize > Len2: 100 | # continue 101 | line = [chrN1, r[i]*binsize, (r[i]+1)*binsize, 102 | chrN2, c[i]*binsize, (c[i]+1)*binsize, contact] 103 | chrom.write('chr'+str(line[0])+':'+str(line[1])+'-'+str(line[2])+ 104 | '\t'+'chr'+str(line[3])+':'+str(line[4])+'-'+str(line[5])+'\t'+str(line[6])+'\n') 105 | 106 | def main(args): 107 | chrN1, chrN2 = args.chrN 108 | binsize = args.binsize 109 | inmodel = args.model 110 | hicfile = args.inputfile 111 | #name = os.path.basename(inmodel).split('.')[0] 112 | #outname = 'chr'+str(chrN1)+'_'+name+'_'+str(binsize//1000)+'pred.txt' 113 | outname = args.outputfile 114 | Mat = chr_pred(hicfile,chrN1,chrN2,binsize,inmodel) 115 | print(Mat.shape) 116 | writeBed(Mat, outname, binsize,chrN1, chrN2) 117 | #print(enhM.shape) 118 | if __name__ == '__main__': 119 | main() 120 | 121 | print(datetime.now() - startTime) 122 | -------------------------------------------------------------------------------- /hicplus/pred_genome.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os,sys 3 | from torch.utils import data 4 | from hicplus import model 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | from torch.autograd import Variable 9 | import straw 10 | from scipy.sparse import csr_matrix, coo_matrix, vstack, hstack 11 | from scipy import sparse 12 | import numpy as np 13 | from hicplus import utils 14 | from time import gmtime, strftime 15 | from datetime import datetime 16 | import argparse 17 | from hicplus import pred_chromosome 18 | 19 | startTime = datetime.now() 20 | 21 | def pred_genome(hicfile, binsize, inmodel): 22 | hic_info = utils.read_hic_header(hicfile) 23 | chromindex = {} 24 | i = 0 25 | for c, Len in hic_info['chromsizes'].items(): 26 | chromindex[c] = i 27 | i += 1 28 | print(hic_info) 29 | 30 | name = os.path.basename(inmodel).split('.')[0] 31 | with open('genome.{}_{}.matrix.txt'.format(int(binsize/1000),name), 'w') as genome: 32 | for c1, Len1 in hic_info['chromsizes'].items(): 33 | for c2, Len2 in hic_info['chromsizes'].items(): 34 | if chromindex[c1] > chromindex[c2]: 35 | continue 36 | if c1 == 'M' or c2 == 'M': 37 | continue 38 | try: 39 | Mat = pred_chromosome.chr_pred(hicfile, c1, c2, binsize, inmodel) 40 | r, c = Mat.nonzero() 41 | for i in range(r.size): 42 | contact = int(round(Mat[r[i],c[i]])) 43 | if contact == 0: 44 | continue 45 | if r[i]*binsize > Len1 or (r[i]+1)*binsize > Len1: 46 | continue 47 | if c[i]*binsize > Len2 or (c[i]+1)*binsize > Len2: 48 | continue 49 | line = [c1, r[i]*binsize, (r[i]+1)*binsize, 50 | c2, c[i]*binsize, (c[i]+1)*binsize, contact] 51 | genome.write('chr'+str(line[0])+':'+str(line[1])+'-'+str(line[2])+ 52 | '\t'+'chr'+str(line[3])+':'+str(line[4])+'-'+str(line[5])+'\t'+str(line[6])+'\n') 53 | except: 54 | pass 55 | 56 | 57 | 58 | 59 | def main(args): 60 | binsize = args.binsize 61 | inmodel = args.model 62 | hicfile = args.inputfile 63 | pred_genome(hicfile, binsize, inmodel) 64 | 65 | if __name__ == '__main__': 66 | main() 67 | 68 | -------------------------------------------------------------------------------- /hicplus/testConvNet.py: -------------------------------------------------------------------------------- 1 | # Author: Yan Zhang 2 | # Email: zhangyan.cse (@) gmail.com 3 | 4 | import sys 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | import pickle 8 | import os 9 | import gzip 10 | import model 11 | from torch.utils import data 12 | import torch 13 | import torch.optim as optim 14 | from torch.autograd import Variable 15 | from time import gmtime, strftime 16 | import sys 17 | import torch.nn as nn 18 | 19 | use_gpu = 1 20 | 21 | conv2d1_filters_numbers = 8 22 | conv2d1_filters_size = 9 23 | conv2d2_filters_numbers = 8 24 | conv2d2_filters_size = 1 25 | conv2d3_filters_numbers = 1 26 | conv2d3_filters_size = 5 27 | 28 | 29 | down_sample_ratio = 16 30 | epochs = 10 31 | HiC_max_value = 100 32 | 33 | 34 | 35 | # This block is the actual training data used in the training. The training data is too large to put on Github, so only toy data is used. 36 | # cell = "GM12878_replicate" 37 | # chrN_range1 = '1_8' 38 | # chrN_range = '1_8' 39 | 40 | # low_resolution_samples = np.load(gzip.GzipFile('/home/zhangyan/SRHiC_samples/'+cell+'down16_chr'+chrN_range+'.npy.gz', "r")).astype(np.float32) * down_sample_ratio 41 | # high_resolution_samples = np.load(gzip.GzipFile('/home/zhangyan/SRHiC_samples/original10k/'+cell+'_original_chr'+chrN_range+'.npy.gz', "r")).astype(np.float32) 42 | 43 | # low_resolution_samples = np.minimum(HiC_max_value, low_resolution_samples) 44 | # high_resolution_samples = np.minimum(HiC_max_value, high_resolution_samples) 45 | 46 | 47 | low_resolution_samples = np.load(gzip.GzipFile('../../data/GM12878_replicate_down16_chr19_22.npy.gz', "r")).astype(np.float32) * down_sample_ratio 48 | 49 | low_resolution_samples = np.minimum(HiC_max_value, low_resolution_samples) 50 | 51 | batch_size = low_resolution_samples.shape[0] 52 | 53 | # Reshape the high-quality Hi-C sample as the target value of the training. 54 | sample_size = low_resolution_samples.shape[-1] 55 | padding = conv2d1_filters_size + conv2d2_filters_size + conv2d3_filters_size - 3 56 | half_padding = padding / 2 57 | output_length = sample_size - padding 58 | 59 | 60 | print(low_resolution_samples.shape) 61 | 62 | lowres_set = data.TensorDataset(torch.from_numpy(low_resolution_samples), torch.from_numpy(np.zeros(low_resolution_samples.shape[0]))) 63 | lowres_loader = torch.utils.data.DataLoader(lowres_set, batch_size=batch_size, shuffle=False) 64 | 65 | production = False 66 | try: 67 | high_resolution_samples = np.load(gzip.GzipFile('../../data/GM12878_replicate_original_chr19_22.npy.gz', "r")).astype(np.float32) 68 | high_resolution_samples = np.minimum(HiC_max_value, high_resolution_samples) 69 | Y = [] 70 | for i in range(high_resolution_samples.shape[0]): 71 | no_padding_sample = high_resolution_samples[i][0][half_padding:(sample_size-half_padding) , half_padding:(sample_size - half_padding)] 72 | Y.append(no_padding_sample) 73 | Y = np.array(Y).astype(np.float32) 74 | hires_set = data.TensorDataset(torch.from_numpy(Y), torch.from_numpy(np.zeros(Y.shape[0]))) 75 | hires_loader = torch.utils.data.DataLoader(hires_set, batch_size=batch_size, shuffle=False) 76 | except: 77 | production = True 78 | hires_loader = lowres_loader 79 | 80 | Net = model.Net(40, 28) 81 | Net.load_state_dict(torch.load('../model/pytorch_model_12000')) 82 | if use_gpu: 83 | Net = Net.cuda() 84 | 85 | _loss = nn.MSELoss() 86 | 87 | 88 | running_loss = 0.0 89 | running_loss_validate = 0.0 90 | reg_loss = 0.0 91 | 92 | 93 | for i, (v1, v2) in enumerate(zip(lowres_loader, hires_loader)): 94 | _lowRes, _ = v1 95 | _highRes, _ = v2 96 | 97 | 98 | _lowRes = Variable(_lowRes) 99 | _highRes = Variable(_highRes) 100 | 101 | 102 | if use_gpu: 103 | _lowRes = _lowRes.cuda() 104 | _highRes = _highRes.cuda() 105 | y_prediction = Net(_lowRes) 106 | if (not production): 107 | loss = _loss(y_prediction, _highRes) 108 | 109 | 110 | running_loss += loss.data[0] 111 | 112 | print('-------', i, running_loss, strftime("%Y-%m-%d %H:%M:%S", gmtime())) 113 | 114 | y_prediction = y_prediction.data.cpu().numpy() 115 | 116 | print(y_prediction.shape) 117 | -------------------------------------------------------------------------------- /hicplus/trainConvNet.py: -------------------------------------------------------------------------------- 1 | # Author: Yan Zhang 2 | # Email: zhangyan.cse (@) gmail.com 3 | 4 | import sys 5 | import numpy as np 6 | #import matplotlib.pyplot as plt 7 | import pickle 8 | import os 9 | import gzip 10 | from hicplus import model 11 | from torch.utils import data 12 | import torch 13 | import torch.optim as optim 14 | from torch.autograd import Variable 15 | from time import gmtime, strftime 16 | import sys 17 | import torch.nn as nn 18 | import argparse 19 | 20 | use_gpu = 1 21 | 22 | conv2d1_filters_numbers = 8 23 | conv2d1_filters_size = 9 24 | conv2d2_filters_numbers = 8 25 | conv2d2_filters_size = 1 26 | conv2d3_filters_numbers = 1 27 | conv2d3_filters_size = 5 28 | 29 | 30 | down_sample_ratio = 16 31 | epochs = 10 32 | HiC_max_value = 100 33 | batch_size = 512 34 | 35 | 36 | # This block is the actual training data used in the training. The training data is too large to put on Github, so only toy data is used. 37 | # cell = "GM12878_replicate" 38 | # chrN_range1 = '1_8' 39 | # chrN_range = '1_8' 40 | 41 | # low_resolution_samples = np.load(gzip.GzipFile('/home/zhangyan/SRHiC_samples/'+cell+'down16_chr'+chrN_range+'.npy.gz', "r")).astype(np.float32) * down_sample_ratio 42 | # high_resolution_samples = np.load(gzip.GzipFile('/home/zhangyan/SRHiC_samples/original10k/'+cell+'_original_chr'+chrN_range+'.npy.gz', "r")).astype(np.float32) 43 | 44 | # low_resolution_samples = np.minimum(HiC_max_value, low_resolution_samples) 45 | # high_resolution_samples = np.minimum(HiC_max_value, high_resolution_samples) 46 | 47 | 48 | #low_resolution_samples = np.load(gzip.GzipFile('../../data/GM12878_replicate_down16_chr19_22.npy.gz', "r")).astype(np.float32) * down_sample_ratio 49 | #high_resolution_samples = np.load(gzip.GzipFile('../../data/GM12878_replicate_original_chr19_22.npy.gz', "r")).astype(np.float32) 50 | 51 | #low_resolution_samples = np.load(gzip.GzipFile('/home/zhangyan/SRHiC_samples/IMR90_down_HINDIII16_chr1_8.npy.gz', "r")).astype(np.float32) * down_sample_ratio 52 | #high_resolution_samples = np.load(gzip.GzipFile('/home/zhangyan/SRHiC_samples/original10k/_IMR90_HindIII_original_chr1_8.npy.gz', "r")).astype(np.float32) 53 | 54 | def train(lowres,highres, outModel): 55 | low_resolution_samples = lowres.astype(np.float32) * down_sample_ratio 56 | 57 | high_resolution_samples = highres.astype(np.float32) 58 | 59 | low_resolution_samples = np.minimum(HiC_max_value, low_resolution_samples) 60 | high_resolution_samples = np.minimum(HiC_max_value, high_resolution_samples) 61 | 62 | 63 | 64 | # Reshape the high-quality Hi-C sample as the target value of the training. 65 | sample_size = low_resolution_samples.shape[-1] 66 | padding = conv2d1_filters_size + conv2d2_filters_size + conv2d3_filters_size - 3 67 | half_padding = padding // 2 68 | output_length = sample_size - padding 69 | Y = [] 70 | for i in range(high_resolution_samples.shape[0]): 71 | no_padding_sample = high_resolution_samples[i][0][half_padding:(sample_size-half_padding) , half_padding:(sample_size - half_padding)] 72 | Y.append(no_padding_sample) 73 | Y = np.array(Y).astype(np.float32) 74 | 75 | print(low_resolution_samples.shape, Y.shape) 76 | 77 | lowres_set = data.TensorDataset(torch.from_numpy(low_resolution_samples), torch.from_numpy(np.zeros(low_resolution_samples.shape[0]))) 78 | lowres_loader = torch.utils.data.DataLoader(lowres_set, batch_size=batch_size, shuffle=False) 79 | 80 | hires_set = data.TensorDataset(torch.from_numpy(Y), torch.from_numpy(np.zeros(Y.shape[0]))) 81 | hires_loader = torch.utils.data.DataLoader(hires_set, batch_size=batch_size, shuffle=False) 82 | 83 | 84 | Net = model.Net(40, 28) 85 | 86 | if use_gpu: 87 | Net = Net.cuda() 88 | 89 | optimizer = optim.SGD(Net.parameters(), lr = 0.00001) 90 | _loss = nn.MSELoss() 91 | Net.train() 92 | 93 | running_loss = 0.0 94 | running_loss_validate = 0.0 95 | reg_loss = 0.0 96 | 97 | # write the log file to record the training process 98 | with open('HindIII_train.txt', 'w') as log: 99 | for epoch in range(0, 3500): 100 | for i, (v1, v2) in enumerate(zip(lowres_loader, hires_loader)): 101 | if (i == len(lowres_loader) - 1): 102 | continue 103 | _lowRes, _ = v1 104 | _highRes, _ = v2 105 | 106 | _lowRes = Variable(_lowRes) 107 | _highRes = Variable(_highRes).unsqueeze(1) 108 | 109 | if use_gpu: 110 | _lowRes = _lowRes.cuda() 111 | _highRes = _highRes.cuda() 112 | optimizer.zero_grad() 113 | y_prediction = Net(_lowRes) 114 | 115 | loss = _loss(y_prediction, _highRes) 116 | loss.backward() 117 | optimizer.step() 118 | 119 | running_loss += loss.item() 120 | 121 | print('-------', i, epoch, running_loss/i, strftime("%Y-%m-%d %H:%M:%S", gmtime())) 122 | 123 | log.write(str(epoch) + ', ' + str(running_loss/i,) +', '+ strftime("%Y-%m-%d %H:%M:%S", gmtime())+ '\n') 124 | running_loss = 0.0 125 | running_loss_validate = 0.0 126 | # save the model every 100 epoches 127 | if (epoch % 100 == 0): 128 | torch.save(Net.state_dict(), outModel + str(epoch) + str('.model')) 129 | pass 130 | -------------------------------------------------------------------------------- /hicplus/trainConvNet.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangjuan001/hicplus/65d263906b3c62cd9e7f543229a38bed2bd52316/hicplus/trainConvNet.pyc -------------------------------------------------------------------------------- /hicplus/train_models.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse as ap 3 | from math import log10 4 | 5 | #import torch 6 | #import torch.nn as nn 7 | #import torch.optim as optim 8 | #from torch.autograd import Variable 9 | #from torch.utils.data import DataLoader 10 | from hicplus import utils 11 | #import model 12 | import argparse 13 | from hicplus import trainConvNet 14 | import numpy as np 15 | 16 | chrs_length = [249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540,102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566] 17 | 18 | #chrN = 21 19 | #scale = 16 20 | 21 | def main(args): 22 | 23 | highres = utils.train_matrix_extract(args.chromosome, 10000, args.inputfile) 24 | 25 | print('dividing, filtering and downsampling files...') 26 | 27 | highres_sub, index = utils.train_divide(highres) 28 | 29 | print(highres_sub.shape) 30 | #np.save(infile+"highres",highres_sub) 31 | 32 | lowres = utils.genDownsample(highres,1/float(args.scalerate)) 33 | lowres_sub,index = utils.train_divide(lowres) 34 | print(lowres_sub.shape) 35 | #np.save(infile+"lowres",lowres_sub) 36 | 37 | print('start training...') 38 | trainConvNet.train(lowres_sub,highres_sub,args.outmodel) 39 | 40 | 41 | print('finished...') 42 | -------------------------------------------------------------------------------- /hicplus/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | #import matplotlib.pyplot as plt 3 | import os,struct 4 | import random 5 | import straw 6 | from scipy.sparse import csr_matrix, coo_matrix, vstack, hstack 7 | from scipy import sparse 8 | import numpy as np 9 | 10 | def readcstr(f): 11 | buf = "" 12 | while True: 13 | b = f.read(1) 14 | b = b.decode('utf-8', 'backslashreplace') 15 | if b is None or b == '\0': 16 | return str(buf) 17 | else: 18 | buf = buf + b 19 | 20 | def read_hic_header(hicfile): 21 | 22 | if not os.path.exists(hicfile): 23 | return None # probably a cool URI 24 | 25 | req = open(hicfile, 'rb') 26 | magic_string = struct.unpack('<3s', req.read(3))[0] 27 | req.read(1) 28 | if (magic_string != b"HIC"): 29 | return None # this is not a valid .hic file 30 | 31 | info = {} 32 | version = struct.unpack('= total_loci or j + subImage_size >= total_loci): 131 | continue 132 | subImage = HiCmatrix[i:i + subImage_size, j:j + subImage_size] 133 | 134 | result.append([subImage, ]) 135 | tag = 'test' 136 | index.append((tag, i, j)) 137 | result = np.array(result) 138 | #print(result.shape) 139 | #result = result.astype(np.double) 140 | index = np.array(index) 141 | yield result, index 142 | 143 | 144 | def train_divide(HiCmatrix): 145 | subImage_size = 40 146 | step = 25 147 | result = [] 148 | index = [] 149 | #chrN = 21 ##need to change. 150 | 151 | total_loci = HiCmatrix.shape[0] 152 | #print(HiCmatrix.shape) 153 | for i in range(0, total_loci, step): 154 | for j in range(0, total_loci, ): 155 | if (abs(i-j)>201 or i + subImage_size >= total_loci or j + subImage_size >= total_loci): 156 | continue 157 | subImage = HiCmatrix[i:i + subImage_size, j:j + subImage_size] 158 | 159 | result.append([subImage, ]) 160 | tag = 'test' 161 | index.append((tag, i, j)) 162 | result = np.array(result) 163 | #print(result.shape) 164 | result = result.astype(np.double) 165 | index = np.array(index) 166 | return result, index 167 | 168 | def genDownsample(original_sample, rate): 169 | result = np.zeros(original_sample.shape).astype(float) 170 | for i in range(0, original_sample.shape[0]): 171 | for j in range(0, original_sample.shape[1]): 172 | for k in range(0, int(original_sample[i][j])): 173 | if (random.random() < rate): 174 | result[i][j] += 1 175 | return result 176 | 177 | 178 | if __name__ == "__main__": 179 | main() 180 | -------------------------------------------------------------------------------- /hicplus/utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangjuan001/hicplus/65d263906b3c62cd9e7f543229a38bed2bd52316/hicplus/utils.pyc -------------------------------------------------------------------------------- /model/GM_chr1-83900.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangjuan001/hicplus/65d263906b3c62cd9e7f543229a38bed2bd52316/model/GM_chr1-83900.model -------------------------------------------------------------------------------- /model/model3400.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangjuan001/hicplus/65d263906b3c62cd9e7f543229a38bed2bd52316/model/model3400.model -------------------------------------------------------------------------------- /model/pytorch_HindIII_model_40000: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangjuan001/hicplus/65d263906b3c62cd9e7f543229a38bed2bd52316/model/pytorch_HindIII_model_40000 -------------------------------------------------------------------------------- /scripts/hicplus: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse, sys 3 | from hicplus import pred_chromosome, train_models, pred_genome 4 | def getargs(): 5 | ## Construct an ArgumentParser object for command-line arguments 6 | parser = argparse.ArgumentParser(description='''Train CNN model with Hi-C data and make predictions for low resolution HiC data with the model. 7 | ''', 8 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 9 | subparsers = parser.add_subparsers(dest='subcommands') 10 | subtrain = subparsers.add_parser('train', 11 | help='''Train CNN model per chromosome''') 12 | subtrain.set_defaults(func=train_models.main) 13 | subchrom = subparsers.add_parser('pred_chromosome', 14 | help='''predict high resolution interaction frequencies for inter and intra chromosomes''') 15 | subchrom.set_defaults(func=pred_chromosome.main) 16 | subgen = subparsers.add_parser('pred_genome', 17 | help='''predict high resolution interaction frequencies for genome''') 18 | subgen.set_defaults(func=pred_genome.main) 19 | 20 | #subgen = subparsers.add_parser('score_genome', 21 | # help='''prediction interaction frequencies for the whole genome''') 22 | #subgen.set_defaults(func=score_genome.main) 23 | 24 | subs=[subtrain,subchrom, subgen] 25 | #subpool.add_argument('-l','--lower', 26 | # help = 'Lower bound of report in bp (20000)', 27 | # type = int, default=20000) 28 | #subpool.add_argument('-u','--upper', 29 | # help = 'Upper bound of report in bp (300000)', 30 | # type = int, default=300000) 31 | for i in subs[:1]: 32 | i.add_argument('-i', '--inputfile', 33 | help = 'path to a .hic file.', type = str) 34 | i.add_argument('-r', '--scalerate', 35 | help = 'downsampling rate to generate the low resolution training file', 36 | type = int, default = 16) 37 | i.add_argument('-c', '--chromosome', 38 | help = 'choose one chromosome to do the model training.', 39 | type = int, default = 21) 40 | i.add_argument('-o', '--outmodel', 41 | help = 'output model name. default = model_epochnumber.model', 42 | type = str, default = 'model') 43 | i.add_argument('-l', '--log', 44 | help = 'output log file. default = train_log.txt', 45 | type = str, default = 'train_log' ) 46 | for i in subs[1:-1]: 47 | i.add_argument('-i', '--inputfile', 48 | help = 'path to a .hic file.', type = str) 49 | i.add_argument('-o', '--outputfile', 50 | help = 'path to an output file.', type = str) 51 | i.add_argument('-m', '--model', 52 | help = 'path to a model file.', type = str) 53 | i.add_argument('-b', '--binsize', 54 | help = 'predicted resolustion, e.g.10kb, 25kb..., default=10000', 55 | type = int, default = 10000) 56 | i.add_argument('-c','--chrN', nargs=2, metavar=('chrN1','chrN2'), 57 | type=str,required=True, help='chromosome number') 58 | 59 | 60 | for i in subs[2:]: 61 | i.add_argument('-i', '--inputfile', 62 | help = 'path to a .hic file.', type = str) 63 | i.add_argument('-m', '--model', 64 | help = 'path to a model file.', type = str) 65 | i.add_argument('-b', '--binsize', 66 | help = 'predicted resolustion, e.g.10kb, 25kb..., default=10000', 67 | type = int, default = 10000) 68 | 69 | 70 | ## Parse the command-line arguments 71 | commands = sys.argv[1:] 72 | if ((not commands) or ((commands[0] in ['train', 'pred_chromosome','pred_genome']) 73 | and len(commands) == 1)): 74 | commands.append('-h') 75 | args = parser.parse_args(commands) 76 | 77 | return args, commands 78 | 79 | 80 | def run(): 81 | # Parse Arguments 82 | args, commands = getargs() 83 | # Improve the performance if you don't want to run it 84 | if commands[0] not in ['-h','--help']: 85 | args.func(args) 86 | 87 | 88 | if __name__ == '__main__': 89 | run() 90 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Setup script for hicplus. 3 | This is a free software under GPLv3. Therefore, you can modify, redistribute 4 | or even mix it with other GPL-compatible codes. See the file LICENSE 5 | included with the distribution for more details. 6 | """ 7 | import os, sys, hicplus, glob 8 | import setuptools 9 | 10 | if (sys.version_info.major!=3) or (sys.version_info.minor<6): 11 | print('PYTHON 3.5+ IS REQUIRED. YOU ARE CURRENTLY USING PYTHON {}'.format(sys.version.split()[0])) 12 | sys.exit(2) 13 | 14 | # Guarantee Unix Format 15 | for src in glob.glob('scripts/*'): 16 | text = open(src, 'r').read().replace('\r\n', '\n') 17 | open(src, 'w').write(text) 18 | 19 | setuptools.setup( 20 | name = 'hicplus', 21 | version = hicplus.__version__, 22 | packages = setuptools.find_packages(), 23 | scripts = glob.glob('scripts/*'), 24 | long_description = 'test description', 25 | classifiers = [ 26 | 'Programming Language :: Python :: 3.6', 27 | 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)', 28 | 'Operating System :: POSIX', 29 | 'Development Status :: 3 - Alpha', 30 | 'Intended Audience :: Science/Research', 31 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 32 | ] 33 | ) 34 | --------------------------------------------------------------------------------