├── DeepMod_tools ├── cal_EcoliDetPerf.py ├── generate_motif_pos.py ├── hm_cluster_predict.py └── sum_chr_mod.py ├── INSTALL ├── LICENSE ├── README.md ├── bin ├── DeepMod.py └── DeepMod_scripts │ ├── EventTable.py │ ├── MoveTable.py │ ├── __init__.py │ ├── myCom.py │ ├── myDetect.py │ ├── myGetFeatureBasedPos.py │ └── myMultiBiRNN.py ├── docs ├── Description of well-trained models.md ├── Install.md ├── Reproducibility.md ├── Results_explanation.md └── Usage.md ├── setup.py └── train_deepmod ├── na12878_cluster_train_mod-keep_prob0.7-nb25-chr1 ├── Cg.cov5.nb25.data-00000-of-00001 ├── Cg.cov5.nb25.index ├── Cg.cov5.nb25.meta └── checkpoint ├── rnn_conmodA_E1m2wd21_f7ne1u0_4 ├── checkpoint ├── mod_train_conmodA_E1m2wd21_f3ne1u0.data-00000-of-00001 ├── mod_train_conmodA_E1m2wd21_f3ne1u0.index └── mod_train_conmodA_E1m2wd21_f3ne1u0.meta ├── rnn_conmodA_P100wd21_f7ne1u0_4 ├── checkpoint ├── mod_train_conmodA_P100wd21_f3ne1u0.data-00000-of-00001 ├── mod_train_conmodA_P100wd21_f3ne1u0.index └── mod_train_conmodA_P100wd21_f3ne1u0.meta ├── rnn_conmodC_P100wd21_f7ne1u0_4 ├── checkpoint ├── mod_train_conmodC_P100wd21_f3ne1u0.data-00000-of-00001 ├── mod_train_conmodC_P100wd21_f3ne1u0.index └── mod_train_conmodC_P100wd21_f3ne1u0.meta ├── rnn_f7_wd21_chr1to10_4 ├── checkpoint ├── mod_train_f7_wd21_chr1to10.data-00000-of-00001 ├── mod_train_f7_wd21_chr1to10.index └── mod_train_f7_wd21_chr1to10.meta └── rnn_sinmodC_P100wd21_f7ne1u0_4 ├── checkpoint ├── mod_train_sinmodC_P100wd21_f3ne1u0.data-00000-of-00001 ├── mod_train_sinmodC_P100wd21_f3ne1u0.index └── mod_train_sinmodC_P100wd21_f3ne1u0.meta /DeepMod_tools/cal_EcoliDetPerf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, sys, time 4 | from collections import defaultdict 5 | import glob 6 | import copy 7 | import numpy as np 8 | 9 | import matplotlib 10 | matplotlib.use('Agg') 11 | import matplotlib.pyplot as plt 12 | from itertools import cycle 13 | 14 | from sklearn.metrics import roc_curve, auc 15 | from sklearn.metrics import precision_recall_curve, average_precision_score 16 | from sklearn.metrics import matthews_corrcoef 17 | 18 | import rpy2.robjects as robjects 19 | from rpy2.robjects.packages import importr 20 | from pkg_resources import resource_string 21 | 22 | from scipy.stats import binom 23 | import copy 24 | 25 | ggplot = importr('ggplot2') 26 | importr('gridExtra') 27 | importr('plyr') 28 | 29 | na4com = {'A':'T', 'C':'G', 'T':'A', 'G':'C'} 30 | 31 | def readFA(mfa, mpat='Cg', mposinpat=0, t_chr=None, t_start=None, t_end=None): 32 | pos_dict = defaultdict(int) 33 | 34 | pat3 = copy.deepcopy(mpat.upper()) 35 | comp_pat3 = ''.join([na4com[curna] for curna in pat3][::-1]) 36 | comp_mposinpat = len(comp_pat3)-1-mposinpat 37 | 38 | fadict = defaultdict(); 39 | with open(mfa, 'r') as mr: 40 | cur_chr = None; 41 | 42 | line = mr.readline(); 43 | while line: 44 | line = line.strip(); 45 | if len(line)>0: 46 | if line[0]=='>': 47 | if not cur_chr==None: 48 | fadict[cur_chr] = ''.join(fadict[cur_chr]) 49 | cur_chr = line[1:].split()[0] 50 | if t_chr in [None, cur_chr]: 51 | fadict[cur_chr] = [] 52 | else: 53 | if t_chr in [None, cur_chr]: 54 | fadict[cur_chr].append(line) 55 | line = mr.readline(); 56 | if not cur_chr==None: 57 | fadict[cur_chr] = ''.join(fadict[cur_chr]) 58 | fakeys = fadict.keys(); 59 | cpgdict = defaultdict(int); cpgnum = [0, 0] 60 | for fak in fakeys: 61 | cpgdict[fak] = defaultdict() 62 | for i in range(len(fadict[fak])): 63 | if (t_start==None or i>=t_start) and (t_end==None or i<=t_end): 64 | if i-mposinpat>=0 and i+len(comp_pat3)-1-mposinpat=0 and i+len(comp_pat3)-1-comp_mposinpat0: 83 | lsp = line.split(); 84 | cur_chr = lsp[0]; 85 | cur_pos = int(lsp[1]); 86 | cur_strand = lsp[5]; 87 | 88 | cur_cov = int(lsp[9]); 89 | cur_m_p = int(lsp[10]); 90 | cur_m_c = int(lsp[11]); 91 | 92 | if not ((t_start==None or cur_pos>=t_start) and (t_end==None or cur_pos<=t_end)): 93 | line = mr.readline(); 94 | continue; 95 | 96 | if not (mna==lsp[3] and lsp[3]==(cpgdict[cur_chr][(cur_strand, cur_pos)][1] if cur_strand=='+' else na4com[cpgdict[cur_chr][(cur_strand, cur_pos)][1]])): 97 | print ('Error !! NA not equal %s == %s == %s %s' % (mna, lsp[3], cpgdict[cur_chr][(cur_strand, cur_pos)][1], modf)) 98 | 99 | if (cur_chr, cur_pos, cur_strand) not in pred_dict: 100 | pred_dict[(cur_chr, cur_pos, cur_strand)] = [cur_cov, cur_m_p, cur_m_c, lsp[3]] 101 | else: 102 | pred_dict[(cur_chr, cur_pos, cur_strand)][0] += cur_cov 103 | pred_dict[(cur_chr, cur_pos, cur_strand)][2] += cur_m_c 104 | pred_dict[(cur_chr, cur_pos, cur_strand)][1] = int(pred_dict[(cur_chr, cur_pos, cur_strand)][2]*100/pred_dict[(cur_chr, cur_pos, cur_strand)][0]) if pred_dict[(cur_chr, cur_pos, cur_strand)][0]>0 else 0 105 | 106 | def add_from_dict(cpgdict, pred_dict, label, pred_list, mna, tp_fp_tn_fn, mpat='Cg', mposinpat=0): 107 | for posk in pred_dict: 108 | cur_chr, cur_pos, cur_strand = posk 109 | cur_cov, cur_m_p, cur_m_c, lsp3 = pred_dict[posk] 110 | 111 | iscpg = False; 112 | if cpgdict[cur_chr][(cur_strand, cur_pos)][0]==1: 113 | iscpg = True; 114 | pred_list.append((label, cur_cov, cur_m_p, cur_m_c, mpat, np.log(binom.pmf(cur_m_c, cur_cov, 0.05)) )) 115 | if (lsp3==mpat[mposinpat]): pass 116 | else: print ('Error not methylated pos %s %s %s' % (mna, cur_strand)) 117 | if not iscpg: 118 | isclosec = False; 119 | for i in range(-3, 4): 120 | if (cur_strand, cur_pos+i) in cpgdict[cur_chr] and cpgdict[cur_chr][(cur_strand, cur_pos+i)][0]==1: 121 | isclosec = True; break; 122 | if lsp3==mpat[mposinpat]: 123 | pred_list.append((0, cur_cov, cur_m_p, cur_m_c, mpat+'_n'+str(abs(i))+mpat[mposinpat] if isclosec else 'Other'+mpat[mposinpat], np.log(binom.pmf(cur_m_c, cur_cov, 0.05)) )) 124 | else: 125 | pred_list.append((0, cur_cov, cur_m_p, cur_m_c, mpat+'_nb' if isclosec else 'Other', np.log(binom.pmf(cur_m_c, cur_cov, 0.05)) )) 126 | if pred_list[-1][0]==0: 127 | tp_fp_tn_fn[2] += cur_cov - cur_m_c 128 | tp_fp_tn_fn[1] += cur_m_c 129 | else: 130 | tp_fp_tn_fn[0] += cur_m_c 131 | tp_fp_tn_fn[3] += cur_cov - cur_m_c 132 | 133 | 134 | def readmodf(cpgdict, modf, label, pred_list, mna, tp_fp_tn_fn, mpat='Cg', mposinpat=0, t_start=None, t_end=None): 135 | with open(modf, 'r') as mr: 136 | line = mr.readline(); 137 | while line: 138 | line = line.strip(); 139 | if len(line)>0: 140 | lsp = line.split(); 141 | cur_chr = lsp[0]; 142 | cur_pos = int(lsp[1]); 143 | cur_strand = lsp[5]; 144 | 145 | cur_cov = int(lsp[9]); 146 | cur_m_p = int(lsp[10]); 147 | cur_m_c = int(lsp[11]); 148 | 149 | if not ((t_start==None or cur_pos>=t_start) and (t_end==None or cur_pos<=t_end)): 150 | line = mr.readline(); 151 | continue; 152 | 153 | if not (mna==lsp[3] and lsp[3]==(cpgdict[cur_chr][(cur_strand, cur_pos)][1] if cur_strand=='+' else na4com[cpgdict[cur_chr][(cur_strand, cur_pos)][1]])): 154 | print ('Error !! NA not equal %s == %s == %s %s' % (mna, lsp[3], cpgdict[cur_chr][(cur_strand, cur_pos)][1], modf)) 155 | iscpg = False; 156 | if cpgdict[cur_chr][(cur_strand, cur_pos)][0]==1: 157 | iscpg = True; 158 | pred_list.append((label, cur_cov, cur_m_p, cur_m_c, mpat, np.log(binom.pmf(cur_m_c, cur_cov, 0.05)) )) 159 | if (lsp[3]==mpat[mposinpat]): pass 160 | else: print ('Error not methylated pos %s %s %s' % (mna, cur_strand, modf)) 161 | if not iscpg: 162 | isclosec = False; 163 | for i in range(-3, 4): 164 | if (cur_strand, cur_pos+i) in cpgdict[cur_chr] and cpgdict[cur_chr][(cur_strand, cur_pos+i)][0]==1: 165 | isclosec = True; break; 166 | if lsp[3]==mpat[mposinpat]: 167 | pred_list.append((0, cur_cov, cur_m_p, cur_m_c, mpat+'_n'+str(abs(i))+mpat[mposinpat] if isclosec else 'Other'+mpat[mposinpat], np.log(binom.pmf(cur_m_c, cur_cov, 0.05)) )) 168 | else: 169 | pred_list.append((0, cur_cov, cur_m_p, cur_m_c, mpat+'_nb' if isclosec else 'Other', np.log(binom.pmf(cur_m_c, cur_cov, 0.05)) )) 170 | if pred_list[-1][0]==0: 171 | tp_fp_tn_fn[2] += cur_cov - cur_m_c 172 | tp_fp_tn_fn[1] += cur_m_c 173 | else: 174 | tp_fp_tn_fn[0] += cur_m_c 175 | tp_fp_tn_fn[3] += cur_cov - cur_m_c 176 | line = mr.readline(); 177 | 178 | 179 | sssfolder = sys.argv[1]; # 180 | mreffile = sys.argv[2]; # 181 | mpat=sys.argv[3]; # Cg 182 | mposinpat=int(sys.argv[4]);# 0 183 | 184 | chrofinterest = sys.argv[5]; 185 | if chrofinterest=='': chrofinterest = None; 186 | stposofinterest = int(sys.argv[6]); 187 | if stposofinterest<0: stposofinterest = None; 188 | edposofinterest = int(sys.argv[7]); 189 | if edposofinterest<0: edposofinterest = None; 190 | 191 | basefig = sys.argv[8] 192 | hastwoclass = 1; 193 | 194 | sssfiles = {mpat[mposinpat]:glob.glob(os.path.join(sssfolder, 'mod_pos.*.'+mpat[mposinpat]+'.bed'))} 195 | sssfiles[mpat[mposinpat]].extend(glob.glob(os.path.join(sssfolder, '*/mod_pos.*.'+mpat[mposinpat]+'.bed'))) 196 | sssfiles[mpat[mposinpat]].extend(glob.glob(os.path.join(sssfolder, '*/*/mod_pos.*.'+mpat[mposinpat]+'.bed'))) 197 | print(str(len(sssfiles[mpat[mposinpat]])) + " " + str(sssfolder)) 198 | 199 | ## for negative; 200 | umrfiles = [] 201 | for cur_umr_f in sys.argv[9].split(','): 202 | if not os.path.isdir(cur_umr_f): 203 | print("No prediction folder {}".format(cur_umr_f)) 204 | sys.exit(1); 205 | umrfiles.extend(glob.glob(os.path.join(cur_umr_f, '*/*/mod_pos.*.'+mpat[mposinpat]+'.bed'))) 206 | umrfiles.extend(glob.glob(os.path.join(cur_umr_f, '*/mod_pos.*.'+mpat[mposinpat]+'.bed'))) 207 | umrfiles.extend(glob.glob(os.path.join(cur_umr_f, 'mod_pos.*.'+mpat[mposinpat]+'.bed'))) 208 | print(str(len(umrfiles)) + " " + str(sys.argv[9].split(','))) 209 | sys.stdout.flush() 210 | 211 | for sa in sssfiles: 212 | print (sa) 213 | for nf in sssfiles[sa]: 214 | print ('\t'+nf) 215 | 216 | cpgdict = readFA(mreffile, mpat, mposinpat, chrofinterest, stposofinterest, edposofinterest) 217 | 218 | pred_dict = defaultdict(); 219 | for modf in umrfiles: 220 | readmodf_dict(cpgdict, modf, pred_dict, mpat[mposinpat], stposofinterest, edposofinterest) 221 | 222 | baseinfo = [mpat, mpat+'_n1'+mpat[mposinpat], mpat+'_n2'+mpat[mposinpat], mpat+'_n3'+mpat[mposinpat], 'Other'+mpat[mposinpat], mpat+'_nb', 'Other'] 223 | 224 | classify_m = ['Methylation_Percentage'] 225 | classify_types = [baseinfo, [mpat]] 226 | filename = [['all_mp','motif_mp'] ] 227 | cov_thr = [1, 5] 228 | mlinestyle = {1:'bo-', 3:'gx--', 5:'r*-.', 7:'cs-', 10:'md--', 15:'k+-.'} 229 | 230 | pred_list = []; tp_fp_tn_fn = [0, 0, 0, 0] 231 | 232 | add_from_dict(cpgdict, pred_dict, 0, pred_list, mpat[mposinpat], tp_fp_tn_fn, mpat, mposinpat) 233 | 234 | if True: 235 | for na4 in sssfiles: 236 | for cur_f in sssfiles[na4]: 237 | print('%s %s' % (na4, cur_f)); sys.stdout.flush(); 238 | readmodf(cpgdict, cur_f, hastwoclass, pred_list, na4, tp_fp_tn_fn, mpat, mposinpat, stposofinterest, edposofinterest); 239 | pred_list = np.array(pred_list, dtype=[('Methylation', np.uint), ('Coverage', np.uint64), ('Methylation_Percentage', np.uint64), ('Methylation_Coverage', np.uint64), ('BaseInfo', 'U20'), ('logp', np.float64)]) 240 | 241 | if hastwoclass==1: 242 | cov_plot_thr = [1, 5] 243 | for ct_ind in range(len(classify_types)): 244 | ct = classify_types[ct_ind] 245 | cur_ct_data = pred_list[np.isin(pred_list['BaseInfo'], ct)] 246 | for cm_ind in range(len(classify_m)): 247 | print('basetype={} classify_measure={}'.format(ct, classify_m[cm_ind])) 248 | cm = classify_m[cm_ind] 249 | 250 | # 1 for roc, 2: pr; 251 | roc_or_pr = 2; roc_or_pr=0 252 | for roc_or_pr in range(1,3): 253 | if roc_or_pr>0: 254 | mfig= plt.figure() 255 | if roc_or_pr==2: 256 | cur_fn = basefig+'/ap_plot_met_pr_'+filename[cm_ind][ct_ind]+'.png' 257 | xylab = ['Recall', 'Precision']; leg_mpos = "lower left" 258 | for covt in cov_plot_thr: 259 | precision, recall, thresholds = precision_recall_curve(cur_ct_data['Methylation'][cur_ct_data['Coverage']>=covt], cur_ct_data[cm][cur_ct_data['Coverage']>=covt]) 260 | ap_pr = average_precision_score(cur_ct_data['Methylation'][cur_ct_data['Coverage']>=covt], cur_ct_data[cm][cur_ct_data['Coverage']>=covt]) 261 | plt.plot(recall, precision, mlinestyle[covt], lw=2, label='Coverage>=%d (AP=%0.3f)' % (covt, ap_pr)) 262 | print('\t\t %s %d ap=%.5f' % (cur_fn, covt, ap_pr)) 263 | elif roc_or_pr==1: 264 | xylab = ['False Positive Rate', 'True Positive Rate']; leg_mpos = "lower right" 265 | cur_fn = basefig+'/roc_plot_met_roc_'+filename[cm_ind][ct_ind]+'.png' 266 | prev = 0; prev_ind = -1 267 | for covt in cov_plot_thr: 268 | fpr, tpr, mthr = roc_curve(cur_ct_data['Methylation'][cur_ct_data['Coverage']>=covt], cur_ct_data[cm][cur_ct_data['Coverage']>=covt]) 269 | #print(','.join([str(np.round(t1, 5)) for t1 in mthr])) 270 | roc_auc = auc(fpr, tpr) 271 | if (not np.isnan(roc_auc)) and (abs(roc_auc - prev)>=0.02 or (covt>10 and abs(roc_auc - prev)>=0.005) or (cov_plot_thr.index(covt)-prev_ind>1 and abs(roc_auc - prev)>=0.005)): 272 | plt.plot(fpr, tpr, mlinestyle[covt], lw=2, label='Coverage>=%d (AUC=%0.3f)' % (covt, roc_auc)) 273 | prev = roc_auc; prev_ind = cov_plot_thr.index(covt) 274 | if not np.isnan(roc_auc): 275 | print ('\t\t %s %d %.7f' % (cur_fn, covt, roc_auc)) 276 | plt.plot([0, 1], [0, 1]) 277 | if roc_or_pr>0: 278 | plt.xlim([0.0, 1.0]); plt.ylim([0.0, 1.0]) 279 | plt.xlabel(xylab[0]); plt.ylabel(xylab[1]) 280 | plt.legend(loc=leg_mpos) 281 | mfig.savefig(cur_fn, dpi=300); plt.close(mfig) 282 | 283 | 284 | -------------------------------------------------------------------------------- /DeepMod_tools/generate_motif_pos.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, sys, string, time 4 | from collections import defaultdict; 5 | import multiprocessing 6 | 7 | 8 | 9 | def read_genome(mfafile): 10 | ref_genome = defaultdict(); 11 | with open(mfafile, 'r') as mr: 12 | cur_chr = None; 13 | while True: 14 | line = mr.readline(); 15 | if not line: break; 16 | line = line.strip(); 17 | if len(line)==0: continue; 18 | if line[0]=='>': 19 | if not cur_chr==None: 20 | ref_genome[cur_chr] = ''.join(seqlist); 21 | cur_chr = line[1:].split()[0]; 22 | seqlist = [] 23 | else: 24 | seqlist.append(line.upper()) 25 | ref_genome[cur_chr] = ''.join(seqlist); 26 | print("Total chr: {}".format(len(ref_genome))); sys.stdout.flush() 27 | return ref_genome 28 | #ref_genome = read_genome(ref_fa); 29 | 30 | def handle_motif_pos(run_Q): 31 | handli = 0; 32 | while not run_Q.empty(): 33 | try: 34 | rgkey, ref_genome, res_folder, na_bp, curna, curmotif = run_Q.get(block=False) 35 | #print(rgkey, ref_genome, res_folder, na_bp, curna, curmotif); continue 36 | except: 37 | break; 38 | 39 | #curna_dict = defaultdict(); 40 | #curmotif_dict = defaultdict(); 41 | nafile = '%sna_%s_%s.bed' % (res_folder, rgkey, curna) 42 | motiffile = '%smotif_%s_%s.bed' % (res_folder, rgkey, curna) 43 | mw_na = open(nafile, 'w') 44 | mw_motif = open(motiffile, 'w') 45 | 46 | batchsize = 500000 47 | print("get motif for {}={}".format(rgkey, len(ref_genome))); sys.stdout.flush() 48 | cur_hi = 0; start_time = time.time(); 49 | for na_ind in range(len(ref_genome)): 50 | cur_hi += 1 51 | if cur_hi % batchsize == 0: 52 | print('\t time consuming ({})= {} {}'.format( rgkey, cur_hi, time.time() - start_time ) ) 53 | sys.stdout.flush() 54 | start_time = time.time() 55 | 56 | if (ref_genome[na_ind]==curna or na_bp[ref_genome[na_ind]]==curna): 57 | #curna_dict[(rgkey, na_ind, '+' if ref_genome[rgkey][na_ind]==curna else '-')] = True; 58 | mw_na.write('%s\t%s\t%s\n' % (rgkey, na_ind, '+' if ref_genome[na_ind]==curna else '-')) 59 | if ref_genome[na_ind]==curna and (not curmotif==None): 60 | for cur_mot in curmotif: 61 | is_mot = True; mot_ind = 0; 62 | for cur_sub_r_ind in range(na_ind - curmotif[cur_mot], na_ind + len(cur_mot) - curmotif[cur_mot] ): 63 | if cur_sub_r_ind<0 or cur_sub_r_ind>len(ref_genome)-1: 64 | is_mot = False; break; 65 | if not ref_genome[cur_sub_r_ind] == cur_mot[mot_ind]: 66 | is_mot = False; break; 67 | mot_ind += 1 68 | if is_mot: 69 | mw_motif.write('%s\t%s\t%s\n' % (rgkey, na_ind, '+')) 70 | mw_motif.write('%s\t%s\t%s\n' % (rgkey, na_ind+1, '-')) 71 | break; 72 | mw_na.close(); mw_motif.close() 73 | 74 | 75 | 76 | ref_fa = 'ref/hg38.fa' 77 | ref_fa = sys.argv[1] 78 | res_folder = 'genome.motif/C/' 79 | res_folder = sys.argv[2]+'/' 80 | if not os.path.isdir(res_folder): 81 | os.system('mkdir -p '+res_folder) 82 | 83 | curna='C' 84 | curmotif={'CG':0} 85 | curna = sys.argv[3]; 86 | curmotif = {sys.argv[4]:int(sys.argv[5])} 87 | 88 | if len(sys.argv)>6: 89 | chrkeys = ["chr%s" % cid for cid in sys.argv[6].split(',')] 90 | else: 91 | chrkeys = [] 92 | for i in range(1, 23): 93 | chrkeys.append("chr%d" % i) 94 | chrkeys.append("chrX") 95 | chrkeys.append("chrY") 96 | chrkeys.append("chrM") 97 | 98 | chrkeys = set(chrkeys) 99 | 100 | 101 | na_bp = {"A":"T", \ 102 | "C":"G", \ 103 | "G":"C", \ 104 | "T":"A", \ 105 | "a":"t", \ 106 | "c":"g", \ 107 | "g":"c", \ 108 | "t":"a", \ 109 | "N":"N", \ 110 | "n":"n" \ 111 | } 112 | 113 | 114 | ref_genome = read_genome(ref_fa); 115 | 116 | ############################## 117 | pmanager = multiprocessing.Manager(); 118 | run_Q = pmanager.Queue(); 119 | for curk in chrkeys: 120 | run_Q.put((curk, ref_genome[curk], res_folder, na_bp, curna, curmotif)) 121 | 122 | mhandlers = []; 123 | share_var = (run_Q, ) 124 | m_thread_num = len(chrkeys); 125 | for i in range(m_thread_num): 126 | p = multiprocessing.Process(target=handle_motif_pos, args=share_var) 127 | p.start(); 128 | mhandlers.append(p); 129 | while any(p.is_alive() for p in mhandlers): 130 | try: 131 | time.sleep(1); 132 | except: 133 | time.sleep(1); 134 | continue; 135 | 136 | 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /DeepMod_tools/hm_cluster_predict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, sys, time 4 | from collections import defaultdict 5 | import glob 6 | 7 | import numpy as np 8 | 9 | from scipy import stats 10 | 11 | import locale 12 | locale.setlocale(locale.LC_ALL, 'en_US') 13 | 14 | import tensorflow as tf 15 | 16 | batch_size = 4096 17 | 18 | cov_thrd = 5 19 | 20 | def readBed(bedfile, t_chr=None, t_start=None, t_end=None): 21 | print('read {}'.format(bedfile)); sys.stdout.flush() 22 | beddict = defaultdict() 23 | with open(bedfile, 'r') as bedreader: 24 | start_time = time.time(); 25 | line = bedreader.readline(); 26 | while True: 27 | line = bedreader.readline(); 28 | if not line: break; 29 | 30 | line = line.strip(); 31 | if len(line)>20: 32 | mchr, start_pos, end_pos, _, _, m_strand, _, _, _, true_cov, meth_perc = line.split() 33 | start_pos = int(start_pos) 34 | true_cov = int(true_cov) 35 | if true_cov < cov_thrd: continue; 36 | meth_perc = round(int(meth_perc)/100.0, 3) 37 | if (t_chr not in [None, mchr]) or (not ((t_start==None or start_pos>=t_start) and (t_end==None or start_pos<=t_end))): 38 | continue; 39 | if true_cov==0: continue 40 | beddict[(mchr, m_strand, start_pos)] = meth_perc 41 | return beddict 42 | 43 | def readpredmod(predmodf, preddict, t_chr=None, t_start=None, t_end=None, cgCposdict=None): 44 | print('read {}'.format(predmodf)); sys.stdout.flush() 45 | with open(predmodf, 'r') as mr: 46 | while True: 47 | line = mr.readline(); 48 | if not line: break; 49 | line = line.strip(); 50 | if len(line)>0: 51 | lsp = line.split(); 52 | cur_chr = lsp[0]; 53 | cur_pos = int(lsp[1]); 54 | cur_strand = lsp[5]; 55 | 56 | if not (cgCposdict==None or (cur_chr, cur_strand, cur_pos) in cgCposdict): continue 57 | 58 | cur_cov = int(lsp[9]); 59 | cur_m_p = int(lsp[10]); 60 | cur_m_c = int(lsp[11]); 61 | if (t_chr not in [None, cur_chr]) or (not ((t_start==None or cur_pos>=t_start) and (t_end==None or cur_pos<=t_end))): 62 | continue; 63 | if cur_cov==0: continue; 64 | 65 | if (cur_chr, cur_strand, cur_pos) not in preddict: 66 | preddict[(cur_chr, cur_strand, cur_pos)] = [cur_cov, round(cur_m_p/100.0, 3), cur_m_c, line] 67 | else: 68 | print("Warning_duplicate {}".format(predmodf)) 69 | preddict[(cur_chr, cur_strand, cur_pos)][0] += cur_cov 70 | preddict[(cur_chr, cur_strand, cur_pos)][2] += cur_m_c 71 | if preddict[(cur_chr, cur_strand, cur_pos)][0]>0: 72 | preddict[(cur_chr, cur_strand, cur_pos)][1] = round(preddict[(cur_chr, cur_strand, cur_pos)][2]/float(preddict[(cur_chr, cur_strand, cur_pos)][0]), 3) 73 | 74 | 75 | 76 | pred_file = sys.argv[1]+'.%s.C.bed' 77 | save_file = sys.argv[1]+'_clusterCpG.%s.C.bed' 78 | gmotfolder = sys.argv[2] 79 | 80 | mpat = 'Cg'; mposinpat=0 81 | stposofinterest = None; edposofinterest = None; 82 | 83 | nbsize = 25; 84 | train_mod = 'DeepMod/train_mod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/{}.cov{}.nb{}'.format(mpat, cov_thrd, nbsize) 85 | 86 | chrkeys = [] 87 | for i in range(1, 23): 88 | chrkeys.append("chr%d" % i) 89 | chrkeys.append("chrX") 90 | chrkeys.append("chrY") 91 | chrkeys.append("chrM") 92 | 93 | 94 | new_saver = tf.train.import_meta_graph(train_mod+'.meta') 95 | print(new_saver); sys.stdout.flush() 96 | with tf.Session() as sess: 97 | print("restore model: {} {}".format(train_mod+'.meta', train_mod[:train_mod.rindex('/')+1])) 98 | print(new_saver.restore(sess,tf.train.latest_checkpoint(train_mod[:train_mod.rindex('/')+1]))); sys.stdout.flush() 99 | 100 | mgraph = tf.get_default_graph() 101 | output = mgraph.get_tensor_by_name('output:0') 102 | X = mgraph.get_tensor_by_name('X:0') 103 | keep_prob = mgraph.get_tensor_by_name('keep_prob:0') 104 | 105 | for chrofinterest in chrkeys: 106 | #read pred 107 | preddict = defaultdict() 108 | 109 | cur_cg_pos = '%s/motif_%s_C.bed' % (gmotfolder, chrofinterest) 110 | if not os.path.isfile(cur_cg_pos): 111 | print("Warning_motif!!! no file {}".format(cur_cg_pos)) 112 | continue; 113 | if not os.path.isfile(pred_file % chrofinterest): 114 | print("Warning_pred!!! no file {}".format(pred_file % chrofinterest)) 115 | continue; 116 | 117 | cgposdict = defaultdict(); 118 | with open(cur_cg_pos, 'r') as mr: 119 | while True: 120 | line = mr.readline(); 121 | if not line: break; 122 | lsp = line.split(); 123 | cgposdict[ (lsp[0], lsp[2], int(lsp[1]) ) ] = True 124 | print("{}: read {} done! ".format(len(cgposdict), cur_cg_pos)); sys.stdout.flush() 125 | readpredmod(pred_file % chrofinterest, preddict, chrofinterest, cgCposdict=cgposdict) 126 | print("size={} vs ".format(len(preddict), len(cgposdict) )); sys.stdout.flush() 127 | 128 | train_data = [] 129 | pdkeys = sorted(list( preddict.keys() )) 130 | for cspk in pdkeys: # preddict: 131 | if cspk not in cgposdict: 132 | print("not in cpg warning!!! {} {}".format(chrofinterest, cspk)) 133 | 134 | partner_pos = (cspk[0], '-' if cspk[1]=='+' else '+', cspk[2]+1 if cspk[1]=='+' else cspk[2]-1) 135 | cur_x = [preddict[cspk][1], preddict[partner_pos][1] if partner_pos in preddict else 0] 136 | for pdis in range(11): 137 | cur_x.append(0) 138 | cur_x.append(0) 139 | if len(train_data)<10: print("test") 140 | for rpos in range(cspk[2]-nbsize, cspk[2]+nbsize+1): 141 | if rpos in [cspk[2], partner_pos[2]]: continue; 142 | 143 | if (cspk[0], '+', rpos) in cgposdict and (cspk[0], '+', rpos) in preddict: 144 | cur_x[int(preddict[(cspk[0], '+', rpos)][1]/0.1+0.5) + 3] += 1 145 | cur_x[2] += 1 146 | if len(train_data)<10: print("\t\t{}: {}".format((cspk[0], '+', rpos), preddict[(cspk[0], '+', rpos)])) 147 | elif (cspk[0], '-', rpos) in cgposdict and (cspk[0], '-', rpos) in preddict: 148 | cur_x[int(preddict[(cspk[0], '-', rpos)][1]/0.1+0.5) + 3] += 1 149 | cur_x[2] += 1 150 | if len(train_data)<10: print("\t\t{}: {}".format((cspk[0], '-', rpos), preddict[(cspk[0], '-', rpos)])) 151 | for i in range(3, len(cur_x)): 152 | if cur_x[2]>0: cur_x[i] = round(cur_x[i]/float(cur_x[2]), 3) 153 | if len(train_data)<10: print('\t{}'.format(cur_x)); sys.stdout.flush() 154 | train_data.append(cur_x) 155 | 156 | print("format data: data={}; {}".format(len(train_data), len(train_data[0]))); sys.stdout.flush() 157 | 158 | batch_data = np.array_split(train_data, int(len(train_data)/batch_size) if len(train_data)>batch_size else 2) 159 | m_pred_new_per = [] 160 | for i in range(len(batch_data)): 161 | moutp = sess.run([output], feed_dict={X:batch_data[i], keep_prob:1}) 162 | for mpind in moutp: 163 | for curpd in mpind: 164 | m_pred_new_per.append(curpd) 165 | print("new per: {}, {} {} {}".format(len(pdkeys), len(train_data), len(m_pred_new_per), curpd )) 166 | for wind in range(10): 167 | print("'{}' <{}> {}".format(preddict[pdkeys[wind]][-1], m_pred_new_per[wind], train_data[wind])) 168 | with open(save_file % chrofinterest, 'w') as mwriter: 169 | for wind in range(len(pdkeys)): 170 | mwriter.write("{} {}\n".format(preddict[pdkeys[wind]][-1], int(m_pred_new_per[wind]*100))) 171 | 172 | 173 | -------------------------------------------------------------------------------- /DeepMod_tools/sum_chr_mod.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, sys, string 4 | import multiprocessing 5 | import time; 6 | import glob 7 | from collections import defaultdict; 8 | 9 | 10 | def mprint(mstr): 11 | print(mstr); sys.stdout.flush() 12 | 13 | if len(sys.argv)<4: 14 | print ("Usage: python {} pred_folder-of-DeepMod Base-of-interest unique-fileid-in-sum-file [chr-list]".format(sys.argv[0])) 15 | print (" pred_folder-of-DeepMod: the prediction must in its sub-folder.") 16 | sys.exit(1) 17 | 18 | pred_folder = sys.argv[1] 19 | baseofint = sys.argv[2] 20 | sum_fileid = sys.argv[3] 21 | 22 | 23 | if len(sys.argv)>4: 24 | chrkeys = ["%s" % cid for cid in sys.argv[4].split(',')] 25 | else: 26 | chrkeys = [] 27 | for i in range(1, 23): 28 | chrkeys.append("chr%d" % i) 29 | chrkeys.append("chrX") 30 | chrkeys.append("chrY") 31 | chrkeys.append("chrM") 32 | 33 | chrkeys = set(chrkeys) 34 | 35 | ##################################### 36 | def readbed2(bedf): 37 | cur_ad = defaultdict(); 38 | with open(bedf, 'r') as mr: 39 | while True: 40 | line = mr.readline(); 41 | if not line: break; 42 | line = line.strip(); 43 | lsp = line.split(); 44 | cur_ad[(lsp[0], int(lsp[1]), lsp[5])] = [int(lsp[9]), int(lsp[11])] 45 | return cur_ad 46 | 47 | def mergeMod(g_ad, cur_ad): 48 | for k in cur_ad: 49 | if k in g_ad: 50 | g_ad[k][0] += cur_ad[k][0] 51 | g_ad[k][1] += cur_ad[k][1] 52 | else: g_ad[k] = cur_ad[k] 53 | 54 | def save_mod(res_file, amod_dict, baseOfin): 55 | poskeys = sorted(list(amod_dict.keys())) 56 | for pk in poskeys: 57 | if amod_dict[pk][1]==0: del amod_dict[pk] 58 | 59 | poskeys = list(amod_dict.keys()) 60 | poskeys = sorted(poskeys); 61 | with open(res_file, 'w') as mw: 62 | for pk in poskeys: 63 | mw.write('%s %d %d %s %d %s %d %d 0,0,0 %d %d %d\n' % (pk[0], pk[1],pk[1]+1, baseOfin,amod_dict[pk][0] if amod_dict[pk][0]<1000 else 1000,pk[2], pk[1],pk[1]+1, amod_dict[pk][0], int(amod_dict[pk][1]*100/amod_dict[pk][0]) if amod_dict[pk][0]>0 else 0, amod_dict[pk][1] )) 64 | 65 | 66 | def sum_amod_handler(run_Q): 67 | handli = 0; 68 | while not run_Q.empty(): 69 | try: 70 | ck, pred_folder, baseOfin = run_Q.get(block=False) 71 | #print (ck, pred_folder, baseOfin); continue 72 | except: 73 | break; 74 | 75 | allbedfiles = glob.glob(os.path.join(pred_folder, ("*/*/*/*.%s-.%s.bed" % (ck, baseOfin)) )) 76 | allbedfiles.extend(glob.glob(os.path.join(pred_folder, ("*/*/*.%s-.%s.bed" % (ck, baseOfin)) ))) 77 | allbedfiles.extend(glob.glob(os.path.join(pred_folder, ("*/*.%s-.%s.bed" % (ck, baseOfin)) ))) 78 | mprint ("%s - %s: %d" % (ck, baseOfin, len(allbedfiles) )) 79 | allbedfiles.extend(glob.glob(os.path.join(pred_folder, ("*/*/*/*.%s+.%s.bed" % (ck, baseOfin)) ))) 80 | allbedfiles.extend(glob.glob(os.path.join(pred_folder, ("*/*/*.%s+.%s.bed" % (ck, baseOfin)) ))) 81 | allbedfiles.extend(glob.glob(os.path.join(pred_folder, ("*/*.%s+.%s.bed" % (ck, baseOfin)) ))) 82 | mprint ("%s -+ %s: %d" % (ck, baseOfin, len(allbedfiles) )) 83 | 84 | # 0 1 2 3 4 5 6 7 8 9 0 1 85 | #chr1 949802 949803 T 1 - 949802 949803 0,0,0 1 0 0 86 | amod_dict = defaultdict(); 87 | res_file = "%s/%s.%s.%s.bed" % (pred_folder, sum_fileid, ck, baseOfin) 88 | for bedf_ind in range(len(allbedfiles)): 89 | mprint("\t %s %s %d/%d" % (ck, baseOfin, bedf_ind+1, len(allbedfiles))) 90 | cur_ad = readbed2(allbedfiles[bedf_ind]) 91 | mergeMod(amod_dict, cur_ad) 92 | 93 | save_mod(res_file, amod_dict, baseOfin) 94 | 95 | ############################## 96 | pmanager = multiprocessing.Manager(); 97 | run_Q = pmanager.Queue(); 98 | for ck in chrkeys: 99 | run_Q.put((ck, pred_folder, baseofint)) 100 | 101 | mhandlers = []; 102 | share_var = (run_Q, ) 103 | m_thread_num = len(chrkeys); 104 | for i in range(m_thread_num+1): 105 | p = multiprocessing.Process(target=sum_amod_handler, args=share_var) 106 | p.start(); 107 | mhandlers.append(p); 108 | while any(p.is_alive() for p in mhandlers): 109 | try: 110 | time.sleep(1); 111 | except: 112 | time.sleep(1); 113 | continue; 114 | 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | 2 | Please refer to [docs/Install.md](https://github.com/WGLab/DeepMod/blob/master/docs/Install.md) 3 | 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | DeepMod is available under GPLv3. 2 | 3 | Copyright (C) <2018> 4 | 5 | DeepMod (refer to as this tool below) is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation with either version 3 of the License, or later version. 6 | 7 | This tool is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 8 | 9 | You should have received a copy of the GNU General Public License along with this tool. If not, please refer to . 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepMod: a deep-learning tool for genomic-scale, strand-sensitive and single-nucleotide based detection of DNA modifications 2 | 3 | ## Methodology of DeepMod 4 | 5 | DeepMod is a computational tool which takes long-read signals as input and outputs modification summary for each genomic position in a reference genome together with modification prediction for each base in a long read. The modification prediction model in DeepMod is a well-trained bidirectional recurrent neural network (RNN) with long short-term memory (LSTM) units. LSTM RNN is a class of artificial neural network for modeling sequential behaviors with LSTM to preclude vanishing gradient problem. To detect DNA modifications, normalized signals of events in a long read were rescaled from -5 and 5, and signal mean, standard deviation and the number of signals together with base information (denoted by 7-feature description) were obtained for each event as input of a LSTM unit with 100 hidden nodes. In DeepMod with 3 hidden layers in RNN. Predicted modification summary for each position would be generated in a BED format, suggesting how many reads cover genomic positions, how many mapped bases in long reads were predicted to be modified and the coverage percentage of prediction modifications. This modification prediction by DeepMod is strand-sensitive and single-nucleotide based. 6 | 7 | ### Inputs of DeepMod 8 | 9 | The input of DeepMod is Nanopore long read data together a refrence genome. 10 | 11 | Please note that the default model is trained on Metrichore basecalled data. While it has reasoanble performance on Albacore v1 basecalled data, it should not be used in Albacore v2 (they require different sets of models) or any Guppy basecalled data, due to the differences in basecalling approaches. We tested the newly trained model on move table basecalled data with the guppy branch of this repository and have good performance. Therefore, in 2022, we released [DeepMod2](https://github.com/WGLab/DeepMod2) which supports two modes: a Guppy basecalling mode, and a Tombo-resquiggled mode. Please note that models will be flowcell specific; right now our models are trained in R9 but if you use R10 flowcell then you need to wait until we release the R10 model. (UPDATE: in October 2022 we relased R10 models, so DeepMod2 is now the only open source softare that handles R10 flowcells.) If you use DeepMod in your research, please be mindful that different basecallers/flowcells can generate very different signal properties so the correct model (rather than default model) in DeepMod/DeepMod2 needs to be used for your specific data set. 12 | 13 | ## System Requirements 14 | ### Hardware requirements 15 | DeepMod is based on deep learning framework, and needs to access raw data of Nanopore sequencing. Thus, it needs enough RAM to support deep learning framework and enough hard drive for raw data of Nanopore sequencing. GPU can substantially speedup the detection process. For optimal performance, we recommend a computer with: 16 | * RAM: 20+ GB per thread 17 | * GPU or CPU with 8+ cores 18 | * HDD or better with SSD. Dependent on how large raw data is (for 30X E coli data, it might need 10+GB, while for 30X human data, it might need 10+TB) 19 | 20 | ### Software requirements 21 | The developmental version of DeepMod has been tested on Linux operating system: CentOS 7.0 with both CPU and GPU machines. 22 | 23 | ### Future improvement 24 | Now, DeepMod supports basecalled data with either event tables or move tables (***Due to confusion to users who do not know the difference between main branch and guppy branch, in 2022 we made the decision to just create a DeepMod2 repository solely for move tables generated by Guppy basecaller; therefore, please do NOT use DeepMod for Guppy-basecalled data!***). But it does not support multi-fast5. For multi-fast5 issue, one can use API at https://github.com/nanoporetech/ont_fast5_api to convert multi-fast5 to single fast5 file, and then re-basecall to get event information as input of DeepMod. We have been working on improvement of DeepMod to support multi-fast5. 25 | 26 | ## Installation 27 | Please refer to [Installation](https://github.com/WGLab/DeepMod/blob/master/docs/Install.md) for how to install DeepMod. 28 | 29 | ## Usage 30 | 31 | Please refer to [Usage](https://github.com/WGLab/DeepMod/blob/master/docs/Usage.md) for how to use DeepMod. 32 | 33 | ## Examples and Reproducibility of our analysis. 34 | 35 | Please refer to [Examples and Reproducibility](https://github.com/WGLab/DeepMod/blob/master/docs/Reproducibility.md) for examples of how to run DeepMod. 36 | 37 | ## Revision History 38 | 39 | For release history, please visit [here](https://github.com/WGLab/NanoDeepMod/releases). For details, please go [here](https://github.com/WGLab/DeepMod/blob/master/README.md). 40 | 41 | ## Contact 42 | 43 | If you have any questions/issues/bugs, please post them on [GitHub](https://github.com/WGLab/DeepMod/issues). They would also be helpful to other users. 44 | 45 | ## Reference 46 | **Please cite the publication below if you use our tool:** 47 | 48 | Q. Liu, L. Fang, G. Yu, D. Wang, C. Xiao, K. Wang. Detection of DNA base modifications by deep recurrent neural network on Oxford Nanopore sequencing data. Nat. Commun 10, 2019. Online at https://www.nature.com/articles/s41467-019-10168-2. 49 | -------------------------------------------------------------------------------- /bin/DeepMod.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os; 4 | import sys; 5 | 6 | import string; 7 | 8 | from collections import defaultdict 9 | 10 | import argparse; 11 | from argparse import RawTextHelpFormatter 12 | 13 | from DeepMod_scripts.myCom import * 14 | 15 | 16 | 17 | # three modules in DeepMod 18 | parser = argparse.ArgumentParser(description="Detect nucleotide modification from nanopore signals data.", epilog="For example, \n \ 19 | \tpython %(prog)s train: Training a modification classifier.\n \ 20 | \tpython %(prog)s detect: Detect modification by integrating all long reads. \n \ 21 | \tpython %(prog)s getfeatures: Get features for training a model. \n \ 22 | ", formatter_class=RawTextHelpFormatter); 23 | 24 | 25 | # 26 | # Return error message when a value<1 27 | # Return an empty string otherwise 28 | # 29 | def non_negative(i, mstr): 30 | if i<1: return (("\n\tError %d could not be negative(%d)" % (mstr, i))) 31 | else: return '' 32 | 33 | # 34 | # Print all parameters in stdout 35 | # 36 | def printParameters(moptions): 37 | mpkeys = moptions.keys(); #mpkeys.sort() 38 | sorted(mpkeys) 39 | print('%30s: %s' % ('Current directory', os.getcwd())) 40 | for mpk in mpkeys: 41 | print ('%30s: %s' % (mpk, str(moptions[mpk]))) 42 | sys.stdout.flush() 43 | 44 | # 45 | # Got common argument provided by users or default values. 46 | # 47 | # 48 | def mCommonParam(margs): 49 | 50 | ErrorMessage = "" 51 | moptions = defaultdict() 52 | # how to output running message: need more control now. 53 | moptions['outLevel'] = margs.outLevel 54 | # the input working base 55 | moptions["wrkBase"] = margs.wrkBase 56 | if moptions["wrkBase"]==None: 57 | ErrorMessage = ErrorMessage + ("\n\tThe input folder is None.") 58 | 59 | # An unique ID for output 60 | # Usefull for run the program in parallel 61 | moptions["FileID"] = margs.FileID 62 | # output folder; 63 | # make it if the output folder does not exist 64 | moptions['outFolder'] = margs.outFolder 65 | moptions['outFolder'] = format_last_letter_of_folder(moptions['outFolder']) 66 | if moptions['outFolder']==None or (not os.path.isdir(moptions['outFolder'])): 67 | try: 68 | os.system('mkdir -p '+moptions['outFolder']); 69 | except: 70 | ErrorMessage = ErrorMessage + ("\n\tThe output folder (%s) does not exist and cannot be created." % moptions['outFolder']) 71 | 72 | # check all data in a recurive way 73 | moptions['recursive'] = margs.recursive 74 | # the number of threads used and the number of files handled by each thread. 75 | moptions['files_per_thread'] = margs.files_per_thread 76 | if moptions['files_per_thread']<2: moptions['files_per_thread'] = 2 77 | # the number of threads used 78 | moptions['threads'] = margs.threads 79 | if moptions['threads']<1: moptions['threads'] = 1 80 | 81 | # windowsize: default=21 82 | moptions['windowsize'] = margs.windowsize 83 | ErrorMessage = ErrorMessage + non_negative(moptions['windowsize'], 'windowsize') 84 | if moptions['windowsize']<1: moptions['windowsize'] = 1 85 | 86 | # aligners: bwa-mem or minimap2 87 | moptions['alignStr'] = margs.alignStr; 88 | 89 | moptions['SignalGroup'] = margs.SignalGroup; 90 | 91 | moptions['move'] = margs.move 92 | 93 | return [moptions, ErrorMessage] 94 | 95 | # 96 | # detect modification for bases of interests 97 | # input is a list of fast5 files, a reference genome and a well-trained model. 98 | # 99 | def mDetect(margs): 100 | # get common parameters 101 | moptions, ErrorMessage = mCommonParam(margs) 102 | 103 | # path for basecall information in fast5 files 104 | moptions['basecall_1d'] = margs.basecall_1d 105 | moptions['basecall_2strand'] = margs.basecall_2strand 106 | # Whether consider those chromosome which contain -_:/ 107 | # default: yes; 108 | moptions['ConUnk'] = margs.ConUnk 109 | # output layer information for deep learning 110 | moptions['outputlayer'] = margs.outputlayer 111 | # base of interest 112 | moptions['Base'] = margs.Base 113 | # whether take cluster effect of methylation into consideration 114 | moptions['mod_cluster'] = margs.mod_cluster 115 | # base of interest 116 | if moptions['Base'] in ["", None]: 117 | ErrorMessage = ErrorMessage + ("\n\t Please provide a base of interest.") 118 | 119 | # predict medification for bases of interest in long reads first 120 | # only summarize them for each genomic position of interest . 121 | moptions['predDet'] = margs.predDet 122 | if moptions['predDet']: 123 | # path to reference genome 124 | moptions['Ref'] = margs.Ref 125 | if moptions['Ref']==None or (not os.path.isfile(moptions['Ref'])): 126 | ErrorMessage = ErrorMessage + ("\n\t reference file does not exist (%s)" % moptions['Ref']) 127 | 128 | # the number of feature for each event 129 | moptions['fnum'] = margs.fnum 130 | ErrorMessage = ErrorMessage + non_negative(moptions['fnum'], 'fnum') 131 | # the number of hidden nodes 132 | moptions['hidden'] = margs.hidden 133 | ErrorMessage = ErrorMessage + non_negative(moptions['hidden'], 'hidden') 134 | # the well-trained model 135 | moptions['modfile'] = margs.modfile 136 | if moptions['modfile']==None: 137 | print("No mod file is provided. The default one is used") 138 | moptions['modfile'] = ('train_deepmod/rnn_P90wd%d_f53/mod_train_P90wd%d_f53' % (moptions['windowsize'], moptions['windowsize'])) 139 | if (not os.path.isfile(moptions['modfile']+'.meta')): 140 | moptions['modfile'] = ('{}/lib/python{}.{}/site-packages/DeepMod/train_deepmod/rnn_P90wd{}_f53/mod_train_P90wd{}_f53'.format(sys.prefix,sys.version_info.major,sys.version_info.minor, moptions['windowsize'], moptions['windowsize'])) 141 | if (not os.path.isfile(moptions['modfile']+'.meta')): 142 | ErrorMessage = ErrorMessage + ("\n\tThe meta file (%s) does not exist" % (moptions['modfile']+'.meta' if not moptions['modfile']==None else "")) 143 | else: 144 | # already done the prediction process? 145 | # Yes: summarize the results only 146 | moptions['predpath'] = margs.predpath 147 | if moptions['predpath']==None or (not os.path.isdir(moptions['predpath'])): 148 | ErrorMessage = ErrorMessage + ("\n\tThe predpath does not exist") 149 | 150 | # specify region of interest 151 | # not consider bases outside regions in a reference genome 152 | # None: all bases of interest 153 | moptions['region'] = [ ] 154 | if margs.region == None or len(margs.region)==0: 155 | moptions['region'].append([None, None, None]) 156 | else: 157 | mregionlist = margs.region.split(';') 158 | for mr in mregionlist: 159 | mr_sp = mr.split(':') 160 | moptions['region'].append([mr_sp[0], int(mr_sp[1]) if len(mr_sp)>1 else None, int(mr_sp[2]) if len(mr_sp)>2 else None ]) 161 | 162 | print("\nNanopore sequencing data analysis is resourece-intensive and time consuming. ") 163 | print("Some potential strong recommendations are below:") 164 | print("\tIf your reference genome is large as human genome and your Nanopore data is huge,") 165 | print("\tIt would be faster to run this program parallelly to speed up.") 166 | print("\tYou might run different input folders of your fast5 files and ") 167 | print("\tgive different output names (--FileID) or folders (--outFolder)") 168 | print("\tA good way for this is to run different chromosome individually.\n") 169 | 170 | # print help information if any necessary options are not provided. 171 | printParameters(moptions) 172 | if not ErrorMessage=="": 173 | ErrorMessage = "Please provide correct parameters" + ErrorMessage 174 | print(ErrorMessage) 175 | parser.print_help(); 176 | parser.parse_args(['detect', '--help']); 177 | sys.exit(1) 178 | 179 | from DeepMod_scripts import myDetect 180 | myDetect.mDetect_manager(moptions) 181 | 182 | # 183 | # Train a model 184 | # Need to get features first. 185 | # 186 | def mTrain(margs): 187 | from DeepMod_scripts import myMultiBiRNN 188 | 189 | # gent common options 190 | moptions, ErrorMessage = mCommonParam(margs) 191 | 192 | # network setting: the number of features and the number of hidden nodes 193 | moptions['fnum'] = margs.fnum 194 | ErrorMessage = ErrorMessage + non_negative(moptions['fnum'], 'fnum') 195 | moptions['hidden'] = margs.hidden 196 | ErrorMessage = ErrorMessage + non_negative(moptions['hidden'], 'hidden') 197 | 198 | # the output function of the deep learning model 199 | moptions['outputlayer'] = margs.outputlayer 200 | # whether using different class weights 201 | moptions['unbalanced'] = margs.unbalanced 202 | 203 | # re-load trained model and continue to train 204 | moptions['modfile'] = margs.modfile 205 | if moptions['modfile']==None: pass; 206 | elif (not os.path.isfile(moptions['modfile']+'.meta')): 207 | ErrorMessage = ErrorMessage + ("\n\tThe meta file (%s) does not exist" % (moptions['modfile']+'.meta' if not moptions['modfile']==None else "")) 208 | 209 | # read-based or region based independent training 210 | # E: region-based 211 | # P: read-based. 212 | if not margs.test==None: 213 | moptions['test'] = margs.test.split(',') 214 | if moptions['test'][0] == 'E': moptions['test'][0] = '-' 215 | elif moptions['test'][0] == 'P': moptions['test'][0] = '0' 216 | else: 217 | ErrorMessage = ErrorMessage + "Unknown option for test: the first character must be E or P "+margs.test 218 | if moptions['test'][0] in ['-']: 219 | moptions['test'][1] = int(moptions['test'][1]) * (10**6) 220 | moptions['test'][2] = int(moptions['test'][2]) * (10**6) 221 | else: moptions['test'][1] = int(moptions['test'][1])/100.0 222 | else: moptions['test'] = ['N', '100'] 223 | 224 | # print help document if necessary options are not provided. 225 | print("Train") 226 | printParameters(moptions) 227 | if not ErrorMessage=="": 228 | ErrorMessage = "Please provide correct parameters" + ErrorMessage 229 | print(ErrorMessage) 230 | parser.print_help(); 231 | parser.parse_args(['train', '--help']); 232 | sys.exit(2) 233 | 234 | myMultiBiRNN.mMult_RNN_LSTM_train(moptions) 235 | 236 | # 237 | # get features for training 238 | # 239 | # 240 | def mGetFeatures(margs): 241 | from DeepMod_scripts import myGetFeatureBasedPos 242 | 243 | # get common options 244 | moptions, ErrorMessage = mCommonParam(margs) 245 | # motif-based data: positive or negative control data 246 | moptions['posneg'] = margs.posneg 247 | # the number of features: 7-description or 57-description 248 | moptions['fnum'] = margs.fnum 249 | ErrorMessage = ErrorMessage + non_negative(moptions['fnum'], 'fnum') 250 | # size of each bacth to store features 251 | moptions['size_per_batch'] = margs.size_per_batch 252 | if moptions['size_per_batch'] < 0.001: moptions['size_per_batch'] = 0.001 253 | 254 | # path to basecall inform in fast5 files 255 | moptions['basecall_1d'] = margs.basecall_1d 256 | moptions['basecall_2strand'] = margs.basecall_2strand 257 | 258 | # regions of interest 259 | moptions['region'] = [None, None, None] 260 | if not (margs.region==None or margs.region.strip()==''): 261 | rsp = margs.region.split(':') 262 | for rv_ind in range(len(rsp)): 263 | rsp[rv_ind] = rsp[rv_ind].strip(); 264 | if not rsp[rv_ind]=='': 265 | moptions['region'][rv_ind] = rsp[rv_ind] 266 | 267 | # referene genome 268 | moptions['Ref'] = margs.Ref 269 | if moptions['Ref']==None or (not os.path.isfile(moptions['Ref'])): 270 | ErrorMessage = ErrorMessage + ("\n\t reference file does not exist (%s)" % moptions['Ref']) 271 | 272 | # get motif-based modification 273 | # or specify by --fulmod/--anymod/--nomod 274 | moptions['motifORPos'] = margs.motifORPos 275 | if margs.motifORPos==1: 276 | moptions['motif'] = [margs.motif.upper(), margs.ModinMotif] 277 | elif margs.motifORPos==2: 278 | moptions['fulmod'] = margs.fulmod 279 | if moptions['fulmod']==None: # completely modificated positions 280 | ErrorMessage = ErrorMessage + ("\t There is no parameter for --fulmod.") 281 | moptions['anymod'] = margs.anymod 282 | if moptions['anymod'] == None: # patially modificated positions 283 | ErrorMessage = ErrorMessage + ("\t There is no parameter for --anymod.") 284 | moptions['nomod'] = margs.nomod 285 | if moptions['nomod'] == None: # completely unmodified posisionts 286 | ErrorMessage = ErrorMessage + ("\t There is no parameter for --nomod.") 287 | else: 288 | ErrorMessage = ErrorMessage + ("\tmotifORPos value (%d) is not supported." % margs.motifORPos) 289 | 290 | # print help document if any required options are not provided. 291 | printParameters(moptions) 292 | if not ErrorMessage=="": 293 | ErrorMessage = "Please provide correct parameters" + ErrorMessage 294 | print(ErrorMessage) 295 | parser.print_help(); 296 | parser.parse_args(['getfeatures', '--help']); 297 | sys.exit(1) 298 | 299 | myGetFeatureBasedPos.getFeature_manager(moptions) 300 | 301 | 302 | ##################################################################################### 303 | 304 | subparsers = parser.add_subparsers() 305 | parent_parser = argparse.ArgumentParser(add_help=False) 306 | 307 | # add common options 308 | com_group_for_comparison = parent_parser.add_argument_group('Common options.') 309 | com_group_for_comparison.add_argument("--outLevel", type=int, choices=[OUTPUT_DEBUG, OUTPUT_INFO, OUTPUT_WARNING, OUTPUT_ERROR], default=OUTPUT_WARNING, help=("The level for output: %d for DEBUG, %d for INFO, %d for WARNING, %d for ERROR. Default: %d" % (OUTPUT_DEBUG, OUTPUT_INFO, OUTPUT_WARNING, OUTPUT_ERROR, OUTPUT_WARNING))) 310 | com_group_for_comparison.add_argument("--wrkBase", help="The base folder for FAST5 files.") 311 | com_group_for_comparison.add_argument("--FileID", default="mod", help="The unique string for intermediate files and final output files. Default: 'mod'") 312 | com_group_for_comparison.add_argument("--outFolder", default='./mod_output', help="The default folder for outputing the results. Default: ./mod_output") 313 | com_group_for_comparison.add_argument("--recursive", type=int, default=1, choices=[0,1], help="Recurise to find fast5 files. Default:1") 314 | com_group_for_comparison.add_argument("--threads", type=int, default=4, help="The number of threads used (not for train). Default:4") 315 | com_group_for_comparison.add_argument("--files_per_thread", type=int, default=1000, help="The number of fast5 files for each thread (not for train). Default:500") 316 | com_group_for_comparison.add_argument("--windowsize", type=int, default=21, help="The window size to extract features. Default: 21") 317 | com_group_for_comparison.add_argument("--alignStr", type=str, default='minimap2', choices=["bwa","minimap2"], help="Alignment tools (bwa or minimap2 is supported). Default: minimap2") 318 | com_group_for_comparison.add_argument("--SignalGroup", type=str, default='simple', choices=["simple","rundif"], help="How to associate signals to each called bases. Default: simple") 319 | com_group_for_comparison.add_argument("--move", default=False, action="store_true", help="Whether the basecalled data use move tables instead of event tables. Default: False") 320 | 321 | # add detection options 322 | parser_detect = subparsers.add_parser('detect', parents=[parent_parser], help="Detect modifications at a genomic scale", description="Detect modifications by integrating all long reads for a genome", epilog="For example, \n \ 323 | python %(prog)s --wrkBase ctrl_oligo_SpeI_cut --FileID mod_det --outFolder ./mod_output/detect3 \n \ 324 | ", formatter_class=RawTextHelpFormatter) 325 | parser_detect.add_argument("--Ref", help="The reference sequence") 326 | parser_detect.add_argument("--predDet", type=int, default=1, choices=[0,1], help="pred first and then detect (1) or only detect (0). Default: 1") 327 | parser_detect.add_argument("--predpath", default=None, help="The file path of predictions for each fast5 file. The file pattern is *_*.detail. Default: './mod_output/pred2/'") 328 | parser_detect.add_argument("--modfile", type=str, default=None, help="The path to load training model. Default: 'mod_output/'") 329 | parser_detect.add_argument("--fnum", type=int, default=7, help="The number of features. Default: 7") 330 | parser_detect.add_argument("--hidden", type=int, default=100, help="The number of hidden node. Default: 100") 331 | parser_detect.add_argument("--basecall_1d", default="Basecall_1D_000", help="Path for basecall_1d. Default: Basecall_1D_000") 332 | parser_detect.add_argument("--basecall_2strand", default="BaseCalled_template", help="Path for basecall_2strand. Default: BaseCalled_template") 333 | parser_detect.add_argument("--region", default=None, help="The region of interest: for example, chr:1:100000;chr2:10000"); 334 | parser_detect.add_argument("--ConUnk", default=True, choices=[False, True], help="Whether contain unknown chromosome"); 335 | parser_detect.add_argument("--outputlayer", default="", choices=["", "sigmoid"], help="how to put activation function for output layer") 336 | parser_detect.add_argument("--Base", type=str, default='C', choices=['A', 'C', 'G', 'T'], help="Interest of bases"); 337 | parser_detect.add_argument("--mod_cluster", default=0, choices=[0,1], help="1: CpG cluster effect; 0: not"); 338 | parser_detect.set_defaults(func=mDetect) 339 | 340 | # add training options 341 | parser_training = subparsers.add_parser('train', parents=[parent_parser], help="Training a modification classifier", description="Training a modification classifier", epilog="For example, \n \ 342 | python %(prog)s --wrkBase umr --wrkBase2 sss --FileID mod_train --outFolder ./mod_output/train1 \n \ 343 | ", formatter_class=RawTextHelpFormatter) 344 | parser_training.add_argument("--wrkBase2", help="The base folder for long reads without any modifications.") 345 | parser_training.add_argument("--fnum", type=int, default=7, help="The number of features. Default: 7") 346 | parser_training.add_argument("--hidden", type=int, default=100, help="The number of hidden node. Default: 100") 347 | parser_training.add_argument("--modfile", type=str, default=None, help="The path to load training model. Default: 'mod_output/'") 348 | parser_training.add_argument("--test", help="The number of E Coli genomic position for testing. Default: 'E,1,2'") 349 | parser_training.add_argument("--outputlayer", default="", choices=["", "sigmoid"], help="how to put activation function for output layer") 350 | parser_training.add_argument("--unbalanced", type=int, default=0, choices=[1, 0, None], help="Whether data is unbalanced"); 351 | parser_training.set_defaults(func=mTrain) 352 | 353 | # add get-feature options 354 | parser_getfeatures = subparsers.add_parser('getfeatures', parents=[parent_parser], help="Get features for all fast5 files", description="Get features for all fast5 files", epilog="For example, \n \ 355 | python %(prog)s --wrkBase umr/160617_ecolilowinput_UMR9/called/pass --threads 48 --recursive 0 --posneg 0 --outFolder umr \n \ 356 | python %(prog)s --wrkBase sss/160617_ecolilowinput_sssiR9/called/pass --threads 48 --recursive 0 --posneg 1 --outFolder sss \n \ 357 | ", formatter_class=RawTextHelpFormatter) 358 | parser_getfeatures.add_argument("--posneg", type=int, default=0, choices=[0,1], help="The positive(1) or negative(0) class. Default: 0") 359 | parser_getfeatures.add_argument("--size_per_batch", type=int, default=1, help="The size (unit: 10^7=10M) of a feature file. Default: 1") 360 | parser_getfeatures.add_argument("--fnum", type=int, default=7, help="The number of features. Default: 7") 361 | parser_getfeatures.add_argument("--region", type=str, help="The region of interest. Set to None or empty for all. Format is chr:start_pos:end_pos") 362 | parser_getfeatures.add_argument("--basecall_1d", default="Basecall_1D_000", help="Path for basecall_1d. Default: Basecall_1D_000") 363 | parser_getfeatures.add_argument("--basecall_2strand", default="BaseCalled_template", help="Path for basecall_2strand. Default: BaseCalled_template") 364 | 365 | parser_getfeatures.add_argument("--motifORPos", type=int, default=1, help="Use Motif (1) or pos (2) for modified bases. Default: 1") 366 | 367 | parser_getfeatures.add_argument("--motif", default='CG', type=str, help="The motif of interest") 368 | parser_getfeatures.add_argument("--ModinMotif", default=0, type=int, help="The position of modified base in the motif of interest") 369 | parser_getfeatures.add_argument("--Ref", help="The reference sequence") 370 | 371 | parser_getfeatures.add_argument("--fulmod", type=str, help="The file pattern for full modification: bisultfiteseq/chr20_C*_0.95.txt") 372 | parser_getfeatures.add_argument("--anymod", type=str, help="The file pattern for any modification: bisultfiteseq/chr20_any_0.95.txt") 373 | parser_getfeatures.add_argument("--nomod", type=str, help="The file pattern for any modification: bisultfiteseq/chr20_no1_0.95.txt") 374 | 375 | parser_getfeatures.set_defaults(func=mGetFeatures) 376 | 377 | # no provided argument 378 | # print help document 379 | if len(sys.argv)<2: 380 | parser.print_help(); 381 | else: 382 | args = parser.parse_args() 383 | args.func(args); 384 | -------------------------------------------------------------------------------- /bin/DeepMod_scripts/EventTable.py: -------------------------------------------------------------------------------- 1 | 2 | import os,sys 3 | import numpy as np 4 | import h5py 5 | 6 | 7 | def get_extreme_N(m_signal_dif, n_splits, p_signal_start, p_signal_end, moptions, sp_param): 8 | cu_region_sort_pos = m_signal_dif[int(p_signal_start-sp_param['min_signal_num']+0.5):int(p_signal_end-sp_param['min_signal_num']+0.5)].argsort()[::-1]+p_signal_start; 9 | m_nb_pos = set(); 10 | # print n_splits, type(n_splits), p_signal_start, type(p_signal_start), p_signal_end, type(p_signal_end), sp_param['min_signal_num'], type( sp_param['min_signal_num']), type(p_signal_start+sp_param['min_signal_num']-1) 11 | m_nb_pos.update(range(p_signal_start, int(p_signal_start+sp_param['min_signal_num']-0.5))); 12 | m_nb_pos.update(range(int(p_signal_end-sp_param['min_signal_num']+1.5), p_signal_end)); 13 | split_points_list = [] 14 | for c_pos in cu_region_sort_pos: 15 | if c_pos not in m_nb_pos: 16 | split_points_list.append(c_pos); 17 | if (len(split_points_list)==n_splits): break; 18 | m_nb_pos.update(range(c_pos-sp_param['min_signal_num']+1, c_pos+sp_param['min_signal_num']+1)); 19 | return sorted(split_points_list); 20 | 21 | def getEvent_Info(moptions, sp_param, events_data): 22 | event_info = [] 23 | sp_param['min_signal_num'] = 4; 24 | 25 | signal_sum = np.cumsum(np.insert(np.round(sp_param['raw_signals']/50.0,5), 0, 0)); 26 | m_signal_dif = np.abs(signal_sum[sp_param['min_signal_num']:-sp_param['min_signal_num']]*2 - signal_sum[:-2*sp_param['min_signal_num']] - signal_sum[2*sp_param['min_signal_num']:]) 27 | #print (sp_param['raw_signals'][:20]); 28 | #print (np.round(sp_param['raw_signals']/50.0,5)[:20]); 29 | #print (signal_sum[:20]); 30 | #print (m_signal_dif[:20]) 31 | # sp_param['fq_seq'] = fq_data[1] 32 | last_ev_i = 0; 33 | last_signal_i = events_data[0]['start']; 34 | fq_seq_i = 2; 35 | c_move_num = 1 36 | incrrt_event_list = [] 37 | for ev_i in range(1, len(events_data)): 38 | if (events_data['move'][ev_i])==0: 39 | pass; 40 | else: 41 | c_move_num += events_data['move'][ev_i] 42 | split_points = get_extreme_N(m_signal_dif, c_move_num-1, last_signal_i, events_data[ev_i]['start']+events_data[ev_i]['length'], moptions, sp_param); 43 | #print c_move_num-1, last_signal_i, ev_i, events_data[ev_i]['start']+events_data[ev_i]['length'], split_points 44 | #for s_i in range(last_signal_i, events_data[ev_i]['start']+events_data[ev_i]['length']): 45 | # if s_i in split_points: 46 | # print '|', 47 | # print sp_param['raw_signals'][s_i], 48 | #print ''; 49 | for c_m_i in range(c_move_num-1): 50 | if c_m_i < len(split_points): 51 | h_m_i = c_m_i; 52 | c_e_p = split_points[h_m_i] 53 | else: 54 | h_m_i = len(split_points)-1 55 | c_e_p = last_signal_i + sp_param['min_signal_num'] 56 | incrrt_event_list.append(len(event_info)); 57 | 58 | c_mnn = np.mean(sp_param['raw_signals'][last_signal_i:c_e_p]); 59 | c_std = np.std(sp_param['raw_signals'][last_signal_i:c_e_p]); 60 | c_start = last_signal_i; 61 | c_length = c_e_p - last_signal_i; 62 | c_mode = sp_param['fq_seq'][fq_seq_i-2:fq_seq_i+3]; 63 | event_info.append((c_mnn, c_std, c_start, c_length, c_mode)) 64 | 65 | last_signal_i = split_points[h_m_i] 66 | fq_seq_i += 1; 67 | 68 | c_move_num = 1; 69 | ev_i = len(events_data)-1 70 | c_e_p = events_data[ev_i]['start'] + events_data[ev_i]['length'] 71 | c_mnn = np.mean(sp_param['raw_signals'][last_signal_i:c_e_p]); 72 | c_std = np.std(sp_param['raw_signals'][last_signal_i:c_e_p]); 73 | c_start = last_signal_i; 74 | c_length = c_e_p - last_signal_i; 75 | c_mode = sp_param['fq_seq'][fq_seq_i-2:fq_seq_i+3]; 76 | event_info.append((c_mnn, c_std, c_start, c_length, c_mode)) 77 | 78 | event_info = np.array(event_info, dtype=[('mean', '0): 105 | # print ("%d/%s %d-%d vs %d-%d %s=%s%s" % (ev_i, ei_i,events_data[ev_i]['start'], events_data[ev_i]['start']+events_data[ev_i]['length'], event_info[ei_i]['start'], event_info[ei_i]['start']+event_info[ei_i]['length'], events_data[ev_i]['model_state'][2],event_info[ei_i]['model_state'][2],sp_param['fq_seq'][ei_i+2])) 106 | # ei_i += events_data[ev_i]['move'] 107 | 108 | return event_info 109 | 110 | 111 | if __name__=='__main__': 112 | moptions = {} 113 | sp_param = {} 114 | 115 | exple_data = ['/home/liuq1/project/DeepNanoRepeat/scripts/fortest/f6343e53-9454-41ae-8398-7be6e1b7557d.fast5', \ 116 | 'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/000a7916-373c-4cc3-a3f2-6bed205b09cb.fast5', \ 117 | 'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/00264c38-4945-4263-ae0d-253e6c6a39ba.fast5', \ 118 | 'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/0039f109-46ac-4a81-883d-b55900924dd4.fast5', \ 119 | 'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/0045bf1d-d7be-44b1-9b6c-9bb76a634e0f.fast5' \ 120 | ] 121 | 122 | sp_param['f5reader'] = h5py.File(sys.argv[1] if len(sys.argv)>1 else exple_data[0], 'r'); 123 | 124 | fq_str = '/Analyses/Basecall_1D_000/BaseCalled_template/Fastq' 125 | ev_str = '/Analyses/Basecall_1D_000/BaseCalled_template/Events' 126 | fq_str = '/Analyses/Basecall_1D_001/BaseCalled_template/Fastq' 127 | ev_str = '/Analyses/Basecall_1D_001/BaseCalled_template/Events' 128 | sg_str = '/Raw/Reads/' 129 | 130 | sp_param['fq_seq'] = sp_param['f5reader'][fq_str][()].split('\n')[1]; 131 | sp_param['raw_signals'] = sp_param['f5reader'][sg_str].values()[0]['Signal'].value 132 | events_data = sp_param['f5reader'][ev_str].value; 133 | 134 | getEvent_Info(moptions, sp_param, events_data) 135 | 136 | sp_param['f5reader'].close(); 137 | 138 | -------------------------------------------------------------------------------- /bin/DeepMod_scripts/MoveTable.py: -------------------------------------------------------------------------------- 1 | 2 | import os,sys 3 | import numpy as np 4 | import h5py 5 | 6 | 7 | def getMove_Info(moptions, sp_param, move_data): 8 | ''' 9 | sp_param.keys: fq_seq, raw_signals, first_sample_template, duration_template 10 | ''' 11 | 12 | #sp_param['first_sample_template'] = sp_param['f5reader']['/Analyses/Segmentation_001/Summary/segmentation'].attrs['first_sample_template'] 13 | #sp_param['duration_template'] = sp_param['f5reader']['/Analyses/Segmentation_001/Summary/segmentation'].attrs['duration_template'] 14 | 15 | seg = "Segmentation_" + moptions['basecall_1d'].split('_')[-1] 16 | attr_path = '/'.join(['', 'Analyses', seg, 'Summary', 'segmentation']) 17 | #mv_str = '/'.join(['', 'Analyses', moptions['basecall_1d'], moptions['basecall_2strand'], 'Move']) 18 | sp_param['first_sample_template'] = sp_param['f5reader'][attr_path].attrs['first_sample_template'] 19 | sp_param['duration_template'] = sp_param['f5reader'][attr_path].attrs['duration_template'] 20 | #move_data = sp_param['f5reader'][mv_str][()] 21 | nrow = len(sp_param['fq_seq']) # row number of event_info; equals to the base number 22 | nsig = len(sp_param['raw_signals']) 23 | first = int(sp_param['first_sample_template']) 24 | duration = int(sp_param['duration_template']) 25 | move_info = np.empty([nrow], dtype=[('mean', '', f5k, '\n', f5data[f5k][0], '\n'])) 38 | temp_fa.flush(); 39 | if moptions['outLevel']<=myCom.OUTPUT_DEBUG: 40 | end_time = time.time(); 41 | print ("Write consuming time %d" % (end_time-start_time)) 42 | 43 | # alignment using bwa-mem or minimap2 44 | temp_sam = tempfile.NamedTemporaryFile() 45 | if moptions['alignStr']=='bwa': 46 | cmd_opt = ['mem', '-x', 'ont2d', '-v', '1', '-t', '1', moptions['Ref'], temp_fa.name] 47 | else: 48 | cmd_opt = ['-ax', 'map-ont', moptions['Ref'], temp_fa.name] 49 | returncode = subprocess.call([moptions['alignStr'],]+cmd_opt, stdout=temp_sam) 50 | if not returncode==0: 51 | print ('Fatal Error!!! returncode is non-zero(%d) for "%s"' % (returncode, curcmd)) 52 | errkey = "Cannot running aligment" 53 | for f5k in f5keys: 54 | sp_options["Error"][errkey].append(f5data[f5k][3]) 55 | return; 56 | 57 | temp_fa.close(); 58 | temp_sam.seek(0); 59 | # get sam information 60 | align_info = temp_sam.readlines() 61 | align_info = [str(align_info[i], 'utf-8').strip() for i in range(len(align_info))] 62 | temp_sam.close(); 63 | 64 | sp_param = defaultdict(); 65 | sp_param['f5data'] = f5data 66 | 67 | # for alignment 68 | f5align = defaultdict() 69 | f5keydict = defaultdict(); 70 | sp_param['ref_info'] = defaultdict() 71 | 72 | if moptions['outLevel']<=myCom.OUTPUT_DEBUG:start_time = time.time(); 73 | ilid = 0; 74 | # for each record in sam, get alignment information 75 | while ilid < len(align_info): 76 | if len(align_info[ilid])==0 or align_info[ilid][0]=='@': 77 | ilid += 1 78 | continue; 79 | 80 | sp_param['f5status'] = ""; 81 | sp_param['line'] = align_info[ilid] 82 | qname = handle_line(moptions, sp_param, f5align) 83 | if sp_param['f5status'] == "": 84 | f5keydict[qname] = True; 85 | ilid += 1 86 | 87 | # for unmapped reads 88 | for f5k in f5keys: 89 | if f5k not in f5keydict: 90 | sp_options["Error"]["Not in alignment sam"].append(f5data[f5k][3]) 91 | 92 | if moptions['outLevel']<=myCom.OUTPUT_DEBUG: 93 | end_time = time.time(); 94 | print ("Get BAM consuming time %d" % (end_time-start_time)) 95 | 96 | sp_param['f5status']= "" 97 | sp_param['line'] = "" 98 | if moptions['outLevel']<=myCom.OUTPUT_DEBUG:start_time = time.time(); 99 | # handle each alignment 100 | handle_record(moptions, sp_options, sp_param, f5align, f5data) 101 | if moptions['outLevel']<=myCom.OUTPUT_DEBUG: 102 | end_time = time.time(); 103 | print ("Analyze & annotate & save consuming time %d" % (end_time-start_time)) 104 | 105 | # 106 | # get mapping information 107 | # then call another function to get feature of each base in a long read 108 | # 109 | def handle_record(moptions, sp_options, sp_param, f5align, f5data): 110 | alignkeys = list(f5align.keys()); 111 | # alignment detail 112 | numreg = re.compile('\d+') 113 | mdireg = re.compile('[MIDNSHPX=]{1}') 114 | 115 | feat_file_ind_dict = [] 116 | feat_list = None; feat_file_ind = 0 117 | start_c_time = time.time(); 118 | 119 | for readk in alignkeys: 120 | if len(feat_file_ind_dict)>0 and feat_list.nbytes > moptions['size_per_batch']: 121 | # save features when the size is larger than the defined size 122 | cur_feat_file_base = sp_options['ctfolder'] + '/'+str(feat_file_ind) 123 | np.savetxt(cur_feat_file_base+'.xy.gz', feat_list, fmt='%.3f') 124 | with open(cur_feat_file_base+'.xy.ind', 'w') as ind_mw: 125 | for f_ind in feat_file_ind_dict: 126 | ind_mw.write('%d %s\n' % (f_ind[1], f_ind[0])) 127 | print ("\t%s-%d Total consuming time %d" % (sp_options['ctfolder'][sp_options['ctfolder'].rfind('/'):], feat_file_ind, time.time()-start_c_time)); sys.stdout.flush() 128 | feat_file_ind_dict = [] 129 | feat_list = None; 130 | feat_file_ind += 1 131 | 132 | # get alignment detail 133 | mapq, flag, rname, pos, cigar, readseq = f5align[readk] 134 | 135 | if not ( (rname in moptions['fulmodlist'] and len(moptions['fulmodlist'][rname])>0) or \ 136 | ((not moptions['anymodlist']==None) and rname in moptions['anymodlist'] and len(moptions['anymodlist'][rname])>0) or \ 137 | ((not moptions['nomodlist']==None) and rname in moptions['nomodlist'] and len(moptions['nomodlist'][rname])>0) ): 138 | continue; 139 | 140 | # get reference sequece 141 | if rname not in sp_param['ref_info']: 142 | myDetect.getRefSeq(moptions, sp_param, rname) 143 | refseq = sp_param['ref_info'][rname] 144 | 145 | # mapped starting position and strand 146 | pos = pos - 1 147 | forward_reverse = '-' if flag&0x10 else '+' 148 | 149 | numinfo = numreg.findall(cigar); 150 | mdiinfo = mdireg.findall(cigar) 151 | numinfo = [int(numinfo[i]) for i in range(len(numinfo))] #map(int, numinfo) 152 | 153 | # remove clip from both tails 154 | leftclip = 0; rightclip = 0; 155 | while mdiinfo[0] in ['I', 'D', 'N', 'S', 'H', 'P', 'X']: 156 | if mdiinfo[0] in ['I', 'S', 'X']: 157 | leftclip += numinfo[0]; readseq = readseq[numinfo[0]:] 158 | if mdiinfo[0] in ['H']: leftclip += numinfo[0] 159 | if mdiinfo[0] in ['D', 'N', 'X']: 160 | pos += numinfo[0] 161 | numinfo = numinfo[1:]; mdiinfo = mdiinfo[1:] 162 | while mdiinfo[-1] in ['I', 'D', 'N', 'S', 'H', 'P', 'X']: 163 | if mdiinfo[-1] in ['I', 'S', 'X']: 164 | rightclip += numinfo[-1]; readseq = readseq[:-numinfo[-1]] 165 | if mdiinfo[-1] in ['H']: rightclip += numinfo[-1] 166 | numinfo = numinfo[:-1]; mdiinfo = mdiinfo[:-1] 167 | if forward_reverse=='+': 168 | if rightclip>0: m_event = f5data[readk][1][leftclip:-rightclip] 169 | else: m_event = f5data[readk][1][leftclip:] 170 | else: 171 | if leftclip>0: m_event = f5data[readk][1][rightclip:-leftclip] 172 | else: m_event = f5data[readk][1][rightclip:] 173 | 174 | # is in region of interest if provided 175 | isinreg = False; 176 | if (moptions['region'][0] in ['', None, rname]) and \ 177 | (moptions['region'][1] in ['', None] or pos>moptions['region'][1]) and \ 178 | (moptions['region'][2] in ['', None] or pos+len(m_event)1: rightclip += len(m_event)-lastmatch-1 255 | # remove events whose bases are not mapped. 256 | if forward_reverse=='+': 257 | if len(m_event)-lastmatch>1: 258 | m_event = m_event[firstmatch:(lastmatch+1-len(m_event))] 259 | elif firstmatch>0: m_event = m_event[firstmatch:] 260 | else: 261 | if firstmatch>0: m_event = m_event[(len(m_event)-1-lastmatch):-firstmatch] 262 | elif len(m_event)-lastmatch>1: m_event = m_event[(len(m_event)-1-lastmatch):] 263 | # print detail if unexpected error occurs 264 | if firstmatch>0 or len(base_map_info)-last_al_match>1: 265 | if moptions['outLevel']<=myCom.OUTPUT_WARNING and ((firstmatch>0) or (len(base_map_info)-last_al_match>1 and refseq[last_match_pos+1] not in ['N'])): 266 | print ("Errorfast5"+f5data[readk][3]) 267 | print ('Warning!!! first not match', firstmatch, lastmatch, first_al_match, last_al_match, len(base_map_info), numinfo[lasmtind-2:(lasmtind+5)], mdiinfo[lasmtind-2:(lasmtind+5)], lasmtind, len(numinfo)) 268 | print('\tref='+refseq[last_match_pos:last_match_pos+20]+"\n\tred="+readseq[lastmatch:lastmatch+20]) 269 | if firstmatch>0: 270 | print('\tref='+refseq[(first_match_pos-20 if first_match_pos-20>0 else 0):first_match_pos]+"\n\tred="+readseq[(firstmatch-20 if firstmatch-20>0 else 0):firstmatch]) 271 | print('\tf=%d, chr=%s, p=%d, c=%s, s=%s' % (flag, rname, pos, cigar, readseq)) # flag, rname, pos, cigar, readseq 272 | 273 | if len(base_map_info)-last_al_match>1: 274 | base_map_info = base_map_info[first_al_match:(last_al_match+1-len(base_map_info))] 275 | elif first_al_match>0: 276 | base_map_info = base_map_info[first_al_match:] 277 | 278 | # post-process mapping information 279 | base_map_info = np.array(base_map_info, dtype=[('refbase', 'U1'), ('readbase', 'U1'), ('refbasei', np.uint64), ('readbasei', np.uint64)]) 280 | if forward_reverse=='-': 281 | base_map_info = np.flipud(base_map_info) 282 | for bmii in range(len(base_map_info)): 283 | base_map_info['refbase'][bmii] = get_complement(base_map_info['refbase'][bmii]) 284 | base_map_info['readbase'][bmii] = get_complement(base_map_info['readbase'][bmii]) 285 | leftclip, rightclip = rightclip, leftclip 286 | if False: #True: # for test base_map_info ### for check consistency 287 | ref_align_key = '/Analyses/NanomoCorrected_000/BaseCalled_template/Alignment/genome_alignment' 288 | read_align_key = '/Analyses/NanomoCorrected_000/BaseCalled_template/Alignment/read_alignment' 289 | with h5py.File(f5data[readk][3], 'r') as mf5: 290 | read_align_list = [bt.decode(encoding="utf-8") for bt in mf5[read_align_key]] 291 | ref_align_list = [bt.decode(encoding="utf-8") for bt in mf5[ref_align_key]] 292 | for rali in range(len(read_align_list)): 293 | if not read_align_list[rali]==base_map_info['readbase'][rali]: 294 | print ("Error not equal1! %s %s %d %s" % (read_align_list[rali], base_map_info['readbase'][rali], rali, f5data[readk][3])) 295 | if not ref_align_list[rali]==base_map_info['refbase'][rali]: 296 | print ("Error not equal2! %s %s %d %s" % (ref_align_list[rali], base_map_info['refbase'][rali], rali, f5data[readk][3])) 297 | # 298 | # handle map like 299 | # CCG or CGG 300 | # C-G C-G 301 | # 302 | if 'motif' in moptions and moptions['motif'][0]=='CG': 303 | for ali in range(len(base_map_info)): 304 | if base_map_info['refbase'][ali]=='C' and base_map_info['readbase'][ali]=='C': 305 | if ali+1-1 and base_map_info['readbase'][ali-1]=='-' and base_map_info['refbase'][ali-1]=='C': 314 | addali = 2; 315 | while ali - addali >-1: 316 | if base_map_info['readbase'][ali-addali]=='-' and base_map_info['refbase'][ali-addali]=='C': addali += 1; 317 | else: break; 318 | if ali - addali>-1 and base_map_info['readbase'][ali-addali]=='C' and base_map_info['refbase'][ali-addali]=='C': 319 | base_map_info['readbase'][ali-1], base_map_info['readbase'][ali-addali] = base_map_info['readbase'][ali-addali], base_map_info['readbase'][ali-1] 320 | # too short reads 321 | if len(m_event)<500: 322 | sp_options["Error"]["Less(<500) events"].append(f5data[readk][3]) 323 | continue; 324 | 325 | # get feautre 326 | mfeatures,isdif = get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, leftclip, rightclip, base_map_info, forward_reverse, rname, first_match_pos, numinsert, numdel) 327 | if isdif and moptions['outLevel']<=myCom.OUTPUT_WARNING: 328 | print("Dif is true") 329 | print([lastmatch, firstmatch, first_match_pos, last_match_pos, first_al_match, last_al_match, lasmtind, len(base_map_info), nummismatch, numinsert, numdel, len(base_map_info)-nummismatch-numinsert-numdel]) 330 | 331 | # merge to previously handled features of other fast5 files 332 | if len(mfeatures)>0: 333 | if len(feat_file_ind_dict)==0: 334 | feat_file_ind_dict.append((f5data[readk][3], 0)); 335 | feat_list = mfeatures 336 | else: 337 | feat_file_ind_dict.append((f5data[readk][3], len(feat_list))) 338 | feat_list = np.concatenate((feat_list, mfeatures), axis=0) 339 | 340 | # store the last feature data. 341 | if len(feat_file_ind_dict)>0: 342 | cur_feat_file_base = sp_options['ctfolder'] + '/'+str(feat_file_ind) 343 | np.savetxt(cur_feat_file_base+'.xy.gz', feat_list, fmt='%.3f') 344 | with open(cur_feat_file_base+'.xy.ind', 'w') as ind_mw: 345 | for f_ind in feat_file_ind_dict: 346 | ind_mw.write('%d %s\n' % (f_ind[1], f_ind[0])) 347 | print ("\t%s-%d Total consuming time %d" % (sp_options['ctfolder'][sp_options['ctfolder'].rfind('/'):], feat_file_ind, time.time()-start_c_time)); sys.stdout.flush() 348 | feat_file_ind_dict = [] 349 | feat_list = None; 350 | feat_file_ind += 1 351 | 352 | # 353 | # get feature for each base of interest in long reads according to raw signals and mapping information 354 | # 355 | def get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, start_clip, end_clip, base_map_info, forward_reverse, rname, mapped_start_pos, num_insertions, num_deletions): 356 | # event information 357 | modevents = sp_param['f5data'][readk][1] 358 | # class number, bin num and bin length 359 | clnum = 2; binnum = 50; binlen = 0.2; 360 | if forward_reverse=='+': 361 | align_ref_pos = mapped_start_pos 362 | else: 363 | align_ref_pos = mapped_start_pos + len(base_map_info) - num_insertions - 1 364 | 365 | # initialize feature matrix for all events. 366 | if moptions['fnum']==57: 367 | #mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (binnum+3+3+4))); 368 | mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (binnum+3+3+4))); 369 | else: mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (3+3+4))); 370 | 371 | # filter poor alignment 372 | checkneighbornums = [3,6] 373 | checkratios = {3:[6,5,4,2], 6:[11,10,9,3]} 374 | checkratios = {3:[6,5,4,2], 6:[12,10,9,3]} 375 | cgpos = [[], []] 376 | affectneighbor = 1; # 2; 377 | for aligni in range(len(base_map_info)): 378 | # for methylated positions and not-used adjacent positions 379 | if 'motif' in moptions and base_map_info['readbase'][aligni]==moptions['motif'][0][moptions['motif'][1]]: 380 | m_a_st = aligni-moptions['motif'][1]; m_a_end = aligni+len(moptions['motif'][0])-moptions['motif'][1] 381 | if m_a_st>-1 and m_a_end<=len(base_map_info) and ''.join(base_map_info['readbase'][m_a_st:m_a_end])==moptions['motif'][0] and (not ''.join(base_map_info['refbase'][m_a_st:m_a_end])==moptions['motif'][0]): 382 | cgpos[1].extend([(forward_reverse, base_map_info['refbasei'][addi]) for addi in range(aligni-affectneighbor if aligni-affectneighbor>-1 else 0, aligni+affectneighbor+1 if aligni+affectneighbor+1-1 and checki-1 else 0, nextnogap+affectneighbor if nextnogap+affectneighbor-1: 420 | if not base_map_info['refbase'][prenogap]=='-': break; 421 | prenogap -= 1 422 | 423 | read0 = aligni; read1 = aligni 424 | for _ in range(affectneighbor): 425 | read0 -= 1 426 | while read0>-1: 427 | if base_map_info['readbase'][read0]=='-': read0 -= 1 428 | else: break; 429 | read1 += 1 430 | while read1-1: prenogap = read0 436 | else: prenogap = 0 437 | if read1>nextnogap: 438 | if read1> %d %d, %d-%d=%d' % (forward_reverse, f5data[readk][3], len(cgpos[0]), len(cgpos[1]), len(modevents)-end_clip-start_clip, start_clip, len(modevents), end_clip, len(modevents)-end_clip)) 447 | 448 | aligni = 0; isdif = False; 449 | for ie in range(start_clip-100, len(modevents)-end_clip+100): 450 | cur_row_num = ie - (start_clip-100); cur_base = '' 451 | # for aligned bases 452 | if ie>=start_clip and ie=0 and ie10 or currs<-10: print ('Error raw signal', currs, ie, modevents['start'][ie], modevents['length'][ie]) 498 | curbin = int((currs+5)/binlen) 499 | if curbin<0: curbin = 0 500 | elif not curbin=0 and ie 0.9: 516 | for ini in range(mfi-25, mfi+26): 517 | if ini<0 or ini>len(mfeatures)-1: 518 | print("Warning wrong del mfeatures id %d for %s" % (ini, f5data[readk][3])) 519 | else: 520 | nbkeys[ini] = True; 521 | keepInd = sorted(list(nbkeys.keys())); 522 | if len(keepInd)>0: 523 | if not len(keepInd)>len(mfeatures)*0.9: 524 | mfeatures = mfeatures[np.array(keepInd)] 525 | else: 526 | mfeatures = [] 527 | 528 | return (mfeatures, isdif) 529 | 530 | 531 | # 532 | # get the complementary bases 533 | # 534 | def get_complement(na): 535 | if na in myCom.acgt: return myCom.na_bp[na] 536 | else: return na; 537 | 538 | # 539 | # get required information for reach mapping records. 540 | # 541 | def handle_line(moptions, sp_param, f5align): 542 | lsp = sp_param['line'].split('\t') 543 | qname, flag, rname, pos, mapq, cigar, _, _, _, seq, _ = lsp[:11] 544 | # checked query name 545 | if qname=='*': sp_param['f5status'] = "qname is *" 546 | # check mapping quality 547 | elif int(mapq)==255: sp_param['f5status'] = "mapq is 255" 548 | # check mapped positions 549 | elif int(pos)==0: sp_param['f5status'] = "pos is 0" 550 | # check mapped string 551 | elif cigar=='*': sp_param['f5status'] = "cigar is *" 552 | # check reference name 553 | elif rname=='*': sp_param['f5status'] = "rname is *" 554 | if not sp_param['f5status']=="": return qname 555 | 556 | if (qname not in f5align) or f5align[qname][0]0: 597 | if line[0]=='>': # for each chromosome line 598 | if (not cur_chr==None) and (t_chr in [None, cur_chr]): 599 | fadict[cur_chr] = ''.join(fadict[cur_chr]) 600 | cur_chr = line[1:].split()[0] 601 | if t_chr in [None, cur_chr]: 602 | fadict[cur_chr] = [] 603 | else: # for sub-sequence line in a reference file 604 | if t_chr in [None, cur_chr]: 605 | fadict[cur_chr].append(line.upper()) 606 | line = mr.readline(); 607 | # for the last chromosome in the file 608 | if (not cur_chr==None) and (t_chr in [None, cur_chr]): 609 | fadict[cur_chr] = ''.join(fadict[cur_chr]) 610 | return fadict 611 | 612 | # 613 | # get reference positions for motif-based modifications 614 | # 615 | def readMotifMod(fadict, mpat='Cg', mposinpat=0, t_chr=None, t_start=None, t_end=None): 616 | pos_dict = defaultdict(int) 617 | 618 | # get motif and complementary motif 619 | pat3 = copy.deepcopy(mpat.upper()) 620 | comp_pat3 = ''.join([get_complement(curna) for curna in pat3][::-1]) 621 | comp_mposinpat = len(comp_pat3)-1-mposinpat 622 | 623 | fakeys = fadict.keys(); 624 | cpgdict = defaultdict(int); 625 | all_a = defaultdict() 626 | for fak in fakeys: 627 | cpgnum = [0, 0] 628 | # motif-based reference positions 629 | cpgdict[fak] = defaultdict() 630 | # position of bases of interest 631 | all_a[fak] = defaultdict() 632 | for i in range(len(fadict[fak])): 633 | if (t_start==None or i>=t_start) and (t_end==None or i<=t_end): 634 | if fadict[fak][i]==mpat[mposinpat]: # for forward strand 635 | all_a[fak][('+', i)] = True; 636 | elif get_complement(fadict[fak][i])==mpat[mposinpat]: # for reverse strand 637 | all_a[fak][('-', i)] = True; 638 | 639 | # check motif in forward strand 640 | if i-mposinpat>=0 and i+len(comp_pat3)-1-mposinpat=0 and i+len(comp_pat3)-1-comp_mposinpat0: 696 | tchr, tstrand, tpos = line.split()[:3] 697 | curmeth[tchr][(tstrand, int(tpos))] = [1-mthi, fadict[tchr][int(tpos)]]; 698 | line = mreader.readline(); 699 | for tchr in moptions['fulmodlist'] if moptions['anymodlist']==None else moptions['anymodlist']: 700 | if len(moptions['fulmodlist'][tchr])>0 or ((not moptions['anymodlist']==None) and len(moptions['anymodlist'][tchr])>0): 701 | print ('%s fulmod=%d anymod=%d nomod=%d' % (tchr, len(moptions['fulmodlist'][tchr]), len(moptions['anymodlist'][tchr]) if (not moptions['anymodlist']==None) else -1, len(moptions['nomodlist'][tchr]) if (not moptions['nomodlist']==None) else -1)) 702 | 703 | if True: #False: 704 | # get all input fast5 files 705 | f5files = glob.glob(os.path.join(moptions['wrkBase'],"*.fast5" )) 706 | if moptions['recursive']==1: 707 | f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*.fast5" ))) 708 | f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*/*.fast5" ))) 709 | f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*/*/*.fast5" ))) 710 | 711 | 712 | print('Total files=%d' % len(f5files)) 713 | h5files_Q = pmanager.Queue(); 714 | failed_Q = pmanager.Queue() 715 | version_Q = pmanager.Queue() 716 | 717 | # split input fast5 files into different batch 718 | h5_batch = []; h5batchind = 0; 719 | for f5f in f5files: 720 | h5_batch.append(f5f); 721 | if len(h5_batch)==moptions['files_per_thread']: 722 | h5files_Q.put((h5_batch, h5batchind)) 723 | h5batchind += 1 724 | h5_batch = []; #break; ### feature500 725 | if len(h5_batch)>0: 726 | h5files_Q.put((h5_batch, h5batchind)) 727 | 728 | # each thread handle a batch a time and repeat for all batches. 729 | share_var = (moptions, h5files_Q, failed_Q, version_Q) 730 | handlers = [] 731 | for hid in range(moptions['threads']): 732 | p = multiprocessing.Process(target=getFeature_handler, args=share_var); 733 | p.start(); 734 | handlers.append(p); 735 | 736 | # get failed files. 737 | failed_files = defaultdict(list); 738 | version_default = defaultdict(lambda: defaultdict(int)); 739 | while any(p.is_alive() for p in handlers): 740 | try: 741 | errk, fns = failed_Q.get(block=False); 742 | failed_files[errk].extend(fns) 743 | curv, curv_num = version_Q.get(block=False); 744 | version_default[curv] += curv_num 745 | except: 746 | time.sleep(1); 747 | continue; 748 | 749 | # output failure information 750 | if len(failed_files)>0: 751 | print ('Error information for different fast5 files:') 752 | for errtype, errfiles in failed_files.items(): 753 | print ('\t%s %d' % (errtype, len(errfiles))) 754 | print("abversion info {}".format(str(version_default))) 755 | sys.stdout.flush() 756 | end_time = time.time(); 757 | print ("Total consuming time %d" % (end_time-start_time)) 758 | 759 | 760 | 761 | # for indepdent testing of code 762 | if __name__=='__main__': 763 | # if len(sys.argv)>4: 764 | moptions = {} 765 | moptions['basecall_1d'] = 'Basecall_1D_000' 766 | moptions['basecall_1d'] = ['Basecall_1D_000'] 767 | moptions['basecall_2strand'] = 'BaseCalled_template' 768 | 769 | moptions['outLevel'] = myCom.OUTPUT_WARNING 770 | moptions['outLevel'] = myCom.OUTPUT_INFO 771 | 772 | moptions['modfile'] = '../../mod_output/train1/2/mod_train' 773 | 774 | moptions['fnum'] = 53; 775 | moptions['hidden'] = 100; 776 | moptions['windowsize'] = 21; 777 | 778 | moptions['threads'] = 8 779 | moptions['threads'] = 1 780 | moptions['files_per_thread'] = 500 781 | 782 | mDetect_manager(moptions) 783 | -------------------------------------------------------------------------------- /bin/DeepMod_scripts/myMultiBiRNN.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib import rnn 5 | import numpy as np 6 | 7 | import math 8 | import glob, os, sys, time; 9 | 10 | from collections import defaultdict 11 | 12 | batchsize = 2048; 13 | 14 | # different class weights for unbalanced data 15 | class_weights = tf.constant([0.1,0.9]) 16 | 17 | # 18 | # create a RNN with LSTM 19 | # define performance evaluation operation 20 | # 21 | def mCreateSession(num_input, num_hidden, timesteps, moptions): 22 | # two classes only 23 | num_classes = 2; 24 | # the number of layers 25 | numlayers = 3; 26 | # learning rate 27 | learning_rate = 0.001 28 | 29 | # define input and output 30 | X = tf.placeholder("float", [None, timesteps, num_input]); 31 | Y = tf.placeholder("float", [None, num_classes]); 32 | 33 | # for last layers 34 | weights = {'out': tf.Variable(tf.truncated_normal([2*num_hidden, num_classes]))}; 35 | biases = {'out': tf.Variable(tf.truncated_normal([num_classes]))} 36 | 37 | # define a bidirectional RNN 38 | def BiRNN(x, weights, biases): 39 | x = tf.unstack(x, timesteps, 1); 40 | 41 | # define the LSTM cells 42 | lstm_fw_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(num_hidden, forget_bias=1.0) for _ in range(numlayers)]); 43 | lstm_bw_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(num_hidden, forget_bias=1.0) for _ in range(numlayers)]); 44 | 45 | # define bidirectional RNN 46 | try: 47 | outputs, _, _ = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x, dtype=tf.float32); 48 | except Exception: 49 | outputs = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x, dtype=tf.float32); 50 | 51 | # define output layer 52 | if moptions['outputlayer'] in ['sigmoid']: 53 | return tf.contrib.layers.fully_connected(outputs[int(timesteps/2)], num_outputs=num_classes, activation_fn=tf.nn.sigmoid); 54 | else: 55 | return tf.matmul(outputs[int(timesteps/2)], weights['out']) + biases['out'] 56 | 57 | # get prediction 58 | logits = BiRNN(X, weights, biases); 59 | prediction = tf.nn.softmax(logits) 60 | 61 | mfpred=tf.argmax(prediction,1) 62 | 63 | ## with different class-weights or not 64 | if 'unbalanced' in moptions and (not moptions['unbalanced']==None) and moptions['unbalanced']==1: # class_weights 65 | loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=tf.multiply(logits, class_weights), labels=Y)) 66 | else: 67 | loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=Y)) 68 | # 69 | 70 | # for optimizer 71 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate); 72 | train_op = optimizer.minimize(loss_op); 73 | 74 | # get accuracy 75 | correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1)); 76 | accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)); 77 | 78 | # AUC 79 | auc_op = tf.metrics.auc(Y, prediction) 80 | # precision 81 | mpre = tf.metrics.precision(tf.argmax(Y, 1), tf.argmax(prediction, 1)) 82 | # recall 83 | mspf = tf.metrics.recall(tf.argmax(Y, 1), tf.argmax(prediction, 1)) 84 | 85 | # initialization of variables 86 | init = tf.global_variables_initializer(); 87 | init_l = tf.local_variables_initializer() 88 | 89 | saver = tf.train.Saver(); 90 | 91 | return (init, init_l, loss_op, accuracy, train_op, X, Y, saver, auc_op, mpre, mspf, mfpred) 92 | 93 | # 94 | # train a model and save it. 95 | # 96 | def train_save_model(filelists, num_input, mhidden, timesteps, moptions): 97 | training_steps = 4 98 | #training_steps = 40 99 | 100 | init, init_l, loss_op, accuracy, train_op, X, Y, saver, auc_op, mpre, mspf, mfpred = mCreateSession(num_input, mhidden, timesteps, moptions) 101 | 102 | # display step 103 | desplay_files = len(filelists[0])/100 104 | if desplay_files<2: desplay_files = 2; 105 | if desplay_files>10: desplay_files = int(desplay_files/10) * 10; #desplay_files=2 106 | if desplay_files>100: desplay_files = 100 107 | file_group_id = [0 for _ in range(len(filelists))]; 108 | sumpsize = 25; 109 | 110 | # for configuration 111 | config = tf.ConfigProto() 112 | if (timesteps>61 and num_input>50): 113 | config.gpu_options.per_process_gpu_memory_fraction = 0.5 114 | else: config.gpu_options.allow_growth = True 115 | with tf.Session(config=config) as sess: 116 | # initialization 117 | sess.run(init); 118 | sess.run(init_l) 119 | start_time = time.time(); start_c_time = time.time(); 120 | io_time = 0; 121 | 122 | # for each epoch 123 | for step in range(1, training_steps+1): 124 | print('===%d=====================step========================%d/%d' % (desplay_files, step, training_steps)) 125 | sys.stdout.flush() 126 | last_desplay_files_num = -1; 127 | file_group_id[0] = 0 128 | while file_group_id[0] < len(filelists[0]): 129 | io_start_time = time.time(); 130 | 131 | # for each input groups. 132 | # usually two groups: one positive group and one negative group 133 | # might also one group containing both positive and negative labelling data 134 | featurelist = [[[], []] for _ in range(len(filelists))]; 135 | minsize = None; cur_batch_num = None; 136 | # get data from all groups until 'minsize' data is loaded. 137 | for ifl in range(len(filelists)): 138 | if ifl==0: 139 | minsize = batchsize * sumpsize 140 | else: minsize = batchsize * cur_batch_num; 141 | while len(featurelist[ifl][0])0: 148 | if len(featurelist[ifl][0])==0: 149 | featurelist[ifl][0] = batch_2_x 150 | featurelist[ifl][1] = batch_2_y 151 | else: 152 | # merge current loading data with previously loading data 153 | featurelist[ifl][0] = np.concatenate((featurelist[ifl][0], batch_2_x), axis=0) 154 | featurelist[ifl][1] = np.concatenate((featurelist[ifl][1], batch_2_y), axis=0) 155 | file_group_id[ifl] += 1; 156 | # split for small groups for training 157 | if ifl==0: 158 | featurelist[ifl][0] = np.array_split(featurelist[ifl][0], int(len(featurelist[ifl][0])/batchsize)) 159 | featurelist[ifl][1] = np.array_split(featurelist[ifl][1], int(len(featurelist[ifl][1])/batchsize)) 160 | cur_batch_num = len(featurelist[ifl][0]) 161 | if len(featurelist[0][0])1: 169 | for ifl in range(1, len(filelists)): 170 | #if (file_group_id[0]+1) - last_desplay_files_num >= desplay_files: msizeprint.append(str(len(featurelist[ifl][0]))) 171 | featurelist[ifl][0] = np.array_split(featurelist[ifl][0], len(featurelist[0][0])) 172 | featurelist[ifl][1] = np.array_split(featurelist[ifl][1], len(featurelist[0][0])) 173 | io_time += (time.time() - io_start_time) 174 | 175 | ifl=3 if len(featurelist)>3 else len(featurelist)-1 176 | if (file_group_id[0]+1) - last_desplay_files_num >= desplay_files: 177 | sess.run(init_l) 178 | try: 179 | # print some testing information as progress indicators 180 | loss, aucm, acc, p, r = sess.run([loss_op, auc_op[1], accuracy, mpre[1], mspf[1]], feed_dict={X:featurelist[ifl][0][0], Y:featurelist[ifl][1][0]}) 181 | print(">>>Tratin#files "+str(file_group_id[0]+1)+",loss="+"{:.3f}".format(loss)+",AUC="+"{:.3f}".format(aucm)+",acc="+"{:.3f}".format(acc)+",p="+"{:.3f}".format(p)+",r="+"{:.3f}".format(r)+(" Comsuming time: %d(current=%d) IO=%d(%.3f)" % (time.time()-start_time, time.time()-start_c_time, io_time, io_time/float(time.time()-start_time)))); 182 | except: 183 | print(">>>Tratin#filesError "+str(file_group_id[0]+1)+(" Comsuming time: %d(current=%d) IO=%d(%.3f)" % (time.time()-start_time, time.time()-start_c_time, io_time, io_time/float(time.time()-start_time)))); 184 | sys.stdout.flush() 185 | start_c_time = time.time(); 186 | 187 | # using each subgroup of data for training 188 | for subi in range(len(featurelist[0][0])): 189 | for ifl in range(len(filelists)): 190 | to = sess.run([train_op, loss_op], feed_dict={X:featurelist[ifl][0][subi], Y:featurelist[ifl][1][subi]}) 191 | if len(featurelist)==1: 192 | # print some detail if nan issue happens 193 | if math.isnan(to[1]): 194 | for toj in range(len(featurelist[ifl][0][subi])): 195 | print('{} vs {}'.format(featurelist[ifl][1][subi][toj][0], featurelist[ifl][1][subi][toj][1])) 196 | for tok in featurelist[ifl][0][subi][toj]: 197 | opstr = [] 198 | for tol in tok: 199 | opstr.append(str(round(tol, 2))) 200 | print("\t\t\t"+','.join(opstr)) 201 | sys.exit(1) 202 | 203 | 204 | # adjust progress output information 205 | ifl=3 if len(featurelist)>3 else len(featurelist)-1 206 | if (file_group_id[0]+1) - last_desplay_files_num >= desplay_files: 207 | last_desplay_files_num = (file_group_id[0]+1) - ((file_group_id[0]+1) % desplay_files) 208 | 209 | # store more models 210 | if 49.50.5: 246 | t1files = t1files[:int(len(t1files)*moptions['test'][1])] 247 | else: t1files = t1files[-int(len(t1files)*moptions['test'][1]):] 248 | print('Sizeinfo: %s sizeof=%d len=%d' % (folder1, t1files.__sizeof__(), len(t1files))) 249 | sys.stdout.flush(); 250 | 251 | return t1files 252 | 253 | # 254 | # get all data files in two seperate folders 255 | # 256 | def getTFiles(folder1, folder2, moptions): 257 | t1files = glob.glob(os.path.join(folder1, "*.xy.gz")); #print(t1files.__sizeof__(), len(t1files)) 258 | # get all data in a recursive way 259 | if moptions['recursive']==1: 260 | t1files.extend(glob.glob(os.path.join(folder1, "*/*.xy.gz"))); #print(t1files.__sizeof__(), len(t1files)) 261 | t1files.extend(glob.glob(os.path.join(folder1, "*/*/*.xy.gz"))); #print(t1files.__sizeof__(), len(t1files)) 262 | t1files.extend(glob.glob(os.path.join(folder1, "*/*/*/*.xy.gz"))); #print(t1files.__sizeof__(), len(t1files)) 263 | t1files.extend(glob.glob(os.path.join(folder1, "*/*/*/*/*.xy.gz"))); #print(t1files.__sizeof__(), len(t1files)) 264 | print("Get folder1"); 265 | # for read-based independent testing 266 | print(t1files.__sizeof__(), len(t1files)) 267 | if moptions['test'][0] == '0': 268 | if moptions['test'][1]>0.5: 269 | t1files = t1files[:int(len(t1files)*moptions['test'][1])] 270 | else: t1files = t1files[-int(len(t1files)*moptions['test'][1]):] 271 | print(t1files.__sizeof__(), len(t1files)) 272 | sys.stdout.flush(); 273 | 274 | if folder2==None: t2files = [] 275 | else: 276 | # get all data in a recursive way for the second folder 277 | t2files = glob.glob(os.path.join(folder2, "*.xy.gz")) 278 | if moptions['recursive']==1: 279 | t2files.extend(glob.glob(os.path.join(folder2, "*/*.xy.gz"))) 280 | t2files.extend(glob.glob(os.path.join(folder2, "*/*/*.xy.gz"))) 281 | t2files.extend(glob.glob(os.path.join(folder2, "*/*/*/*.xy.gz"))) 282 | t2files.extend(glob.glob(os.path.join(folder2, "*/*/*/*/*.xy.gz"))) 283 | print("Get folder2"); 284 | # for read-based independent testing 285 | print(t2files.__sizeof__(), len(t2files)) 286 | if moptions['test'][0] == '0': 287 | if moptions['test'][1]>0.5: 288 | t2files = t2files[:int(len(t2files)*moptions['test'][1])] 289 | else: t2files = t2files[-int(len(t2files)*moptions['test'][1]):] 290 | print(t2files.__sizeof__(), len(t2files)) 291 | sys.stdout.flush(); 292 | return t1files, t2files 293 | 294 | # 295 | # get data from a data file 296 | # 297 | def getDataFromFile(fn, moptions): 298 | mdata = np.loadtxt(fn, dtype=np.float32) 299 | # get genomic position, label, and feature 300 | t0, ty, tx = np.split(mdata, [0,2], axis=1); 301 | return (tx, ty, None) 302 | 303 | # 304 | # get X and Y from a data file 305 | # 306 | def getDataFromFile_new(fn, moptions, mfind0ld=None): 307 | mdata = np.loadtxt(fn, dtype=np.float32) 308 | # get genomic position, label, and feature 309 | t0, ty, tx = np.split(mdata, [1,3], axis=1); 310 | 311 | if moptions['test'][0] in ['-', '+']: 312 | t0 = t0.astype(int) 313 | 314 | nan_file = [] 315 | m_data = []; m_y = []; 316 | if not mfind0ld==None: 317 | pos_to_file_dict = defaultdict(); preind = 0 318 | mfind0ldkeys = sorted(list(mfind0ld.keys())); 319 | for mind in range(len(ty)): 320 | if not mfind0ld==None: 321 | if preind0: 356 | m_data = np.reshape(m_data, (len(m_data), len(m_data[0]), len(m_data[0][0]))) 357 | m_y = np.reshape(m_y, (len(m_y), len(m_y[0]))).astype(int) 358 | 359 | if not mfind0ld==None: 360 | return (m_data, m_y, file_to_pos_dict); 361 | else: return (m_data, m_y, None) 362 | 363 | # 364 | # get index information for each fast5 file in a data file 365 | # 366 | def getGZFilePos(gzfile): 367 | # get the staring row of fast5 files in a feature file 368 | mfind = defaultdict() 369 | with open(gzfile[:-len('.gz')]+'.ind', 'r') as mr: 370 | line = mr.readline() 371 | while line: 372 | line = line.strip(); 373 | lsp = line.split(); 374 | if len(lsp)>1: 375 | mfind[int(lsp[0])] = lsp[1] 376 | line = mr.readline() 377 | return mfind 378 | 379 | # 380 | # make a prediction for data from a data file 381 | # 382 | def mPred(mfbase, mffolder, accuracy, X, Y, test_gzfile2, pf, num_input, auc_op, mpre, mspf, init_l, mfpred, timesteps, moptions): 383 | config = tf.ConfigProto() 384 | config.gpu_options.allow_growth = True 385 | with tf.Session(config=config) as sess: 386 | # restore the well-trained model 387 | new_saver = tf.train.import_meta_graph(mfbase+'.meta') 388 | new_saver.restore(sess,tf.train.latest_checkpoint(mffolder)) 389 | 390 | # write the prediction information 391 | pfwriter = open(pf, 'w'); 392 | for test_gzfile in test_gzfile2: 393 | for test_fn_ind in range(len(test_gzfile)): 394 | # get feature files for prediction 395 | test_gzfeature, test_gzlabel, _ = getDataFromFile_new(test_gzfile[test_fn_ind], moptions) 396 | if len(test_gzfeature)<1: continue; 397 | 398 | ftlist = np.array_split(test_gzfeature, int(len(test_gzfeature)/batchsize)+1) 399 | lblist = np.array_split(test_gzlabel, int(len(test_gzlabel)/batchsize)+1) 400 | for fti in range(len(ftlist)): 401 | sess.run(init_l) 402 | mfpred_output = sess.run([ mfpred], feed_dict={X:ftlist[fti], Y:lblist[fti]}) 403 | # get ture positive, false positive, false negative and true negative 404 | tp, fp, fn, tn = 0, 0, 0, 0 405 | for pi in range(len(mfpred_output[0])): 406 | if mfpred_output[0][pi]==1: 407 | if lblist[fti][pi][1]==1: tp += 1 408 | else: fp += 1 409 | else: 410 | if lblist[fti][pi][1]==1: fn += 1 411 | else: tn += 1 412 | pfwriter.write('tp=%d fp=%d fn=%d tn=%d %s\n' % (tp, fp, fn, tn, test_gzfile[test_fn_ind])) 413 | pfwriter.flush() 414 | pfwriter.close(); 415 | 416 | # 417 | # entry for prediction 418 | # 419 | def pred_prepare(moptions, test_file, accuracy, X, Y, auc_op, mpre, mspf, init_l, mfpred): 420 | mPred(moptions['modfile'][0], moptions['modfile'][1], accuracy, X, Y, test_file, moptions['outFolder']+moptions['FileID']+'_mpred.txt', moptions['fnum'], auc_op, mpre, mspf, init_l, mfpred, moptions['windowsize'], moptions) 421 | 422 | # 423 | # prepare training process 424 | # 425 | def mMult_RNN_LSTM_train(moptions): 426 | # get folders of feature files 427 | filegroups = moptions['wrkBase'].split(';') 428 | for i in range(len(filegroups)): 429 | filegroups[i] = filegroups[i].split(',') 430 | 431 | print(filegroups) 432 | 433 | # get feature files 434 | filelists = [[] for _ in range(len(filegroups))] 435 | for i in range(len(filegroups)): 436 | for fgj in range(len(filegroups[i])): 437 | if not len(filegroups[i][fgj])>0: continue 438 | filelists[i].extend(getTFiles1(filegroups[i][fgj], moptions)) 439 | mostnum, mostid = 0, -1; 440 | # set seed so that random setting 441 | np.random.seed(3) 442 | for i in range(len(filelists)): 443 | np.random.shuffle(filelists[i]) 444 | if len(filelists[i])>mostnum: 445 | mostnum = len(filelists[i]) 446 | mostid = i; 447 | 448 | # set seed so that random setting 449 | np.random.seed(7) 450 | # path for model file 451 | if 'modfile' in moptions and (not moptions['modfile']==None): 452 | if moptions['modfile'].rfind('/')==-1: 453 | moptions['modfile'] = [moptions['modfile'], './'] 454 | else: 455 | moptions['modfile'] = [moptions['modfile'], moptions['modfile'][:moptions['modfile'].rfind('/')+1]] 456 | 457 | if not mostid==0: 458 | filelists[mostid], filelists[0] = filelists[0], filelists[mostid] 459 | 460 | accuracy, X, Y, auc_op, mpre, mspf, init_l, mfpred = train_save_model(filelists, moptions['fnum'], moptions['hidden'], moptions['windowsize'], moptions) 461 | 462 | # 463 | # prepare prediction process 464 | # 465 | def pred_entry(moptions): 466 | # get feature files 467 | tfiles = [getTFiles1(moptions['wrkBase'], moptions)] 468 | 469 | init, init_l, loss_op, accuracy, train_op, X, Y, saver, auc_op, mpre, mspf, mfpred = mCreateSession(moptions['fnum'], moptions['hidden'], moptions['windowsize'], moptions) 470 | 471 | # path for model file 472 | if moptions['modfile'].rfind('/')==-1: 473 | moptions['modfile'] = [moptions['modfile'], './'] 474 | else: 475 | moptions['modfile'] = [moptions['modfile'], moptions['modfile'][:moptions['modfile'].rfind('/')+1]] 476 | 477 | pred_prepare(moptions, tfiles, accuracy, X, Y, auc_op, mpre, mspf, init_l, mfpred) 478 | -------------------------------------------------------------------------------- /docs/Description of well-trained models.md: -------------------------------------------------------------------------------- 1 | This file contains description of well-trained model in `train_mod` directory. The meaning of the short name of the data set can be found in the paper. ***Warnings: the well-trained models below are NEVER retrained with Guppy. You will get unpredictable results if you use them to make the prediction with the data basecalled with Guppy.*** 2 | 3 | # 1. Modificaiton prediction model 4 | These are several trained models of LSTM(Long short-term memory)-based RNN (Recurrent neural network), and their setting is given below. 5 | 6 | ## 1.1 `rnn_sinmodC_P100wd21_f7ne1u0_4` 7 | This model is trained on E. Coli data with synthetically introduced 5mC. 8 | ``` 9 | Window size: 21 10 | #Feature per event: 7 11 | Base of interest: C 12 | #Training epoch of negative data: 4 13 | Training data: SSS and UMR 14 | ``` 15 | Please refer to our paper for the meanings of `SSS` and `UMR`. 16 | 17 | ## 1.2 `rnn_conmodC_P100wd21_f7ne1u0_4` 18 | This model is trained on E. Coli data with synthetically introduced 5mC. 19 | ``` 20 | Window size: 21 21 | #Feature per event: 7 22 | Base of interest: C 23 | #Training epoch of negative data: 4 24 | Training data: positive control (SSS, Cg_sssl, Cg_mpel and gCgC), and negative control: (UMR, con1 and con2) 25 | ``` 26 | Please refer to our paper for the meanings of `SSS`, `UMR`, `con1`, `con2`, `Cg_sssl`, `Cg_mpel` and `gCgC`. 27 | 28 | ## 1.3 `rnn_conmodA_P100wd21_f7ne1u0_4` 29 | This model is trained on E. Coli data with synthetically introduced 6mA. 30 | ``` 31 | Window size: 21 32 | #Feature per event: 7 33 | Base of interest: A 34 | #Training epoch of negative data: 4 35 | Training data: Three positive control of 6mA and the negative control (con1, con2) 36 | ``` 37 | 38 | ## 1.4 `rnn_conmodA_E1m2wd21_f7ne1u0_4` 39 | This is a region-based training model. Reads and bases mapped to 1,000,000 on E. Coli is used for testing and others for training. 40 | ``` 41 | Window size: 21 42 | #Feature per event: 7 43 | Base of interest: A 44 | #Training epoch of negative data: 4 45 | Training data: Three positive control of 6mA and the negative control (con1, con2) 46 | ``` 47 | 48 | ## 1.5 `rnn_f7_wd21_chr1to10_4` 49 | This model is trained on Chr 1 to 10 of NA12878 with completely methylated positions (>90% methylation percentage in both replicates of bisulfite sequences) and completely un-methylated positions (<=0% methylation percentage in both replicates of bisulfite sequences). Bases in long reads mapped to the two groups of reference positions are used in training process. This model is used to make 5mC prediction on HX1, and both HX1 and NA12878 are basecalled using Albacore v2.3.1. 50 | ``` 51 | Window size: 21 52 | #Feature per event: 7 53 | Base of interest: C 54 | #Training epoch of all data: 4 55 | Training data: Chr 1 to 10 of NA12878 56 | ``` 57 | 58 | # 2. Cluster-effect model (the second neural network) 59 | This is the model of the second neural network to consider modificatoin cluster of 5mC. 60 | ## 2.1 `na12878_cluster_train_mod-keep_prob0.7-nb25-chr1` 61 | The model is only trained on Chr1 of NA12878. 62 | ``` 63 | Window size: 25 64 | #Feature per event: 11 65 | Base of interest: C 66 | #Training epoch of negative data: 1 67 | Training data: Chr 1 of NA12878 68 | ``` 69 | -------------------------------------------------------------------------------- /docs/Install.md: -------------------------------------------------------------------------------- 1 | 2 | # Installation Guide 3 | 4 | ## Prerequisites: 5 | ### The required packages for running DeepMod are listed below: 6 | * Python 3.6 7 | * python packages: 8 | + h5py 9 | + numpy 10 | + scipy 11 | * BWA MEM/minimap2 12 | * SAMTOOLS 13 | * HDF5 14 | * Tensorflow 15 | * If running performance evaluation, you might need 16 | + python packages: rpy2, matplotlib, scikit-learn 17 | + R packages: ggplot2, gridExtra, plyr, RColorBrewer 18 | 19 | The packages I used are 20 | ``` 21 | h5py 2.7.1 22 | hdf5 1.10.1 23 | numpy 1.14.0 24 | scikit-learn 0.19.1 25 | scipy 1.0.0 26 | tensorflow 1.7.0 27 | samtools 1.5 28 | minimap2 2.12 29 | bwa 0.7.15 30 | matplotlib 2.1.2 31 | rpy2 2.8.6 32 | r 3.4.2 33 | ``` 34 | 35 | ### Package installation 36 | Users have their own choice of how to install required package above. But to efficiently manage the installation packages, Anaconda is recommended. After installing Annocoda, it would also be an good option to use virtual environment in annocoda. `conda create -n mdeepmod python=3.6` can be used to create a virtual environment, and `source activate mdeepmod` can be used to activate the virtual environment, and then install required packages or run DeepMod. If users want to exit the virtual environment, simply type `source deactivate`. 37 | 38 | After creating `mdeepmod` virtual environment using annocoda, the following commands can install majority of necessary packages: 39 | 40 | ``` 41 | source activate mdeepmod 42 | conda install -c anaconda h5py hdf5 numpy scipy scikit-learn 43 | conda install -c bioconda minimap2 bwa samtools 44 | conda install -c r rpy2 r-ggplot2 r-gridextra r-plyr r-rcolorbrewer 45 | ``` 46 | Users might need to install `tensorflow` with version 1.13 by yourself or by using `conda install -c conda-forge tensorflow matplotlib` (if there are conflicts when using this command, uers need to give up and install `tensorflow` and `matplotlib` by yourself.). 47 | 48 | ### Additional notes 49 | 50 | Some users reported that the compression format of their fast5 is vbz instead of gzip. Install `ont-vbz-hdf-plugin` solves the problem. Otherwise, an error message such as "Error!!! No Raw_reads/Signal data" will be shown. 51 | 52 | 53 | ## Install DeepMod: 54 | * git clone https://github.com/WGLab/DeepMod 55 | * The script to be run is in bin/DeepMod.py: 56 | + Run `python bin/DeepMod.py` for help document or 57 | + Run `python bin/DeepMod.py detect` for the detection help document. 58 | Users might need to run `source activate mdeepmod` if the virtual environment and required packages are installed with the commands above. 59 | 60 | ## Installation time 61 | Without GPU-version tensorflow, it would take ~30 minutes to install required packages and DeepMod. 62 | 63 | ## Usage: 64 | For how to use them, please refer to [Usage](https://github.com/WGLab/DeepMod/blob/master/docs/Usage.md) 65 | 66 | -------------------------------------------------------------------------------- /docs/Reproducibility.md: -------------------------------------------------------------------------------- 1 | # Demo and Examples 2 | This section contains examples and also reproducibility process of our analysis. It is assumed that DeepMod has been successfully installed. If not, please install it and its required packages first according to the [instruction](https://github.com/WGLab/DeepMod/blob/master/docs/Install.md). It is also assumed that the virtual environment `mdeepmod` is used. Please remove `source activate mdeepmod` if virtual environment is not used in your system. 3 | 4 | To prepare to run DeepMod, It is assumed that there is a directory which has a *DeepMod* directory for our DeepMod tool, a *data* directory for storing Nanopore sequencing data, and a *ref* directory for reference data. 5 | 6 | ## Reference genomes 7 | Please used `bwa` to index the genome before runnning `DeepMod` 8 | ### E. coli reference genome 9 | The E. coli reference fasta could be downloaded from https://www.ncbi.nlm.nih.gov/nuccore/556503834. We assumed the reference file name under the *ref* directory is 'Ecoli_k12_mg1655.fasta' and indexed by `bwa` 10 | 11 | ### Human reference genome 12 | Hg38 was used for human Nanopore sequencing data. We assumed the fasta human file name under the *ref* directory is 'hg38' and indexed by `bwa` 13 | 14 | ## Nanopore data 15 | DeepMod needs a group of FAST5 files generated by Nanopore sequencer after basecalling. Thus, before you run DeepMod, you might need to run `Albacore` first on your Nanopore data. When you run `Albacore` on your Nanopore data, please make sure that `fastq,fast5` is used so that `event` information can be stored in FAST5 files. 16 | 17 | ## Run on E. coli data 18 | The Nanopore data set are large, and thus users need to contact the original authors1 to get the downloading URL. 19 | ### Example 1: 5mC detection 20 | #### Step 1. datasets 21 | Please download Nanopore sequencing data for *con1*, *con2* and *CG* motif with SSSl, and untar them into seperate sub-folder under the *data* directory, and assumed their sub-folder names are *Control_lib1*, *Control_lib3* and *meth10_lib3* respectively. 22 | 23 | #### Step 2. 24 | ``` 25 | mkdir ecoli_pred/ 26 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/meth10_lib3/ --Ref ref/Ecoli_k12_mg1655.fasta --FileID Cgsss --modfile DeepMod/train_mod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/ 27 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/Control_lib1/ --Ref ref/Ecoli_k12_mg1655.fasta --FileID con1 --modfile DeepMod/train_mod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/ 28 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/Control_lib3/ --Ref ref/Ecoli_k12_mg1655.fasta --FileID con2 --modfile DeepMod/train_mod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/ 29 | ``` 30 | The three commands might take ~41 minutes, ~84 minutes and ~120 minutes to be done. After that, you will found the results under the directory of *ecoli_pred* with *--FileID* as the sub-folder name. The results are in the *bed* format with the file names as *mod_pos.NC_000913.3-.C.bed* and *mod_pos.NC_000913.3+.C.bed*. The detail of the methylation prediction for each long read is also provided. 31 | 32 | One can run DeepMod on other datasets with the commands below. 33 | ``` 34 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/meth8_lib2/ --Ref ref/Ecoli_k12_mg1655.fasta --FileID gCgc --modfile DeepMod/train_mod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/ 35 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/meth9_lib2/ --Ref ref/Ecoli_k12_mg1655.fasta --FileID Cgmpe --modfile DeepMod/train_mod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/ 36 | ``` 37 | 38 | #### Step 3. 39 | The following commands then can be used to calculate average precision and AUC values of DeepMod. 40 | ``` 41 | python DeepMod/tools/cal_EcoliDetPerf.py ecoli_pred/Cgmpe ref/Ecoli_k12_mg1655.fasta Cg 0 '' -1 -1 ecoli_pred/Cgmpe/ ecoli_pred/con1,ecoli_pred/con2 42 | python DeepMod/tools/cal_EcoliDetPerf.py ecoli_pred/Cgsss ref/Ecoli_k12_mg1655.fasta Cg 0 '' -1 -1 ecoli_pred/Cgsss/ ecoli_pred/con1,ecoli_pred/con2 43 | python DeepMod/tools/cal_EcoliDetPerf.py ecoli_pred/gCgc ref/Ecoli_k12_mg1655.fasta gCgc 1 '' -1 -1 ecoli_pred/gCgc/ ecoli_pred/con1,ecoli_pred/con2 44 | ``` 45 | The commands above will generate AP plots and AUC plots under the directory of *ecoli_pred/Cgmpe/*, *ecoli_pred/Cgsss/*, *ecoli_pred/gCgc/* which are performance as shown in Figure 2 (a), (c) and (d). 46 | 47 | ### Example 2: 6mA detection 48 | #### Step 1. datasets 49 | You need contact the original authors1 to get the downloading URL. 50 | 51 | Please download Nanopore sequencing data for *con1*, *con2* and *gAtc* motif, *tcgA* and *gaAttc*, and untar them into seperate sub-folder under the *data* directory, and assumed their sub-folder names are *Control_lib1*, *Control_lib3*, *meth11_lib3*, *meth1_lib1* and *meth4_lib1* respectively. If you have run Example 1, you might already have the dataset for *con1* and *con2*. 52 | 53 | #### Step 2. 54 | ``` 55 | mkdir ecoli_pred/ (if not exist) 56 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/meth11_lib3/ --Ref ref/Ecoli_k12_mg1655.fasta --Base A --FileID gAtc --modfile DeepMod/train_mod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/ 57 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/meth1_lib1/ --Ref ref/Ecoli_k12_mg1655.fasta --Base A --FileID tcgA --modfile DeepMod/train_mod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/ 58 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/meth4_lib1/ --Ref ref/Ecoli_k12_mg1655.fasta --Base A --FileID gaAttc --modfile DeepMod/train_mod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/ 59 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/Control_lib1/ --Ref ref/Ecoli_k12_mg1655.fasta --Base A --FileID con1a --modfile DeepMod/train_mod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/ 60 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/Control_lib3/ --Ref ref/Ecoli_k12_mg1655.fasta --Base A --FileID con2a --modfile DeepMod/train_mod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/ 61 | ``` 62 | #### Step 3. 63 | The following commands then can be used to calculate average precision and AUC values of DeepMod. 64 | ``` 65 | python DeepMod/tools/cal_EcoliDetPerf.py ecoli_pred/gAtc ref/Ecoli_k12_mg1655.fasta gAtc 1 '' 1000000 2000000 ecoli_pred/gAtc/ ecoli_pred/con1a,ecoli_pred/con2a 66 | python DeepMod/tools/cal_EcoliDetPerf.py ecoli_pred/tcgA ref/Ecoli_k12_mg1655.fasta tcgA 3 '' 1000000 2000000 ecoli_pred/tcgA/ ecoli_pred/con1a,ecoli_pred/con2a 67 | python DeepMod/tools/cal_EcoliDetPerf.py ecoli_pred/gaAttc ref/Ecoli_k12_mg1655.fasta gaAttc 2 '' 1000000 2000000 ecoli_pred/gaAttc/ ecoli_pred/con1a,ecoli_pred/con2a 68 | ``` 69 | The commands above will generate AP plots and AUC plots under the directory of *ecoli_pred/gAtc/*, *ecoli_pred/tcgA/*, *ecoli_pred/gaAttc/* which are performance as shown in Figure 3 (a) and the supplementary Figure 6. 70 | 71 | ## Example 3: Detect 5mC on Na12878 72 | ### Step 1. datasets 73 | You might need to [Na12878 Nanopore sequencing data](https://github.com/nanopore-wgs-consortium/NA12878/blob/master/nanopore-human-genome/rel_3_4.md) to download fast5 files. Please note that the whole dataset is ~30TB. 74 | 75 | ### Step 2. 76 | Since it is very large for NA12878 Nanopore sequencing data, users can run each of tar files (each chromomsome has 1 to 9 tar files) separately to speed up the detection process. An example of running DeepMod on a template tar file is given below: 77 | ``` 78 | mkdir na12878_pred 79 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/chr1/tar1 --Ref ref/hg38.fasta --FileID chr1_tar1 --modfile DeepMod/train_mod/rnn_conmodC_P100wd21_f7ne1u0_4/mod_train_conmodC_P100wd21_f3ne1u0 --threads 15 --outFolder na12878_pred/ 80 | ``` 81 | 82 | ### Step 3. 83 | Then, the following command can be used to merge all results in Step 2. 84 | ``` 85 | python DeepMod/tools/sum_chr_mod.py na12878_pred/ C na12878_C 86 | ``` 87 | Then, the results will be under the directory of *na12878_pred/* and the result file names start with *na12878_C* and end with '.bed' in a bed format. The results are grouped by chromosomes. 88 | 89 | ### Step 4 (optional) 90 | This step is to consder the cluster effect of 5mC in human genome. To do that, a CpG index in a human genome will be generated. 91 | ``` 92 | python DeepMod/tools/generate_motif_pos.py ref/hg38.fa genome_motif/C C CG 0 93 | ``` 94 | The results are under the directory of *genome_motif/C*. 95 | 96 | After that, a second deep learning process will be used to consider cluster effect. 97 | ``` 98 | python DeepMod/tools/hm_cluster_predict.py na12878_pred/na12878_C genome_motif/C 99 | ``` 100 | The script will take all *a12878_pred/na12878_C.chr[12....].C.bed* as input, and output the bed files with the file name format of *a12878_pred/na12878_C_clusterCpG.chr[12....].C.bed* 101 | 102 | ### Step 5 103 | To evaluate DeepMod's performance on NA12878, users might use bisulfite sequencing results from https://www.encodeproject.org/experiments/ENCSR890UQO/. Due to the heterogeneity of sequenced samples, completely methylated and completely un-methylated bases could be used for the evaluation: a genomic position of a base was considered to be completely methylated if its methylation percentage >=90% in both replicates of bisulfite sequencing with coverage>=c (c could be 1, 5 or 10), and to be completely un-methylated if its methylation percentage is 0% in both replicates. 104 | 105 | The modification detection on HX1 can be run in the similar way to that on NA12878 but with different `--modfile`: *DeepMod/train_mod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10*. 106 | 107 | ## Reference 108 | 1. Stoiber MH, et al. De novo Identification of DNA Modifications Enabled by Genome-Guided Nanopore Signal Processing. bioRxiv 10.1101/094672, (2017). 109 | -------------------------------------------------------------------------------- /docs/Results_explanation.md: -------------------------------------------------------------------------------- 1 | This document describe the fomrat of the output of DeepMod. 2 | 3 | # 1. Check whether the running of DeepMod is successful. 4 | After finishing the running of DeepMod, usually you will find a "\*.done" file under the output folder. If you cannot see that file, some errors might happen and the running of DeepMod is not successful. One common cause behind this is the memory issue, that is, DeepMod needs more memory than what you requested or have. Increasing memory generally solve the issue. 5 | 6 | # 2 Format of output 7 | The output is in a BED format like below. The first six columns are `Chr`, `Start pos`, `End pos`, `Base`, `Capped coverage`, and `Strand`, and the last three columns are `Real coverage`, `Mehylation percentage` and `Methylation coverage`. 8 | 9 | ``` 10 | chr6 148655 148656 C 10 - 148655 148656 0,0,0 10 10 1 11 | chr6 148657 148658 C 12 + 148657 148658 0,0,0 12 8 1 12 | chr6 148674 148675 C 14 - 148674 148675 0,0,0 14 7 1 13 | chr6 148675 148676 C 15 - 148675 148676 0,0,0 15 6 1 14 | chr6 148676 148677 C 14 - 148676 148677 0,0,0 14 7 1 15 | chr6 148684 148685 C 12 - 148684 148685 0,0,0 12 25 3 16 | chr6 148685 148686 C 16 - 148685 148686 0,0,0 16 6 1 17 | chr6 148689 148690 C 11 + 148689 148690 0,0,0 11 72 8 18 | chr6 148691 148692 C 10 + 148691 148692 0,0,0 10 50 5 19 | chr6 148693 148694 C 8 + 148693 148694 0,0,0 8 100 8 20 | chr6 148694 148695 C 11 - 148694 148695 0,0,0 11 54 6 21 | chr6 148695 148696 C 10 + 148695 148696 0,0,0 10 90 9 22 | chr6 148697 148698 C 12 + 148697 148698 0,0,0 12 50 6 23 | chr6 148699 148700 C 9 + 148699 148700 0,0,0 9 22 2 24 | chr6 148701 148702 C 13 - 148701 148702 0,0,0 13 7 1 25 | chr6 148703 148704 C 13 - 148703 148704 0,0,0 13 15 2 26 | chr6 148706 148707 C 9 - 148706 148707 0,0,0 9 22 2 27 | ``` 28 | -------------------------------------------------------------------------------- /docs/Usage.md: -------------------------------------------------------------------------------- 1 | **This is an explanation of how to use DeepMod without examples. If you want to run some examples, please refer to [demo](https://github.com/WGLab/DeepMod/blob/master/docs/Reproducibility.md).** 2 | 3 | 4 | The inputs of DeepMod is a group of FAST5 files and a reference genome. FAST5 files need to be basecalled already, and `Events` data must be availabe in FAST5 files. 5 | 6 | DeepMod has a functional module called "detect" which will detect a specific modification in a single run. However, if the dataset and genome size is very larger or one wants to have the results soon, it would be better to run "detect" in a separate process simultaneously and then merge them together. For some special cases, if there is cluster effect between modifications (such as 5mC in CpG cluster), the third process would be used for additional prediction after "detect". How to use the three functional modules is described below. 7 | 8 | # 1. How to detect modifications from FAST5 files. 9 | The command for modification detection is to run `python DeepMod.py detect`. Without any other parameters, the help document will be shown. An example of how to use it is given below. 10 | 11 | ``` 12 | python DeepMod/bin/DeepMod.py detect --wrkBase FAST5-Folder --Ref Ref_genome_path --outFolder out_folder --Base C --modfile train_mod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10 --FileID User_Uniq_name --threads 4 13 | ``` 14 | where users need to provide where is the FAST5 files (`--wrkBase`), where is the reference genome (`--Ref`), where is the output folder (`--outFolder`), and also the base of interest and the mod file. Users can optionally specify unique string for the results file names (`--FileID`) and how many threads are used (`--threads`). 15 | 16 | If you want to make the prediction for base `A`, the following command could be used. 17 | ``` 18 | python DeepMod/bin/DeepMod.py detect --wrkBase FAST5-Folder --Ref Ref_genome_path --outFolder out_folder --Base A --modfile train_mod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0 --FileID User_Uniq_name --threads 4 19 | ``` 20 | 21 | 22 | # 2. How to merge different runs of modification detection 23 | Some projects might generate very large Nanopore sequencing data. For example, [NA12878 Nanopore sequencing data](https://github.com/nanopore-wgs-consortium/NA12878/blob/master/nanopore-human-genome/rel_3_4.md) was ~30TB. To speed up the detection of modification, users can run DeepMod with different `--FileID` and folders where fast5 files are (`--wrkBase`) but the same output folder (`--outFolder`). Then, the following script can be used to merge modification detection grouped by chromosomes for human genome. 24 | ``` 25 | python DeepMod/tools/sum_chr_mod.py outFolder base-of-interest res-unique-filename chromosomes 26 | ``` 27 | The last parameter is optional if running on human genome; otherwise, the chromosomes should be provided by a string where chromosome names are seperated by ','. `res-unique-filename` is a unique prefix of the summary file under `outFolder`. `outFolder` is the output folder of `DeepMod` and the prediction of `DeepMod` must be in the sub-folders under `outFolder`. 28 | 29 | # 3. How to consider modification cluster effect. 30 | 5mC in CpG motifs has cluster effect in human genome. To consider cluster effect, a second deep learning process was designed to improve the 5mC detection performance. To do that, additional commands below are used 31 | 32 | ## Output C in CpG motifs in a genome 33 | ``` 34 | python DeepMod/tools/generate_motif_pos.py ref-genome result-folder C CG 0 35 | ``` 36 | The result files were generated under the directory of *result-folder*. 37 | 38 | ### Generated clustered results. 39 | ``` 40 | python DeepMod/tools/hm_cluster_predict.py prefix-merged-bed-files genome_motif_folder-in-last-step 41 | ``` 42 | The output files will be under the same directory of *prefix-merged-bed-files* but with the prefix of *prefix-merged-bed-files* by appending "_clusterCpG". 43 | 44 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os,sys 4 | import setuptools 5 | 6 | 7 | with open("README.md", "r") as fh: 8 | long_description = fh.read() 9 | 10 | 11 | setuptools.setup( 12 | name="DeepMod", # Replace with your own username 13 | version="0.1.3", 14 | author="Qian Liu", 15 | author_email="", 16 | description="A deep-learning tool to detect DNA modifications using Nanopore long-read sequencing data", 17 | long_description=long_description, 18 | long_description_content_type="text/markdown", 19 | url="https://github.com/WGLab/DeepMod", 20 | #packages=setuptools.find_packages(), 21 | packages=['DeepMod_scripts'], 22 | package_dir={'DeepMod_scripts': 'bin/DeepMod_scripts'}, 23 | scripts=['bin/DeepMod.py', 'DeepMod_tools/cal_EcoliDetPerf.py', 'DeepMod_tools/generate_motif_pos.py', 'DeepMod_tools/hm_cluster_predict.py', 'DeepMod_tools/sum_chr_mod.py'], 24 | package_data={'train_deepmod': ['train_deepmod/*/*']}, 25 | #data_files=[('train_deepmod', ['train_deepmod/*'])], 26 | classifiers=[ 27 | "Programming Language :: Python", 28 | 'Intended Audience :: Science/Research', 29 | "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)", 30 | "Operating System :: OS Independent", 31 | ], 32 | ) 33 | -------------------------------------------------------------------------------- /train_deepmod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/Cg.cov5.nb25.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/Cg.cov5.nb25.data-00000-of-00001 -------------------------------------------------------------------------------- /train_deepmod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/Cg.cov5.nb25.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/Cg.cov5.nb25.index -------------------------------------------------------------------------------- /train_deepmod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/Cg.cov5.nb25.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/Cg.cov5.nb25.meta -------------------------------------------------------------------------------- /train_deepmod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "Cg.cov5.nb25" 2 | all_model_checkpoint_paths: "Cg.cov5.nb25" 3 | -------------------------------------------------------------------------------- /train_deepmod/rnn_conmodA_E1m2wd21_f7ne1u0_4/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "mod_train_conmodA_E1m2wd21_f3ne1u0" 2 | all_model_checkpoint_paths: "../2/mod_train_conmodA_E1m2wd21_f3ne1u0" 3 | all_model_checkpoint_paths: "../2.50/mod_train_conmodA_E1m2wd21_f3ne1u0" 4 | all_model_checkpoint_paths: "../3/mod_train_conmodA_E1m2wd21_f3ne1u0" 5 | all_model_checkpoint_paths: "../3.50/mod_train_conmodA_E1m2wd21_f3ne1u0" 6 | all_model_checkpoint_paths: "mod_train_conmodA_E1m2wd21_f3ne1u0" 7 | -------------------------------------------------------------------------------- /train_deepmod/rnn_conmodA_E1m2wd21_f7ne1u0_4/mod_train_conmodA_E1m2wd21_f3ne1u0.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodA_E1m2wd21_f7ne1u0_4/mod_train_conmodA_E1m2wd21_f3ne1u0.data-00000-of-00001 -------------------------------------------------------------------------------- /train_deepmod/rnn_conmodA_E1m2wd21_f7ne1u0_4/mod_train_conmodA_E1m2wd21_f3ne1u0.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodA_E1m2wd21_f7ne1u0_4/mod_train_conmodA_E1m2wd21_f3ne1u0.index -------------------------------------------------------------------------------- /train_deepmod/rnn_conmodA_E1m2wd21_f7ne1u0_4/mod_train_conmodA_E1m2wd21_f3ne1u0.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodA_E1m2wd21_f7ne1u0_4/mod_train_conmodA_E1m2wd21_f3ne1u0.meta -------------------------------------------------------------------------------- /train_deepmod/rnn_conmodA_P100wd21_f7ne1u0_4/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "mod_train_conmodA_P100wd21_f3ne1u0" 2 | all_model_checkpoint_paths: "../2/mod_train_conmodA_P100wd21_f3ne1u0" 3 | all_model_checkpoint_paths: "../2.50/mod_train_conmodA_P100wd21_f3ne1u0" 4 | all_model_checkpoint_paths: "../3/mod_train_conmodA_P100wd21_f3ne1u0" 5 | all_model_checkpoint_paths: "../3.50/mod_train_conmodA_P100wd21_f3ne1u0" 6 | all_model_checkpoint_paths: "mod_train_conmodA_P100wd21_f3ne1u0" 7 | -------------------------------------------------------------------------------- /train_deepmod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0.data-00000-of-00001 -------------------------------------------------------------------------------- /train_deepmod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0.index -------------------------------------------------------------------------------- /train_deepmod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0.meta -------------------------------------------------------------------------------- /train_deepmod/rnn_conmodC_P100wd21_f7ne1u0_4/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "mod_train_conmodC_P100wd21_f3ne1u0" 2 | all_model_checkpoint_paths: "../2/mod_train_conmodC_P100wd21_f3ne1u0" 3 | all_model_checkpoint_paths: "../2.50/mod_train_conmodC_P100wd21_f3ne1u0" 4 | all_model_checkpoint_paths: "../3/mod_train_conmodC_P100wd21_f3ne1u0" 5 | all_model_checkpoint_paths: "../3.50/mod_train_conmodC_P100wd21_f3ne1u0" 6 | all_model_checkpoint_paths: "mod_train_conmodC_P100wd21_f3ne1u0" 7 | -------------------------------------------------------------------------------- /train_deepmod/rnn_conmodC_P100wd21_f7ne1u0_4/mod_train_conmodC_P100wd21_f3ne1u0.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodC_P100wd21_f7ne1u0_4/mod_train_conmodC_P100wd21_f3ne1u0.data-00000-of-00001 -------------------------------------------------------------------------------- /train_deepmod/rnn_conmodC_P100wd21_f7ne1u0_4/mod_train_conmodC_P100wd21_f3ne1u0.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodC_P100wd21_f7ne1u0_4/mod_train_conmodC_P100wd21_f3ne1u0.index -------------------------------------------------------------------------------- /train_deepmod/rnn_conmodC_P100wd21_f7ne1u0_4/mod_train_conmodC_P100wd21_f3ne1u0.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodC_P100wd21_f7ne1u0_4/mod_train_conmodC_P100wd21_f3ne1u0.meta -------------------------------------------------------------------------------- /train_deepmod/rnn_f7_wd21_chr1to10_4/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "mod_train_f7_wd21_chr1to10" 2 | all_model_checkpoint_paths: "../30.6/mod_train_f7_wd21_chr1to10" 3 | all_model_checkpoint_paths: "../30.7/mod_train_f7_wd21_chr1to10" 4 | all_model_checkpoint_paths: "../30.8/mod_train_f7_wd21_chr1to10" 5 | all_model_checkpoint_paths: "../30.9/mod_train_f7_wd21_chr1to10" 6 | all_model_checkpoint_paths: "mod_train_f7_wd21_chr1to10" 7 | -------------------------------------------------------------------------------- /train_deepmod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10.data-00000-of-00001 -------------------------------------------------------------------------------- /train_deepmod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10.index -------------------------------------------------------------------------------- /train_deepmod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10.meta -------------------------------------------------------------------------------- /train_deepmod/rnn_sinmodC_P100wd21_f7ne1u0_4/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "mod_train_sinmodC_P100wd21_f3ne1u0" 2 | all_model_checkpoint_paths: "../2/mod_train_sinmodC_P100wd21_f3ne1u0" 3 | all_model_checkpoint_paths: "../2.50/mod_train_sinmodC_P100wd21_f3ne1u0" 4 | all_model_checkpoint_paths: "../3/mod_train_sinmodC_P100wd21_f3ne1u0" 5 | all_model_checkpoint_paths: "../3.50/mod_train_sinmodC_P100wd21_f3ne1u0" 6 | all_model_checkpoint_paths: "mod_train_sinmodC_P100wd21_f3ne1u0" 7 | -------------------------------------------------------------------------------- /train_deepmod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0.data-00000-of-00001 -------------------------------------------------------------------------------- /train_deepmod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0.index -------------------------------------------------------------------------------- /train_deepmod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0.meta --------------------------------------------------------------------------------