├── DeepMod_tools
    ├── cal_EcoliDetPerf.py
    ├── generate_motif_pos.py
    ├── hm_cluster_predict.py
    └── sum_chr_mod.py
├── INSTALL
├── LICENSE
├── README.md
├── bin
    ├── DeepMod.py
    └── DeepMod_scripts
    │   ├── EventTable.py
    │   ├── MoveTable.py
    │   ├── __init__.py
    │   ├── myCom.py
    │   ├── myDetect.py
    │   ├── myGetFeatureBasedPos.py
    │   └── myMultiBiRNN.py
├── docs
    ├── Description of well-trained models.md
    ├── Install.md
    ├── Reproducibility.md
    ├── Results_explanation.md
    └── Usage.md
├── setup.py
└── train_deepmod
    ├── na12878_cluster_train_mod-keep_prob0.7-nb25-chr1
        ├── Cg.cov5.nb25.data-00000-of-00001
        ├── Cg.cov5.nb25.index
        ├── Cg.cov5.nb25.meta
        └── checkpoint
    ├── rnn_conmodA_E1m2wd21_f7ne1u0_4
        ├── checkpoint
        ├── mod_train_conmodA_E1m2wd21_f3ne1u0.data-00000-of-00001
        ├── mod_train_conmodA_E1m2wd21_f3ne1u0.index
        └── mod_train_conmodA_E1m2wd21_f3ne1u0.meta
    ├── rnn_conmodA_P100wd21_f7ne1u0_4
        ├── checkpoint
        ├── mod_train_conmodA_P100wd21_f3ne1u0.data-00000-of-00001
        ├── mod_train_conmodA_P100wd21_f3ne1u0.index
        └── mod_train_conmodA_P100wd21_f3ne1u0.meta
    ├── rnn_conmodC_P100wd21_f7ne1u0_4
        ├── checkpoint
        ├── mod_train_conmodC_P100wd21_f3ne1u0.data-00000-of-00001
        ├── mod_train_conmodC_P100wd21_f3ne1u0.index
        └── mod_train_conmodC_P100wd21_f3ne1u0.meta
    ├── rnn_f7_wd21_chr1to10_4
        ├── checkpoint
        ├── mod_train_f7_wd21_chr1to10.data-00000-of-00001
        ├── mod_train_f7_wd21_chr1to10.index
        └── mod_train_f7_wd21_chr1to10.meta
    └── rnn_sinmodC_P100wd21_f7ne1u0_4
        ├── checkpoint
        ├── mod_train_sinmodC_P100wd21_f3ne1u0.data-00000-of-00001
        ├── mod_train_sinmodC_P100wd21_f3ne1u0.index
        └── mod_train_sinmodC_P100wd21_f3ne1u0.meta


/DeepMod_tools/cal_EcoliDetPerf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os, sys, time
  4 | from collections import defaultdict
  5 | import glob
  6 | import copy
  7 | import numpy as np
  8 | 
  9 | import matplotlib
 10 | matplotlib.use('Agg')
 11 | import matplotlib.pyplot as plt
 12 | from itertools import cycle
 13 | 
 14 | from sklearn.metrics import roc_curve, auc
 15 | from sklearn.metrics import precision_recall_curve, average_precision_score
 16 | from sklearn.metrics import matthews_corrcoef
 17 | 
 18 | import rpy2.robjects as robjects
 19 | from rpy2.robjects.packages import importr
 20 | from pkg_resources import resource_string
 21 | 
 22 | from scipy.stats import binom
 23 | import copy
 24 | 
 25 | ggplot = importr('ggplot2')
 26 | importr('gridExtra')
 27 | importr('plyr')
 28 | 
 29 | na4com = {'A':'T', 'C':'G', 'T':'A', 'G':'C'}
 30 | 
 31 | def readFA(mfa, mpat='Cg', mposinpat=0, t_chr=None, t_start=None, t_end=None):
 32 |    pos_dict = defaultdict(int)
 33 |   
 34 |    pat3 = copy.deepcopy(mpat.upper())
 35 |    comp_pat3 = ''.join([na4com[curna] for curna in pat3][::-1])
 36 |    comp_mposinpat = len(comp_pat3)-1-mposinpat
 37 |  
 38 |    fadict = defaultdict();
 39 |    with open(mfa, 'r') as mr:
 40 |       cur_chr = None;
 41 | 
 42 |       line = mr.readline();
 43 |       while line:
 44 |          line = line.strip();
 45 |          if len(line)>0:
 46 |             if line[0]=='>': 
 47 |                if not cur_chr==None:
 48 |                   fadict[cur_chr] = ''.join(fadict[cur_chr])
 49 |                cur_chr = line[1:].split()[0]
 50 |                if t_chr in [None, cur_chr]:
 51 |                   fadict[cur_chr] = []
 52 |             else:
 53 |                if t_chr in [None, cur_chr]: 
 54 |                   fadict[cur_chr].append(line)
 55 |          line = mr.readline();
 56 |       if not cur_chr==None:
 57 |          fadict[cur_chr] = ''.join(fadict[cur_chr]) 
 58 |    fakeys = fadict.keys();
 59 |    cpgdict = defaultdict(int); cpgnum = [0, 0]
 60 |    for fak in fakeys:
 61 |        cpgdict[fak] = defaultdict()
 62 |        for i in range(len(fadict[fak])):
 63 |           if (t_start==None or i>=t_start) and (t_end==None or i<=t_end):
 64 |              if i-mposinpat>=0 and i+len(comp_pat3)-1-mposinpat<len(fadict[fak]) and ''.join(fadict[fak][i-mposinpat:(i+len(comp_pat3)-1-mposinpat+1)])==pat3:
 65 |                 cpgdict[fak][('+', i)] = [1, fadict[fak][i]]; cpgnum[0] += 1
 66 |                 cpgdict[fak][('-', i)] = [0, fadict[fak][i]]
 67 |              elif i-comp_mposinpat>=0 and i+len(comp_pat3)-1-comp_mposinpat<len(fadict[fak]) and ''.join(fadict[fak][i-comp_mposinpat:(i+len(comp_pat3)-1-comp_mposinpat+1)])==comp_pat3:
 68 |                 cpgdict[fak][('+', i)] = [0, fadict[fak][i]]
 69 |                 cpgdict[fak][('-', i)] = [1, fadict[fak][i]]; cpgnum[1] += 1
 70 |              else:
 71 |                 cpgdict[fak][('+', i)] = [0, fadict[fak][i]]
 72 |                 cpgdict[fak][('-', i)] = [0, fadict[fak][i]]
 73 |    print('%s%d site: %d(+) %d(-)' % (pat3, mposinpat, cpgnum[0], cpgnum[1]))
 74 |    return cpgdict
 75 | 
 76 | def readmodf_dict(cpgdict, modf, pred_dict, mna, t_start=None, t_end=None):
 77 |    with open(modf, 'r') as mr:
 78 |       while True:
 79 |           line = mr.readline();
 80 |           if not line: break;
 81 |           line = line.strip();
 82 |           if len(line)>0:
 83 |              lsp = line.split();
 84 |              cur_chr = lsp[0];
 85 |              cur_pos = int(lsp[1]);
 86 |              cur_strand = lsp[5];
 87 | 
 88 |              cur_cov = int(lsp[9]);
 89 |              cur_m_p = int(lsp[10]);
 90 |              cur_m_c = int(lsp[11]);
 91 | 
 92 |              if not ((t_start==None or cur_pos>=t_start) and (t_end==None or cur_pos<=t_end)):
 93 |                 line = mr.readline();
 94 |                 continue;
 95 | 
 96 |              if not (mna==lsp[3] and lsp[3]==(cpgdict[cur_chr][(cur_strand, cur_pos)][1] if cur_strand=='+' else na4com[cpgdict[cur_chr][(cur_strand, cur_pos)][1]])):
 97 |                 print ('Error !! NA not equal %s == %s == %s %s' % (mna, lsp[3], cpgdict[cur_chr][(cur_strand, cur_pos)][1], modf))
 98 | 
 99 |              if (cur_chr, cur_pos, cur_strand) not in pred_dict: 
100 |                 pred_dict[(cur_chr, cur_pos, cur_strand)] = [cur_cov, cur_m_p, cur_m_c, lsp[3]]
101 |              else:
102 |                 pred_dict[(cur_chr, cur_pos, cur_strand)][0] += cur_cov
103 |                 pred_dict[(cur_chr, cur_pos, cur_strand)][2] += cur_m_c
104 |                 pred_dict[(cur_chr, cur_pos, cur_strand)][1] = int(pred_dict[(cur_chr, cur_pos, cur_strand)][2]*100/pred_dict[(cur_chr, cur_pos, cur_strand)][0]) if pred_dict[(cur_chr, cur_pos, cur_strand)][0]>0 else 0
105 | 
106 | def add_from_dict(cpgdict, pred_dict, label, pred_list, mna, tp_fp_tn_fn, mpat='Cg', mposinpat=0):
107 |    for posk in pred_dict:
108 |              cur_chr, cur_pos, cur_strand = posk
109 |              cur_cov, cur_m_p, cur_m_c, lsp3 = pred_dict[posk]
110 | 
111 |              iscpg = False;
112 |              if cpgdict[cur_chr][(cur_strand, cur_pos)][0]==1:
113 |                  iscpg = True;
114 |                  pred_list.append((label, cur_cov, cur_m_p, cur_m_c, mpat, np.log(binom.pmf(cur_m_c, cur_cov, 0.05)) ))
115 |                  if (lsp3==mpat[mposinpat]): pass
116 |                  else: print ('Error not methylated pos %s %s %s' % (mna, cur_strand))
117 |              if not iscpg:
118 |                 isclosec = False;
119 |                 for i in range(-3, 4):
120 |                    if (cur_strand, cur_pos+i) in cpgdict[cur_chr] and cpgdict[cur_chr][(cur_strand, cur_pos+i)][0]==1:
121 |                       isclosec = True; break;
122 |                 if lsp3==mpat[mposinpat]:
123 |                    pred_list.append((0, cur_cov, cur_m_p, cur_m_c, mpat+'_n'+str(abs(i))+mpat[mposinpat] if isclosec else 'Other'+mpat[mposinpat], np.log(binom.pmf(cur_m_c, cur_cov, 0.05)) ))
124 |                 else:
125 |                    pred_list.append((0, cur_cov, cur_m_p, cur_m_c, mpat+'_nb' if isclosec else 'Other', np.log(binom.pmf(cur_m_c, cur_cov, 0.05)) ))
126 |              if pred_list[-1][0]==0:
127 |                 tp_fp_tn_fn[2] += cur_cov - cur_m_c
128 |                 tp_fp_tn_fn[1] += cur_m_c
129 |              else:
130 |                 tp_fp_tn_fn[0] += cur_m_c
131 |                 tp_fp_tn_fn[3] += cur_cov - cur_m_c
132 | 
133 | 
134 | def readmodf(cpgdict, modf, label, pred_list, mna, tp_fp_tn_fn, mpat='Cg', mposinpat=0, t_start=None, t_end=None):
135 |    with open(modf, 'r') as mr:
136 |       line = mr.readline();
137 |       while line:
138 |           line = line.strip();
139 |           if len(line)>0:
140 |              lsp = line.split();
141 |              cur_chr = lsp[0];
142 |              cur_pos = int(lsp[1]);
143 |              cur_strand = lsp[5];
144 |              
145 |              cur_cov = int(lsp[9]);
146 |              cur_m_p = int(lsp[10]);
147 |              cur_m_c = int(lsp[11]); 
148 | 
149 |              if not ((t_start==None or cur_pos>=t_start) and (t_end==None or cur_pos<=t_end)): 
150 |                 line = mr.readline();
151 |                 continue;
152 | 
153 |              if not (mna==lsp[3] and lsp[3]==(cpgdict[cur_chr][(cur_strand, cur_pos)][1] if cur_strand=='+' else na4com[cpgdict[cur_chr][(cur_strand, cur_pos)][1]])):
154 |                 print ('Error !! NA not equal %s == %s == %s %s' % (mna, lsp[3], cpgdict[cur_chr][(cur_strand, cur_pos)][1], modf))
155 |              iscpg = False;
156 |              if cpgdict[cur_chr][(cur_strand, cur_pos)][0]==1:
157 |                  iscpg = True;
158 |                  pred_list.append((label, cur_cov, cur_m_p, cur_m_c, mpat, np.log(binom.pmf(cur_m_c, cur_cov, 0.05)) ))
159 |                  if (lsp[3]==mpat[mposinpat]): pass
160 |                  else: print ('Error not methylated pos %s %s %s' % (mna, cur_strand, modf))
161 |              if not iscpg:
162 |                 isclosec = False;
163 |                 for i in range(-3, 4):
164 |                    if (cur_strand, cur_pos+i) in cpgdict[cur_chr] and cpgdict[cur_chr][(cur_strand, cur_pos+i)][0]==1:
165 |                       isclosec = True; break;
166 |                 if lsp[3]==mpat[mposinpat]:
167 |                    pred_list.append((0, cur_cov, cur_m_p, cur_m_c, mpat+'_n'+str(abs(i))+mpat[mposinpat] if isclosec else 'Other'+mpat[mposinpat], np.log(binom.pmf(cur_m_c, cur_cov, 0.05)) ))
168 |                 else:
169 |                    pred_list.append((0, cur_cov, cur_m_p, cur_m_c, mpat+'_nb' if isclosec else 'Other', np.log(binom.pmf(cur_m_c, cur_cov, 0.05)) ))
170 |              if pred_list[-1][0]==0:
171 |                 tp_fp_tn_fn[2] += cur_cov - cur_m_c
172 |                 tp_fp_tn_fn[1] += cur_m_c
173 |              else:
174 |                 tp_fp_tn_fn[0] += cur_m_c
175 |                 tp_fp_tn_fn[3] += cur_cov - cur_m_c 
176 |           line = mr.readline();   
177 | 
178 | 
179 | sssfolder = sys.argv[1];   # 
180 | mreffile = sys.argv[2];    # 
181 | mpat=sys.argv[3];          # Cg
182 | mposinpat=int(sys.argv[4]);# 0 
183 | 
184 | chrofinterest = sys.argv[5];
185 | if chrofinterest=='': chrofinterest = None;
186 | stposofinterest = int(sys.argv[6]);
187 | if stposofinterest<0: stposofinterest = None;
188 | edposofinterest = int(sys.argv[7]);
189 | if edposofinterest<0: edposofinterest = None;
190 | 
191 | basefig = sys.argv[8]
192 | hastwoclass = 1;
193 | 
194 | sssfiles = {mpat[mposinpat]:glob.glob(os.path.join(sssfolder, 'mod_pos.*.'+mpat[mposinpat]+'.bed'))}
195 | sssfiles[mpat[mposinpat]].extend(glob.glob(os.path.join(sssfolder, '*/mod_pos.*.'+mpat[mposinpat]+'.bed')))
196 | sssfiles[mpat[mposinpat]].extend(glob.glob(os.path.join(sssfolder, '*/*/mod_pos.*.'+mpat[mposinpat]+'.bed')))
197 | print(str(len(sssfiles[mpat[mposinpat]])) + " " + str(sssfolder))
198 | 
199 | ## for negative;
200 | umrfiles = []
201 | for cur_umr_f in sys.argv[9].split(','):
202 |    if not os.path.isdir(cur_umr_f):
203 |        print("No prediction folder {}".format(cur_umr_f))
204 |        sys.exit(1);
205 |    umrfiles.extend(glob.glob(os.path.join(cur_umr_f, '*/*/mod_pos.*.'+mpat[mposinpat]+'.bed')))
206 |    umrfiles.extend(glob.glob(os.path.join(cur_umr_f, '*/mod_pos.*.'+mpat[mposinpat]+'.bed')))
207 |    umrfiles.extend(glob.glob(os.path.join(cur_umr_f, 'mod_pos.*.'+mpat[mposinpat]+'.bed')))
208 | print(str(len(umrfiles)) + "  " + str(sys.argv[9].split(',')))
209 | sys.stdout.flush()
210 | 
211 | for sa in sssfiles:
212 |    print (sa)
213 |    for nf in sssfiles[sa]:
214 |       print ('\t'+nf)
215 | 
216 | cpgdict = readFA(mreffile, mpat, mposinpat, chrofinterest, stposofinterest, edposofinterest)
217 | 
218 | pred_dict = defaultdict();
219 | for modf in umrfiles:
220 |    readmodf_dict(cpgdict, modf, pred_dict, mpat[mposinpat], stposofinterest, edposofinterest)
221 | 
222 | baseinfo = [mpat, mpat+'_n1'+mpat[mposinpat], mpat+'_n2'+mpat[mposinpat], mpat+'_n3'+mpat[mposinpat], 'Other'+mpat[mposinpat], mpat+'_nb', 'Other']
223 | 
224 | classify_m = ['Methylation_Percentage']
225 | classify_types = [baseinfo, [mpat]]
226 | filename = [['all_mp','motif_mp'] ]
227 | cov_thr = [1, 5]
228 | mlinestyle = {1:'bo-', 3:'gx--', 5:'r*-.', 7:'cs-', 10:'md--', 15:'k+-.'}
229 | 
230 | pred_list = []; tp_fp_tn_fn = [0, 0, 0, 0]
231 | 
232 | add_from_dict(cpgdict, pred_dict, 0, pred_list, mpat[mposinpat], tp_fp_tn_fn, mpat, mposinpat)
233 | 
234 | if True:
235 |    for na4 in sssfiles:
236 |       for cur_f in sssfiles[na4]:
237 |          print('%s %s' % (na4, cur_f)); sys.stdout.flush();
238 |          readmodf(cpgdict, cur_f, hastwoclass, pred_list, na4, tp_fp_tn_fn, mpat, mposinpat, stposofinterest, edposofinterest);
239 |    pred_list = np.array(pred_list, dtype=[('Methylation', np.uint), ('Coverage', np.uint64), ('Methylation_Percentage', np.uint64), ('Methylation_Coverage', np.uint64), ('BaseInfo', 'U20'), ('logp', np.float64)])
240 |    
241 |    if hastwoclass==1:
242 |       cov_plot_thr = [1, 5]
243 |       for ct_ind in range(len(classify_types)):
244 |          ct = classify_types[ct_ind]
245 |          cur_ct_data = pred_list[np.isin(pred_list['BaseInfo'], ct)]
246 |          for cm_ind in range(len(classify_m)):
247 |              print('basetype={} classify_measure={}'.format(ct, classify_m[cm_ind]))
248 |              cm = classify_m[cm_ind]
249 |            
250 |              # 1 for roc, 2: pr;
251 |              roc_or_pr = 2; roc_or_pr=0
252 |              for roc_or_pr in range(1,3):
253 |               if roc_or_pr>0: 
254 |                 mfig= plt.figure()
255 |               if roc_or_pr==2:
256 |                 cur_fn = basefig+'/ap_plot_met_pr_'+filename[cm_ind][ct_ind]+'.png'
257 |                 xylab = ['Recall', 'Precision'];  leg_mpos = "lower left"
258 |                 for covt in cov_plot_thr:
259 |                    precision, recall, thresholds = precision_recall_curve(cur_ct_data['Methylation'][cur_ct_data['Coverage']>=covt], cur_ct_data[cm][cur_ct_data['Coverage']>=covt])
260 |                    ap_pr = average_precision_score(cur_ct_data['Methylation'][cur_ct_data['Coverage']>=covt], cur_ct_data[cm][cur_ct_data['Coverage']>=covt])
261 |                    plt.plot(recall, precision, mlinestyle[covt], lw=2, label='Coverage>=%d (AP=%0.3f)' % (covt, ap_pr))
262 |                    print('\t\t %s %d ap=%.5f' % (cur_fn, covt, ap_pr))
263 |               elif roc_or_pr==1:
264 |                 xylab = ['False Positive Rate', 'True Positive Rate']; leg_mpos = "lower right"
265 |                 cur_fn = basefig+'/roc_plot_met_roc_'+filename[cm_ind][ct_ind]+'.png'
266 |                 prev = 0; prev_ind = -1
267 |                 for covt in cov_plot_thr:
268 |                    fpr, tpr, mthr = roc_curve(cur_ct_data['Methylation'][cur_ct_data['Coverage']>=covt], cur_ct_data[cm][cur_ct_data['Coverage']>=covt])
269 |                    #print(','.join([str(np.round(t1, 5)) for t1 in mthr]))
270 |                    roc_auc = auc(fpr, tpr)
271 |                    if (not np.isnan(roc_auc)) and (abs(roc_auc - prev)>=0.02 or (covt>10 and abs(roc_auc - prev)>=0.005) or (cov_plot_thr.index(covt)-prev_ind>1 and abs(roc_auc - prev)>=0.005)):
272 |                       plt.plot(fpr, tpr, mlinestyle[covt], lw=2, label='Coverage>=%d (AUC=%0.3f)' % (covt, roc_auc)) 
273 |                       prev = roc_auc; prev_ind = cov_plot_thr.index(covt)
274 |                    if not np.isnan(roc_auc):
275 |                       print ('\t\t %s %d %.7f' % (cur_fn, covt, roc_auc))
276 |                 plt.plot([0, 1], [0, 1])
277 |               if roc_or_pr>0:
278 |                 plt.xlim([0.0, 1.0]);              plt.ylim([0.0, 1.0])
279 |                 plt.xlabel(xylab[0]);              plt.ylabel(xylab[1])
280 |                 plt.legend(loc=leg_mpos)
281 |                 mfig.savefig(cur_fn, dpi=300);              plt.close(mfig)
282 | 
283 | 
284 | 


--------------------------------------------------------------------------------
/DeepMod_tools/generate_motif_pos.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os, sys, string, time
  4 | from collections import defaultdict;
  5 | import multiprocessing
  6 | 
  7 | 
  8 | 
  9 | def read_genome(mfafile):
 10 |    ref_genome = defaultdict();
 11 |    with open(mfafile, 'r') as mr:
 12 |       cur_chr = None;
 13 |       while True:
 14 |          line = mr.readline();
 15 |          if not line: break;
 16 |          line = line.strip();
 17 |          if len(line)==0: continue;
 18 |          if line[0]=='>':
 19 |             if not cur_chr==None:
 20 |                ref_genome[cur_chr] = ''.join(seqlist);
 21 |             cur_chr = line[1:].split()[0];
 22 |             seqlist = []
 23 |          else:
 24 |             seqlist.append(line.upper())
 25 |       ref_genome[cur_chr] = ''.join(seqlist);
 26 |    print("Total chr: {}".format(len(ref_genome))); sys.stdout.flush()
 27 |    return ref_genome
 28 | #ref_genome = read_genome(ref_fa);
 29 | 
 30 | def handle_motif_pos(run_Q):
 31 |    handli = 0;
 32 |    while not run_Q.empty():
 33 |       try:
 34 |          rgkey, ref_genome, res_folder, na_bp, curna, curmotif = run_Q.get(block=False)
 35 |          #print(rgkey, ref_genome, res_folder, na_bp, curna, curmotif); continue
 36 |       except:
 37 |          break;
 38 | 
 39 |       #curna_dict = defaultdict();
 40 |       #curmotif_dict = defaultdict();
 41 |       nafile = '%sna_%s_%s.bed' % (res_folder, rgkey, curna)
 42 |       motiffile = '%smotif_%s_%s.bed' % (res_folder, rgkey, curna)
 43 |       mw_na = open(nafile, 'w')
 44 |       mw_motif = open(motiffile, 'w')
 45 | 
 46 |       batchsize = 500000
 47 |       print("get motif for {}={}".format(rgkey, len(ref_genome))); sys.stdout.flush()
 48 |       cur_hi = 0; start_time = time.time();
 49 |       for na_ind in range(len(ref_genome)):
 50 |           cur_hi += 1
 51 |           if cur_hi % batchsize == 0:
 52 |              print('\t time consuming ({})= {} {}'.format( rgkey, cur_hi, time.time() - start_time ) )
 53 |              sys.stdout.flush()
 54 |              start_time = time.time()
 55 | 
 56 |           if (ref_genome[na_ind]==curna or na_bp[ref_genome[na_ind]]==curna):
 57 |              #curna_dict[(rgkey, na_ind, '+' if ref_genome[rgkey][na_ind]==curna else '-')] = True;
 58 |              mw_na.write('%s\t%s\t%s\n' % (rgkey, na_ind, '+' if ref_genome[na_ind]==curna else '-'))
 59 |           if ref_genome[na_ind]==curna and (not curmotif==None):
 60 |              for cur_mot in curmotif:
 61 |                  is_mot = True; mot_ind = 0;
 62 |                  for cur_sub_r_ind in range(na_ind - curmotif[cur_mot], na_ind + len(cur_mot) - curmotif[cur_mot] ):
 63 |                      if cur_sub_r_ind<0 or cur_sub_r_ind>len(ref_genome)-1: 
 64 |                         is_mot = False; break;
 65 |                      if not ref_genome[cur_sub_r_ind] == cur_mot[mot_ind]: 
 66 |                         is_mot = False; break;
 67 |                      mot_ind += 1
 68 |                  if is_mot:
 69 |                     mw_motif.write('%s\t%s\t%s\n' % (rgkey, na_ind, '+'))
 70 |                     mw_motif.write('%s\t%s\t%s\n' % (rgkey, na_ind+1, '-'))
 71 |                     break;
 72 |       mw_na.close(); mw_motif.close()
 73 | 
 74 |                    
 75 | 
 76 | ref_fa = 'ref/hg38.fa'
 77 | ref_fa = sys.argv[1]
 78 | res_folder = 'genome.motif/C/'
 79 | res_folder = sys.argv[2]+'/'
 80 | if not os.path.isdir(res_folder):
 81 |    os.system('mkdir -p '+res_folder)
 82 | 
 83 | curna='C'
 84 | curmotif={'CG':0}
 85 | curna = sys.argv[3];
 86 | curmotif = {sys.argv[4]:int(sys.argv[5])}
 87 | 
 88 | if len(sys.argv)>6:
 89 |    chrkeys = ["chr%s" % cid for cid in sys.argv[6].split(',')]
 90 | else:
 91 |    chrkeys = []
 92 |    for i in range(1, 23):
 93 |       chrkeys.append("chr%d" % i)
 94 |    chrkeys.append("chrX")
 95 |    chrkeys.append("chrY")
 96 |    chrkeys.append("chrM")
 97 | 
 98 | chrkeys = set(chrkeys)
 99 | 
100 | 
101 | na_bp = {"A":"T", \
102 |          "C":"G", \
103 |          "G":"C", \
104 |          "T":"A", \
105 |          "a":"t", \
106 |          "c":"g", \
107 |          "g":"c", \
108 |          "t":"a", \
109 |          "N":"N", \
110 |          "n":"n" \
111 |          }
112 | 
113 | 
114 | ref_genome = read_genome(ref_fa);
115 | 
116 | ##############################
117 | pmanager = multiprocessing.Manager();
118 | run_Q = pmanager.Queue();
119 | for curk in chrkeys:
120 |    run_Q.put((curk, ref_genome[curk], res_folder, na_bp, curna, curmotif))
121 | 
122 | mhandlers = [];
123 | share_var = (run_Q, )
124 | m_thread_num = len(chrkeys);
125 | for i in range(m_thread_num):
126 |    p = multiprocessing.Process(target=handle_motif_pos, args=share_var)
127 |    p.start();
128 |    mhandlers.append(p);
129 | while any(p.is_alive() for p in mhandlers):
130 |    try:
131 |       time.sleep(1);
132 |    except:
133 |       time.sleep(1);
134 |       continue;
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/DeepMod_tools/hm_cluster_predict.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os, sys, time
  4 | from collections import defaultdict
  5 | import glob
  6 | 
  7 | import numpy as np
  8 | 
  9 | from scipy import stats
 10 | 
 11 | import locale
 12 | locale.setlocale(locale.LC_ALL, 'en_US')
 13 | 
 14 | import tensorflow as tf
 15 | 
 16 | batch_size = 4096
 17 | 
 18 | cov_thrd = 5
 19 | 
 20 | def readBed(bedfile, t_chr=None, t_start=None, t_end=None):
 21 |    print('read {}'.format(bedfile)); sys.stdout.flush()
 22 |    beddict = defaultdict()
 23 |    with open(bedfile, 'r') as bedreader:
 24 |       start_time = time.time();
 25 |       line = bedreader.readline();
 26 |       while True:
 27 |          line = bedreader.readline();
 28 |          if not line: break;
 29 | 
 30 |          line = line.strip();
 31 |          if len(line)>20:
 32 |             mchr, start_pos, end_pos, _, _, m_strand, _, _, _, true_cov, meth_perc = line.split()
 33 |             start_pos = int(start_pos)
 34 |             true_cov = int(true_cov)
 35 |             if true_cov < cov_thrd: continue;
 36 |             meth_perc = round(int(meth_perc)/100.0, 3)
 37 |             if (t_chr not in [None, mchr]) or (not ((t_start==None or start_pos>=t_start) and (t_end==None or start_pos<=t_end))):
 38 |                  continue;
 39 |             if true_cov==0: continue
 40 |             beddict[(mchr, m_strand, start_pos)] = meth_perc
 41 |    return beddict
 42 | 
 43 | def readpredmod(predmodf, preddict, t_chr=None, t_start=None, t_end=None, cgCposdict=None):
 44 |    print('read {}'.format(predmodf)); sys.stdout.flush()
 45 |    with open(predmodf, 'r') as mr:
 46 |       while True:
 47 |           line = mr.readline();
 48 |           if not line: break;
 49 |           line = line.strip();
 50 |           if len(line)>0:
 51 |              lsp = line.split();
 52 |              cur_chr = lsp[0];
 53 |              cur_pos = int(lsp[1]);
 54 |              cur_strand = lsp[5];
 55 | 
 56 |              if not (cgCposdict==None or (cur_chr, cur_strand, cur_pos) in cgCposdict): continue
 57 | 
 58 |              cur_cov = int(lsp[9]);
 59 |              cur_m_p = int(lsp[10]);
 60 |              cur_m_c = int(lsp[11]);
 61 |              if (t_chr not in [None, cur_chr]) or (not ((t_start==None or cur_pos>=t_start) and (t_end==None or cur_pos<=t_end))):
 62 |                 continue;
 63 |              if cur_cov==0: continue;
 64 |  
 65 |              if (cur_chr, cur_strand, cur_pos) not in preddict:
 66 |                  preddict[(cur_chr, cur_strand, cur_pos)] = [cur_cov, round(cur_m_p/100.0, 3), cur_m_c, line]
 67 |              else:
 68 |                  print("Warning_duplicate {}".format(predmodf))
 69 |                  preddict[(cur_chr, cur_strand, cur_pos)][0] += cur_cov
 70 |                  preddict[(cur_chr, cur_strand, cur_pos)][2] += cur_m_c
 71 |                  if preddict[(cur_chr, cur_strand, cur_pos)][0]>0:
 72 |                     preddict[(cur_chr, cur_strand, cur_pos)][1] = round(preddict[(cur_chr, cur_strand, cur_pos)][2]/float(preddict[(cur_chr, cur_strand, cur_pos)][0]), 3)
 73 | 
 74 | 
 75 | 
 76 | pred_file = sys.argv[1]+'.%s.C.bed'
 77 | save_file = sys.argv[1]+'_clusterCpG.%s.C.bed'
 78 | gmotfolder = sys.argv[2]
 79 | 
 80 | mpat = 'Cg'; mposinpat=0
 81 | stposofinterest = None; edposofinterest = None;
 82 | 
 83 | nbsize = 25;
 84 | train_mod = 'DeepMod/train_mod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/{}.cov{}.nb{}'.format(mpat, cov_thrd, nbsize)
 85 | 
 86 | chrkeys = []
 87 | for i in range(1, 23):
 88 |    chrkeys.append("chr%d" % i)
 89 | chrkeys.append("chrX")
 90 | chrkeys.append("chrY")
 91 | chrkeys.append("chrM")
 92 | 
 93 | 
 94 | new_saver = tf.train.import_meta_graph(train_mod+'.meta')
 95 | print(new_saver); sys.stdout.flush()
 96 | with tf.Session() as sess:
 97 |    print("restore model: {} {}".format(train_mod+'.meta', train_mod[:train_mod.rindex('/')+1]))
 98 |    print(new_saver.restore(sess,tf.train.latest_checkpoint(train_mod[:train_mod.rindex('/')+1]))); sys.stdout.flush()
 99 | 
100 |    mgraph = tf.get_default_graph()
101 |    output = mgraph.get_tensor_by_name('output:0')
102 |    X = mgraph.get_tensor_by_name('X:0')
103 |    keep_prob = mgraph.get_tensor_by_name('keep_prob:0')
104 |  
105 |    for chrofinterest in chrkeys:
106 |       #read pred
107 |       preddict = defaultdict()
108 | 
109 |       cur_cg_pos = '%s/motif_%s_C.bed' % (gmotfolder, chrofinterest)
110 |       if not os.path.isfile(cur_cg_pos): 
111 |          print("Warning_motif!!! no file {}".format(cur_cg_pos))
112 |          continue;
113 |       if not os.path.isfile(pred_file % chrofinterest):
114 |          print("Warning_pred!!! no file {}".format(pred_file % chrofinterest))
115 |          continue;
116 |  
117 |       cgposdict = defaultdict();
118 |       with open(cur_cg_pos, 'r') as mr:
119 |          while True:
120 |             line = mr.readline();
121 |             if not line: break;
122 |             lsp = line.split();
123 |             cgposdict[ (lsp[0], lsp[2], int(lsp[1]) ) ] = True
124 |       print("{}: read {} done! ".format(len(cgposdict), cur_cg_pos)); sys.stdout.flush()
125 |       readpredmod(pred_file % chrofinterest, preddict, chrofinterest, cgCposdict=cgposdict)
126 |       print("size={} vs ".format(len(preddict), len(cgposdict) )); sys.stdout.flush()
127 | 
128 |       train_data = []
129 |       pdkeys = sorted(list( preddict.keys() ))
130 |       for cspk in pdkeys: # preddict:
131 |          if cspk not in cgposdict: 
132 |             print("not in cpg warning!!! {} {}".format(chrofinterest, cspk))
133 | 
134 |          partner_pos = (cspk[0], '-' if cspk[1]=='+' else '+', cspk[2]+1 if cspk[1]=='+' else cspk[2]-1)
135 |          cur_x = [preddict[cspk][1], preddict[partner_pos][1] if partner_pos in preddict else 0]
136 |          for pdis in range(11):
137 |             cur_x.append(0)
138 |          cur_x.append(0)
139 |          if len(train_data)<10: print("test")
140 |          for rpos in range(cspk[2]-nbsize, cspk[2]+nbsize+1):
141 |             if rpos in [cspk[2], partner_pos[2]]: continue;
142 |             
143 |             if (cspk[0], '+', rpos) in cgposdict and (cspk[0], '+', rpos) in preddict:
144 |                 cur_x[int(preddict[(cspk[0], '+', rpos)][1]/0.1+0.5) + 3] += 1
145 |                 cur_x[2] += 1
146 |                 if len(train_data)<10: print("\t\t{}: {}".format((cspk[0], '+', rpos), preddict[(cspk[0], '+', rpos)]))
147 |             elif (cspk[0], '-', rpos) in cgposdict and (cspk[0], '-', rpos) in preddict:
148 |                 cur_x[int(preddict[(cspk[0], '-', rpos)][1]/0.1+0.5) + 3] += 1
149 |                 cur_x[2] += 1
150 |                 if len(train_data)<10: print("\t\t{}: {}".format((cspk[0], '-', rpos), preddict[(cspk[0], '-', rpos)]))
151 |          for i in range(3, len(cur_x)):
152 |             if cur_x[2]>0: cur_x[i] = round(cur_x[i]/float(cur_x[2]), 3)
153 |          if len(train_data)<10: print('\t{}'.format(cur_x)); sys.stdout.flush()
154 |          train_data.append(cur_x)
155 | 
156 |       print("format data: data={}; {}".format(len(train_data), len(train_data[0]))); sys.stdout.flush()
157 |       
158 |       batch_data = np.array_split(train_data, int(len(train_data)/batch_size) if len(train_data)>batch_size else 2)
159 |       m_pred_new_per = []
160 |       for i in range(len(batch_data)):
161 |           moutp = sess.run([output], feed_dict={X:batch_data[i], keep_prob:1})
162 |           for mpind in moutp:
163 |               for curpd in mpind:
164 |                  m_pred_new_per.append(curpd)
165 |       print("new per: {}, {}  {} {}".format(len(pdkeys), len(train_data), len(m_pred_new_per), curpd ))
166 |       for wind in range(10):
167 |          print("'{}' <{}> {}".format(preddict[pdkeys[wind]][-1], m_pred_new_per[wind], train_data[wind]))
168 |       with open(save_file % chrofinterest, 'w') as mwriter:
169 |          for wind in range(len(pdkeys)):
170 |             mwriter.write("{} {}\n".format(preddict[pdkeys[wind]][-1], int(m_pred_new_per[wind]*100)))
171 |  
172 |  
173 | 


--------------------------------------------------------------------------------
/DeepMod_tools/sum_chr_mod.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os, sys, string
  4 | import multiprocessing
  5 | import time;
  6 | import glob
  7 | from collections import defaultdict;
  8 | 
  9 | 
 10 | def mprint(mstr):
 11 |    print(mstr); sys.stdout.flush()
 12 | 
 13 | if len(sys.argv)<4:
 14 |    print ("Usage: python {} pred_folder-of-DeepMod Base-of-interest unique-fileid-in-sum-file [chr-list]".format(sys.argv[0]))
 15 |    print ("       pred_folder-of-DeepMod: the prediction must in its sub-folder.")
 16 |    sys.exit(1)
 17 | 
 18 | pred_folder = sys.argv[1]
 19 | baseofint = sys.argv[2]
 20 | sum_fileid = sys.argv[3]
 21 | 
 22 | 
 23 | if len(sys.argv)>4:
 24 |    chrkeys = ["%s" % cid for cid in sys.argv[4].split(',')]
 25 | else:
 26 |    chrkeys = []
 27 |    for i in range(1, 23):
 28 |       chrkeys.append("chr%d" % i)
 29 |    chrkeys.append("chrX")
 30 |    chrkeys.append("chrY")
 31 |    chrkeys.append("chrM")
 32 | 
 33 | chrkeys = set(chrkeys)
 34 | 
 35 | #####################################
 36 | def readbed2(bedf):
 37 |    cur_ad = defaultdict();
 38 |    with open(bedf, 'r') as mr:
 39 |       while True:
 40 |          line = mr.readline();
 41 |          if not line: break;
 42 |          line = line.strip();
 43 |          lsp = line.split();
 44 |          cur_ad[(lsp[0], int(lsp[1]), lsp[5])] = [int(lsp[9]), int(lsp[11])]
 45 |    return cur_ad
 46 | 
 47 | def mergeMod(g_ad, cur_ad):
 48 |     for k in cur_ad:
 49 |        if k in g_ad:
 50 |             g_ad[k][0] += cur_ad[k][0]
 51 |             g_ad[k][1] += cur_ad[k][1]
 52 |        else: g_ad[k] = cur_ad[k]
 53 |       
 54 | def save_mod(res_file, amod_dict, baseOfin):
 55 |    poskeys = sorted(list(amod_dict.keys()))
 56 |    for pk in poskeys:
 57 |       if amod_dict[pk][1]==0: del amod_dict[pk]
 58 | 
 59 |    poskeys = list(amod_dict.keys())
 60 |    poskeys = sorted(poskeys);
 61 |    with open(res_file, 'w') as mw:
 62 |       for pk in poskeys:
 63 |           mw.write('%s %d %d %s %d %s  %d %d 0,0,0 %d %d %d\n' % (pk[0], pk[1],pk[1]+1, baseOfin,amod_dict[pk][0] if amod_dict[pk][0]<1000 else 1000,pk[2], pk[1],pk[1]+1, amod_dict[pk][0], int(amod_dict[pk][1]*100/amod_dict[pk][0]) if amod_dict[pk][0]>0 else 0, amod_dict[pk][1]  ))
 64 |        
 65 | 
 66 | def sum_amod_handler(run_Q):
 67 |    handli = 0;
 68 |    while not run_Q.empty():
 69 |       try:
 70 |           ck, pred_folder, baseOfin = run_Q.get(block=False)
 71 |           #print (ck, pred_folder, baseOfin); continue
 72 |       except:
 73 |          break;
 74 | 
 75 |       allbedfiles = glob.glob(os.path.join(pred_folder, ("*/*/*/*.%s-.%s.bed" % (ck, baseOfin)) ))
 76 |       allbedfiles.extend(glob.glob(os.path.join(pred_folder, ("*/*/*.%s-.%s.bed" % (ck, baseOfin)) )))
 77 |       allbedfiles.extend(glob.glob(os.path.join(pred_folder, ("*/*.%s-.%s.bed" % (ck, baseOfin)) )))
 78 |       mprint ("%s - %s: %d" % (ck, baseOfin, len(allbedfiles) )) 
 79 |       allbedfiles.extend(glob.glob(os.path.join(pred_folder, ("*/*/*/*.%s+.%s.bed" % (ck, baseOfin)) )))
 80 |       allbedfiles.extend(glob.glob(os.path.join(pred_folder, ("*/*/*.%s+.%s.bed" % (ck, baseOfin)) )))
 81 |       allbedfiles.extend(glob.glob(os.path.join(pred_folder, ("*/*.%s+.%s.bed" % (ck, baseOfin)) )))
 82 |       mprint ("%s -+ %s: %d" % (ck, baseOfin, len(allbedfiles) ))
 83 | 
 84 |       #  0    1     2     3 4 5   6      7      8   9 0 1
 85 |       #chr1 949802 949803 T 1 - 949802 949803 0,0,0 1 0 0 
 86 |       amod_dict = defaultdict();      
 87 |       res_file = "%s/%s.%s.%s.bed" % (pred_folder, sum_fileid, ck, baseOfin)
 88 |       for bedf_ind in range(len(allbedfiles)):
 89 |           mprint("\t %s %s %d/%d" % (ck, baseOfin, bedf_ind+1, len(allbedfiles)))
 90 |           cur_ad = readbed2(allbedfiles[bedf_ind])
 91 |           mergeMod(amod_dict, cur_ad)
 92 | 
 93 |       save_mod(res_file, amod_dict, baseOfin)
 94 | 
 95 | ##############################
 96 | pmanager = multiprocessing.Manager();
 97 | run_Q = pmanager.Queue();
 98 | for ck in chrkeys:
 99 |    run_Q.put((ck, pred_folder, baseofint))
100 | 
101 | mhandlers = [];
102 | share_var = (run_Q, )
103 | m_thread_num = len(chrkeys);
104 | for i in range(m_thread_num+1):
105 |    p = multiprocessing.Process(target=sum_amod_handler, args=share_var)
106 |    p.start();
107 |    mhandlers.append(p);
108 | while any(p.is_alive() for p in mhandlers):
109 |    try:
110 |       time.sleep(1);
111 |    except:
112 |       time.sleep(1);
113 |       continue;
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/INSTALL:
--------------------------------------------------------------------------------
1 | 
2 | Please refer to [docs/Install.md](https://github.com/WGLab/DeepMod/blob/master/docs/Install.md)
3 | 
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | DeepMod is available under GPLv3.
 2 | 
 3 | Copyright (C) <2018>  <Qian Liu/Kai Wang>
 4 | 
 5 | DeepMod (refer to as this tool below) is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation with either version 3 of the License, or later version.
 6 | 
 7 | This tool is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 8 | 
 9 | You should have received a copy of the GNU General Public License along with this tool. If not, please refer to <http://www.gnu.org/licenses/>.
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DeepMod: a deep-learning tool for genomic-scale, strand-sensitive and single-nucleotide based detection of DNA modifications
 2 | 
 3 | ## Methodology of DeepMod
 4 | 
 5 | DeepMod is a computational tool which takes long-read signals as input and outputs modification summary for each genomic position in a reference genome together with modification prediction for each base in a long read. The modification prediction model in DeepMod is a well-trained bidirectional recurrent neural network (RNN) with long short-term memory (LSTM) units. LSTM RNN is a class of artificial neural network for modeling sequential behaviors with LSTM to preclude vanishing gradient problem.  To detect DNA modifications, normalized signals of events in a long read were rescaled from -5 and 5, and signal mean, standard deviation and the number of signals together with base information (denoted by 7-feature description) were obtained for each event as input of a LSTM unit with 100 hidden nodes. In DeepMod with 3 hidden layers in RNN. Predicted modification summary for each position would be generated in a BED format, suggesting how many reads cover genomic positions, how many mapped bases in long reads were predicted to be modified and the coverage percentage of prediction modifications. This modification prediction by DeepMod is strand-sensitive and single-nucleotide based.
 6 | 
 7 | ### Inputs of DeepMod
 8 | 
 9 | The input of DeepMod is Nanopore long read data together a refrence genome.
10 | 
11 | Please note that the default model is trained on Metrichore basecalled data. While it has reasoanble performance on Albacore v1 basecalled data, it should not be used in Albacore v2 (they require different sets of models) or any Guppy basecalled data, due to the differences in basecalling approaches. We tested the newly trained model on move table basecalled data with the guppy branch of this repository and have good performance. Therefore, in 2022, we released [DeepMod2](https://github.com/WGLab/DeepMod2) which supports two modes: a Guppy basecalling mode, and a Tombo-resquiggled mode. Please note that models will be flowcell specific; right now our models are trained in R9 but if you use R10 flowcell then you need to wait until we release the R10 model. (UPDATE: in October 2022 we relased R10 models, so DeepMod2 is now the only open source softare that handles R10 flowcells.) If you use DeepMod in your research, please be mindful that different basecallers/flowcells can generate very different signal properties so the correct model (rather than default model) in DeepMod/DeepMod2 needs to be used for your specific data set.
12 | 
13 | ## System Requirements
14 | ### Hardware requirements
15 | DeepMod is based on deep learning framework, and needs to access raw data of Nanopore sequencing. Thus, it needs enough RAM to support deep learning framework and enough hard drive for raw data of Nanopore sequencing. GPU can substantially speedup the detection process. For optimal performance, we recommend a computer with:
16 |  * RAM: 20+ GB per thread
17 |  * GPU or CPU with 8+ cores
18 |  * HDD or better with SSD. Dependent on how large raw data is (for 30X E coli data, it might need 10+GB, while for 30X human data, it might need 10+TB)
19 | 
20 | ### Software requirements
21 | The developmental version of DeepMod has been tested on Linux operating system: CentOS 7.0 with both CPU and GPU machines.
22 | 
23 | ### Future improvement
24 | Now, DeepMod supports basecalled data with either event tables or move tables (***Due to confusion to users who do not know the difference between main branch and guppy branch, in 2022 we made the decision to just create a DeepMod2 repository solely for move tables generated by Guppy basecaller; therefore, please do NOT use DeepMod for Guppy-basecalled data!***). But it does not support multi-fast5. For multi-fast5 issue, one can use API at https://github.com/nanoporetech/ont_fast5_api to convert multi-fast5 to single fast5 file, and then re-basecall to get event information as input of DeepMod. We have been working on improvement of DeepMod to support multi-fast5.
25 | 
26 | ## Installation
27 | Please refer to [Installation](https://github.com/WGLab/DeepMod/blob/master/docs/Install.md) for how to install DeepMod.
28 | 
29 | ## Usage
30 | 
31 | Please refer to [Usage](https://github.com/WGLab/DeepMod/blob/master/docs/Usage.md) for how to use DeepMod.
32 | 
33 | ## Examples and Reproducibility of our analysis.
34 | 
35 | Please refer to [Examples and Reproducibility](https://github.com/WGLab/DeepMod/blob/master/docs/Reproducibility.md) for examples of how to run DeepMod.
36 | 
37 | ## Revision History
38 | 
39 | For release history, please visit [here](https://github.com/WGLab/NanoDeepMod/releases). For details, please go [here](https://github.com/WGLab/DeepMod/blob/master/README.md).
40 | 
41 | ## Contact
42 | 
43 | If you have any questions/issues/bugs, please post them on [GitHub](https://github.com/WGLab/DeepMod/issues). They would also be helpful to other users.
44 | 
45 | ## Reference
46 | **Please cite the publication below if you use our tool:**
47 | 
48 | Q. Liu, L. Fang, G. Yu, D. Wang, C. Xiao, K. Wang. Detection of DNA base modifications by deep recurrent neural network on Oxford Nanopore sequencing data. Nat. Commun 10, 2019. Online at https://www.nature.com/articles/s41467-019-10168-2.
49 | 


--------------------------------------------------------------------------------
/bin/DeepMod.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os;
  4 | import sys;
  5 | 
  6 | import string;
  7 | 
  8 | from collections import defaultdict
  9 | 
 10 | import argparse;
 11 | from argparse import RawTextHelpFormatter
 12 | 
 13 | from DeepMod_scripts.myCom import *
 14 | 
 15 | 
 16 | 
 17 | # three modules in DeepMod
 18 | parser = argparse.ArgumentParser(description="Detect nucleotide modification from nanopore signals data.", epilog="For example, \n \
 19 | \tpython %(prog)s train: Training a modification classifier.\n \
 20 | \tpython %(prog)s detect: Detect modification by integrating all long reads. \n \
 21 | \tpython %(prog)s getfeatures: Get features for training a model.  \n \
 22 | ", formatter_class=RawTextHelpFormatter);
 23 | 
 24 | 
 25 | #
 26 | # Return error message when a value<1
 27 | # Return an empty string otherwise
 28 | #
 29 | def non_negative(i, mstr):
 30 |    if i<1: return (("\n\tError %d could not be negative(%d)" % (mstr, i)))
 31 |    else: return ''
 32 | 
 33 | #
 34 | # Print all parameters in stdout
 35 | #
 36 | def printParameters(moptions):
 37 |    mpkeys = moptions.keys(); #mpkeys.sort()
 38 |    sorted(mpkeys)
 39 |    print('%30s: %s' % ('Current directory', os.getcwd()))
 40 |    for mpk in mpkeys:
 41 |       print ('%30s: %s' % (mpk, str(moptions[mpk])))
 42 |    sys.stdout.flush()
 43 | 
 44 | #
 45 | # Got common argument provided by users or default values.
 46 | #
 47 | #
 48 | def mCommonParam(margs):
 49 | 
 50 |    ErrorMessage = ""
 51 |    moptions = defaultdict()
 52 |    # how to output running message: need more control now.
 53 |    moptions['outLevel'] = margs.outLevel
 54 |    # the input working base
 55 |    moptions["wrkBase"] = margs.wrkBase
 56 |    if moptions["wrkBase"]==None:
 57 |       ErrorMessage = ErrorMessage + ("\n\tThe input folder is None.")
 58 | 
 59 |    # An unique ID for output
 60 |    # Usefull for run the program in parallel
 61 |    moptions["FileID"] = margs.FileID
 62 |    # output folder;
 63 |    # make it if the output folder does not exist
 64 |    moptions['outFolder'] = margs.outFolder
 65 |    moptions['outFolder'] = format_last_letter_of_folder(moptions['outFolder'])
 66 |    if moptions['outFolder']==None or (not os.path.isdir(moptions['outFolder'])):
 67 |       try:
 68 |          os.system('mkdir -p '+moptions['outFolder']);
 69 |       except:
 70 |          ErrorMessage = ErrorMessage + ("\n\tThe output folder (%s) does not exist and cannot be created." % moptions['outFolder'])
 71 | 
 72 |    # check all data in a recurive way
 73 |    moptions['recursive'] = margs.recursive
 74 |    # the number of threads used and the number of files handled by each thread.
 75 |    moptions['files_per_thread'] = margs.files_per_thread
 76 |    if moptions['files_per_thread']<2: moptions['files_per_thread'] = 2
 77 |    # the number of threads used
 78 |    moptions['threads'] = margs.threads
 79 |    if moptions['threads']<1: moptions['threads'] = 1
 80 | 
 81 |    # windowsize: default=21
 82 |    moptions['windowsize'] = margs.windowsize
 83 |    ErrorMessage = ErrorMessage + non_negative(moptions['windowsize'], 'windowsize')
 84 |    if moptions['windowsize']<1: moptions['windowsize'] = 1
 85 | 
 86 |    # aligners: bwa-mem or minimap2
 87 |    moptions['alignStr'] = margs.alignStr;
 88 | 
 89 |    moptions['SignalGroup'] = margs.SignalGroup;
 90 | 
 91 |    moptions['move'] = margs.move
 92 | 
 93 |    return [moptions, ErrorMessage]
 94 | 
 95 | #
 96 | # detect modification for bases of interests
 97 | # input is a list of fast5 files, a reference genome and a well-trained model.
 98 | #
 99 | def mDetect(margs):
100 |    # get common parameters
101 |    moptions, ErrorMessage = mCommonParam(margs)
102 | 
103 |    # path for basecall information in fast5 files
104 |    moptions['basecall_1d'] = margs.basecall_1d
105 |    moptions['basecall_2strand'] = margs.basecall_2strand
106 |    # Whether consider those chromosome which contain -_:/
107 |    # default: yes;
108 |    moptions['ConUnk'] = margs.ConUnk
109 |    # output layer information for deep learning
110 |    moptions['outputlayer'] = margs.outputlayer
111 |    # base of interest
112 |    moptions['Base'] = margs.Base
113 |    # whether take cluster effect of methylation into consideration
114 |    moptions['mod_cluster'] = margs.mod_cluster
115 |    # base of interest
116 |    if moptions['Base'] in ["", None]:
117 |       ErrorMessage = ErrorMessage + ("\n\t Please provide a base of interest.")
118 | 
119 |    # predict medification for bases of interest in long reads first
120 |    # only summarize them for each genomic position of interest .
121 |    moptions['predDet'] = margs.predDet
122 |    if moptions['predDet']:
123 |       # path to reference genome
124 |       moptions['Ref'] = margs.Ref
125 |       if moptions['Ref']==None or (not os.path.isfile(moptions['Ref'])):
126 |          ErrorMessage = ErrorMessage + ("\n\t reference file does not exist (%s)" % moptions['Ref'])
127 | 
128 |       # the number of feature for each event
129 |       moptions['fnum'] = margs.fnum
130 |       ErrorMessage = ErrorMessage + non_negative(moptions['fnum'], 'fnum')
131 |       # the number of hidden nodes
132 |       moptions['hidden'] = margs.hidden
133 |       ErrorMessage = ErrorMessage + non_negative(moptions['hidden'], 'hidden')
134 |       # the well-trained model
135 |       moptions['modfile'] = margs.modfile
136 |       if moptions['modfile']==None:
137 |          print("No mod file is provided. The default one is used")
138 |          moptions['modfile'] = ('train_deepmod/rnn_P90wd%d_f53/mod_train_P90wd%d_f53' % (moptions['windowsize'], moptions['windowsize']))
139 |          if (not os.path.isfile(moptions['modfile']+'.meta')):
140 |             moptions['modfile'] = ('{}/lib/python{}.{}/site-packages/DeepMod/train_deepmod/rnn_P90wd{}_f53/mod_train_P90wd{}_f53'.format(sys.prefix,sys.version_info.major,sys.version_info.minor, moptions['windowsize'], moptions['windowsize']))
141 |       if (not os.path.isfile(moptions['modfile']+'.meta')):
142 |          ErrorMessage = ErrorMessage + ("\n\tThe meta file (%s) does not exist" % (moptions['modfile']+'.meta' if not moptions['modfile']==None else ""))
143 |    else:
144 |       # already done the prediction process?
145 |       # Yes: summarize the results only
146 |       moptions['predpath'] = margs.predpath
147 |       if moptions['predpath']==None or (not os.path.isdir(moptions['predpath'])):
148 |          ErrorMessage = ErrorMessage + ("\n\tThe predpath does not exist")
149 | 
150 |    # specify region of interest
151 |    # not consider bases outside regions in a reference genome
152 |    # None: all bases of interest
153 |    moptions['region'] = [ ]
154 |    if margs.region == None or len(margs.region)==0:
155 |       moptions['region'].append([None, None, None])
156 |    else:
157 |       mregionlist = margs.region.split(';')
158 |       for mr in mregionlist:
159 |          mr_sp = mr.split(':')
160 |          moptions['region'].append([mr_sp[0], int(mr_sp[1]) if len(mr_sp)>1 else None, int(mr_sp[2]) if len(mr_sp)>2 else None ])
161 | 
162 |    print("\nNanopore sequencing data analysis is resourece-intensive and time consuming. ")
163 |    print("Some potential strong recommendations are below:")
164 |    print("\tIf your reference genome is large as human genome and your Nanopore data is huge,")
165 |    print("\tIt would be faster to run this program parallelly to speed up.")
166 |    print("\tYou might run different input folders of your fast5 files and ")
167 |    print("\tgive different output names (--FileID) or folders (--outFolder)")
168 |    print("\tA good way for this is to run different chromosome individually.\n")
169 | 
170 |    # print help information if any necessary options are not provided.
171 |    printParameters(moptions)
172 |    if not ErrorMessage=="":
173 |       ErrorMessage = "Please provide correct parameters" + ErrorMessage
174 |       print(ErrorMessage)
175 |       parser.print_help();
176 |       parser.parse_args(['detect', '--help']);
177 |       sys.exit(1)
178 | 
179 |    from DeepMod_scripts import myDetect
180 |    myDetect.mDetect_manager(moptions)
181 | 
182 | #
183 | # Train a model
184 | # Need to get features first.
185 | #
186 | def mTrain(margs):
187 |    from DeepMod_scripts import myMultiBiRNN
188 | 
189 |    # gent common options
190 |    moptions, ErrorMessage = mCommonParam(margs)
191 | 
192 |    # network setting: the number of features and the number of hidden nodes
193 |    moptions['fnum'] = margs.fnum
194 |    ErrorMessage = ErrorMessage + non_negative(moptions['fnum'], 'fnum')
195 |    moptions['hidden'] = margs.hidden
196 |    ErrorMessage = ErrorMessage + non_negative(moptions['hidden'], 'hidden')
197 | 
198 |    # the output function of the deep learning model
199 |    moptions['outputlayer'] = margs.outputlayer
200 |    # whether using different class weights
201 |    moptions['unbalanced'] = margs.unbalanced
202 | 
203 |    # re-load trained model and continue to train
204 |    moptions['modfile'] = margs.modfile
205 |    if moptions['modfile']==None: pass;
206 |    elif (not os.path.isfile(moptions['modfile']+'.meta')):
207 |       ErrorMessage = ErrorMessage + ("\n\tThe meta file (%s) does not exist" % (moptions['modfile']+'.meta' if not moptions['modfile']==None else ""))
208 | 
209 |    # read-based or region based independent training
210 |    # E: region-based
211 |    # P: read-based.
212 |    if not margs.test==None:
213 |       moptions['test'] = margs.test.split(',')
214 |       if moptions['test'][0] == 'E': moptions['test'][0] = '-'
215 |       elif moptions['test'][0] == 'P': moptions['test'][0] = '0'
216 |       else:
217 |          ErrorMessage = ErrorMessage + "Unknown option for test: the first character must be E or P "+margs.test
218 |       if moptions['test'][0] in ['-']:
219 |          moptions['test'][1] = int(moptions['test'][1]) * (10**6)
220 |          moptions['test'][2] = int(moptions['test'][2]) * (10**6)
221 |       else: moptions['test'][1] = int(moptions['test'][1])/100.0
222 |    else: moptions['test'] = ['N', '100']
223 | 
224 |    # print help document if necessary options are not provided.
225 |    print("Train")
226 |    printParameters(moptions)
227 |    if not ErrorMessage=="":
228 |       ErrorMessage = "Please provide correct parameters" + ErrorMessage
229 |       print(ErrorMessage)
230 |       parser.print_help();
231 |       parser.parse_args(['train', '--help']);
232 |       sys.exit(2)
233 | 
234 |    myMultiBiRNN.mMult_RNN_LSTM_train(moptions)
235 | 
236 | #
237 | # get features for training
238 | #
239 | #
240 | def mGetFeatures(margs):
241 |    from DeepMod_scripts import myGetFeatureBasedPos
242 | 
243 |    # get common options
244 |    moptions, ErrorMessage = mCommonParam(margs)
245 |    # motif-based data: positive or negative control data
246 |    moptions['posneg'] = margs.posneg
247 |    # the number of features: 7-description or 57-description
248 |    moptions['fnum'] = margs.fnum
249 |    ErrorMessage = ErrorMessage + non_negative(moptions['fnum'], 'fnum')
250 |    # size of each bacth to store features
251 |    moptions['size_per_batch'] = margs.size_per_batch
252 |    if moptions['size_per_batch'] < 0.001: moptions['size_per_batch'] = 0.001
253 | 
254 |    # path to basecall inform in fast5 files
255 |    moptions['basecall_1d'] = margs.basecall_1d
256 |    moptions['basecall_2strand'] = margs.basecall_2strand
257 | 
258 |    # regions of interest
259 |    moptions['region'] = [None, None, None]
260 |    if not (margs.region==None or margs.region.strip()==''):
261 |       rsp = margs.region.split(':')
262 |       for rv_ind in range(len(rsp)):
263 |          rsp[rv_ind] = rsp[rv_ind].strip();
264 |          if not rsp[rv_ind]=='':
265 |             moptions['region'][rv_ind] = rsp[rv_ind]
266 | 
267 |    # referene genome
268 |    moptions['Ref'] = margs.Ref
269 |    if moptions['Ref']==None or (not os.path.isfile(moptions['Ref'])):
270 |       ErrorMessage = ErrorMessage + ("\n\t reference file does not exist (%s)" % moptions['Ref'])
271 | 
272 |    # get motif-based modification
273 |    # or specify by --fulmod/--anymod/--nomod
274 |    moptions['motifORPos'] = margs.motifORPos
275 |    if margs.motifORPos==1:
276 |       moptions['motif'] = [margs.motif.upper(), margs.ModinMotif]
277 |    elif margs.motifORPos==2:
278 |       moptions['fulmod'] = margs.fulmod
279 |       if moptions['fulmod']==None: # completely modificated positions
280 |          ErrorMessage = ErrorMessage + ("\t There is no parameter for --fulmod.")
281 |       moptions['anymod'] = margs.anymod
282 |       if moptions['anymod'] == None: # patially modificated positions
283 |          ErrorMessage = ErrorMessage + ("\t There is no parameter for --anymod.")
284 |       moptions['nomod'] = margs.nomod
285 |       if moptions['nomod'] == None: # completely unmodified posisionts
286 |          ErrorMessage = ErrorMessage + ("\t There is no parameter for --nomod.")
287 |    else:
288 |       ErrorMessage = ErrorMessage + ("\tmotifORPos value (%d) is not supported." % margs.motifORPos)
289 | 
290 |    # print help document if any required options are not provided.
291 |    printParameters(moptions)
292 |    if not ErrorMessage=="":
293 |       ErrorMessage = "Please provide correct parameters" + ErrorMessage
294 |       print(ErrorMessage)
295 |       parser.print_help();
296 |       parser.parse_args(['getfeatures', '--help']);
297 |       sys.exit(1)
298 | 
299 |    myGetFeatureBasedPos.getFeature_manager(moptions)
300 | 
301 | 
302 | #####################################################################################
303 | 
304 | subparsers = parser.add_subparsers()
305 | parent_parser = argparse.ArgumentParser(add_help=False)
306 | 
307 | # add common options
308 | com_group_for_comparison = parent_parser.add_argument_group('Common options.')
309 | com_group_for_comparison.add_argument("--outLevel", type=int, choices=[OUTPUT_DEBUG, OUTPUT_INFO, OUTPUT_WARNING, OUTPUT_ERROR], default=OUTPUT_WARNING, help=("The level for output: %d for DEBUG, %d for INFO, %d for WARNING, %d for ERROR. Default: %d" % (OUTPUT_DEBUG, OUTPUT_INFO, OUTPUT_WARNING, OUTPUT_ERROR, OUTPUT_WARNING)))
310 | com_group_for_comparison.add_argument("--wrkBase", help="The base folder for FAST5 files.")
311 | com_group_for_comparison.add_argument("--FileID", default="mod", help="The unique string for intermediate files and final output files. Default: 'mod'")
312 | com_group_for_comparison.add_argument("--outFolder", default='./mod_output', help="The default folder for outputing the results. Default: ./mod_output")
313 | com_group_for_comparison.add_argument("--recursive", type=int, default=1, choices=[0,1], help="Recurise to find fast5 files. Default:1")
314 | com_group_for_comparison.add_argument("--threads", type=int, default=4, help="The number of threads used (not for train). Default:4")
315 | com_group_for_comparison.add_argument("--files_per_thread", type=int, default=1000, help="The number of fast5 files for each thread (not for train). Default:500")
316 | com_group_for_comparison.add_argument("--windowsize", type=int, default=21, help="The window size to extract features. Default: 21")
317 | com_group_for_comparison.add_argument("--alignStr", type=str, default='minimap2', choices=["bwa","minimap2"], help="Alignment tools (bwa or minimap2 is supported). Default: minimap2")
318 | com_group_for_comparison.add_argument("--SignalGroup", type=str, default='simple', choices=["simple","rundif"], help="How to associate signals to each called bases. Default: simple")
319 | com_group_for_comparison.add_argument("--move", default=False, action="store_true", help="Whether the basecalled data use move tables instead of event tables. Default: False")
320 | 
321 | # add detection options
322 | parser_detect = subparsers.add_parser('detect', parents=[parent_parser], help="Detect modifications at a genomic scale", description="Detect modifications by integrating all long reads for a genome", epilog="For example, \n \
323 | python %(prog)s --wrkBase ctrl_oligo_SpeI_cut --FileID mod_det --outFolder ./mod_output/detect3 \n \
324 | ", formatter_class=RawTextHelpFormatter)
325 | parser_detect.add_argument("--Ref", help="The reference sequence")
326 | parser_detect.add_argument("--predDet", type=int, default=1, choices=[0,1], help="pred first and then detect (1) or only detect (0). Default: 1")
327 | parser_detect.add_argument("--predpath", default=None, help="The file path of predictions for each fast5 file. The file pattern is *_*.detail. Default: './mod_output/pred2/'")
328 | parser_detect.add_argument("--modfile", type=str, default=None, help="The path to load training model. Default: 'mod_output/'")
329 | parser_detect.add_argument("--fnum", type=int, default=7, help="The number of features. Default: 7")
330 | parser_detect.add_argument("--hidden", type=int, default=100, help="The number of hidden node. Default: 100")
331 | parser_detect.add_argument("--basecall_1d", default="Basecall_1D_000", help="Path for basecall_1d. Default: Basecall_1D_000")
332 | parser_detect.add_argument("--basecall_2strand", default="BaseCalled_template", help="Path for basecall_2strand. Default: BaseCalled_template")
333 | parser_detect.add_argument("--region", default=None, help="The region of interest: for example, chr:1:100000;chr2:10000");
334 | parser_detect.add_argument("--ConUnk", default=True, choices=[False, True], help="Whether contain unknown chromosome");
335 | parser_detect.add_argument("--outputlayer", default="", choices=["", "sigmoid"], help="how to put activation function for output layer")
336 | parser_detect.add_argument("--Base", type=str, default='C', choices=['A', 'C', 'G', 'T'], help="Interest of bases");
337 | parser_detect.add_argument("--mod_cluster", default=0, choices=[0,1], help="1: CpG cluster effect; 0: not");
338 | parser_detect.set_defaults(func=mDetect)
339 | 
340 | # add training options
341 | parser_training = subparsers.add_parser('train', parents=[parent_parser], help="Training a modification classifier", description="Training a modification classifier", epilog="For example, \n \
342 | python %(prog)s --wrkBase umr --wrkBase2 sss --FileID mod_train --outFolder ./mod_output/train1 \n \
343 | ", formatter_class=RawTextHelpFormatter)
344 | parser_training.add_argument("--wrkBase2", help="The base folder for long reads without any modifications.")
345 | parser_training.add_argument("--fnum", type=int, default=7, help="The number of features. Default: 7")
346 | parser_training.add_argument("--hidden", type=int, default=100, help="The number of hidden node. Default: 100")
347 | parser_training.add_argument("--modfile", type=str, default=None, help="The path to load training model. Default: 'mod_output/'")
348 | parser_training.add_argument("--test", help="The number of E Coli genomic position for testing. Default: 'E,1,2'")
349 | parser_training.add_argument("--outputlayer", default="", choices=["", "sigmoid"], help="how to put activation function for output layer")
350 | parser_training.add_argument("--unbalanced", type=int, default=0, choices=[1, 0, None], help="Whether data is unbalanced");
351 | parser_training.set_defaults(func=mTrain)
352 | 
353 | # add get-feature options
354 | parser_getfeatures = subparsers.add_parser('getfeatures', parents=[parent_parser], help="Get features for all fast5 files", description="Get features for all fast5 files", epilog="For example, \n \
355 | python %(prog)s --wrkBase umr/160617_ecolilowinput_UMR9/called/pass --threads 48 --recursive 0 --posneg 0 --outFolder umr  \n \
356 | python %(prog)s --wrkBase sss/160617_ecolilowinput_sssiR9/called/pass --threads 48 --recursive 0 --posneg 1 --outFolder sss \n \
357 | ", formatter_class=RawTextHelpFormatter)
358 | parser_getfeatures.add_argument("--posneg", type=int, default=0, choices=[0,1], help="The positive(1) or negative(0) class. Default: 0")
359 | parser_getfeatures.add_argument("--size_per_batch", type=int, default=1, help="The size (unit: 10^7=10M) of a feature file. Default: 1")
360 | parser_getfeatures.add_argument("--fnum", type=int, default=7, help="The number of features. Default: 7")
361 | parser_getfeatures.add_argument("--region", type=str, help="The region of interest. Set to None or empty for all. Format is chr:start_pos:end_pos")
362 | parser_getfeatures.add_argument("--basecall_1d", default="Basecall_1D_000", help="Path for basecall_1d. Default: Basecall_1D_000")
363 | parser_getfeatures.add_argument("--basecall_2strand", default="BaseCalled_template", help="Path for basecall_2strand. Default: BaseCalled_template")
364 | 
365 | parser_getfeatures.add_argument("--motifORPos", type=int, default=1, help="Use Motif (1) or pos (2) for modified bases. Default: 1")
366 | 
367 | parser_getfeatures.add_argument("--motif", default='CG', type=str, help="The motif of interest")
368 | parser_getfeatures.add_argument("--ModinMotif", default=0, type=int, help="The position of modified base in the motif of interest")
369 | parser_getfeatures.add_argument("--Ref", help="The reference sequence")
370 | 
371 | parser_getfeatures.add_argument("--fulmod", type=str, help="The file pattern for full modification: bisultfiteseq/chr20_C*_0.95.txt")
372 | parser_getfeatures.add_argument("--anymod", type=str, help="The file pattern for any modification: bisultfiteseq/chr20_any_0.95.txt")
373 | parser_getfeatures.add_argument("--nomod", type=str, help="The file pattern for any modification: bisultfiteseq/chr20_no1_0.95.txt")
374 | 
375 | parser_getfeatures.set_defaults(func=mGetFeatures)
376 | 
377 | # no provided argument
378 | # print help document
379 | if len(sys.argv)<2:
380 |    parser.print_help();
381 | else:
382 |    args = parser.parse_args()
383 |    args.func(args);
384 | 


--------------------------------------------------------------------------------
/bin/DeepMod_scripts/EventTable.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os,sys
  3 | import numpy as np
  4 | import h5py
  5 | 
  6 | 
  7 | def get_extreme_N(m_signal_dif, n_splits, p_signal_start, p_signal_end, moptions, sp_param):
  8 |    cu_region_sort_pos = m_signal_dif[int(p_signal_start-sp_param['min_signal_num']+0.5):int(p_signal_end-sp_param['min_signal_num']+0.5)].argsort()[::-1]+p_signal_start;
  9 |    m_nb_pos = set();
 10 |    # print n_splits, type(n_splits), p_signal_start, type(p_signal_start), p_signal_end, type(p_signal_end), sp_param['min_signal_num'], type( sp_param['min_signal_num']), type(p_signal_start+sp_param['min_signal_num']-1)
 11 |    m_nb_pos.update(range(p_signal_start, int(p_signal_start+sp_param['min_signal_num']-0.5)));
 12 |    m_nb_pos.update(range(int(p_signal_end-sp_param['min_signal_num']+1.5), p_signal_end));
 13 |    split_points_list = []
 14 |    for c_pos in cu_region_sort_pos:
 15 |       if c_pos not in m_nb_pos:
 16 |          split_points_list.append(c_pos);
 17 |          if (len(split_points_list)==n_splits): break;
 18 |          m_nb_pos.update(range(c_pos-sp_param['min_signal_num']+1, c_pos+sp_param['min_signal_num']+1));
 19 |    return sorted(split_points_list);
 20 | 
 21 | def getEvent_Info(moptions, sp_param, events_data):
 22 |    event_info = []
 23 |    sp_param['min_signal_num'] = 4;
 24 | 
 25 |    signal_sum = np.cumsum(np.insert(np.round(sp_param['raw_signals']/50.0,5), 0, 0));
 26 |    m_signal_dif = np.abs(signal_sum[sp_param['min_signal_num']:-sp_param['min_signal_num']]*2 - signal_sum[:-2*sp_param['min_signal_num']] - signal_sum[2*sp_param['min_signal_num']:])
 27 |    #print (sp_param['raw_signals'][:20]);
 28 |    #print (np.round(sp_param['raw_signals']/50.0,5)[:20]);
 29 |    #print (signal_sum[:20]);
 30 |    #print (m_signal_dif[:20])
 31 |    # sp_param['fq_seq'] = fq_data[1]
 32 |    last_ev_i = 0;
 33 |    last_signal_i = events_data[0]['start'];
 34 |    fq_seq_i = 2;
 35 |    c_move_num = 1
 36 |    incrrt_event_list = []
 37 |    for ev_i in range(1, len(events_data)):
 38 |       if (events_data['move'][ev_i])==0:
 39 |          pass;
 40 |       else:
 41 |          c_move_num += events_data['move'][ev_i]
 42 |          split_points = get_extreme_N(m_signal_dif, c_move_num-1, last_signal_i, events_data[ev_i]['start']+events_data[ev_i]['length'], moptions, sp_param);
 43 |          #print c_move_num-1, last_signal_i, ev_i, events_data[ev_i]['start']+events_data[ev_i]['length'], split_points
 44 |          #for s_i in range(last_signal_i, events_data[ev_i]['start']+events_data[ev_i]['length']): 
 45 |          #    if s_i in split_points:
 46 |          #       print '|',
 47 |          #    print sp_param['raw_signals'][s_i],
 48 |          #print '';
 49 |          for c_m_i in range(c_move_num-1):
 50 |             if c_m_i < len(split_points): 
 51 |                h_m_i = c_m_i;
 52 |                c_e_p = split_points[h_m_i]
 53 |             else: 
 54 |                h_m_i = len(split_points)-1
 55 |                c_e_p = last_signal_i + sp_param['min_signal_num']
 56 |                incrrt_event_list.append(len(event_info));
 57 | 
 58 |             c_mnn = np.mean(sp_param['raw_signals'][last_signal_i:c_e_p]);
 59 |             c_std = np.std(sp_param['raw_signals'][last_signal_i:c_e_p]);
 60 |             c_start = last_signal_i;
 61 |             c_length = c_e_p - last_signal_i;
 62 |             c_mode = sp_param['fq_seq'][fq_seq_i-2:fq_seq_i+3];
 63 |             event_info.append((c_mnn, c_std, c_start, c_length, c_mode))
 64 | 
 65 |             last_signal_i = split_points[h_m_i]
 66 |             fq_seq_i += 1;
 67 |  
 68 |          c_move_num = 1;
 69 |    ev_i = len(events_data)-1 
 70 |    c_e_p = events_data[ev_i]['start'] + events_data[ev_i]['length']
 71 |    c_mnn = np.mean(sp_param['raw_signals'][last_signal_i:c_e_p]);
 72 |    c_std = np.std(sp_param['raw_signals'][last_signal_i:c_e_p]);
 73 |    c_start = last_signal_i;
 74 |    c_length = c_e_p - last_signal_i;
 75 |    c_mode = sp_param['fq_seq'][fq_seq_i-2:fq_seq_i+3];
 76 |    event_info.append((c_mnn, c_std, c_start, c_length, c_mode))
 77 | 
 78 |    event_info = np.array(event_info, dtype=[('mean', '<f4'), ('stdv', '<f4'), ('start', np.uint64), ('length', np.uint64), ('model_state', 'U5')])
 79 |    #c_seq = ''.join([event_model_state[2] for event_model_state in event_info['model_state'] ] )
 80 |  
 81 |    #print '\n' 
 82 |    for c_ev_i in incrrt_event_list:
 83 |       #print c_ev_i, event_info[c_ev_i-1]['start'], event_info[c_ev_i-1]['length'], event_info[c_ev_i]['start'], event_info[c_ev_i]['length'], event_info[c_ev_i+1]['start'], event_info[c_ev_i+1]['length']
 84 |       h_2 = int((event_info[c_ev_i+1]['length'] + event_info[c_ev_i+1]['start'] - event_info[c_ev_i]['start'] )/2+0.2)
 85 |       event_info[c_ev_i]['length'] = h_2
 86 |       event_info[c_ev_i+1]['start'] = event_info[c_ev_i]['start'] + event_info[c_ev_i]['length']
 87 |       event_info[c_ev_i+1]['length'] = event_info[c_ev_i+1]['length'] - h_2
 88 |       #print '\t', c_ev_i, event_info[c_ev_i-1]['start'], event_info[c_ev_i-1]['length'], event_info[c_ev_i]['start'], event_info[c_ev_i]['length'], event_info[c_ev_i+1]['start'], event_info[c_ev_i+1]['length']
 89 | 
 90 |    #for c_ev_i in range(len(event_info)):
 91 |    #   print c_ev_i, event_info[c_ev_i]['start'], event_info[c_ev_i]['length'], ':', 
 92 |    #   for s_i in range(event_info[c_ev_i]['start'], event_info[c_ev_i]['start']+event_info[c_ev_i]['length']):
 93 |    #       pass # print sp_param['raw_signals'][s_i],
 94 |    #   print ''
 95 |  
 96 |    #msi = 50;
 97 |    #print (c_seq[:msi])
 98 |    #print (sp_param['fq_seq'][2:(msi+2)])
 99 |    #print (c_seq[-msi:])
100 |    #print (sp_param['fq_seq'][-(msi+2):-2])
101 |    #print len(events_data), len(event_info), len(sp_param['fq_seq'])
102 |    #ei_i = 0;
103 |    #for ev_i in range(0, len(events_data)):
104 |    #   if (events_data[ev_i]['move']>0):
105 |    #      print ("%d/%s %d-%d vs %d-%d %s=%s%s" % (ev_i, ei_i,events_data[ev_i]['start'], events_data[ev_i]['start']+events_data[ev_i]['length'],  event_info[ei_i]['start'], event_info[ei_i]['start']+event_info[ei_i]['length'], events_data[ev_i]['model_state'][2],event_info[ei_i]['model_state'][2],sp_param['fq_seq'][ei_i+2]))
106 |    #      ei_i += events_data[ev_i]['move']
107 | 
108 |    return event_info
109 | 
110 | 
111 | if __name__=='__main__':
112 |    moptions = {}
113 |    sp_param = {}
114 | 
115 |    exple_data = ['/home/liuq1/project/DeepNanoRepeat/scripts/fortest/f6343e53-9454-41ae-8398-7be6e1b7557d.fast5', \
116 |                  'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/000a7916-373c-4cc3-a3f2-6bed205b09cb.fast5', \
117 |                  'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/00264c38-4945-4263-ae0d-253e6c6a39ba.fast5', \
118 |                  'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/0039f109-46ac-4a81-883d-b55900924dd4.fast5', \
119 |                  'data/alb231/S_053119TrainSeq3ctrloligoSpeIcut/workspace/pass/0/0045bf1d-d7be-44b1-9b6c-9bb76a634e0f.fast5' \
120 |                 ]
121 | 
122 |    sp_param['f5reader'] = h5py.File(sys.argv[1] if len(sys.argv)>1 else exple_data[0], 'r');   
123 | 
124 |    fq_str = '/Analyses/Basecall_1D_000/BaseCalled_template/Fastq'
125 |    ev_str = '/Analyses/Basecall_1D_000/BaseCalled_template/Events'
126 |    fq_str = '/Analyses/Basecall_1D_001/BaseCalled_template/Fastq'
127 |    ev_str = '/Analyses/Basecall_1D_001/BaseCalled_template/Events'
128 |    sg_str = '/Raw/Reads/'
129 | 
130 |    sp_param['fq_seq'] = sp_param['f5reader'][fq_str][()].split('\n')[1];
131 |    sp_param['raw_signals'] = sp_param['f5reader'][sg_str].values()[0]['Signal'].value
132 |    events_data = sp_param['f5reader'][ev_str].value;
133 | 
134 |    getEvent_Info(moptions, sp_param, events_data)
135 | 
136 |    sp_param['f5reader'].close();   
137 | 
138 | 


--------------------------------------------------------------------------------
/bin/DeepMod_scripts/MoveTable.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os,sys
 3 | import numpy as np
 4 | import h5py
 5 | 
 6 | 
 7 | def getMove_Info(moptions, sp_param, move_data):
 8 |     '''
 9 |     sp_param.keys: fq_seq, raw_signals, first_sample_template, duration_template
10 |     '''
11 | 
12 |     #sp_param['first_sample_template'] = sp_param['f5reader']['/Analyses/Segmentation_001/Summary/segmentation'].attrs['first_sample_template']
13 |     #sp_param['duration_template'] = sp_param['f5reader']['/Analyses/Segmentation_001/Summary/segmentation'].attrs['duration_template']
14 | 
15 |     seg = "Segmentation_" + moptions['basecall_1d'].split('_')[-1]
16 |     attr_path = '/'.join(['', 'Analyses', seg, 'Summary', 'segmentation'])
17 |     #mv_str = '/'.join(['', 'Analyses', moptions['basecall_1d'], moptions['basecall_2strand'], 'Move'])
18 |     sp_param['first_sample_template'] = sp_param['f5reader'][attr_path].attrs['first_sample_template']
19 |     sp_param['duration_template'] = sp_param['f5reader'][attr_path].attrs['duration_template']
20 |     #move_data = sp_param['f5reader'][mv_str][()]
21 |     nrow = len(sp_param['fq_seq']) # row number of event_info; equals to the base number
22 |     nsig = len(sp_param['raw_signals'])
23 |     first = int(sp_param['first_sample_template'])
24 |     duration = int(sp_param['duration_template'])
25 |     move_info = np.empty([nrow], dtype=[('mean', '<f4'), ('stdv', '<f4'), ('start', np.uint64), ('length', np.uint64), ('model_state', 'U5')])
26 |     effect_sig_index = list(range(first, nsig))
27 |     pivot = first
28 |     seg_count = 0 #which segmentation
29 |     for i in range(1, len(move_data)):
30 |         if move_data[i] == 1:
31 |             move_info[seg_count]['mean'] = np.mean(sp_param['raw_signals'][pivot:(2*i + first)])
32 |             move_info[seg_count]['length'] = 2*i + first - pivot
33 |             move_info[seg_count]['stdv'] = np.std(sp_param['raw_signals'][pivot:(2*i + first)])
34 |             move_info[seg_count]['start'] = pivot
35 |             if seg_count == 0:
36 |                 move_info[seg_count]['model_state'] = 'N'*2 + sp_param['fq_seq'][seg_count:seg_count+3]
37 |             elif seg_count == 1:
38 |                 move_info[seg_count]['model_state'] = 'N' + sp_param['fq_seq'][seg_count-1:seg_count+3]
39 |             elif seg_count == nrow-2:
40 |                 move_info[seg_count]['model_state'] = sp_param['fq_seq'][seg_count-2:seg_count+2] + 'N'
41 |             else:
42 |                 move_info[seg_count]['model_state'] = sp_param['fq_seq'][seg_count-2 : seg_count+3]
43 |             pivot = 2*i + first
44 |             seg_count += 1
45 |     move_info[seg_count]['mean'] = np.mean(sp_param['raw_signals'][pivot:nsig])
46 |     move_info[seg_count]['length'] = nsig - pivot
47 |     move_info[seg_count]['stdv'] = np.std(sp_param['raw_signals'][pivot:nsig])
48 |     move_info[seg_count]['start'] = pivot
49 |     move_info[seg_count]['model_state'] = sp_param['fq_seq'][seg_count-2:seg_count+1] + 'N'*2
50 |     return move_info
51 | 
52 | 
53 | if __name__=='__main__':
54 |    moptions = {}
55 |    sp_param = {}
56 | 
57 |    exple_data = ['/mnt/isilon/wang_lab/shared/temp_shared/fast5_move/IBDUCAL377261L_20170201_FNfab41074_MN17640_mux_scan_X209_66786_ch12_read102_strand.fast5', \
58 |                  '/mnt/isilon/wang_lab/shared/temp_shared/fast5_move/IBDUCAL377261L_20170201_FNfab41074_MN17640_mux_scan_X209_66786_ch47_read165_strand.fast5', \
59 |                  '/mnt/isilon/wang_lab/shared/temp_shared/fast5_move/IBDUCAL377261L_20170201_FNfab41074_MN17640_mux_scan_X209_66786_ch48_read38_strand.fast5', \
60 |                  '/mnt/isilon/wang_lab/shared/temp_shared/fast5_move/IBDUCAL377261L_20170201_FNfab41074_MN17640_mux_scan_X209_66786_ch52_read12_strand.fast5' \
61 |                  ]
62 | 
63 |    sp_param['f5reader'] = h5py.File(exple_data[0], 'r');
64 | 
65 | 
66 |    fq_str = '/Analyses/Basecall_1D_001/BaseCalled_template/Fastq'
67 |    mv_str = '/Analyses/Basecall_1D_001/BaseCalled_template/Move'
68 |    sg_str = '/Raw/Reads/'
69 | 
70 |    sp_param['fq_seq'] = sp_param['f5reader'][fq_str][()].splitlines()[1]
71 |    k = list(sp_param['f5reader'][sg_str].keys())[0]
72 |    sp_param['raw_signals'] = sp_param['f5reader'][sg_str][k]['Signal'][()]
73 |    sp_param['first_sample_template'] = sp_param['f5reader']['/Analyses/Segmentation_001/Summary/segmentation'].attrs['first_sample_template']
74 |    sp_param['duration_template'] = sp_param['f5reader']['/Analyses/Segmentation_001/Summary/segmentation'].attrs['duration_template']
75 |    move_data = sp_param['f5reader'][mv_str][()]
76 | 
77 |    getEvent_Info(moptions, sp_param, move_data)
78 | 
79 |    sp_param['f5reader'].close();
80 | 


--------------------------------------------------------------------------------
/bin/DeepMod_scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/bin/DeepMod_scripts/__init__.py


--------------------------------------------------------------------------------
/bin/DeepMod_scripts/myCom.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #
 3 | # output level
 4 | #
 5 | OUTPUT_DEBUG=0;
 6 | OUTPUT_INFO=1;
 7 | OUTPUT_WARNING=2;
 8 | OUTPUT_ERROR=3;
 9 | 
10 | #
11 | # base-pairing
12 | # 4 base types with case insensitive
13 | #
14 | na_bp = {"A":"T", \
15 |          "C":"G", \
16 |          "G":"C", \
17 |          "T":"A", \
18 |          "a":"t", \
19 |          "c":"g", \
20 |          "g":"c", \
21 |          "t":"a", \
22 |          "N":"N", \
23 |          "n":"n" \
24 |          }
25 | 
26 | g_ACGT = ['A', 'C', 'G', 'T']
27 | 
28 | acgt = na_bp.keys();
29 | 
30 | #
31 | # get complementary base of a given base
32 | #
33 | def getComplementary(na):
34 |    com_na = []
35 |    for i in range(len(na)):
36 |       com_na.append(na_bp[na[i]])
37 |    ''.join(com_na[::-1])
38 | 
39 | #
40 | # format the end of a folder
41 | #
42 | def format_last_letter_of_folder(cursub):
43 |    if not cursub==None:
44 |       if cursub[-1]=='/': return cursub;
45 |       elif cursub[-1]=='\\': return cursub[:-1]+'/';
46 |       else: return cursub+'/';
47 | 
48 | #
49 | # default path of data in fast5 files
50 | #
51 | analyses_base = "Analyses"
52 | basecall_events_base = "Events"
53 | raw_base = 'Raw'
54 | reads_base = "Reads"
55 | signal_base = "Signal"
56 | basecall_fastq_base = "Fastq"
57 | 
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/bin/DeepMod_scripts/myGetFeatureBasedPos.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os;
  3 | import sys;
  4 | import string;
  5 | import glob;
  6 | import time
  7 | import copy
  8 | 
  9 | import h5py
 10 | import numpy as np
 11 | import multiprocessing
 12 | 
 13 | from collections import defaultdict
 14 | from distutils.version import LooseVersion
 15 | 
 16 | import tempfile
 17 | import subprocess
 18 | 
 19 | import re;
 20 | 
 21 | from . import myCom
 22 | from . import myDetect
 23 | 
 24 | #
 25 | # map long reads
 26 | # then call another function to get feature for each base of interest
 27 | #
 28 | def mGetFeature1(moptions, sp_options, f5files):
 29 |    # associate signals to events
 30 |    f5data = myDetect.get_Event_Signals(moptions, sp_options, f5files)
 31 | 
 32 |    # save all sequences information
 33 |    if moptions['outLevel']<=myCom.OUTPUT_DEBUG: start_time = time.time();
 34 |    temp_fa = tempfile.NamedTemporaryFile(suffix='.fa', mode='w')
 35 |    f5keys = sorted(f5data.keys()); #f5keys.sort()
 36 |    for f5k in f5keys:
 37 |       temp_fa.write(''.join(['>', f5k, '\n', f5data[f5k][0], '\n']))
 38 |    temp_fa.flush();
 39 |    if moptions['outLevel']<=myCom.OUTPUT_DEBUG:
 40 |       end_time = time.time();
 41 |       print ("Write consuming time %d" % (end_time-start_time))
 42 | 
 43 |    # alignment using bwa-mem or minimap2
 44 |    temp_sam = tempfile.NamedTemporaryFile()
 45 |    if moptions['alignStr']=='bwa':
 46 |       cmd_opt = ['mem', '-x', 'ont2d', '-v', '1', '-t', '1', moptions['Ref'], temp_fa.name]
 47 |    else:
 48 |       cmd_opt = ['-ax', 'map-ont', moptions['Ref'], temp_fa.name]
 49 |    returncode = subprocess.call([moptions['alignStr'],]+cmd_opt, stdout=temp_sam)
 50 |    if not returncode==0:
 51 |       print ('Fatal Error!!! returncode is non-zero(%d) for "%s"' % (returncode, curcmd))
 52 |       errkey = "Cannot running aligment"
 53 |       for f5k in f5keys:
 54 |          sp_options["Error"][errkey].append(f5data[f5k][3])
 55 |       return;
 56 | 
 57 |    temp_fa.close();
 58 |    temp_sam.seek(0);
 59 |    # get sam information
 60 |    align_info = temp_sam.readlines()
 61 |    align_info = [str(align_info[i], 'utf-8').strip() for i in range(len(align_info))]
 62 |    temp_sam.close();
 63 | 
 64 |    sp_param = defaultdict();
 65 |    sp_param['f5data'] = f5data
 66 | 
 67 |    # for alignment
 68 |    f5align = defaultdict()
 69 |    f5keydict = defaultdict();
 70 |    sp_param['ref_info'] = defaultdict()
 71 | 
 72 |    if moptions['outLevel']<=myCom.OUTPUT_DEBUG:start_time = time.time();
 73 |    ilid = 0;
 74 |    # for each record in sam, get alignment information
 75 |    while ilid < len(align_info):
 76 |       if len(align_info[ilid])==0 or align_info[ilid][0]=='@':
 77 |          ilid += 1
 78 |          continue;
 79 | 
 80 |       sp_param['f5status'] = "";
 81 |       sp_param['line'] = align_info[ilid]
 82 |       qname = handle_line(moptions, sp_param, f5align)
 83 |       if sp_param['f5status'] == "":
 84 |          f5keydict[qname] = True;
 85 |       ilid += 1
 86 | 
 87 |    # for unmapped reads
 88 |    for f5k in f5keys:
 89 |       if f5k not in f5keydict:
 90 |          sp_options["Error"]["Not in alignment sam"].append(f5data[f5k][3])
 91 | 
 92 |    if moptions['outLevel']<=myCom.OUTPUT_DEBUG:
 93 |       end_time = time.time();
 94 |       print ("Get BAM consuming time %d" % (end_time-start_time))
 95 | 
 96 |    sp_param['f5status']= ""
 97 |    sp_param['line'] = ""
 98 |    if moptions['outLevel']<=myCom.OUTPUT_DEBUG:start_time = time.time();
 99 |    # handle each alignment
100 |    handle_record(moptions, sp_options, sp_param, f5align, f5data)
101 |    if moptions['outLevel']<=myCom.OUTPUT_DEBUG:
102 |       end_time = time.time();
103 |       print ("Analyze & annotate & save consuming time %d" % (end_time-start_time))
104 | 
105 | #
106 | # get mapping information
107 | # then call another function to get feature of each base in a long read
108 | #
109 | def handle_record(moptions, sp_options, sp_param, f5align, f5data):
110 |    alignkeys = list(f5align.keys());
111 |    # alignment detail
112 |    numreg = re.compile('\d+')
113 |    mdireg = re.compile('[MIDNSHPX=]{1}')
114 | 
115 |    feat_file_ind_dict = []
116 |    feat_list = None; feat_file_ind = 0
117 |    start_c_time = time.time();
118 | 
119 |    for readk in alignkeys:
120 |      if len(feat_file_ind_dict)>0 and feat_list.nbytes > moptions['size_per_batch']:
121 |         # save features when the size is larger than the defined size
122 |         cur_feat_file_base = sp_options['ctfolder'] + '/'+str(feat_file_ind)
123 |         np.savetxt(cur_feat_file_base+'.xy.gz', feat_list, fmt='%.3f')
124 |         with open(cur_feat_file_base+'.xy.ind', 'w') as ind_mw:
125 |             for f_ind in feat_file_ind_dict:
126 |                ind_mw.write('%d %s\n' % (f_ind[1], f_ind[0]))
127 |         print ("\t%s-%d Total consuming time %d" % (sp_options['ctfolder'][sp_options['ctfolder'].rfind('/'):], feat_file_ind, time.time()-start_c_time)); sys.stdout.flush()
128 |         feat_file_ind_dict = []
129 |         feat_list = None;
130 |         feat_file_ind += 1
131 | 
132 |      # get alignment detail
133 |      mapq, flag, rname, pos, cigar, readseq = f5align[readk]
134 | 
135 |      if not ( (rname in moptions['fulmodlist'] and len(moptions['fulmodlist'][rname])>0) or \
136 |         ((not moptions['anymodlist']==None) and rname in moptions['anymodlist'] and len(moptions['anymodlist'][rname])>0) or \
137 |         ((not moptions['nomodlist']==None) and rname in moptions['nomodlist'] and len(moptions['nomodlist'][rname])>0) ):
138 |         continue;
139 | 
140 |      # get reference sequece
141 |      if rname not in sp_param['ref_info']:
142 |         myDetect.getRefSeq(moptions, sp_param, rname)
143 |      refseq = sp_param['ref_info'][rname]
144 | 
145 |      # mapped starting position and strand
146 |      pos = pos - 1
147 |      forward_reverse = '-' if flag&0x10 else '+'
148 | 
149 |      numinfo = numreg.findall(cigar);
150 |      mdiinfo = mdireg.findall(cigar)
151 |      numinfo = [int(numinfo[i]) for i in range(len(numinfo))] #map(int, numinfo)
152 | 
153 |      # remove clip from both tails
154 |      leftclip = 0; rightclip = 0;
155 |      while mdiinfo[0] in ['I', 'D', 'N', 'S', 'H', 'P', 'X']:
156 |          if mdiinfo[0] in ['I', 'S', 'X']:
157 |             leftclip += numinfo[0];  readseq = readseq[numinfo[0]:]
158 |          if mdiinfo[0] in ['H']: leftclip += numinfo[0]
159 |          if mdiinfo[0] in ['D', 'N', 'X']:
160 |             pos += numinfo[0]
161 |          numinfo = numinfo[1:];  mdiinfo = mdiinfo[1:]
162 |      while mdiinfo[-1] in ['I', 'D', 'N', 'S', 'H', 'P', 'X']:
163 |          if mdiinfo[-1] in ['I', 'S', 'X']:
164 |             rightclip += numinfo[-1]; readseq = readseq[:-numinfo[-1]]
165 |          if mdiinfo[-1] in ['H']: rightclip += numinfo[-1]
166 |          numinfo = numinfo[:-1]; mdiinfo = mdiinfo[:-1]
167 |      if forward_reverse=='+':
168 |         if rightclip>0: m_event = f5data[readk][1][leftclip:-rightclip]
169 |         else: m_event = f5data[readk][1][leftclip:]
170 |      else:
171 |         if leftclip>0: m_event = f5data[readk][1][rightclip:-leftclip]
172 |         else: m_event = f5data[readk][1][rightclip:]
173 | 
174 |      # is in region of interest if provided
175 |      isinreg = False;
176 |      if (moptions['region'][0] in ['', None, rname]) and \
177 |         (moptions['region'][1] in ['', None] or pos>moptions['region'][1]) and \
178 |         (moptions['region'][2] in ['', None] or pos+len(m_event)<moptions['region'][2]):
179 |         isinreg = True;
180 |      if not isinreg:
181 |         continue;
182 | 
183 |      # associate mapped reference positions with read positions
184 |      lastmatch = None; firstmatch = None;
185 |      first_match_pos = None; last_match_pos = None
186 |      last_al_match = None;  first_al_match = None
187 |      lasmtind = 0;
188 |      base_map_info = []; #indel_groups = defaultdict()
189 |      nummismatch = 0; numinsert = 0; numdel = 0;
190 |      read_ind = 0;
191 |      for n1ind in range(len(numinfo)):
192 |         mdi = mdiinfo[n1ind];
193 |         # for each mapped types
194 |         for n1i in range(numinfo[n1ind]):
195 |            if mdi=='M':
196 |               base_map_info.append((refseq[pos], readseq[read_ind], pos, read_ind))
197 |               if refseq[pos]==readseq[read_ind]:
198 |                  if firstmatch==None: firstmatch = read_ind
199 |                  if lastmatch==None or lastmatch<read_ind: lastmatch = read_ind; lasmtind=n1ind
200 |                  if first_al_match==None: first_al_match=len(base_map_info)-1
201 |                  if last_al_match==None or last_al_match<len(base_map_info): last_al_match=len(base_map_info)-1
202 |                  if first_match_pos==None: first_match_pos = pos
203 |                  if last_match_pos==None or last_match_pos<pos: last_match_pos = pos
204 |               else: nummismatch += 1
205 |               pos += 1; read_ind += 1;
206 |            elif mdi =='I':
207 |               base_map_info.append(('-', readseq[read_ind], pos, read_ind))
208 |               read_ind += 1;
209 |               numinsert += 1
210 |            elif mdi == 'D':
211 |               base_map_info.append((refseq[pos], '-', pos, read_ind))
212 |               pos += 1;
213 |               numdel += 1
214 |            elif mdi == 'N':
215 |               base_map_info.append((refseq[pos], '-', pos, read_ind))
216 |               pos += 1;
217 |               if moptions['outLevel']<=myCom.OUTPUT_WARNING:
218 |                  print ('CIGAR-Error N exist', f5data[readk][3])
219 |            elif mdi == 'S':
220 |               read_ind += 1;
221 |               if moptions['outLevel']<=myCom.OUTPUT_WARNING:
222 |                  print ('CIGAR-Error!!! S in the middle of the sequence', f5data[readk][3])
223 |            elif mdi == 'H':
224 |               if moptions['outLevel']<=myCom.OUTPUT_WARNING:
225 |                  print ('CIGAR-Error!!! H in the middle of the sequence', f5data[readk][3])
226 |            elif mdi == 'P':
227 |               if moptions['outLevel']<=myCom.OUTPUT_WARNING:
228 |                  print ('CIGAR-Error!!! P exist', f5data[readk][3])
229 |            elif mdi == '=':
230 |              base_map_info.append((refseq[pos], readseq[read_ind], pos, read_ind))
231 |              if first_match_pos==None: first_match_pos  = pos
232 |              if last_match_pos==None or last_match_pos<pos: last_match_pos = pos
233 |              pos += 1; read_ind += 1;
234 |              if firstmatch==None: firstmatch = read_ind - 1
235 |              if lastmatch==None or lastmatch<read_ind-1: lastmatch = read_ind - 1; lasmtind=n1ind
236 |              if last_al_match==None or last_al_match<len(base_map_info): last_al_match=len(base_map_info)-1
237 |              if first_al_match==None: first_al_match=len(base_map_info)-1
238 |            elif mdi == 'X':
239 |              base_map_info.append((refseq[pos], readseq[read_ind], pos, read_ind))
240 |              pos += 1; read_ind += 1;
241 |              nummismatch += 1
242 |            else:
243 |              if moptions['outLevel']<=myCom.OUTPUT_WARNING:
244 |                 print ('CIGAR-Error!!!', 'Warning unknow CIGAR element ' + str(numinfo[n1ind]) + ' ' + mdi, f5data[readk][3])
245 |      if firstmatch==None or lastmatch==None or firstmatch<0 or lastmatch<0:
246 |         if moptions['outLevel']<=myCom.OUTPUT_WARNING:
247 |            print ("Errorfast5 "+f5data[readk][3])
248 |            print('match-Error!!! no first and/or last match',f5data[readk][3],('firstmatch=%d' % firstmatch) if not (firstmatch==None) else "N", ('lastmatch%d' % lastmatch) if not (lastmatch==None) else "N", str(flag), rname, str(pos));
249 |            print('\tf=%d, chr=%s, p=%d, c=%s, s=%s' % (flag, rname, pos, cigar, readseq))
250 |            continue;
251 | 
252 |      # remove unmatch in both tails
253 |      if not firstmatch==None: leftclip += firstmatch
254 |      if (not lastmatch==None) and len(m_event)-lastmatch>1: rightclip += len(m_event)-lastmatch-1
255 |      # remove events whose bases are not mapped.
256 |      if forward_reverse=='+':
257 |         if len(m_event)-lastmatch>1:
258 |            m_event = m_event[firstmatch:(lastmatch+1-len(m_event))]
259 |         elif firstmatch>0: m_event = m_event[firstmatch:]
260 |      else:
261 |         if firstmatch>0: m_event = m_event[(len(m_event)-1-lastmatch):-firstmatch]
262 |         elif len(m_event)-lastmatch>1: m_event = m_event[(len(m_event)-1-lastmatch):]
263 |      # print detail if unexpected error occurs
264 |      if firstmatch>0 or len(base_map_info)-last_al_match>1:
265 |         if moptions['outLevel']<=myCom.OUTPUT_WARNING and ((firstmatch>0) or (len(base_map_info)-last_al_match>1 and refseq[last_match_pos+1] not in ['N'])):
266 |            print ("Errorfast5"+f5data[readk][3])
267 |            print ('Warning!!! first not match', firstmatch, lastmatch, first_al_match, last_al_match, len(base_map_info), numinfo[lasmtind-2:(lasmtind+5)], mdiinfo[lasmtind-2:(lasmtind+5)], lasmtind, len(numinfo))
268 |            print('\tref='+refseq[last_match_pos:last_match_pos+20]+"\n\tred="+readseq[lastmatch:lastmatch+20])
269 |            if firstmatch>0:
270 |               print('\tref='+refseq[(first_match_pos-20 if first_match_pos-20>0 else 0):first_match_pos]+"\n\tred="+readseq[(firstmatch-20 if firstmatch-20>0 else 0):firstmatch])
271 |            print('\tf=%d, chr=%s, p=%d, c=%s, s=%s' % (flag, rname, pos, cigar, readseq)) # flag, rname, pos, cigar, readseq
272 | 
273 |         if len(base_map_info)-last_al_match>1:
274 |            base_map_info = base_map_info[first_al_match:(last_al_match+1-len(base_map_info))]
275 |         elif first_al_match>0:
276 |            base_map_info = base_map_info[first_al_match:]
277 | 
278 |      # post-process mapping information
279 |      base_map_info = np.array(base_map_info, dtype=[('refbase', 'U1'), ('readbase', 'U1'), ('refbasei', np.uint64), ('readbasei', np.uint64)])
280 |      if forward_reverse=='-':
281 |         base_map_info = np.flipud(base_map_info)
282 |         for bmii in range(len(base_map_info)):
283 |             base_map_info['refbase'][bmii]  = get_complement(base_map_info['refbase'][bmii])
284 |             base_map_info['readbase'][bmii] = get_complement(base_map_info['readbase'][bmii])
285 |         leftclip, rightclip = rightclip, leftclip
286 |      if False: #True: # for test base_map_info  ### for check consistency
287 |         ref_align_key = '/Analyses/NanomoCorrected_000/BaseCalled_template/Alignment/genome_alignment'
288 |         read_align_key = '/Analyses/NanomoCorrected_000/BaseCalled_template/Alignment/read_alignment'
289 |         with h5py.File(f5data[readk][3], 'r') as mf5:
290 |            read_align_list = [bt.decode(encoding="utf-8") for bt in mf5[read_align_key]]
291 |            ref_align_list = [bt.decode(encoding="utf-8") for bt in mf5[ref_align_key]]
292 |            for rali in range(len(read_align_list)):
293 |               if not read_align_list[rali]==base_map_info['readbase'][rali]:
294 |                  print ("Error not equal1! %s %s %d %s" % (read_align_list[rali], base_map_info['readbase'][rali], rali, f5data[readk][3]))
295 |               if not ref_align_list[rali]==base_map_info['refbase'][rali]:
296 |                  print ("Error not equal2! %s %s %d %s" % (ref_align_list[rali], base_map_info['refbase'][rali], rali, f5data[readk][3]))
297 |      #
298 |      # handle map like
299 |      # CCG    or CGG
300 |      # C-G       C-G
301 |      #
302 |      if 'motif' in moptions and moptions['motif'][0]=='CG':
303 |         for ali in range(len(base_map_info)):
304 |            if base_map_info['refbase'][ali]=='C' and base_map_info['readbase'][ali]=='C':
305 |               if ali+1<len(base_map_info) and base_map_info['readbase'][ali+1]=='-' and base_map_info['refbase'][ali+1]=='G':
306 |                  addali = 2;
307 |                  while ali + addali < len(base_map_info):
308 |                      if base_map_info['readbase'][ali+addali]=='-' and base_map_info['refbase'][ali+addali]=='G': addali += 1;
309 |                      else: break;
310 |                  if ali + addali < len(base_map_info) and base_map_info['readbase'][ali+addali]=='G' and base_map_info['refbase'][ali+addali]=='G':
311 |                     base_map_info['readbase'][ali+1], base_map_info['readbase'][ali+addali] = base_map_info['readbase'][ali+addali], base_map_info['readbase'][ali+1]
312 |            if base_map_info['refbase'][ali]=='G' and base_map_info['readbase'][ali]=='G':
313 |               if ali-1>-1 and base_map_info['readbase'][ali-1]=='-' and base_map_info['refbase'][ali-1]=='C':
314 |                  addali = 2;
315 |                  while ali - addali >-1:
316 |                      if base_map_info['readbase'][ali-addali]=='-' and base_map_info['refbase'][ali-addali]=='C': addali += 1;
317 |                      else: break;
318 |                  if ali - addali>-1 and base_map_info['readbase'][ali-addali]=='C' and base_map_info['refbase'][ali-addali]=='C':
319 |                      base_map_info['readbase'][ali-1], base_map_info['readbase'][ali-addali] = base_map_info['readbase'][ali-addali], base_map_info['readbase'][ali-1]
320 |      # too short reads
321 |      if len(m_event)<500:
322 |          sp_options["Error"]["Less(<500) events"].append(f5data[readk][3])
323 |          continue;
324 | 
325 |      # get feautre
326 |      mfeatures,isdif = get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, leftclip, rightclip, base_map_info, forward_reverse, rname, first_match_pos, numinsert, numdel)
327 |      if isdif and moptions['outLevel']<=myCom.OUTPUT_WARNING:
328 |         print("Dif is true")
329 |         print([lastmatch, firstmatch, first_match_pos, last_match_pos, first_al_match, last_al_match, lasmtind, len(base_map_info), nummismatch, numinsert, numdel, len(base_map_info)-nummismatch-numinsert-numdel])
330 | 
331 |      # merge to previously handled features of other fast5 files
332 |      if len(mfeatures)>0:
333 |         if len(feat_file_ind_dict)==0:
334 |            feat_file_ind_dict.append((f5data[readk][3], 0));
335 |            feat_list = mfeatures
336 |         else:
337 |            feat_file_ind_dict.append((f5data[readk][3], len(feat_list)))
338 |            feat_list = np.concatenate((feat_list, mfeatures), axis=0)
339 | 
340 |    # store the last feature data.
341 |    if len(feat_file_ind_dict)>0:
342 |       cur_feat_file_base = sp_options['ctfolder'] + '/'+str(feat_file_ind)
343 |       np.savetxt(cur_feat_file_base+'.xy.gz', feat_list, fmt='%.3f')
344 |       with open(cur_feat_file_base+'.xy.ind', 'w') as ind_mw:
345 |           for f_ind in feat_file_ind_dict:
346 |              ind_mw.write('%d %s\n' % (f_ind[1], f_ind[0]))
347 |       print ("\t%s-%d Total consuming time %d" % (sp_options['ctfolder'][sp_options['ctfolder'].rfind('/'):], feat_file_ind, time.time()-start_c_time)); sys.stdout.flush()
348 |       feat_file_ind_dict = []
349 |       feat_list = None;
350 |       feat_file_ind += 1
351 | 
352 | #
353 | # get feature for each base of interest in long reads according to raw signals and mapping information
354 | #
355 | def get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, start_clip, end_clip, base_map_info, forward_reverse, rname, mapped_start_pos, num_insertions, num_deletions):
356 |    # event information
357 |    modevents = sp_param['f5data'][readk][1]
358 |    # class number, bin num and bin length
359 |    clnum = 2; binnum = 50; binlen = 0.2;
360 |    if forward_reverse=='+':
361 |       align_ref_pos = mapped_start_pos
362 |    else:
363 |       align_ref_pos = mapped_start_pos + len(base_map_info) - num_insertions - 1
364 | 
365 |    # initialize feature matrix for all events.
366 |    if moptions['fnum']==57:
367 |       #mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (binnum+3+3+4)));
368 |       mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (binnum+3+3+4)));
369 |    else: mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (3+3+4)));
370 | 
371 |    # filter poor alignment
372 |    checkneighbornums = [3,6]
373 |    checkratios = {3:[6,5,4,2], 6:[11,10,9,3]}
374 |    checkratios = {3:[6,5,4,2], 6:[12,10,9,3]}
375 |    cgpos = [[], []]
376 |    affectneighbor = 1; # 2;
377 |    for aligni in range(len(base_map_info)):
378 |       # for methylated positions and not-used adjacent positions
379 |       if 'motif' in moptions and base_map_info['readbase'][aligni]==moptions['motif'][0][moptions['motif'][1]]:
380 |          m_a_st = aligni-moptions['motif'][1]; m_a_end = aligni+len(moptions['motif'][0])-moptions['motif'][1]
381 |          if m_a_st>-1 and m_a_end<=len(base_map_info) and ''.join(base_map_info['readbase'][m_a_st:m_a_end])==moptions['motif'][0] and (not ''.join(base_map_info['refbase'][m_a_st:m_a_end])==moptions['motif'][0]):
382 |             cgpos[1].extend([(forward_reverse, base_map_info['refbasei'][addi]) for addi in range(aligni-affectneighbor if aligni-affectneighbor>-1 else 0, aligni+affectneighbor+1 if aligni+affectneighbor+1<len(base_map_info) else len(base_map_info))])
383 |       if (not base_map_info['refbase'][aligni]=='-') and \
384 |          (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['fulmodlist'][rname]:
385 |          if not base_map_info['readbase'][aligni]=='-':
386 |             nextnogap = aligni + 1;
387 |             while nextnogap<len(base_map_info):
388 |                if not base_map_info['refbase'][nextnogap]=='-': break;
389 |                nextnogap += 1
390 |             iscg = False;
391 |             # find gaps
392 |             for checkneighbornum in checkneighbornums:
393 |                if not nextnogap<len(base_map_info): continue;
394 |                matchnum = 0; gapnum = 0;
395 |                # get gaps for two window sizes
396 |                for checki in range(aligni-checkneighbornum, aligni+checkneighbornum+1):
397 |                   if checki>-1 and checki<len(base_map_info):
398 |                      if base_map_info['refbase'][checki]==base_map_info['readbase'][checki]: matchnum += 1
399 |                      if base_map_info['refbase'][checki]=='-' or base_map_info['readbase'][checki]=='-': gapnum += 1
400 |                if gapnum<=checkratios[checkneighbornum][3]:
401 |                   for addi in range(aligni-affectneighbor if aligni-affectneighbor>-1 else 0, nextnogap+affectneighbor if nextnogap+affectneighbor<len(base_map_info) else len(base_map_info)):
402 |                      if addi==aligni: # for methylated positions
403 |                         cgpos[0].append((forward_reverse, base_map_info['refbasei'][addi]))
404 |                      else: # for non-used positions
405 |                         cgpos[1].append((forward_reverse, base_map_info['refbasei'][addi]))
406 |                   iscg = True; break;
407 |             if iscg: continue;
408 |          # add more not-used positions if more gaps exist
409 |          if not base_map_info['readbase'][aligni]=='-':
410 |             nextnogap = aligni
411 |             for _ in range(affectneighbor):
412 |                nextnogap += 1;
413 |                while nextnogap<len(base_map_info['refbase']):
414 |                  if not base_map_info['refbase'][nextnogap]=='-': break;
415 |                  nextnogap += 1
416 |             prenogap = aligni
417 |             for _ in range(affectneighbor):
418 |                prenogap -= 1;
419 |                while prenogap>-1:
420 |                   if not base_map_info['refbase'][prenogap]=='-': break;
421 |                   prenogap -= 1
422 | 
423 |             read0 = aligni; read1 = aligni
424 |             for _ in range(affectneighbor):
425 |                read0 -= 1
426 |                while read0>-1:
427 |                   if base_map_info['readbase'][read0]=='-': read0 -= 1
428 |                   else: break;
429 |                read1 += 1
430 |                while read1<len(base_map_info['readbase']):
431 |                   if base_map_info['readbase'][read1]=='-': read1 += 1
432 |                   else: break;
433 | 
434 |             if read0<prenogap:
435 |                if read0>-1: prenogap = read0
436 |                else: prenogap = 0
437 |             if read1>nextnogap:
438 |                if read1<len(base_map_info['readbase']): nextnogap = read1
439 |                else: nextnogap = len(base_map_info['readbase'])-1
440 |             if prenogap<0: prenogap = 0
441 |             if not nextnogap<len(base_map_info['readbase']): nextnogap=len(base_map_info['readbase'])-1
442 |             if not prenogap<len(base_map_info['readbase']): prenogap=len(base_map_info['readbase'])-1
443 |             for excldi in range(prenogap, nextnogap+1):
444 |                cgpos[1].append((forward_reverse, base_map_info['refbasei'][excldi]))
445 | 
446 |    print ('%s%s %d, %d >> %d %d, %d-%d=%d' % (forward_reverse, f5data[readk][3], len(cgpos[0]), len(cgpos[1]), len(modevents)-end_clip-start_clip, start_clip, len(modevents), end_clip, len(modevents)-end_clip))
447 | 
448 |    aligni = 0; isdif = False;
449 |    for ie in range(start_clip-100, len(modevents)-end_clip+100):
450 |       cur_row_num = ie - (start_clip-100); cur_base = ''
451 |       # for aligned bases
452 |       if ie>=start_clip and ie<len(modevents)-end_clip:
453 |          if align_ref_pos<mapped_start_pos:
454 |             print ('ERRRR align_ref_pos(%d)<mapped_start_pos(%d)' % (align_ref_pos, mapped_start_pos))
455 |          while base_map_info['readbase'][aligni]=='-':
456 |             if not align_ref_pos==base_map_info['refbasei'][aligni]:
457 |                print ('ERRRR align_ref_pos(%d) not equal to %d' % (align_ref_pos, base_map_info['refbasei'][aligni] ))
458 |             if not base_map_info['refbase'][aligni]=='-':
459 |                if forward_reverse=='+': align_ref_pos += 1
460 |                else: align_ref_pos -= 1
461 |             aligni += 1
462 |          if not base_map_info['readbase'][aligni] == modevents['model_state'][ie][2]:
463 |             print ('Error Does not match', base_map_info['readbase'][aligni], modevents['model_state'][ie][2], aligni, ie)
464 |             isdif = True;
465 |          # the first column is the aligned reference position
466 |          mfeatures[cur_row_num][0] = align_ref_pos
467 |          cur_base = base_map_info['refbase'][aligni]
468 |          # the second/third column is for negative/positive labels of methylation
469 |          if moptions['posneg'] == 0: # for a data without any modification
470 |             if ( (not moptions['anymodlist']==None) and rname in moptions['nomodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['nomodlist'][rname] ):
471 |                 mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0
472 |             elif (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['fulmodlist'][rname]:
473 |                 mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0
474 |             elif ((not moptions['anymodlist']==None) and rname in moptions['anymodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['anymodlist'][rname] ):
475 |                 mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0
476 |          else: # for a data with both modified and un-modified positions
477 |             if (forward_reverse, base_map_info['refbasei'][aligni]) in cgpos[0] and (not base_map_info['refbase'][aligni]=='-'):
478 |                mfeatures[cur_row_num][1] = 0; mfeatures[cur_row_num][2] = 1
479 |             else:
480 |                if (forward_reverse, base_map_info['refbasei'][aligni]) not in cgpos[1]:
481 |                   if moptions['anymodlist']==None:
482 |                       if moptions['nomodlist']==None or ( rname in moptions['nomodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['nomodlist'][rname] ):
483 |                          mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0
484 |                   elif rname in moptions['anymodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['anymodlist'][rname]:
485 |                       pass
486 |                   else:
487 |                       if moptions['nomodlist']==None or ( rname in moptions['nomodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['nomodlist'][rname] ):
488 |                          mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0
489 |          if not base_map_info['refbase'][aligni]=='-':
490 |             if forward_reverse=='+': align_ref_pos += 1
491 |             else: align_ref_pos -= 1
492 |          aligni += 1
493 | 
494 |       # for bin features
495 |       if ie>=0 and ie<len(modevents) and moptions['fnum']==57:
496 |          for currs in sp_param['f5data'][readk][2][modevents['start'][ie]:int(modevents['start'][ie]+int(modevents['length'][ie]+0.5))]:
497 |              if currs>10 or currs<-10: print ('Error raw signal', currs, ie, modevents['start'][ie], modevents['length'][ie])
498 |              curbin = int((currs+5)/binlen)
499 |              if curbin<0: curbin = 0
500 |              elif not curbin<binnum: curbin = binnum-1
501 |              mfeatures[cur_row_num][curbin+3] += 1
502 |       if ie>=0 and ie<len(modevents):
503 |          # for reference base type feature
504 |          if cur_base in myCom.g_ACGT:
505 |             mfeatures[cur_row_num][moptions['fnum']-3+3-4+myCom.g_ACGT.index(cur_base)] = 1
506 |          cur_index_add = moptions['fnum'] - 3 + 3
507 |          # for signal mean std and length.
508 |          mfeatures[cur_row_num][cur_index_add + 0] = modevents["mean"][ie]
509 |          mfeatures[cur_row_num][cur_index_add + 1] = modevents["stdv"][ie]
510 |          mfeatures[cur_row_num][cur_index_add + 2] = modevents["length"][ie]
511 | 
512 |    # truncated too much not-used positions
513 |    nbkeys = defaultdict();
514 |    for mfi in range(len(mfeatures)):
515 |       if mfeatures[mfi][1] + mfeatures[mfi][2] > 0.9:
516 |          for ini in range(mfi-25, mfi+26):
517 |             if ini<0 or ini>len(mfeatures)-1:
518 |                print("Warning wrong del mfeatures id %d for %s" % (ini, f5data[readk][3]))
519 |             else:
520 |                nbkeys[ini] = True;
521 |    keepInd = sorted(list(nbkeys.keys()));
522 |    if len(keepInd)>0:
523 |       if not len(keepInd)>len(mfeatures)*0.9:
524 |          mfeatures = mfeatures[np.array(keepInd)]
525 |    else:
526 |       mfeatures = []
527 | 
528 |    return (mfeatures, isdif)
529 | 
530 | 
531 | #
532 | # get the complementary bases
533 | #
534 | def get_complement(na):
535 |    if na in myCom.acgt: return myCom.na_bp[na]
536 |    else: return na;
537 | 
538 | #
539 | # get required information for reach mapping records.
540 | #
541 | def handle_line(moptions, sp_param, f5align):
542 |    lsp = sp_param['line'].split('\t')
543 |    qname, flag, rname, pos, mapq, cigar, _, _, _, seq, _ = lsp[:11]
544 |    # checked query name
545 |    if qname=='*': sp_param['f5status'] = "qname is *"
546 |    # check mapping quality
547 |    elif int(mapq)==255: sp_param['f5status'] = "mapq is 255"
548 |    # check mapped positions
549 |    elif int(pos)==0: sp_param['f5status'] = "pos is 0"
550 |    # check mapped string
551 |    elif cigar=='*': sp_param['f5status'] = "cigar is *"
552 |    # check reference name
553 |    elif rname=='*': sp_param['f5status'] = "rname is *"
554 |    if not sp_param['f5status']=="": return qname
555 | 
556 |    if (qname not in f5align) or f5align[qname][0]<int(mapq):
557 |       f5align[qname] = (int(mapq), int(flag), rname, int(pos), cigar, seq)
558 | 
559 |    return qname
560 | 
561 | #
562 | # feature handler/workder for multiprocessing
563 | #
564 | def getFeature_handler(moptions, h5files_Q, failed_Q, version_Q):
565 |    while not h5files_Q.empty():
566 |       try:
567 |          # get a list of files
568 |          f5files, ctfolderid = h5files_Q.get(block=False)
569 |       except:
570 |          break;
571 | 
572 |       sp_options = defaultdict();
573 |       sp_options['ctfolder'] = moptions['outFolder']+str(ctfolderid)
574 |       if not os.path.isdir(sp_options['ctfolder']):
575 |          os.system('mkdir '+sp_options['ctfolder'])
576 |       # get features
577 |       mGetFeature1(moptions, sp_options, f5files)
578 |       # output errors
579 |       for errtype, errfiles in sp_options["Error"].items():
580 |          failed_Q.put((errtype, errfiles));
581 |       # double check albacore version
582 |       for vk in sp_options["get_albacore_version"]:
583 |          version_Q.put((vk, sp_options["get_albacore_version"][vk]))
584 | 
585 | #
586 | # read sequence information from a reference genome
587 | #
588 | def readFA(mfa, t_chr=None):
589 |    fadict = defaultdict();
590 |    with open(mfa, 'r') as mr:
591 |       cur_chr = None;
592 |       line = mr.readline();
593 |       while line:
594 |          # remove empty spaces
595 |          line = line.strip();
596 |          if len(line)>0:
597 |             if line[0]=='>': # for each chromosome line
598 |                if (not cur_chr==None) and (t_chr in [None, cur_chr]):
599 |                   fadict[cur_chr] = ''.join(fadict[cur_chr])
600 |                cur_chr = line[1:].split()[0]
601 |                if t_chr in [None, cur_chr]:
602 |                   fadict[cur_chr] = []
603 |             else: # for sub-sequence line in a reference file
604 |                if t_chr in [None, cur_chr]:
605 |                   fadict[cur_chr].append(line.upper())
606 |          line = mr.readline();
607 |       # for the last chromosome in the file
608 |       if (not cur_chr==None) and (t_chr in [None, cur_chr]):
609 |          fadict[cur_chr] = ''.join(fadict[cur_chr])
610 |    return fadict
611 | 
612 | #
613 | # get reference positions for motif-based modifications
614 | #
615 | def readMotifMod(fadict, mpat='Cg', mposinpat=0, t_chr=None, t_start=None, t_end=None):
616 |    pos_dict = defaultdict(int)
617 | 
618 |    # get motif and complementary motif
619 |    pat3 = copy.deepcopy(mpat.upper())
620 |    comp_pat3 = ''.join([get_complement(curna) for curna in pat3][::-1])
621 |    comp_mposinpat = len(comp_pat3)-1-mposinpat
622 | 
623 |    fakeys = fadict.keys();
624 |    cpgdict = defaultdict(int);
625 |    all_a = defaultdict()
626 |    for fak in fakeys:
627 |        cpgnum = [0, 0]
628 |        # motif-based reference positions
629 |        cpgdict[fak] = defaultdict()
630 |        # position of bases of interest
631 |        all_a[fak] = defaultdict()
632 |        for i in range(len(fadict[fak])):
633 |           if (t_start==None or i>=t_start) and (t_end==None or i<=t_end):
634 |              if fadict[fak][i]==mpat[mposinpat]: # for forward strand
635 |                 all_a[fak][('+', i)] = True;
636 |              elif get_complement(fadict[fak][i])==mpat[mposinpat]: # for reverse strand
637 |                 all_a[fak][('-', i)] = True;
638 | 
639 |              # check motif in forward strand
640 |              if i-mposinpat>=0 and i+len(comp_pat3)-1-mposinpat<len(fadict[fak]) and ''.join(fadict[fak][i-mposinpat:(i+len(comp_pat3)-1-mposinpat+1)])==pat3:
641 |                 cpgdict[fak][('+', i)] = [1, fadict[fak][i]]; cpgnum[0] += 1
642 |              elif i-comp_mposinpat>=0 and i+len(comp_pat3)-1-comp_mposinpat<len(fadict[fak]) and ''.join(fadict[fak][i-comp_mposinpat:(i+len(comp_pat3)-1-comp_mposinpat+1)])==comp_pat3: # check motif in reverse strand
643 |                 cpgdict[fak][('-', i)] = [1, fadict[fak][i]]; cpgnum[1] += 1
644 |              else:
645 |                 pass
646 |        print('%s%d site: %d(+) %d(-) for %s' % (pat3, mposinpat, cpgnum[0], cpgnum[1], fak))
647 |    return (cpgdict, all_a)
648 | 
649 | 
650 | #
651 | # a multiprocessing manager to get features from all long reads.
652 | #
653 | def getFeature_manager(moptions):
654 |    start_time = time.time();
655 |    # multipprocessing manager
656 |    pmanager = multiprocessing.Manager();
657 | 
658 |    # prepare output folder
659 |    if os.path.isdir(moptions['outFolder']):
660 |       os.system('rm -dr '+moptions['outFolder'])
661 |    if not os.path.isdir(moptions['outFolder']):
662 |       os.system('mkdir '+moptions['outFolder'])
663 | 
664 |    moptions['size_per_batch'] = moptions['size_per_batch'] * (10**7)
665 | 
666 |    # read reference information
667 |    fadict = readFA(moptions['Ref'],moptions['region'][0])
668 |    if moptions['motifORPos']==1: # get motif-based positions for modifications
669 |       moptions['fulmodlist'], moptions['nomodlist'] = readMotifMod(fadict, moptions['motif'][0], moptions['motif'][1], moptions['region'][0], moptions['region'][1], moptions['region'][2])
670 |       moptions['anymodlist'] = None
671 |       moptions['nomodlist'] = None; # add for simple process
672 |    elif moptions['motifORPos']==2: # modification position is specified by the files
673 |       fuldfiles = glob.glob(moptions["fulmod"]);
674 |       moptions['fulmodlist'] = defaultdict(lambda: defaultdict());
675 |       if not moptions["anymod"]==None: # partially modified positions
676 |          anydfiles = glob.glob(moptions["anymod"])
677 |          moptions['anymodlist'] = defaultdict(lambda: defaultdict());
678 |       else:
679 |          moptions['anymodlist'] = None
680 |       if not moptions["nomod"]==None: # completely un-modified positions
681 |          nodfiles = glob.glob(moptions["nomod"])
682 |          moptions['nomodlist'] = defaultdict(lambda: defaultdict());
683 |       else:
684 |          moptions['nomodlist'] = None
685 |       mthreadin = [moptions['fulmodlist'], moptions['anymodlist'], moptions['nomodlist']]
686 |       mthfiles = [fuldfiles, anydfiles, nodfiles]
687 |       # read completely modified positions, partially modified positions, completely un-modified positions from files
688 |       for mthi in range(len(mthreadin)):
689 |          curmeth = mthreadin[mthi]; curfilelist = mthfiles[mthi]
690 |          if curmeth==None or curfilelist==None: continue;
691 |          for curmthf in curfilelist:
692 |              with open(curmthf, 'r') as mreader:
693 |                 line = mreader.readline();
694 |                 while line:
695 |                    if len(line)>0:
696 |                       tchr, tstrand, tpos = line.split()[:3]
697 |                       curmeth[tchr][(tstrand, int(tpos))] = [1-mthi, fadict[tchr][int(tpos)]];
698 |                    line = mreader.readline();
699 |    for tchr in moptions['fulmodlist'] if moptions['anymodlist']==None else moptions['anymodlist']:
700 |       if len(moptions['fulmodlist'][tchr])>0 or ((not moptions['anymodlist']==None) and len(moptions['anymodlist'][tchr])>0):
701 |           print ('%s fulmod=%d anymod=%d nomod=%d' % (tchr, len(moptions['fulmodlist'][tchr]), len(moptions['anymodlist'][tchr]) if (not moptions['anymodlist']==None) else -1, len(moptions['nomodlist'][tchr]) if (not moptions['nomodlist']==None) else -1))
702 | 
703 |    if True: #False:
704 |       # get all input fast5 files
705 |       f5files = glob.glob(os.path.join(moptions['wrkBase'],"*.fast5" ))
706 |       if moptions['recursive']==1:
707 |          f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*.fast5" )))
708 |          f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*/*.fast5" )))
709 |          f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*/*/*.fast5" )))
710 | 
711 | 
712 |    print('Total files=%d' % len(f5files))
713 |    h5files_Q = pmanager.Queue();
714 |    failed_Q = pmanager.Queue()
715 |    version_Q = pmanager.Queue()
716 | 
717 |    # split input fast5 files into different batch
718 |    h5_batch = []; h5batchind = 0;
719 |    for f5f in f5files:
720 |       h5_batch.append(f5f);
721 |       if len(h5_batch)==moptions['files_per_thread']:
722 |          h5files_Q.put((h5_batch, h5batchind))
723 |          h5batchind += 1
724 |          h5_batch = []; #break; ### feature500
725 |    if len(h5_batch)>0:
726 |       h5files_Q.put((h5_batch, h5batchind))
727 | 
728 |    # each thread handle a batch a time and repeat for all batches.
729 |    share_var = (moptions, h5files_Q, failed_Q, version_Q)
730 |    handlers = []
731 |    for hid in range(moptions['threads']):
732 |       p = multiprocessing.Process(target=getFeature_handler, args=share_var);
733 |       p.start();
734 |       handlers.append(p);
735 | 
736 |    # get failed files.
737 |    failed_files = defaultdict(list);
738 |    version_default = defaultdict(lambda: defaultdict(int));
739 |    while any(p.is_alive() for p in handlers):
740 |       try:
741 |          errk, fns = failed_Q.get(block=False);
742 |          failed_files[errk].extend(fns)
743 |          curv, curv_num = version_Q.get(block=False);
744 |          version_default[curv] += curv_num
745 |       except:
746 |          time.sleep(1);
747 |          continue;
748 | 
749 |    # output failure information
750 |    if len(failed_files)>0:
751 |       print ('Error information for different fast5 files:')
752 |       for errtype, errfiles in failed_files.items():
753 |          print ('\t%s %d' % (errtype, len(errfiles)))
754 |    print("abversion info {}".format(str(version_default)))
755 |    sys.stdout.flush()
756 |    end_time = time.time();
757 |    print ("Total consuming time %d" % (end_time-start_time))
758 | 
759 | 
760 | 
761 | # for indepdent testing of code
762 | if __name__=='__main__':
763 | #   if len(sys.argv)>4:
764 |       moptions = {}
765 |       moptions['basecall_1d'] = 'Basecall_1D_000'
766 |       moptions['basecall_1d'] = ['Basecall_1D_000']
767 |       moptions['basecall_2strand'] = 'BaseCalled_template'
768 | 
769 |       moptions['outLevel'] = myCom.OUTPUT_WARNING
770 |       moptions['outLevel'] = myCom.OUTPUT_INFO
771 | 
772 |       moptions['modfile'] = '../../mod_output/train1/2/mod_train'
773 | 
774 |       moptions['fnum'] = 53;
775 |       moptions['hidden'] = 100;
776 |       moptions['windowsize'] = 21;
777 | 
778 |       moptions['threads'] = 8
779 |       moptions['threads'] = 1
780 |       moptions['files_per_thread'] = 500
781 | 
782 |       mDetect_manager(moptions)
783 | 


--------------------------------------------------------------------------------
/bin/DeepMod_scripts/myMultiBiRNN.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import tensorflow as tf
  4 | from tensorflow.contrib import rnn
  5 | import numpy as np
  6 | 
  7 | import math
  8 | import glob, os, sys, time;
  9 | 
 10 | from collections import defaultdict
 11 | 
 12 | batchsize = 2048;
 13 | 
 14 | # different class weights for unbalanced data
 15 | class_weights = tf.constant([0.1,0.9])
 16 | 
 17 | #
 18 | # create a RNN with LSTM
 19 | # define performance evaluation operation
 20 | #
 21 | def mCreateSession(num_input, num_hidden, timesteps, moptions):
 22 |    # two classes only
 23 |    num_classes = 2;
 24 |    # the number of layers
 25 |    numlayers = 3;
 26 |    # learning rate
 27 |    learning_rate = 0.001
 28 | 
 29 |    # define input and output
 30 |    X = tf.placeholder("float", [None, timesteps, num_input]);
 31 |    Y = tf.placeholder("float", [None, num_classes]);
 32 | 
 33 |    # for last layers
 34 |    weights = {'out': tf.Variable(tf.truncated_normal([2*num_hidden, num_classes]))};
 35 |    biases = {'out': tf.Variable(tf.truncated_normal([num_classes]))}
 36 | 
 37 |    # define a bidirectional RNN
 38 |    def BiRNN(x, weights, biases):
 39 |       x = tf.unstack(x, timesteps, 1);
 40 | 
 41 |       # define the LSTM cells
 42 |       lstm_fw_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(num_hidden, forget_bias=1.0) for _ in range(numlayers)]);
 43 |       lstm_bw_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(num_hidden, forget_bias=1.0) for _ in range(numlayers)]);
 44 | 
 45 |       # define bidirectional RNN
 46 |       try:
 47 |          outputs, _, _ = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x, dtype=tf.float32);
 48 |       except Exception:
 49 |          outputs = rnn.static_bidirectional_rnn(lstm_fw_cell, lstm_bw_cell, x, dtype=tf.float32);
 50 | 
 51 |       # define output layer
 52 |       if moptions['outputlayer'] in ['sigmoid']:
 53 |          return tf.contrib.layers.fully_connected(outputs[int(timesteps/2)], num_outputs=num_classes, activation_fn=tf.nn.sigmoid);
 54 |       else:
 55 |          return tf.matmul(outputs[int(timesteps/2)], weights['out']) + biases['out']
 56 | 
 57 |    # get prediction
 58 |    logits = BiRNN(X, weights, biases);
 59 |    prediction = tf.nn.softmax(logits)
 60 | 
 61 |    mfpred=tf.argmax(prediction,1)
 62 | 
 63 |    ## with different class-weights or not
 64 |    if 'unbalanced' in moptions and (not moptions['unbalanced']==None) and moptions['unbalanced']==1:  # class_weights
 65 |       loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=tf.multiply(logits, class_weights), labels=Y))
 66 |    else:
 67 |       loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=Y))
 68 |    #
 69 | 
 70 |    # for optimizer
 71 |    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate);
 72 |    train_op = optimizer.minimize(loss_op);
 73 | 
 74 |    # get accuracy
 75 |    correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y, 1));
 76 |    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32));
 77 | 
 78 |    # AUC
 79 |    auc_op = tf.metrics.auc(Y, prediction)
 80 |    # precision
 81 |    mpre = tf.metrics.precision(tf.argmax(Y, 1), tf.argmax(prediction, 1))
 82 |    # recall
 83 |    mspf = tf.metrics.recall(tf.argmax(Y, 1), tf.argmax(prediction, 1))
 84 | 
 85 |    # initialization of variables
 86 |    init = tf.global_variables_initializer();
 87 |    init_l = tf.local_variables_initializer()
 88 | 
 89 |    saver = tf.train.Saver();
 90 | 
 91 |    return (init, init_l, loss_op, accuracy, train_op, X, Y, saver, auc_op, mpre, mspf, mfpred)
 92 | 
 93 | #
 94 | # train a model and save it.
 95 | #
 96 | def train_save_model(filelists, num_input, mhidden, timesteps, moptions):
 97 |    training_steps = 4
 98 |    #training_steps = 40
 99 | 
100 |    init, init_l, loss_op, accuracy, train_op, X, Y, saver, auc_op, mpre, mspf, mfpred = mCreateSession(num_input, mhidden, timesteps, moptions)
101 | 
102 |    # display step
103 |    desplay_files = len(filelists[0])/100
104 |    if desplay_files<2: desplay_files = 2;
105 |    if desplay_files>10: desplay_files = int(desplay_files/10) * 10; #desplay_files=2
106 |    if desplay_files>100: desplay_files = 100
107 |    file_group_id = [0 for _ in range(len(filelists))];
108 |    sumpsize = 25;
109 | 
110 |    # for configuration
111 |    config = tf.ConfigProto()
112 |    if (timesteps>61 and num_input>50):
113 |       config.gpu_options.per_process_gpu_memory_fraction = 0.5
114 |    else: config.gpu_options.allow_growth = True
115 |    with tf.Session(config=config) as sess:
116 |       # initialization
117 |       sess.run(init);
118 |       sess.run(init_l)
119 |       start_time = time.time(); start_c_time = time.time();
120 |       io_time = 0;
121 | 
122 |       # for each epoch
123 |       for step in range(1, training_steps+1):
124 |          print('===%d=====================step========================%d/%d' % (desplay_files, step, training_steps))
125 |          sys.stdout.flush()
126 |          last_desplay_files_num = -1;
127 |          file_group_id[0] = 0
128 |          while file_group_id[0] < len(filelists[0]):
129 |              io_start_time = time.time();
130 | 
131 |              # for each input groups.
132 |              # usually two groups: one positive group and one negative group
133 |              # might also one group containing both positive and negative labelling data
134 |              featurelist = [[[], []] for _ in range(len(filelists))];
135 |              minsize = None; cur_batch_num = None;
136 |              # get data from all groups until 'minsize' data is loaded.
137 |              for ifl in range(len(filelists)):
138 |                 if ifl==0:
139 |                    minsize = batchsize * sumpsize
140 |                 else: minsize = batchsize * cur_batch_num;
141 |                 while len(featurelist[ifl][0])<minsize:
142 |                    if not file_group_id[ifl] < len(filelists[ifl]):
143 |                       if ifl==0: break;
144 |                       else: file_group_id[ifl] = 0
145 |                    # get more data
146 |                    batch_2_x, batch_2_y, _ = getDataFromFile_new(filelists[ifl][file_group_id[ifl]], moptions)
147 |                    if len(batch_2_y)>0:
148 |                       if len(featurelist[ifl][0])==0:
149 |                          featurelist[ifl][0] = batch_2_x
150 |                          featurelist[ifl][1] = batch_2_y
151 |                       else:
152 |                          # merge current loading data with previously loading data
153 |                          featurelist[ifl][0] = np.concatenate((featurelist[ifl][0], batch_2_x), axis=0)
154 |                          featurelist[ifl][1] = np.concatenate((featurelist[ifl][1], batch_2_y), axis=0)
155 |                    file_group_id[ifl] += 1;
156 |                 # split for small groups for training
157 |                 if ifl==0:
158 |                    featurelist[ifl][0] = np.array_split(featurelist[ifl][0], int(len(featurelist[ifl][0])/batchsize))
159 |                    featurelist[ifl][1] = np.array_split(featurelist[ifl][1], int(len(featurelist[ifl][1])/batchsize))
160 |                    cur_batch_num = len(featurelist[ifl][0])
161 |              if len(featurelist[0][0])<sumpsize*0.8:
162 |                 for ifl in range(1, len(filelists)):
163 |                     if len(featurelist[0][0])*batchsize*1.2 < len(featurelist[ifl][0]):
164 |                        featurelist[ifl][0] = featurelist[ifl][0][:int(len(featurelist[0][0])*batchsize*1.2)]
165 |                        featurelist[ifl][1] = featurelist[ifl][1][:int(len(featurelist[0][0])*batchsize*1.2)]
166 |                 if len(featurelist[0][0])<1: continue
167 |              #
168 |              if len(filelists)>1:
169 |                 for ifl in range(1, len(filelists)):
170 |                    #if (file_group_id[0]+1) - last_desplay_files_num >= desplay_files: msizeprint.append(str(len(featurelist[ifl][0])))
171 |                    featurelist[ifl][0] = np.array_split(featurelist[ifl][0], len(featurelist[0][0]))
172 |                    featurelist[ifl][1] = np.array_split(featurelist[ifl][1], len(featurelist[0][0]))
173 |              io_time += (time.time() - io_start_time)
174 | 
175 |              ifl=3 if len(featurelist)>3 else len(featurelist)-1
176 |              if (file_group_id[0]+1) - last_desplay_files_num >= desplay_files:
177 |                 sess.run(init_l)
178 |                 try:
179 |                    # print some testing information as progress indicators
180 |                    loss, aucm, acc, p, r = sess.run([loss_op, auc_op[1], accuracy, mpre[1], mspf[1]], feed_dict={X:featurelist[ifl][0][0], Y:featurelist[ifl][1][0]})
181 |                    print(">>>Tratin#files "+str(file_group_id[0]+1)+",loss="+"{:.3f}".format(loss)+",AUC="+"{:.3f}".format(aucm)+",acc="+"{:.3f}".format(acc)+",p="+"{:.3f}".format(p)+",r="+"{:.3f}".format(r)+(" Comsuming time: %d(current=%d) IO=%d(%.3f)" % (time.time()-start_time, time.time()-start_c_time, io_time, io_time/float(time.time()-start_time))));
182 |                 except:
183 |                    print(">>>Tratin#filesError "+str(file_group_id[0]+1)+(" Comsuming time: %d(current=%d) IO=%d(%.3f)" % (time.time()-start_time, time.time()-start_c_time, io_time, io_time/float(time.time()-start_time))));
184 |                 sys.stdout.flush()
185 |                 start_c_time = time.time();
186 | 
187 |              # using each subgroup of data for training
188 |              for subi in range(len(featurelist[0][0])):
189 |                 for ifl in range(len(filelists)):
190 |                    to = sess.run([train_op, loss_op], feed_dict={X:featurelist[ifl][0][subi], Y:featurelist[ifl][1][subi]})
191 |                    if len(featurelist)==1:
192 |                       # print some detail if nan issue happens
193 |                       if math.isnan(to[1]):
194 |                          for toj in range(len(featurelist[ifl][0][subi])):
195 |                             print('{} vs {}'.format(featurelist[ifl][1][subi][toj][0], featurelist[ifl][1][subi][toj][1]))
196 |                             for tok in featurelist[ifl][0][subi][toj]:
197 |                                opstr = []
198 |                                for tol in tok:
199 |                                    opstr.append(str(round(tol, 2)))
200 |                                print("\t\t\t"+','.join(opstr))
201 |                          sys.exit(1)
202 | 
203 | 
204 |              # adjust progress output information
205 |              ifl=3 if len(featurelist)>3 else len(featurelist)-1
206 |              if (file_group_id[0]+1) - last_desplay_files_num >= desplay_files:
207 |                 last_desplay_files_num = (file_group_id[0]+1) - ((file_group_id[0]+1) % desplay_files)
208 | 
209 |              # store more models
210 |              if 49.5<int(file_group_id[0]*100/float(len(filelists[0])))<50.5:
211 |                 savp = '.50'
212 |                 if (not os.path.isdir(moptions['outFolder']+str(step-1)+savp)):
213 |                   os.system('mkdir -p '+moptions['outFolder']+str(step-1)+savp);
214 |                 saver.save(sess, moptions['outFolder']+str(step-1)+savp+'/'+moptions['FileID']);
215 |              if len(featurelist)==1:
216 |                 cur_per = int(file_group_id[0]*100/float(len(filelists[0])))
217 |                 if cur_per in [10, 20, 30, 40, 60, 70, 80, 90]:
218 |                     savp = str(round(cur_per/100.0, 2))
219 |                     if (not os.path.isdir(moptions['outFolder']+str(step-1)+savp)):
220 |                         os.system('mkdir -p '+moptions['outFolder']+str(step-1)+savp);
221 |                     saver.save(sess, moptions['outFolder']+str(step-1)+savp+'/'+moptions['FileID']);
222 |          # for each epoch, store the trained model
223 |          if (not os.path.isdir(moptions['outFolder']+str(step))):
224 |             os.system('mkdir -p '+moptions['outFolder']+str(step));
225 |          saver.save(sess, moptions['outFolder']+str(step)+'/'+moptions['FileID']);
226 |       print("Training Finished!")
227 | 
228 |       return (accuracy, X, Y, auc_op, mpre, mspf, init_l, mfpred)
229 | 
230 | #
231 | # get all data files in a folder
232 | #
233 | def getTFiles1(folder1, moptions):
234 |    t1files = glob.glob(os.path.join(folder1, "*.xy.gz"))
235 |    # get all data in a recursive way
236 |    if moptions['recursive']==1:
237 |       t1files.extend(glob.glob(os.path.join(folder1, "*/*.xy.gz")))
238 |       t1files.extend(glob.glob(os.path.join(folder1, "*/*/*.xy.gz")));
239 |       t1files.extend(glob.glob(os.path.join(folder1, "*/*/*/*.xy.gz")));
240 |       t1files.extend(glob.glob(os.path.join(folder1, "*/*/*/*/*.xy.gz")));
241 |    print("Get folder1");
242 |    # for read-based independent testing
243 |    print(t1files.__sizeof__(), len(t1files))
244 |    if moptions['test'][0] == '0':
245 |       if moptions['test'][1]>0.5:
246 |          t1files = t1files[:int(len(t1files)*moptions['test'][1])]
247 |       else: t1files = t1files[-int(len(t1files)*moptions['test'][1]):]
248 |    print('Sizeinfo: %s sizeof=%d len=%d' % (folder1, t1files.__sizeof__(), len(t1files)))
249 |    sys.stdout.flush();
250 | 
251 |    return t1files
252 | 
253 | #
254 | # get all data files in two seperate folders
255 | #
256 | def getTFiles(folder1, folder2, moptions):
257 |    t1files = glob.glob(os.path.join(folder1, "*.xy.gz")); #print(t1files.__sizeof__(), len(t1files))
258 |    # get all data in a recursive way
259 |    if moptions['recursive']==1:
260 |       t1files.extend(glob.glob(os.path.join(folder1, "*/*.xy.gz"))); #print(t1files.__sizeof__(), len(t1files))
261 |       t1files.extend(glob.glob(os.path.join(folder1, "*/*/*.xy.gz"))); #print(t1files.__sizeof__(), len(t1files))
262 |       t1files.extend(glob.glob(os.path.join(folder1, "*/*/*/*.xy.gz"))); #print(t1files.__sizeof__(), len(t1files))
263 |       t1files.extend(glob.glob(os.path.join(folder1, "*/*/*/*/*.xy.gz"))); #print(t1files.__sizeof__(), len(t1files))
264 |    print("Get folder1");
265 |    # for read-based independent testing
266 |    print(t1files.__sizeof__(), len(t1files))
267 |    if moptions['test'][0] == '0':
268 |       if moptions['test'][1]>0.5:
269 |          t1files = t1files[:int(len(t1files)*moptions['test'][1])]
270 |       else: t1files = t1files[-int(len(t1files)*moptions['test'][1]):]
271 |    print(t1files.__sizeof__(), len(t1files))
272 |    sys.stdout.flush();
273 | 
274 |    if folder2==None: t2files = []
275 |    else:
276 |       # get all data in a recursive way for the second folder
277 |       t2files = glob.glob(os.path.join(folder2, "*.xy.gz"))
278 |       if moptions['recursive']==1:
279 |          t2files.extend(glob.glob(os.path.join(folder2, "*/*.xy.gz")))
280 |          t2files.extend(glob.glob(os.path.join(folder2, "*/*/*.xy.gz")))
281 |          t2files.extend(glob.glob(os.path.join(folder2, "*/*/*/*.xy.gz")))
282 |          t2files.extend(glob.glob(os.path.join(folder2, "*/*/*/*/*.xy.gz")))
283 |       print("Get folder2");
284 |       # for read-based independent testing
285 |       print(t2files.__sizeof__(), len(t2files))
286 |       if moptions['test'][0] == '0':
287 |          if moptions['test'][1]>0.5:
288 |             t2files = t2files[:int(len(t2files)*moptions['test'][1])]
289 |          else: t2files = t2files[-int(len(t2files)*moptions['test'][1]):]
290 |       print(t2files.__sizeof__(), len(t2files))
291 |       sys.stdout.flush();
292 |    return t1files, t2files
293 | 
294 | #
295 | # get data from a data file
296 | #
297 | def getDataFromFile(fn, moptions):
298 |    mdata = np.loadtxt(fn, dtype=np.float32)
299 |    # get genomic position, label, and feature
300 |    t0, ty, tx = np.split(mdata, [0,2], axis=1);
301 |    return (tx, ty, None)
302 | 
303 | #
304 | # get X and Y from a data file
305 | #
306 | def getDataFromFile_new(fn, moptions, mfind0ld=None):
307 |    mdata = np.loadtxt(fn, dtype=np.float32)
308 |    # get genomic position, label, and feature
309 |    t0, ty, tx = np.split(mdata, [1,3], axis=1);
310 | 
311 |    if moptions['test'][0] in ['-', '+']:
312 |       t0 = t0.astype(int)
313 | 
314 |    nan_file = []
315 |    m_data = []; m_y = [];
316 |    if not mfind0ld==None:
317 |       pos_to_file_dict = defaultdict();  preind = 0
318 |       mfind0ldkeys = sorted(list(mfind0ld.keys()));
319 |    for mind in range(len(ty)):
320 |       if not mfind0ld==None:
321 |          if preind<len(mfind0ldkeys) and mind == mfind0ldkeys[preind]:
322 |             pos_to_file_dict[len(m_y)] = mfind0ld[ mfind0ldkeys[preind] ]
323 |             preind += 1
324 | 
325 |       # for region-based independent testing
326 |       if (ty[mind][0]<0.01 and ty[mind][1]<0.01): continue;
327 |       if (moptions['test'][0]=='-' and moptions['test'][1]<t0[mind]<moptions['test'][2]) or \
328 |          (moptions['test'][0]=='+' and (not moptions['test'][1]<t0[mind]<moptions['test'][2])):
329 |          continue;
330 |       ## check nan values in feature files
331 |       has_nan_value = False;
332 |       for cur_row in tx[(mind-int(moptions['windowsize']/2)):(mind+int(moptions['windowsize']/2)+1)]:
333 |          if np.isnan(cur_row).any():
334 |             has_nan_value = True;
335 |             break;
336 |       if has_nan_value:
337 |          if fn in nan_file: pass
338 |          else:
339 |              print ("Warning-nan-value {}".format(fn))
340 |              nan_file.append(fn);
341 |       else:
342 |          m_y.append(ty[mind])
343 |          m_data.append(tx[(mind-int(moptions['windowsize']/2)):(mind+int(moptions['windowsize']/2)+1)])
344 |    if not mfind0ld==None:
345 |       # since each feature file contains data from several fast5 files
346 |       # get the staring rows of fast5 files in feature files
347 |       file_to_pos_dict = defaultdict();
348 |       ptofkeys = sorted(list(pos_to_file_dict.keys()))
349 |       for npk_ind in range(len(ptofkeys)):
350 |          if (npk_ind+1<len(ptofkeys) and ptofkeys[npk_ind+1]-ptofkeys[npk_ind]<500) or len(m_y)-ptofkeys[npk_ind]<500: continue;
351 | 
352 |          file_to_pos_dict[ pos_to_file_dict[ptofkeys[npk_ind]] ] = [ptofkeys[npk_ind], (ptofkeys[npk_ind+1] if npk_ind+1<len(ptofkeys) else len(m_y))]
353 | 
354 |    # reshape the data
355 |    if len(m_data)>0:
356 |       m_data = np.reshape(m_data, (len(m_data), len(m_data[0]), len(m_data[0][0])))
357 |       m_y = np.reshape(m_y, (len(m_y), len(m_y[0]))).astype(int)
358 | 
359 |    if not mfind0ld==None:
360 |       return (m_data, m_y, file_to_pos_dict);
361 |    else: return (m_data, m_y, None)
362 | 
363 | #
364 | # get index information for each fast5 file in a data file
365 | #
366 | def getGZFilePos(gzfile):
367 |    # get the staring row of fast5 files in a feature file
368 |    mfind = defaultdict()
369 |    with open(gzfile[:-len('.gz')]+'.ind', 'r') as mr:
370 |       line = mr.readline()
371 |       while line:
372 |           line = line.strip();
373 |           lsp = line.split();
374 |           if len(lsp)>1:
375 |              mfind[int(lsp[0])] = lsp[1]
376 |           line = mr.readline()
377 |    return mfind
378 | 
379 | #
380 | # make a prediction for data from a data file
381 | #
382 | def mPred(mfbase, mffolder, accuracy, X, Y, test_gzfile2, pf, num_input, auc_op, mpre, mspf, init_l, mfpred, timesteps, moptions):
383 |    config = tf.ConfigProto()
384 |    config.gpu_options.allow_growth = True
385 |    with tf.Session(config=config) as sess:
386 |       # restore the well-trained model
387 |       new_saver = tf.train.import_meta_graph(mfbase+'.meta')
388 |       new_saver.restore(sess,tf.train.latest_checkpoint(mffolder))
389 | 
390 |       # write the prediction information
391 |       pfwriter = open(pf, 'w');
392 |       for test_gzfile in test_gzfile2:
393 |          for test_fn_ind in range(len(test_gzfile)):
394 |             # get feature files for prediction
395 |             test_gzfeature, test_gzlabel, _ = getDataFromFile_new(test_gzfile[test_fn_ind], moptions)
396 |             if len(test_gzfeature)<1: continue;
397 | 
398 |             ftlist = np.array_split(test_gzfeature, int(len(test_gzfeature)/batchsize)+1)
399 |             lblist = np.array_split(test_gzlabel, int(len(test_gzlabel)/batchsize)+1)
400 |             for fti in range(len(ftlist)):
401 |                sess.run(init_l)
402 |                mfpred_output = sess.run([ mfpred], feed_dict={X:ftlist[fti], Y:lblist[fti]})
403 |                # get ture positive, false positive, false negative and true negative
404 |                tp, fp, fn, tn = 0, 0, 0, 0
405 |                for pi in range(len(mfpred_output[0])):
406 |                   if mfpred_output[0][pi]==1:
407 |                      if lblist[fti][pi][1]==1: tp += 1
408 |                      else: fp += 1
409 |                   else:
410 |                      if lblist[fti][pi][1]==1: fn += 1
411 |                      else: tn += 1
412 |                pfwriter.write('tp=%d fp=%d fn=%d tn=%d %s\n' % (tp, fp, fn, tn, test_gzfile[test_fn_ind]))
413 |             pfwriter.flush()
414 |       pfwriter.close();
415 | 
416 | #
417 | # entry for prediction
418 | #
419 | def pred_prepare(moptions, test_file, accuracy, X, Y, auc_op, mpre, mspf, init_l, mfpred):
420 |     mPred(moptions['modfile'][0], moptions['modfile'][1], accuracy, X, Y, test_file, moptions['outFolder']+moptions['FileID']+'_mpred.txt', moptions['fnum'], auc_op, mpre, mspf, init_l, mfpred, moptions['windowsize'], moptions)
421 | 
422 | #
423 | # prepare training process
424 | #
425 | def mMult_RNN_LSTM_train(moptions):
426 |    # get folders of feature files
427 |    filegroups = moptions['wrkBase'].split(';')
428 |    for i in range(len(filegroups)):
429 |       filegroups[i] = filegroups[i].split(',')
430 | 
431 |    print(filegroups)
432 | 
433 |    # get feature files
434 |    filelists = [[] for _ in range(len(filegroups))]
435 |    for i in range(len(filegroups)):
436 |       for fgj in range(len(filegroups[i])):
437 |          if not len(filegroups[i][fgj])>0: continue
438 |          filelists[i].extend(getTFiles1(filegroups[i][fgj], moptions))
439 |    mostnum, mostid = 0, -1;
440 |    # set seed so that random setting
441 |    np.random.seed(3)
442 |    for i in range(len(filelists)):
443 |       np.random.shuffle(filelists[i])
444 |       if len(filelists[i])>mostnum:
445 |          mostnum = len(filelists[i])
446 |          mostid = i;
447 | 
448 |    # set seed so that random setting
449 |    np.random.seed(7)
450 |    # path for model file
451 |    if 'modfile' in moptions and (not moptions['modfile']==None):
452 |       if moptions['modfile'].rfind('/')==-1:
453 |          moptions['modfile'] = [moptions['modfile'], './']
454 |       else:
455 |          moptions['modfile'] = [moptions['modfile'], moptions['modfile'][:moptions['modfile'].rfind('/')+1]]
456 | 
457 |    if not mostid==0:
458 |       filelists[mostid], filelists[0] = filelists[0], filelists[mostid]
459 | 
460 |    accuracy, X, Y, auc_op, mpre, mspf, init_l, mfpred = train_save_model(filelists, moptions['fnum'], moptions['hidden'], moptions['windowsize'], moptions)
461 | 
462 | #
463 | # prepare prediction process
464 | #
465 | def pred_entry(moptions):
466 |    # get feature files
467 |    tfiles = [getTFiles1(moptions['wrkBase'], moptions)]
468 | 
469 |    init, init_l, loss_op, accuracy, train_op, X, Y, saver, auc_op, mpre, mspf, mfpred = mCreateSession(moptions['fnum'], moptions['hidden'], moptions['windowsize'], moptions)
470 | 
471 |    # path for model file
472 |    if moptions['modfile'].rfind('/')==-1:
473 |       moptions['modfile'] = [moptions['modfile'], './']
474 |    else:
475 |       moptions['modfile'] = [moptions['modfile'], moptions['modfile'][:moptions['modfile'].rfind('/')+1]]
476 | 
477 |    pred_prepare(moptions, tfiles, accuracy, X, Y, auc_op, mpre, mspf, init_l, mfpred)
478 | 


--------------------------------------------------------------------------------
/docs/Description of well-trained models.md:
--------------------------------------------------------------------------------
 1 | This file contains description of well-trained model in `train_mod` directory. The meaning of the short name of the data set can be found in the paper. ***Warnings: the well-trained models below are NEVER retrained with Guppy. You will get unpredictable results if you use them to make the prediction with the data basecalled with Guppy.***
 2 | 
 3 | # 1. Modificaiton prediction model
 4 | These are several trained models of LSTM(Long short-term memory)-based RNN (Recurrent neural network), and their setting is given below.
 5 | 
 6 | ## 1.1 `rnn_sinmodC_P100wd21_f7ne1u0_4`
 7 | This model is trained on E. Coli data with synthetically introduced 5mC.
 8 | ```
 9 | Window size: 21
10 | #Feature per event: 7
11 | Base of interest: C
12 | #Training epoch of negative data: 4
13 | Training data: SSS and UMR
14 | ```
15 | Please refer to our paper for the meanings of `SSS` and `UMR`.
16 | 
17 | ## 1.2 `rnn_conmodC_P100wd21_f7ne1u0_4`
18 | This model is trained on E. Coli data with synthetically introduced 5mC.
19 | ```
20 | Window size: 21
21 | #Feature per event: 7
22 | Base of interest: C
23 | #Training epoch of negative data: 4
24 | Training data: positive control (SSS, Cg_sssl, Cg_mpel and gCgC), and negative control: (UMR, con1 and con2)
25 | ```
26 | Please refer to our paper for the meanings of `SSS`, `UMR`, `con1`, `con2`, `Cg_sssl`, `Cg_mpel` and `gCgC`.
27 | 
28 | ## 1.3 `rnn_conmodA_P100wd21_f7ne1u0_4`
29 | This model is trained on E. Coli data with synthetically introduced 6mA.
30 | ```
31 | Window size: 21
32 | #Feature per event: 7
33 | Base of interest: A
34 | #Training epoch of negative data: 4
35 | Training data: Three positive control of 6mA and the negative control (con1, con2)
36 | ```
37 | 
38 | ## 1.4 `rnn_conmodA_E1m2wd21_f7ne1u0_4`
39 | This is a region-based training model. Reads and bases mapped to 1,000,000 on E. Coli is used for testing and others for training.
40 | ```
41 | Window size: 21
42 | #Feature per event: 7
43 | Base of interest: A
44 | #Training epoch of negative data: 4
45 | Training data: Three positive control of 6mA and the negative control (con1, con2)
46 | ```
47 | 
48 | ## 1.5 `rnn_f7_wd21_chr1to10_4`
49 | This model is trained on Chr 1 to 10 of NA12878 with completely methylated positions (>90% methylation percentage in both replicates of bisulfite sequences) and completely un-methylated positions (<=0% methylation percentage in both replicates of bisulfite sequences). Bases in long reads mapped to the two groups of reference positions are used in training process. This model is used to make 5mC prediction on HX1, and both HX1 and NA12878 are basecalled using Albacore v2.3.1.
50 | ```
51 | Window size: 21
52 | #Feature per event: 7
53 | Base of interest: C
54 | #Training epoch of all data: 4
55 | Training data: Chr 1 to 10 of NA12878
56 | ```
57 | 
58 | # 2. Cluster-effect model (the second neural network)
59 | This is the model of the second neural network to consider modificatoin cluster of 5mC. 
60 | ## 2.1 `na12878_cluster_train_mod-keep_prob0.7-nb25-chr1`
61 | The model is only trained on Chr1 of NA12878.
62 | ```
63 | Window size: 25
64 | #Feature per event: 11
65 | Base of interest: C
66 | #Training epoch of negative data: 1
67 | Training data: Chr 1 of NA12878
68 | ```
69 | 


--------------------------------------------------------------------------------
/docs/Install.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Installation Guide
 3 | 
 4 | ## Prerequisites:
 5 |  ### The required packages for running DeepMod are listed below:
 6 | 	* Python 3.6
 7 | 	* python packages:
 8 | 		+ h5py
 9 | 		+ numpy
10 | 		+ scipy
11 | 	* BWA MEM/minimap2
12 | 	* SAMTOOLS
13 | 	* HDF5
14 | 	* Tensorflow
15 | 	* If running performance evaluation, you might need
16 | 		+ python packages: rpy2, matplotlib, scikit-learn
17 | 		+ R packages: ggplot2, gridExtra, plyr, RColorBrewer
18 | 
19 | The packages I used are 
20 | ```
21 | h5py                      2.7.1
22 | hdf5                      1.10.1
23 | numpy                     1.14.0
24 | scikit-learn              0.19.1
25 | scipy                     1.0.0
26 | tensorflow                1.7.0
27 | samtools                  1.5
28 | minimap2                  2.12
29 | bwa                       0.7.15
30 | matplotlib                2.1.2
31 | rpy2                      2.8.6
32 | r                         3.4.2
33 | ```
34 | 
35 |   ### Package installation		
36 | Users have their own choice of how to install required package above. But to efficiently manage the installation packages, Anaconda is recommended. After installing Annocoda, it would also be an good option to use virtual environment in annocoda. `conda create -n mdeepmod python=3.6` can be used to create a virtual environment, and `source activate mdeepmod` can be used to activate the virtual environment, and then install required packages or run DeepMod. If users want to exit the virtual environment, simply type `source deactivate`. 
37 | 
38 | After creating `mdeepmod` virtual environment using annocoda, the following commands can install majority of necessary packages:
39 | 
40 | ```
41 | source activate mdeepmod
42 | conda install -c anaconda h5py hdf5 numpy scipy scikit-learn
43 | conda install -c bioconda minimap2 bwa samtools
44 | conda install -c r rpy2 r-ggplot2 r-gridextra r-plyr r-rcolorbrewer
45 | ```
46 | Users might need to install `tensorflow` with version 1.13 by yourself or by using `conda install -c conda-forge tensorflow matplotlib` (if there are conflicts when using this command, uers need to give up and install `tensorflow` and `matplotlib` by yourself.).
47 | 
48 | ### Additional notes
49 | 
50 | Some users reported that the compression format of their fast5 is vbz instead of gzip. Install `ont-vbz-hdf-plugin` solves the problem. Otherwise, an error message such as "Error!!! No Raw_reads/Signal data" will be shown.
51 | 
52 | 
53 | ## Install DeepMod:
54 | 	* git clone https://github.com/WGLab/DeepMod
55 | 	* The script to be run is in bin/DeepMod.py: 
56 | 		+ Run `python bin/DeepMod.py` for help document or 
57 | 		+ Run `python bin/DeepMod.py detect` for the detection help document.
58 | Users might need to run `source activate mdeepmod` if the virtual environment and required packages are installed with the commands above.
59 | 
60 | ## Installation time
61 | Without GPU-version tensorflow, it would take ~30 minutes to install required packages and DeepMod. 
62 | 
63 | ## Usage:
64 |  For how to use them, please refer to [Usage](https://github.com/WGLab/DeepMod/blob/master/docs/Usage.md)
65 | 
66 | 


--------------------------------------------------------------------------------
/docs/Reproducibility.md:
--------------------------------------------------------------------------------
  1 | # Demo and Examples
  2 | This section contains examples and also reproducibility process of our analysis. It is assumed that DeepMod has been successfully installed. If not, please install it and its required packages first according to the [instruction](https://github.com/WGLab/DeepMod/blob/master/docs/Install.md). It is also assumed that the virtual environment `mdeepmod` is used. Please remove `source activate mdeepmod` if virtual environment is not used in your system. 
  3 | 
  4 | To prepare to run DeepMod, It is assumed that there is a directory which has a *DeepMod* directory for our DeepMod tool, a *data* directory for storing Nanopore sequencing data, and a *ref* directory for reference data.
  5 | 
  6 | ## Reference genomes
  7 | Please used `bwa` to index the genome before runnning `DeepMod`
  8 | ### E. coli reference genome
  9 | The E. coli reference fasta could be downloaded from https://www.ncbi.nlm.nih.gov/nuccore/556503834. We assumed the reference file name under the *ref* directory is 'Ecoli_k12_mg1655.fasta' and indexed by `bwa`
 10 | 
 11 | ### Human reference genome
 12 | Hg38 was used for human Nanopore sequencing data. We assumed the fasta human file name under the *ref* directory is 'hg38' and indexed by `bwa` 
 13 | 
 14 | ## Nanopore data
 15 | DeepMod needs a group of FAST5 files generated by Nanopore sequencer after basecalling. Thus, before you run DeepMod, you might need to run `Albacore` first on your Nanopore data. When you run `Albacore` on your Nanopore data, please make sure that `fastq,fast5` is used so that `event` information can be stored in FAST5 files. 
 16 | 
 17 | ## Run on E. coli data
 18 | The Nanopore data set are large, and thus users need to contact the original authors<sup>1</sup> to get the downloading URL. 
 19 | ### Example 1: 5mC detection
 20 | #### Step 1. datasets
 21 | Please download Nanopore sequencing data for *con1*, *con2* and *CG* motif with SSSl, and untar them into seperate sub-folder under the *data* directory, and assumed their sub-folder names are *Control_lib1*， *Control_lib3* and *meth10_lib3* respectively.
 22 | 
 23 | #### Step 2.
 24 | ```
 25 | mkdir ecoli_pred/
 26 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/meth10_lib3/ --Ref ref/Ecoli_k12_mg1655.fasta --FileID Cgsss --modfile DeepMod/train_mod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/
 27 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/Control_lib1/ --Ref ref/Ecoli_k12_mg1655.fasta --FileID con1 --modfile DeepMod/train_mod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/
 28 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/Control_lib3/ --Ref ref/Ecoli_k12_mg1655.fasta --FileID con2 --modfile DeepMod/train_mod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/
 29 | ```
 30 | The three commands might take ~41 minutes, ~84 minutes and ~120 minutes to be done. After that, you will found the results under the directory of *ecoli_pred* with *--FileID* as the sub-folder name. The results are in the *bed* format with the file names as *mod_pos.NC_000913.3-.C.bed* and *mod_pos.NC_000913.3+.C.bed*. The detail of the methylation prediction for each long read is also provided. 
 31 | 
 32 | One can run DeepMod on other datasets with the commands below.
 33 | ```
 34 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/meth8_lib2/ --Ref ref/Ecoli_k12_mg1655.fasta --FileID gCgc --modfile DeepMod/train_mod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/
 35 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/meth9_lib2/ --Ref ref/Ecoli_k12_mg1655.fasta --FileID Cgmpe --modfile DeepMod/train_mod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/
 36 | ```
 37 | 
 38 | #### Step 3.
 39 | The following commands then can be used to calculate average precision and AUC values of DeepMod.
 40 | ```
 41 | python DeepMod/tools/cal_EcoliDetPerf.py ecoli_pred/Cgmpe ref/Ecoli_k12_mg1655.fasta Cg 0  '' -1 -1 ecoli_pred/Cgmpe/ ecoli_pred/con1,ecoli_pred/con2
 42 | python DeepMod/tools/cal_EcoliDetPerf.py ecoli_pred/Cgsss ref/Ecoli_k12_mg1655.fasta Cg 0  '' -1 -1 ecoli_pred/Cgsss/ ecoli_pred/con1,ecoli_pred/con2
 43 | python DeepMod/tools/cal_EcoliDetPerf.py ecoli_pred/gCgc ref/Ecoli_k12_mg1655.fasta gCgc 1  '' -1 -1 ecoli_pred/gCgc/ ecoli_pred/con1,ecoli_pred/con2
 44 | ```
 45 | The commands above will generate AP plots and AUC plots under the directory of *ecoli_pred/Cgmpe/*, *ecoli_pred/Cgsss/*, *ecoli_pred/gCgc/* which are performance as shown in Figure 2 (a), (c) and (d).
 46 | 
 47 | ### Example 2: 6mA detection
 48 | #### Step 1. datasets
 49 | You need contact the original authors<sup>1</sup> to get the downloading URL.
 50 | 
 51 | Please download Nanopore sequencing data for *con1*, *con2* and *gAtc* motif, *tcgA* and *gaAttc*, and untar them into seperate sub-folder under the *data* directory, and assumed their sub-folder names are *Control_lib1*， *Control_lib3*, *meth11_lib3*, *meth1_lib1* and *meth4_lib1* respectively. If you have run Example 1, you might already have the dataset for *con1* and *con2*.
 52 | 
 53 | #### Step 2.
 54 | ```
 55 | mkdir ecoli_pred/ (if not exist)
 56 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/meth11_lib3/ --Ref ref/Ecoli_k12_mg1655.fasta --Base A --FileID gAtc --modfile DeepMod/train_mod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/
 57 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/meth1_lib1/ --Ref ref/Ecoli_k12_mg1655.fasta --Base A --FileID tcgA --modfile DeepMod/train_mod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/
 58 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/meth4_lib1/ --Ref ref/Ecoli_k12_mg1655.fasta --Base A --FileID gaAttc --modfile DeepMod/train_mod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/
 59 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/Control_lib1/ --Ref ref/Ecoli_k12_mg1655.fasta --Base A --FileID con1a --modfile DeepMod/train_mod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/
 60 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/Control_lib3/ --Ref ref/Ecoli_k12_mg1655.fasta --Base A --FileID con2a --modfile DeepMod/train_mod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0 --threads 15 --outFolder ecoli_pred/
 61 | ```
 62 | #### Step 3.
 63 | The following commands then can be used to calculate average precision and AUC values of DeepMod.
 64 | ```
 65 | python DeepMod/tools/cal_EcoliDetPerf.py ecoli_pred/gAtc ref/Ecoli_k12_mg1655.fasta gAtc 1  '' 1000000 2000000 ecoli_pred/gAtc/ ecoli_pred/con1a,ecoli_pred/con2a
 66 | python DeepMod/tools/cal_EcoliDetPerf.py ecoli_pred/tcgA ref/Ecoli_k12_mg1655.fasta tcgA 3  '' 1000000 2000000 ecoli_pred/tcgA/ ecoli_pred/con1a,ecoli_pred/con2a
 67 | python DeepMod/tools/cal_EcoliDetPerf.py ecoli_pred/gaAttc ref/Ecoli_k12_mg1655.fasta gaAttc 2  '' 1000000 2000000 ecoli_pred/gaAttc/ ecoli_pred/con1a,ecoli_pred/con2a
 68 | ```
 69 | The commands above will generate AP plots and AUC plots under the directory of *ecoli_pred/gAtc/*, *ecoli_pred/tcgA/*, *ecoli_pred/gaAttc/* which are performance as shown in Figure 3 (a) and the supplementary Figure 6.
 70 | 
 71 | ## Example 3: Detect 5mC on Na12878
 72 | ### Step 1. datasets
 73 | You might need to [Na12878 Nanopore sequencing data](https://github.com/nanopore-wgs-consortium/NA12878/blob/master/nanopore-human-genome/rel_3_4.md) to download fast5 files. Please note that the whole dataset is ~30TB.
 74 | 
 75 | ### Step 2.
 76 | Since it is very large for NA12878 Nanopore sequencing data, users can run each of tar files (each chromomsome has 1 to 9 tar files) separately to speed up the detection process. An example of running DeepMod on a template tar file is given below:
 77 | ```
 78 | mkdir na12878_pred
 79 | time python DeepMod/bin/DeepMod.py detect --wrkBase data/chr1/tar1 --Ref ref/hg38.fasta --FileID chr1_tar1 --modfile DeepMod/train_mod/rnn_conmodC_P100wd21_f7ne1u0_4/mod_train_conmodC_P100wd21_f3ne1u0 --threads 15 --outFolder na12878_pred/
 80 | ```
 81 | 
 82 | ### Step 3.
 83 | Then, the following command can be used to merge all results in Step 2.
 84 | ```
 85 | python DeepMod/tools/sum_chr_mod.py na12878_pred/ C na12878_C
 86 | ```
 87 | Then, the results will be under the directory of *na12878_pred/* and the result file names start with *na12878_C* and end with '.bed' in a bed format. The results are grouped by chromosomes.
 88 | 
 89 | ### Step 4 (optional)
 90 | This step is to consder the cluster effect of 5mC in human genome. To do that, a CpG index in a human genome will be generated.
 91 | ```
 92 | python DeepMod/tools/generate_motif_pos.py ref/hg38.fa genome_motif/C C CG 0
 93 | ```
 94 | The results are under the directory of *genome_motif/C*.
 95 | 
 96 | After that, a second deep learning process will be used to consider cluster effect.
 97 | ```
 98 | python DeepMod/tools/hm_cluster_predict.py na12878_pred/na12878_C genome_motif/C
 99 | ```
100 | The script will take all *a12878_pred/na12878_C.chr[12....].C.bed* as input, and output the bed files with the file name format of *a12878_pred/na12878_C_clusterCpG.chr[12....].C.bed*
101 | 
102 | ### Step 5
103 | To evaluate DeepMod's performance on NA12878, users might use bisulfite sequencing results from https://www.encodeproject.org/experiments/ENCSR890UQO/. Due to the heterogeneity of sequenced samples, completely methylated and completely un-methylated bases could be used for the evaluation: a genomic position of a base was considered to be completely methylated if its methylation percentage >=90% in both replicates of bisulfite sequencing with coverage>=c (c could be 1, 5 or 10), and to be completely un-methylated if its methylation percentage is 0% in both replicates. 
104 | 
105 | The modification detection on HX1 can be run in the similar way to that on NA12878 but with different `--modfile`: *DeepMod/train_mod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10*.
106 | 
107 | ## Reference
108 | 1. Stoiber MH, et al. De novo Identification of DNA Modifications Enabled by Genome-Guided Nanopore Signal Processing. bioRxiv 10.1101/094672,  (2017).
109 | 


--------------------------------------------------------------------------------
/docs/Results_explanation.md:
--------------------------------------------------------------------------------
 1 | This document describe the fomrat of the output of DeepMod.
 2 | 
 3 | # 1. Check whether the running of DeepMod is successful.
 4 | After finishing the running of DeepMod, usually you will find a "\*.done" file under the output folder. If you cannot see that file, some errors might happen and the running of DeepMod is not successful. One common cause behind this is the memory issue, that is, DeepMod needs more memory than what you requested or have. Increasing memory generally solve the issue.
 5 | 
 6 | # 2 Format of output
 7 | The output is in a BED format like below. The first six columns are `Chr`, `Start pos`, `End pos`, `Base`, `Capped coverage`, and `Strand`, and the last three columns are `Real coverage`, `Mehylation percentage` and `Methylation coverage`.
 8 | 
 9 | ```
10 | chr6 148655 148656 C 10 -  148655 148656 0,0,0 10 10 1
11 | chr6 148657 148658 C 12 +  148657 148658 0,0,0 12 8 1
12 | chr6 148674 148675 C 14 -  148674 148675 0,0,0 14 7 1
13 | chr6 148675 148676 C 15 -  148675 148676 0,0,0 15 6 1
14 | chr6 148676 148677 C 14 -  148676 148677 0,0,0 14 7 1
15 | chr6 148684 148685 C 12 -  148684 148685 0,0,0 12 25 3
16 | chr6 148685 148686 C 16 -  148685 148686 0,0,0 16 6 1
17 | chr6 148689 148690 C 11 +  148689 148690 0,0,0 11 72 8
18 | chr6 148691 148692 C 10 +  148691 148692 0,0,0 10 50 5
19 | chr6 148693 148694 C 8 +  148693 148694 0,0,0 8 100 8
20 | chr6 148694 148695 C 11 -  148694 148695 0,0,0 11 54 6
21 | chr6 148695 148696 C 10 +  148695 148696 0,0,0 10 90 9
22 | chr6 148697 148698 C 12 +  148697 148698 0,0,0 12 50 6
23 | chr6 148699 148700 C 9 +  148699 148700 0,0,0 9 22 2
24 | chr6 148701 148702 C 13 -  148701 148702 0,0,0 13 7 1
25 | chr6 148703 148704 C 13 -  148703 148704 0,0,0 13 15 2
26 | chr6 148706 148707 C 9 -  148706 148707 0,0,0 9 22 2
27 | ```
28 | 


--------------------------------------------------------------------------------
/docs/Usage.md:
--------------------------------------------------------------------------------
 1 | **This is an explanation of how to use DeepMod without examples. If you want to run some examples, please refer to [demo](https://github.com/WGLab/DeepMod/blob/master/docs/Reproducibility.md).**
 2 | 
 3 | 
 4 | The inputs of DeepMod is a group of FAST5 files and a reference genome. FAST5 files need to be basecalled already, and `Events` data must be availabe in FAST5 files. 
 5 | 
 6 | DeepMod has a functional module called "detect" which will detect a specific modification in a single run. However, if the dataset and genome size is very larger or one wants to have the results soon, it would be better to run "detect" in a separate process simultaneously and then merge them together. For some special cases, if there is cluster effect between modifications (such as 5mC in CpG cluster), the third process would be used for additional prediction after "detect". How to use the three functional modules is described below.
 7 | 
 8 | # 1. How to detect modifications from FAST5 files.
 9 | The command for modification detection is to run `python DeepMod.py detect`. Without any other parameters, the help document will be shown. An example of how to use it is given below.
10 | 
11 | ```
12 | python DeepMod/bin/DeepMod.py detect --wrkBase FAST5-Folder --Ref Ref_genome_path --outFolder out_folder --Base C --modfile train_mod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10 --FileID User_Uniq_name --threads 4
13 | ```
14 | where users need to provide where is the FAST5 files (`--wrkBase`), where is the reference genome (`--Ref`), where is the output folder (`--outFolder`), and also the base of interest and the mod file. Users can optionally specify unique string for the results file names (`--FileID`) and how many threads are used (`--threads`).
15 | 
16 | If you want to make the prediction for base `A`, the following command could be used.
17 | ```
18 | python DeepMod/bin/DeepMod.py detect --wrkBase FAST5-Folder --Ref Ref_genome_path --outFolder out_folder --Base A --modfile train_mod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0 --FileID User_Uniq_name --threads 4
19 | ```
20 | 
21 | 
22 | # 2. How to merge different runs of modification detection
23 | Some projects might generate very large Nanopore sequencing data. For example, [NA12878 Nanopore sequencing data](https://github.com/nanopore-wgs-consortium/NA12878/blob/master/nanopore-human-genome/rel_3_4.md) was ~30TB. To speed up the detection of modification, users can run DeepMod with different `--FileID` and folders where fast5 files are (`--wrkBase`) but the same output folder (`--outFolder`). Then, the following script can be used to merge modification detection grouped by chromosomes for human genome.
24 | ```
25 | python DeepMod/tools/sum_chr_mod.py outFolder base-of-interest res-unique-filename chromosomes
26 | ```
27 | The last parameter is optional if running on human genome; otherwise, the chromosomes should be provided by a string where chromosome names are seperated by ','. `res-unique-filename` is a unique prefix of the summary file under `outFolder`. `outFolder` is the output folder of `DeepMod` and the prediction of `DeepMod` must be in the sub-folders under `outFolder`. 
28 | 
29 | # 3. How to consider modification cluster effect.
30 | 5mC in CpG motifs has cluster effect in human genome. To consider cluster effect, a second deep learning process was designed to improve the 5mC detection performance. To do that, additional commands below are used
31 | 
32 | ## Output C in CpG motifs in a genome
33 | ```
34 | python DeepMod/tools/generate_motif_pos.py ref-genome result-folder C CG 0
35 | ```
36 | The result files were generated under the directory of *result-folder*.
37 | 
38 | ### Generated clustered results.
39 | ```
40 | python DeepMod/tools/hm_cluster_predict.py prefix-merged-bed-files genome_motif_folder-in-last-step
41 | ```
42 | The output files will be under the same directory of *prefix-merged-bed-files* but with the prefix of *prefix-merged-bed-files* by appending "_clusterCpG".
43 | 
44 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os,sys
 4 | import setuptools
 5 | 
 6 | 
 7 | with open("README.md", "r") as fh:
 8 |     long_description = fh.read()
 9 | 
10 | 
11 | setuptools.setup(
12 |     name="DeepMod", # Replace with your own username
13 |     version="0.1.3",
14 |     author="Qian Liu",
15 |     author_email="",
16 |     description="A deep-learning tool to detect DNA modifications using Nanopore long-read sequencing data",
17 |     long_description=long_description,
18 |     long_description_content_type="text/markdown",
19 |     url="https://github.com/WGLab/DeepMod",
20 |     #packages=setuptools.find_packages(),
21 |     packages=['DeepMod_scripts'],
22 |     package_dir={'DeepMod_scripts': 'bin/DeepMod_scripts'},
23 |     scripts=['bin/DeepMod.py', 'DeepMod_tools/cal_EcoliDetPerf.py', 'DeepMod_tools/generate_motif_pos.py', 'DeepMod_tools/hm_cluster_predict.py', 'DeepMod_tools/sum_chr_mod.py'],
24 |     package_data={'train_deepmod': ['train_deepmod/*/*']},
25 |     #data_files=[('train_deepmod', ['train_deepmod/*'])],
26 |     classifiers=[
27 |         "Programming Language :: Python",
28 |         'Intended Audience :: Science/Research',
29 |         "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)",
30 |         "Operating System :: OS Independent",
31 |     ],
32 | )
33 | 


--------------------------------------------------------------------------------
/train_deepmod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/Cg.cov5.nb25.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/Cg.cov5.nb25.data-00000-of-00001


--------------------------------------------------------------------------------
/train_deepmod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/Cg.cov5.nb25.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/Cg.cov5.nb25.index


--------------------------------------------------------------------------------
/train_deepmod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/Cg.cov5.nb25.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/Cg.cov5.nb25.meta


--------------------------------------------------------------------------------
/train_deepmod/na12878_cluster_train_mod-keep_prob0.7-nb25-chr1/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "Cg.cov5.nb25"
2 | all_model_checkpoint_paths: "Cg.cov5.nb25"
3 | 


--------------------------------------------------------------------------------
/train_deepmod/rnn_conmodA_E1m2wd21_f7ne1u0_4/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "mod_train_conmodA_E1m2wd21_f3ne1u0"
2 | all_model_checkpoint_paths: "../2/mod_train_conmodA_E1m2wd21_f3ne1u0"
3 | all_model_checkpoint_paths: "../2.50/mod_train_conmodA_E1m2wd21_f3ne1u0"
4 | all_model_checkpoint_paths: "../3/mod_train_conmodA_E1m2wd21_f3ne1u0"
5 | all_model_checkpoint_paths: "../3.50/mod_train_conmodA_E1m2wd21_f3ne1u0"
6 | all_model_checkpoint_paths: "mod_train_conmodA_E1m2wd21_f3ne1u0"
7 | 


--------------------------------------------------------------------------------
/train_deepmod/rnn_conmodA_E1m2wd21_f7ne1u0_4/mod_train_conmodA_E1m2wd21_f3ne1u0.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodA_E1m2wd21_f7ne1u0_4/mod_train_conmodA_E1m2wd21_f3ne1u0.data-00000-of-00001


--------------------------------------------------------------------------------
/train_deepmod/rnn_conmodA_E1m2wd21_f7ne1u0_4/mod_train_conmodA_E1m2wd21_f3ne1u0.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodA_E1m2wd21_f7ne1u0_4/mod_train_conmodA_E1m2wd21_f3ne1u0.index


--------------------------------------------------------------------------------
/train_deepmod/rnn_conmodA_E1m2wd21_f7ne1u0_4/mod_train_conmodA_E1m2wd21_f3ne1u0.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodA_E1m2wd21_f7ne1u0_4/mod_train_conmodA_E1m2wd21_f3ne1u0.meta


--------------------------------------------------------------------------------
/train_deepmod/rnn_conmodA_P100wd21_f7ne1u0_4/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "mod_train_conmodA_P100wd21_f3ne1u0"
2 | all_model_checkpoint_paths: "../2/mod_train_conmodA_P100wd21_f3ne1u0"
3 | all_model_checkpoint_paths: "../2.50/mod_train_conmodA_P100wd21_f3ne1u0"
4 | all_model_checkpoint_paths: "../3/mod_train_conmodA_P100wd21_f3ne1u0"
5 | all_model_checkpoint_paths: "../3.50/mod_train_conmodA_P100wd21_f3ne1u0"
6 | all_model_checkpoint_paths: "mod_train_conmodA_P100wd21_f3ne1u0"
7 | 


--------------------------------------------------------------------------------
/train_deepmod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0.data-00000-of-00001


--------------------------------------------------------------------------------
/train_deepmod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0.index


--------------------------------------------------------------------------------
/train_deepmod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodA_P100wd21_f7ne1u0_4/mod_train_conmodA_P100wd21_f3ne1u0.meta


--------------------------------------------------------------------------------
/train_deepmod/rnn_conmodC_P100wd21_f7ne1u0_4/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "mod_train_conmodC_P100wd21_f3ne1u0"
2 | all_model_checkpoint_paths: "../2/mod_train_conmodC_P100wd21_f3ne1u0"
3 | all_model_checkpoint_paths: "../2.50/mod_train_conmodC_P100wd21_f3ne1u0"
4 | all_model_checkpoint_paths: "../3/mod_train_conmodC_P100wd21_f3ne1u0"
5 | all_model_checkpoint_paths: "../3.50/mod_train_conmodC_P100wd21_f3ne1u0"
6 | all_model_checkpoint_paths: "mod_train_conmodC_P100wd21_f3ne1u0"
7 | 


--------------------------------------------------------------------------------
/train_deepmod/rnn_conmodC_P100wd21_f7ne1u0_4/mod_train_conmodC_P100wd21_f3ne1u0.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodC_P100wd21_f7ne1u0_4/mod_train_conmodC_P100wd21_f3ne1u0.data-00000-of-00001


--------------------------------------------------------------------------------
/train_deepmod/rnn_conmodC_P100wd21_f7ne1u0_4/mod_train_conmodC_P100wd21_f3ne1u0.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodC_P100wd21_f7ne1u0_4/mod_train_conmodC_P100wd21_f3ne1u0.index


--------------------------------------------------------------------------------
/train_deepmod/rnn_conmodC_P100wd21_f7ne1u0_4/mod_train_conmodC_P100wd21_f3ne1u0.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_conmodC_P100wd21_f7ne1u0_4/mod_train_conmodC_P100wd21_f3ne1u0.meta


--------------------------------------------------------------------------------
/train_deepmod/rnn_f7_wd21_chr1to10_4/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "mod_train_f7_wd21_chr1to10"
2 | all_model_checkpoint_paths: "../30.6/mod_train_f7_wd21_chr1to10"
3 | all_model_checkpoint_paths: "../30.7/mod_train_f7_wd21_chr1to10"
4 | all_model_checkpoint_paths: "../30.8/mod_train_f7_wd21_chr1to10"
5 | all_model_checkpoint_paths: "../30.9/mod_train_f7_wd21_chr1to10"
6 | all_model_checkpoint_paths: "mod_train_f7_wd21_chr1to10"
7 | 


--------------------------------------------------------------------------------
/train_deepmod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10.data-00000-of-00001


--------------------------------------------------------------------------------
/train_deepmod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10.index


--------------------------------------------------------------------------------
/train_deepmod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_f7_wd21_chr1to10_4/mod_train_f7_wd21_chr1to10.meta


--------------------------------------------------------------------------------
/train_deepmod/rnn_sinmodC_P100wd21_f7ne1u0_4/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "mod_train_sinmodC_P100wd21_f3ne1u0"
2 | all_model_checkpoint_paths: "../2/mod_train_sinmodC_P100wd21_f3ne1u0"
3 | all_model_checkpoint_paths: "../2.50/mod_train_sinmodC_P100wd21_f3ne1u0"
4 | all_model_checkpoint_paths: "../3/mod_train_sinmodC_P100wd21_f3ne1u0"
5 | all_model_checkpoint_paths: "../3.50/mod_train_sinmodC_P100wd21_f3ne1u0"
6 | all_model_checkpoint_paths: "mod_train_sinmodC_P100wd21_f3ne1u0"
7 | 


--------------------------------------------------------------------------------
/train_deepmod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0.data-00000-of-00001


--------------------------------------------------------------------------------
/train_deepmod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0.index


--------------------------------------------------------------------------------
/train_deepmod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WGLab/DeepMod/488b0c4d57b31613a31a8ed15ad206761aefce87/train_deepmod/rnn_sinmodC_P100wd21_f7ne1u0_4/mod_train_sinmodC_P100wd21_f3ne1u0.meta


--------------------------------------------------------------------------------