├── deepgmap
    ├── misc
    │   ├── WORKSPACE
    │   ├── __init__.py
    │   ├── Constants.py
    │   ├── Constants.pyc
    │   ├── __init__.pyc
    │   ├── small_tools.pyc
    │   ├── .gitignore
    │   ├── small_tools.py
    │   ├── pickup_pos_seq_region.py
    │   ├── spearman_r.py
    │   ├── dataset_checker.py
    │   ├── edit_labeled_file.py
    │   ├── fasta_file_from_labeled.py
    │   ├── dataset_checker_multiple_label.py
    │   ├── TSS_bedfile.py
    │   ├── reduce_negatives.py
    │   ├── optimal_frip_cutoff.py
    │   ├── compare_deepsea_data.py
    │   ├── fix_fasta.py
    │   ├── randomdna2.py
    │   ├── deepsea_anal.py
    │   ├── box_plot.py
    │   ├── randomdna.py
    │   ├── bed_file_compare.py
    │   ├── motif_logo_creator.py
    │   ├── gff_to_colored_bed.py
    │   ├── bed_file_compare2.py
    │   ├── kernel_distribution_analizer.py
    │   └── igv_session.xml
    ├── train
    │   ├── __init__.py
    │   ├── .gitignore
    │   ├── __init__.pyc
    │   └── deepshark_local_oop_1d.pyc
    ├── post_train_tools
    │   ├── __init__.py
    │   ├── test.pdf
    │   ├── test.png
    │   ├── __init__.pyc
    │   ├── unpooling.pyc
    │   ├── cython_util.so
    │   ├── sequence_visualizer2.pyc
    │   ├── inputfileGeneratorForGenomeScan_p.pyc
    │   ├── inputfileGeneratorForGenomeScan_p2.pyc
    │   ├── cython_util.cpython-36m-x86_64-linux-gnu.so
    │   ├── randomize_labels.py
    │   ├── PCA.py
    │   ├── compare_narrowPeak_scores.py
    │   ├── liftover_indiv_genome_to_hg38.py
    │   ├── merge_bigwig.py
    │   ├── motif_compare2.py
    │   ├── sequence_visualizer.py
    │   ├── Clustering_analizer.py
    │   ├── ROC_space_plotter.py
    │   ├── inputfileGeneratorForGenomeScan_p2.py
    │   ├── ROC_space_plotter3.py
    │   ├── motif_compare.py
    │   ├── inputfileGeneratorForGenomeScan_gwas.py
    │   ├── inputfileGeneratorForGenomeScan_p.py
    │   ├── unpooling.py
    │   ├── precision_recall_handmade.py
    │   ├── fimo_to_numpy_array.py
    │   ├── trained_deepshark_local_multiple_label.py
    │   ├── cython_util.pyx
    │   ├── ROC_space_plotter2.py
    │   ├── sequence_visualizer2.py
    │   ├── kernel_visualizer2.py
    │   ├── deconvolution_to_signal.py
    │   └── deconv_deepshark_local_extend.py
    ├── __init__.py
    ├── network_constructors
    │   ├── __init__.py
    │   ├── conv4.pyc
    │   ├── danq.pyc
    │   ├── danq2.pyc
    │   ├── danq3.pyc
    │   ├── danq4.pyc
    │   ├── __init__.pyc
    │   ├── auc_calc.pyc
    │   ├── basset.pyc
    │   ├── danqfrss.pyc
    │   ├── deepsea.pyc
    │   ├── conv3frss.pyc
    │   ├── conv4frss.pyc
    │   ├── danqblock.pyc
    │   ├── conv4frssplus.pyc
    │   ├── conv4frssplus2.pyc
    │   ├── template_model.pyc
    │   ├── __pycache__
    │   │   └── auc_calc.cpython-36.pyc
    │   ├── .gitignore
    │   ├── auc_calc.py
    │   └── template_model.py
    ├── data_preprocessing_tools
    │   ├── __init__.py
    │   ├── queue.c
    │   ├── __init__.pyc
    │   ├── genome_divider.pyc
    │   ├── seq_to_binary.pyc
    │   ├── seq_to_binary2.so
    │   ├── genome_labeling2.pyc
    │   ├── input_generator_from_narrowPeaks2.pyc
    │   ├── inputfileGenerator_multiple_label3.pyc
    │   ├── build
    │   │   └── temp.linux-x86_64-3.6
    │   │   │   └── seq_to_binary2.o
    │   ├── seq_to_binary2.cpython-36m-x86_64-linux-gnu.so
    │   ├── setup.py
    │   ├── deepgmap
    │   │   └── data_preprocessing_tools
    │   │   │   └── seq_to_binary2.cpython-36m-x86_64-linux-gnu.so
    │   ├── remove_chr.py
    │   ├── bed_file_500_add_seq.py
    │   ├── cqueue.pxd
    │   ├── pick_one_chromosome.py
    │   ├── bed_file_500_add_seq2.py
    │   ├── remove_excess_negatives.py
    │   ├── genome_file_maker.py
    │   ├── remove_variant_annotations.py
    │   ├── seq_to_binary.py
    │   ├── bed_file_500.py
    │   ├── queue.pyx
    │   ├── inputGenerator_from_deepsea.py
    │   ├── genome_labeling.py
    │   ├── genome_labeling_compare.py
    │   ├── genome_divider.py
    │   └── genome_labeling2.py
    └── __init__.pyc
├── .gitignore
├── requirements.txt
├── Dockerfile
├── setup.py
└── INSTALL.rst


/deepgmap/misc/WORKSPACE:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepgmap/misc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepgmap/train/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepgmap/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.0"


--------------------------------------------------------------------------------
/deepgmap/network_constructors/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deepgmap/train/.gitignore:
--------------------------------------------------------------------------------
1 | /send_email.py
2 | 


--------------------------------------------------------------------------------
/deepgmap/misc/Constants.py:
--------------------------------------------------------------------------------
1 | DeepGMAP_VERSION = "dev3"
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .project
2 | .pydevproject
3 | /data/
4 | /build/
5 | /_tmp.bw
6 | 


--------------------------------------------------------------------------------
/deepgmap/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/__init__.pyc


--------------------------------------------------------------------------------
/deepgmap/misc/Constants.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/misc/Constants.pyc


--------------------------------------------------------------------------------
/deepgmap/misc/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/misc/__init__.pyc


--------------------------------------------------------------------------------
/deepgmap/train/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/train/__init__.pyc


--------------------------------------------------------------------------------
/deepgmap/misc/small_tools.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/misc/small_tools.pyc


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/queue.c:
--------------------------------------------------------------------------------
1 | #error Do not use this file, it is the result of a failed Cython compilation.
2 | 


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/test.pdf


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/test.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow>=1.15.2
2 | numpy
3 | matplotlib
4 | sklearn
5 | cairocffi
6 | cython
7 | tornado
8 | pyBigWig


--------------------------------------------------------------------------------
/deepgmap/network_constructors/conv4.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/conv4.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/danq.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/danq.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/danq2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/danq2.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/danq3.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/danq3.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/danq4.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/danq4.pyc


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/__init__.pyc


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/unpooling.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/unpooling.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/__init__.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/auc_calc.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/auc_calc.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/basset.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/basset.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/danqfrss.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/danqfrss.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/deepsea.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/deepsea.pyc


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/cython_util.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/cython_util.so


--------------------------------------------------------------------------------
/deepgmap/train/deepshark_local_oop_1d.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/train/deepshark_local_oop_1d.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/conv3frss.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/conv3frss.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/conv4frss.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/conv4frss.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/danqblock.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/danqblock.pyc


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/__init__.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/conv4frssplus.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/conv4frssplus.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/conv4frssplus2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/conv4frssplus2.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/template_model.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/template_model.pyc


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/genome_divider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/genome_divider.pyc


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/seq_to_binary.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/seq_to_binary.pyc


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/seq_to_binary2.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/seq_to_binary2.so


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/sequence_visualizer2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/sequence_visualizer2.pyc


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/genome_labeling2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/genome_labeling2.pyc


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/inputfileGeneratorForGenomeScan_p.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/inputfileGeneratorForGenomeScan_p.pyc


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/inputfileGeneratorForGenomeScan_p2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/inputfileGeneratorForGenomeScan_p2.pyc


--------------------------------------------------------------------------------
/deepgmap/network_constructors/__pycache__/auc_calc.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/__pycache__/auc_calc.cpython-36.pyc


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/cython_util.cpython-36m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/cython_util.cpython-36m-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/input_generator_from_narrowPeaks2.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/input_generator_from_narrowPeaks2.pyc


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/inputfileGenerator_multiple_label3.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/inputfileGenerator_multiple_label3.pyc


--------------------------------------------------------------------------------
/deepgmap/misc/.gitignore:
--------------------------------------------------------------------------------
1 | /intersectAB.bed
2 | /intersectABC.bed
3 | /intersectABC_.bed
4 | /intersectAB_.bed
5 | /intersectAC.bed
6 | /intersectAC_.bed
7 | /intersectBC.bed
8 | /intersectBC_.bed
9 | 


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/build/temp.linux-x86_64-3.6/seq_to_binary2.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/build/temp.linux-x86_64-3.6/seq_to_binary2.o


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/seq_to_binary2.cpython-36m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/seq_to_binary2.cpython-36m-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 | from Cython.Build import cythonize
3 | 
4 | setup(
5 |     name = "seq_to_binary",
6 |     ext_modules = cythonize('seq_to_binary.pyx'),  # accepts a glob pattern
7 | )
8 | 


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/deepgmap/data_preprocessing_tools/seq_to_binary2.cpython-36m-x86_64-linux-gnu.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/deepgmap/data_preprocessing_tools/seq_to_binary2.cpython-36m-x86_64-linux-gnu.so


--------------------------------------------------------------------------------
/deepgmap/misc/small_tools.py:
--------------------------------------------------------------------------------
 1 | def is_number(s):
 2 |     try:
 3 |         float(s)
 4 |         return True
 5 |     except ValueError:
 6 |         return False
 7 |     
 8 | def div_roundup(x, y):
 9 |     if y%x==0:
10 |         return y/x
11 |     else:
12 |         return y/x+1
13 | 
14 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM tensorflow/tensorflow:1.9.0-rc2-gpu-py3
2 | RUN apt-get update && apt-get install -y --no-install-recommends bedtools git
3 | RUN pip3 install --no-cache-dir setuptools matplotlib pyBigWig
4 | RUN git clone -b dev3 https://github.com/koonimaru/DeepGMAP.git && \
5 | 	cd DeepGMAP && git checkout && \
6 | 	python3 setup.py install


--------------------------------------------------------------------------------
/deepgmap/misc/pickup_pos_seq_region.py:
--------------------------------------------------------------------------------
 1 | 
 2 | labeled_file="/home/fast/onimaru/data/CTCF/mm10_CTCF_narrowPeak_mapq/picard_mm10_1000.bed.labeled"
 3 | 
 4 | with open(labeled_file, "r") as fin, open(labeled_file.split('.')[0]+"_positive_region.bed", 'w') as fo:
 5 |     for line in fin:
 6 |         if not line.startswith("#"):
 7 |             line1=line.split()
 8 |             a=map(int, line1[3:])
 9 |             if sum(a) >0:
10 |                 fo.write(line)
11 |                 
12 |             


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/remove_chr.py:
--------------------------------------------------------------------------------
 1 | file_name="/home/slow/onimaru/1000genome/HG00119.fa.ed"
 2 | file_out="/home/slow/onimaru/1000genome/HG00119_ed_chr.fa"
 3 | with open(file_name, "r") as fin, open(file_out,"w") as fout: 
 4 |     for line in fin:
 5 |         if line.startswith('>'):
 6 |             line=line.split()
 7 |             chromo=line[0].strip('>')
 8 |             line=">chr"+str(chromo)+"\n"
 9 |             fout.write(line)
10 |         else:
11 |             fout.write(line)
12 | 


--------------------------------------------------------------------------------
/deepgmap/misc/spearman_r.py:
--------------------------------------------------------------------------------
 1 | import scipy.stats as sc
 2 | 
 3 | f="/home/fast2/onimaru/DeepGMAP-dev/data/misc/cfrip_mm10_ctcf.txt"
 4 | 
 5 | peaks=[]
 6 | frips=[]
 7 | cfrips=[]
 8 | 
 9 | with open(f, "r") as fin:
10 |     for line in fin:
11 |         line=line.split()
12 |         if not line[0]=="ID":
13 |             peaks.append(float(line[4]))
14 |             frips.append(float(line[2]))
15 |             cfrips.append(float(line[5]))
16 | 
17 | #print sc.spearmanr(frips, peaks)
18 | #print sc.spearmanr(cfrips, peaks)


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/bed_file_500_add_seq.py:
--------------------------------------------------------------------------------
 1 | 
 2 | infile="/home/fast/onimaru/data/CTCF/hg38_200_no_hiPS_CTCF.bed"
 3 | outfile="/home/fast/onimaru/data/CTCF/hg38_200_no_hiPS_CTCF_pm400.bed"
 4 | with open(infile, 'r') as fin, open(outfile, 'w') as fout:
 5 |     
 6 |     for line in fin:
 7 |         line=line.split()
 8 |         chrom=line[0]
 9 |         start=int(line[1])
10 |         end=int(line[2])
11 |         new_start=start-400
12 |         new_end=end+400
13 |         fout.write(str(chrom)+"\t"+str(new_start)+"\t"+str(new_end)+"\n")
14 | 


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/cqueue.pxd:
--------------------------------------------------------------------------------
 1 | # file: cqueue.pxd
 2 | 
 3 | cdef extern from "libcalg/queue.h":
 4 |     ctypedef struct Queue:
 5 |         pass
 6 |     ctypedef void* QueueValue
 7 | 
 8 |     Queue* queue_new()
 9 |     void queue_free(Queue* queue)
10 | 
11 |     int queue_push_head(Queue* queue, QueueValue data)
12 |     QueueValue  queue_pop_head(Queue* queue)
13 |     QueueValue queue_peek_head(Queue* queue)
14 | 
15 |     int queue_push_tail(Queue* queue, QueueValue data)
16 |     QueueValue queue_pop_tail(Queue* queue)
17 |     QueueValue queue_peek_tail(Queue* queue)
18 | 
19 |     bint queue_is_empty(Queue* queue)


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/pick_one_chromosome.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | chromosome_name="chr1"
 4 | file_name="/home/slow/onimaru/1000genome/HG00119_1000.fa"
 5 | with open(file_name, "r") as fin, open(file_name+"_"+chromosome_name, "w") as fout:
 6 |     WRITE=False
 7 |     for line in fin:
 8 |         if line.startswith('>'):
 9 |             #line1=line.split()
10 |             a=line.strip('>\n')
11 |             #print a
12 |             if a.startswith(chromosome_name):
13 |                 fout.write(line)
14 |                 WRITE=True
15 |             else:
16 |                 WRITE=False
17 |         
18 |         elif WRITE:
19 |             fout.write(line)
20 | 


--------------------------------------------------------------------------------
/deepgmap/misc/dataset_checker.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | def dataset_checker(input, output):
 5 |     with open(input, 'r') as f_in, open(output, 'w') as f_out:
 6 |         for line in f_in:
 7 |             if '>' in line:
 8 |                 position=line
 9 |             else:
10 |                 sequence=line
11 |                 N_percent=float(sequence.count('N'))/len(sequence)
12 |                 if N_percent<0.90 and len(sequence)>100:
13 |                     f_out.write(str(position)+str(sequence))
14 |             
15 | if __name__ == '__main__':
16 |   dataset_checker('/home/fast/onimaru/data/CTCF/mm10_no_CTCF.fa', '/home/fast/onimaru/data/CTCF/mm10_no_CTCF_noN.fa')


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/bed_file_500_add_seq2.py:
--------------------------------------------------------------------------------
 1 | 
 2 | infile="/home/fast/onimaru/encode/mm10_dnase-seq_subset/*_summits.bed"
 3 | outfile="/home/fast/onimaru/data/CTCF/hiPS_CTCF_peaks.narrowPeak_600.bed"
 4 | with open(infile, 'r') as fin, open(outfile, 'w') as fout:
 5 |     
 6 |     for line in fin:
 7 |         
 8 |         line=line.split()
 9 |         chrom=line[0]
10 |         if not chrom.startswith('chrM') and not '_' in chrom:
11 |             start=int(line[1])
12 |             end=int(line[2])
13 |             mid_p=(start+end)/2
14 |             new_start=mid_p-300
15 |             new_end=mid_p+300
16 |             fout.write(str(chrom)+"\t"+str(new_start)+"\t"+str(new_end)+"\n")
17 | 


--------------------------------------------------------------------------------
/deepgmap/network_constructors/.gitignore:
--------------------------------------------------------------------------------
 1 | /__init__.pyc
 2 | /basset.pyc
 3 | /conv4frss.pyc
 4 | /conv4frssplus2.pyc
 5 | /danq.pyc
 6 | /danq2.pyc
 7 | /danq3.pyc
 8 | /danq4.pyc
 9 | /danqblock.pyc
10 | /deepsea.pyc
11 | /deepshark2.pyc
12 | /deepshark4.pyc
13 | /deepshark5.pyc
14 | /deepsharkcheck.pyc
15 | /deepsharkcheck2.pyc
16 | /deepsharktest3.pyc
17 | /network_constructor_basset.pyc
18 | /network_constructor_danq_1d3.pyc
19 | /network_constructor_deepsea_1d2.pyc
20 | /network_constructor_deepsea_1d4.pyc
21 | /network_constructor_deepsea_1d5.pyc
22 | /network_constructor_deepsea_1d6.pyc
23 | /template_model.pyc
24 | /conv4frss2.pyc
25 | /conv4frss3.pyc
26 | /conv4frssplus3.pyc
27 | /conv4frssplus4.pyc
28 | 


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/remove_excess_negatives.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | labeled_file="/home/fast/onimaru/data/Chip-seq/three_tfs_hg38_500_rand_250_5times_srt.bed.labeled"
 3 | labeled_file_out="/home/fast/onimaru/data/Chip-seq/three_tfs_hg38_500_rand_250_5times_srt_reduced.bed.labeled"
 4 | with open(labeled_file, 'r') as fin, open(labeled_file_out ,'w') as fout:
 5 |     for line in fin:
 6 |         if line.startswith("#"):
 7 |             fout.write(line)
 8 |         else:
 9 |             r=random.random()
10 |             line1=line.split()
11 |             #print line1[3:]
12 |             label_num=sum(map(int, line1[3:]))
13 |             if label_num==0 and r<=0.800:
14 |                 continue
15 |             else:
16 |                 fout.write(line)
17 |             


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/genome_file_maker.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import math
 3 | length_list=[]
 4 | 
 5 | 
 6 | with open(sys.argv[1], 'r') as fin, open('./'+sys.argv[2], 'w') as fout:
 7 |     seq=0
 8 |     chrom_name=''
 9 |     for line in fin:
10 |         
11 |         if '>' in line:
12 |             
13 |             if not seq==0:
14 |                 length_list.append(seq)
15 |                 #if not "_" in chrom_name and not "M" in chrom_name:
16 |                 fout.write(str(chrom_name)+'\t'+str(seq)+'\n')
17 |             line=line.split()
18 |             chrom_name=line[0].strip('>')
19 |             seq=0
20 |         else:
21 |             line1=line.strip("\n")
22 |             seq+=len(line1)
23 |     #if len(chrom_name)==3 and not "M" in chrom_name:
24 |     fout.write(str(chrom_name)+'\t'+str(seq)+'\n')


--------------------------------------------------------------------------------
/deepgmap/misc/edit_labeled_file.py:
--------------------------------------------------------------------------------
 1 | 
 2 | lf="/home/fast/onimaru/encode/mm10_dnase-seq_subset/deepsea_type_wondow_mm10_s200.bed.labeled"
 3 | gf="/home/fast/onimaru/data/genome_fasta/mm10.genome"
 4 | 
 5 | chrm_dict={}
 6 | 
 7 | with open(gf, 'r') as fin:
 8 |     for line in fin:
 9 |         line=line.split()
10 |         chrm_dict[line[0]]=int(line[1])
11 | import os
12 | h, t=os.path.split(lf)
13 | elf=h+"/edited_"+t
14 | with open(lf,'r') as fin, open(elf,'w') as fo:
15 |     for line in fin:
16 |         if line.startswith("#"):
17 |             fo.write(line)
18 |         else:
19 |             line=line.split()
20 |             start=int(line[1])-400
21 |             end=int(line[2])+400
22 |             if start>=0 and end<=chrm_dict[line[0]]:
23 |                 fo.write('\t'.join([line[0],str(start),str(end)])+"\t"+" ".join(line[3:])+"\n")
24 | 


--------------------------------------------------------------------------------
/deepgmap/misc/fasta_file_from_labeled.py:
--------------------------------------------------------------------------------
 1 | 
 2 | position_set=set()
 3 | 
 4 | with open("/home/fast/onimaru/data/mm10_1000_limb_altwindow_75co_non.labeled", 'r') as f1, open("/home/fast/onimaru/data/mm10_1000_altwindow_non.fa", 'r') as f2:
 5 |     for line in f1:
 6 |         line=line.split()
 7 |         if int(line[3])==1:
 8 |             position_set.add(line[0]+":"+line[1]+"-"+line[2])
 9 |     
10 |     with open("/home/fast/onimaru/data/mm10_1000_limb_altwindow_75co_non.labeled.fa", 'w') as fo:
11 |         WRITE=False
12 |         for line in f2:
13 |             if line.startswith('>'):
14 |                 current_position=line.strip('>\n')
15 |                 if current_position in position_set:
16 |                     fo.write(line)
17 |                     WRITE=True
18 |                 else:
19 |                     WRITE=False
20 |             elif WRITE:
21 |                 fo.write(line)
22 |                 


--------------------------------------------------------------------------------
/deepgmap/misc/dataset_checker_multiple_label.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | def dataset_checker(input1,input2, output1,output2):
 5 |     with open(input1, 'r') as fin1,open(input2, 'r') as fin2, open(output1, 'w') as fout1,open(output2, 'w') as fout2:
 6 |         
 7 |         for line in fin1:
 8 |             if '>' in line:
 9 |                 position=line
10 |                 fin2_line=fin2.readline()
11 |             else:
12 |                 sequence=line
13 |                 N_percent=float(sequence.count('N'))/len(sequence)
14 |                 if N_percent<=0.80:
15 |                     fout1.write(str(position)+str(sequence))
16 |                     fout2.write(fin2_line)
17 | if __name__ == '__main__':
18 |     dataset_checker('/home/fast/onimaru/data/mm10_1000_mrg_srt.fa', 
19 |                     '/home/fast/onimaru/data/mm10_1000_mrg_srt.bedadipo_mrg.labeled',
20 |                     '/home/fast/onimaru/data/mm10_1000_mrg_srt_non.fa',
21 |                     '/home/fast/onimaru/data/mm10_1000_mrg_srt.bedadipo_mrg_noN.labeled')


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/randomize_labels.py:
--------------------------------------------------------------------------------
 1 | 
 2 | file_name="/home/fast/onimaru/data/CTCF/mm10_CTCF_qc2_mm10_1000.bed.labeled"
 3 | from random import shuffle
 4 | 
 5 | a=file_name.split('.')[0]+"_shuffled.bed.labeled"
 6 | 
 7 | with open(file_name, "r") as fin, open(a, "w") as fo:
 8 |     i=0
 9 |     for line in fin:
10 |         if line.startswith("#"):
11 |             line=line.split()
12 |             list_of_label=line[1:]
13 |             
14 |             x = [i for i in range(len(list_of_label))]
15 |             shuffle(x)
16 |             list_of_label_shuf=[]
17 |             for e in x:
18 |                 list_of_label_shuf.append(list_of_label[e])
19 |             fo.write(line[0]+" "+" ".join(list_of_label_shuf)+"\n")
20 |         else:
21 |             b=line.split()
22 |             pos="\t".join(b[:3])
23 |             tmp=b[3:]
24 |             tmp_shuf=[]
25 |             #print tmp
26 |             tmp2=map(int, tmp)
27 |             if sum(tmp2)>0:
28 |             
29 |                 for e in x:
30 |                     tmp_shuf.append(tmp[e])
31 |             else:
32 |                 tmp_shuf=tmp
33 |             label=" ".join(tmp_shuf)
34 |             fo.write(pos+"\t"+label+"\n")
35 | 
36 | 


--------------------------------------------------------------------------------
/deepgmap/misc/TSS_bedfile.py:
--------------------------------------------------------------------------------
 1 | 
 2 | gene_list=[]
 3 | with open ('/media/koh/HD-PCFU3/mouse/mouse_UCSC_wholegenes.bed', 'r') as fin:
 4 |     with open ('/media/koh/HD-PCFU3/mouse/mouse_TSS.bed', 'w') as fout:
 5 |         for line in fin:
 6 |             if not line=='' or not line=='\n':
 7 |                 line=line.split()
 8 |                 chromosome=line[0]
 9 |                 left=int(line[1])
10 |                 right=int(line[2])
11 |                 direction=line[5]
12 |                 if direction=='+':
13 |                     gene=line[0]+':'+line[1]
14 |                 elif direction=='-':
15 |                     gene=line[0]+':'+line[2]
16 |                     
17 |                 if not gene in gene_list:
18 |                     gene_list.append(gene)
19 |                     if direction=='+':
20 |                         start=left-1000
21 |                         end=left+1000
22 |                         fout.write(str(chromosome)+'\t'+str(start)+'\t'+str(end)+'\n')
23 |                     if direction=='-':
24 |                         start=right-1000
25 |                         end=right+1000
26 |                         fout.write(str(chromosome)+'\t'+str(start)+'\t'+str(end)+'\n')
27 |                 


--------------------------------------------------------------------------------
/deepgmap/misc/reduce_negatives.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import os
 3 | labeledf="/home/fast/onimaru/data/Chip-seq/narrowPeaks/three_3times.labeled"
 4 | 
 5 | h, t =os.path.split(labeledf)
 6 | 
 7 | pos_num=0
 8 | neg_num=0
 9 | 
10 | with open(labeledf, 'r') as fin:
11 |     for line in fin:
12 |         if line.startswith("#"):
13 |             continue
14 |         line=line.split()
15 |         i=sum(map(int, line[3:]))
16 |         if i >0:
17 |             pos_num+=1
18 |         else:
19 |             neg_num+=1
20 |             
21 | 
22 | #print pos_num, neg_num
23 | 
24 | r=float(pos_num)/(0.75*neg_num)
25 | 
26 | with open(labeledf, 'r') as fin, open(h+"/down_sampled_"+str(round(r,4))+"_"+t, "w") as fl, open(h+"/down_sampled_"+str(round(r,4))+"_"+t+".bed", "w") as fb:
27 |     for line in fin:
28 |         if line.startswith("#"):
29 |             fl.write(line)
30 |             continue
31 |         line_=line.split()
32 |         i=sum(map(int, line_[3:]))
33 |         rand=random.random()
34 |         
35 |         if i > 0:
36 |             fl.write(line)
37 |             fb.write("\t".join(line_[:3])+"\n")
38 |         elif rand<r:
39 |             fl.write(line)
40 |             fb.write("\t".join(line_[:3])+"\n")
41 |             


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/remove_variant_annotations.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | file_name="/home/slow/onimaru/1000genome/HG00119.fa"
 4 | i=0
 5 | chromosome_list={}
 6 | seq=[]
 7 | seq_list=[]
 8 | 
 9 | with open(file_name,'r') as fin:
10 |     WRITE=False
11 |     for line in fin:
12 |         if line.startswith('>') and "dna:chromosome" in line:
13 |             line=line.split()[0]
14 |             a=line.strip('>')
15 |             a='chr'+str(a)
16 |             seq=[]
17 |             chromosome_list[a]=seq
18 |             seq_list.append(a)
19 |             print(a)
20 |             WRITE=True
21 | 
22 | 
23 |         elif line.startswith('>') and "GL" in line:
24 |             WRITE=False
25 |                          
26 |         elif WRITE:
27 |             line1=re.sub(r'\<.*?\>', '', line)
28 |             line1=re.sub(r'\<.*?\n', '', line1)
29 |             line1=re.sub(r'.*?\>', '', line1)
30 |             #line1=line1.strip("\n")
31 |             chromosome_list[a].append(line1.strip('\n'))
32 | 
33 | with open(file_name+'.ed','w') as fout:
34 |     for k in seq_list:
35 |         #print k
36 |         fout.write(">"+str(k)+"\n")
37 |         for i in chromosome_list[k]:
38 |             fout.write(str(i))
39 |         fout.write("\n")


--------------------------------------------------------------------------------
/deepgmap/misc/optimal_frip_cutoff.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from scipy.interpolate import BSpline
 4 | 
 5 | fname="/home/fast2/onimaru/DeepGMAP-dev/data/misc/mm10_ctcf_auprcs_frips.txt"
 6 | 
 7 | auprc=[]
 8 | frip=[]
 9 | """
10 | with open(fname, 'r') as fin:
11 |     for line in fin:
12 |         line=line.split()
13 |         if len(line)>0:
14 |             if line[0]=="AUPRC":
15 |                 auprc=map(float, line[1:])
16 |             elif line[0]=="correctedFRiP":
17 |                 frip=map(float, line[1:])"""
18 | 
19 | with open(fname, 'r') as fin:
20 |     for line in fin:
21 |         line=line.split()
22 |         if not len(line)==0 and not line[0]=="ID":
23 |             auprc.append(float(line[1]))
24 |             frip.append(float(line[5]))
25 | frip, auprc=zip(*sorted(zip(frip, auprc)))
26 | auprc_av=[]
27 | for i in range(len(auprc)):
28 |     auprc_av.append(np.average(auprc[i:]))
29 | 
30 | plt.figure(1, figsize=(4,4))
31 | ax1=plt.subplot()
32 | ax1.plot(frip,auprc_av)
33 | ax1.grid(b=True, which='major', color='black', linestyle='-')
34 | plt.xticks(np.arange(0, max(frip), 0.02))
35 | 
36 | ax1.grid(b=True, which='minor', color='gray', linestyle='--')
37 | plt.minorticks_on()
38 | plt.show()


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/seq_to_binary.py:
--------------------------------------------------------------------------------
 1 | #return  a single one hot vector of DNA
 2 | 
 3 | def AGCTtoArray(Nuc):
 4 |     onehot=[]
 5 |     if Nuc=="A" or Nuc=="a":
 6 |         onehot=(1, 0, 0, 0)
 7 |         return onehot
 8 |     elif Nuc=="G" or Nuc=="g":
 9 |         onehot=(0, 1, 0, 0)
10 |         return onehot
11 |     elif Nuc=="C" or Nuc=="c":
12 |         onehot=(0, 0, 1, 0)
13 |         return onehot
14 |     elif Nuc=="T" or Nuc=="t":
15 |         onehot=(0, 0, 0, 1)
16 |         return onehot
17 |     elif Nuc=="N" or Nuc=="n":
18 |         onehot=(0, 0, 0, 0)
19 |         return onehot
20 |     else: 
21 |         pass
22 | 
23 | #a function to convert AGCTN to 4d array
24 | def AGCTtoArray2(Seq):
25 |     onehot=[]
26 |     for Nuc in Seq:
27 |         if Nuc=="A" or Nuc=="a":
28 |             onehot.append((1, 0, 0, 0))
29 |             
30 |         elif Nuc=="G" or Nuc=="g":
31 |             onehot.append((0, 1, 0, 0))
32 |         elif Nuc=="C" or Nuc=="c":
33 |             onehot.append((0, 0, 1, 0))
34 |         elif Nuc=="T" or Nuc=="t":
35 |             onehot.append((0, 0, 0, 1))
36 |         elif Nuc=="N" or Nuc=="n":
37 |             onehot.append((0, 0, 0, 0))
38 |         else: 
39 |             pass
40 |     
41 |     return onehot


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/bed_file_500.py:
--------------------------------------------------------------------------------
 1 | WINDOW_SIZE=200
 2 | genome_file="/home/fast/onimaru/data/genome_fasta/mm10.genome"
 3 | #with open(genome_file, 'r') as fin, open('/home/fast/onimaru/data/genome_fasta/hg38_1000_altwindow.bed', 'w') as fout1, open('/home/fast/onimaru/data/genome_fasta/hg38_1000_.bed', 'w') as fout2:
 4 | with open(genome_file, 'r') as fin, open('/home/fast/onimaru/data/genome_fasta/mm10_200_single_'+str(WINDOW_SIZE)+'.bed', 'w') as fout1:
 5 |     
 6 |     for line in fin:
 7 |         line=line.split()
 8 |         chrom=line[0]
 9 |         chrom_size=int(line[1])
10 |         divide_num=chrom_size/WINDOW_SIZE
11 |         #divide_num=chrom_size/WINDOW_SIZE-4
12 |         for i in range(divide_num):
13 |             
14 |             #if i>=2:
15 |             
16 |             if i*WINDOW_SIZE+WINDOW_SIZE<=chrom_size:
17 |                 fout1.write(str(chrom)+'\t'+str(i*WINDOW_SIZE)+'\t'+str(i*WINDOW_SIZE+WINDOW_SIZE)+'\n')
18 |             else:
19 |                 break
20 |             #if i*WINDOW_SIZE+WINDOW_SIZE+WINDOW_SIZE/2<=chrom_size:
21 |                 #fout1.write(str(chrom)+'\t'+str(i*WINDOW_SIZE+WINDOW_SIZE/2)+'\t'+str(i*WINDOW_SIZE+WINDOW_SIZE+WINDOW_SIZE/2)+'\n')
22 |             #else:
23 |                 #break
24 | 
25 | 


--------------------------------------------------------------------------------
/deepgmap/network_constructors/auc_calc.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def auc_pr(true, prob, threshold):
 4 | 
 5 |     pred = tf.where(prob > threshold, tf.ones_like(prob), tf.zeros_like(prob))
 6 |     tp = tf.logical_and(tf.cast(pred, tf.bool), tf.cast(true, tf.bool))
 7 |     fp = tf.logical_and(tf.cast(pred, tf.bool), tf.logical_not(tf.cast(true, tf.bool)))
 8 |     fn = tf.logical_and(tf.logical_not(tf.cast(pred, tf.bool)), tf.cast(true, tf.bool))
 9 |     tn = tf.logical_and(tf.logical_not(tf.cast(pred, tf.bool)), tf.logical_not(tf.cast(true, tf.bool)))
10 |     FPR = tf.truediv(tf.reduce_sum(tf.cast(fp, tf.int32)),
11 |                      tf.reduce_sum(tf.cast(tf.logical_or(tn, fp), tf.int32)))
12 |     TPR = tf.truediv(tf.reduce_sum(tf.cast(tp, tf.int32)),
13 |                      tf.reduce_sum(tf.cast(tf.logical_or(tp, fn), tf.int32)))
14 |     PPV = tf.truediv(tf.reduce_sum(tf.cast(tp, tf.int32)),
15 |                      tf.reduce_sum(tf.cast(tf.logical_or(tp, fp), tf.int32)))
16 | 
17 |     return FPR, TPR, PPV
18 | 
19 | 
20 | def auc_pr2(true, prob, threshold):
21 |     FPR, _ = tf.metrics.false_positives_at_thresholds(true, prob, [threshold])
22 |     TPR, _ = tf.metrics.true_negatives_at_thresholds(true, prob, [threshold])
23 |     PPV, _ = tf.metrics.precision_at_thresholds(true, prob, [threshold])
24 | 
25 |     return FPR, TPR, PPV


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/PCA.py:
--------------------------------------------------------------------------------
 1 | # Authors: Kyle Kastner
 2 | # License: BSD 3 clause
 3 | 
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | from sklearn.datasets import load_iris
 8 | from sklearn.decomposition import PCA, IncrementalPCA
 9 | 
10 | iris = load_iris()
11 | X = iris.data
12 | y = iris.target
13 | 
14 | #print X.shape
15 | 
16 | n_components = 2
17 | ipca = IncrementalPCA(n_components=n_components, batch_size=10)
18 | X_ipca = ipca.fit_transform(X)
19 | 
20 | #print X_ipca.shape
21 | 
22 | pca = PCA(n_components=n_components)
23 | X_pca = pca.fit_transform(X)
24 | 
25 | colors = ['navy', 'turquoise', 'darkorange']
26 | 
27 | for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]:
28 |     plt.figure(figsize=(8, 8))
29 |     for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names):
30 |         plt.scatter(X_transformed[y == i, 0], X_transformed[y == i, 1],
31 |                     color=color, lw=2, label=target_name)
32 | 
33 |     if "Incremental" in title:
34 |         err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean()
35 |         plt.title(title + " of iris dataset\nMean absolute unsigned error "
36 |                   "%.6f" % err)
37 |     else:
38 |         plt.title(title + " of iris dataset")
39 |     plt.legend(loc="best", shadow=False, scatterpoints=1)
40 |     plt.axis([-4, 4, -1.5, 1.5])
41 | 
42 | plt.show()


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/compare_narrowPeak_scores.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | """
 4 | chr1    0    1000    .     3.64092565    .     0.00364093    -1    -1    -1
 5 | chr1    500    1500    .     3.64092565    .     0.00364093    -1    -1    -1
 6 | chr1    1000    2000    .     3.64092565    .     0.00364093    -1    -1    -1
 7 | chr1    1500    2500    .     3.64092565    .     0.00364093    -1    -1    -1
 8 | """
 9 | 
10 | 
11 | ref_data="/home/fast/onimaru/data/prediction/CTCF/network_constructor_deepsea_1d3_Wed_Oct_11_074555_2017.ckpt-13019.narrowPeak"
12 | ind_data="/home/fast/onimaru/data/prediction/CTCF/HG00119_network_constructor_deepsea_1d3_Wed_Oct_11_074555_2017.ckpt-13019.narrowPeak.hg38.narrowPeak"
13 | 
14 | ref_data_dict={}
15 | ind_data_dicts={}
16 | 
17 | with open(ref_data,'r') as fin:
18 |     for line in fin:
19 |         line=line.split()
20 |         position=str(line[0])+'\t'+str(line[1])+'\t'+str(line[2])
21 |         score=float(line[4])
22 |         ref_data_dict[position]=score
23 |         
24 | with open(ind_data,'r') as fin:
25 |     for line in fin:
26 |         line_=line.split()
27 |         position=str(line_[0])+'\t'+str(line_[1])+'\t'+str(line_[2])
28 |         score=float(line_[4])
29 |         if position in ref_data_dict:
30 |             score_of_ref=ref_data_dict[position]
31 |             abs_diff=math.fabs(score-score_of_ref)
32 |             
33 |         ref_data_dict[position]=score


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/liftover_indiv_genome_to_hg38.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | map_file="/home/slow/onimaru/1000genome/hg38_HG00119_1000.outfmt"
 3 | 
 4 | 
 5 | with open(map_file, 'r') as fin:
 6 |     has_seen=set()
 7 |     map_dict={}
 8 |     for line in fin:
 9 |         line=line.split()
10 |         hg38_coo, indiv_coo=line[1], line[0]
11 |         if not hg38_coo in has_seen:
12 |             has_seen.add(hg38_coo)
13 |             map_dict[indiv_coo]=hg38_coo
14 |             
15 | narrowPeak_prediction="/home/fast/onimaru/data/prediction/HG00119/HG00119_network_constructor_deepsea_1d3_Wed_Oct_11_074555_2017.ckpt-13019.narrowPeak"
16 | 
17 | with open(narrowPeak_prediction, 'r') as fin, open(narrowPeak_prediction+".hg38.narrowPeak", 'w') as fout:
18 |     for line in fin:
19 |         a=line.split()
20 |         b=str(a[0])+":"+str(a[1])+"-"+str(a[2])
21 |         if b in map_dict:
22 |             new_coo=map_dict[b]
23 |             new_coo=re.findall(r"[\w']+", new_coo)
24 |             fout.write(str(new_coo[0])+"\t"
25 |                        +str(new_coo[1])+"\t"
26 |                        +str(new_coo[2])+"\t"
27 |                        +str(a[3])+"\t"
28 |                        +str(a[4])+"\t"
29 |                        +str(a[5])+"\t"
30 |                        +str(a[6])+"\t"
31 |                        +str(a[7])+"\t"
32 |                        +str(a[8])+"\t"
33 |                        +str(a[9])+"\n")
34 | 


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/merge_bigwig.py:
--------------------------------------------------------------------------------
 1 | import pyBigWig as pbw
 2 | import math
 3 | import glob as gl
 4 | def merge_biwig(bigwig_file_list, out_file):
 5 |     bigwig_list=[]
 6 |     _chromosome_list=[]
 7 |     _value_list=[]
 8 |     _header_list=[]
 9 |     start_list=[]
10 |     end_list=[]
11 |     for f in bigwig_file_list:
12 |         _tmp_wig=pbw.open(f)
13 |         chroms=_tmp_wig.chroms()
14 |         print(chroms)
15 |         for chrom_name, chrom_length in chroms.items():
16 |             _header_list.append((chrom_name, chrom_length))
17 |             j=0
18 |             for s,e, v in _tmp_wig.intervals(chrom_name, 0, chrom_length):
19 |                 j+=1
20 |                 start_list.append(s)
21 |                 end_list.append(e)
22 |                 _value_list.append(v)
23 |             
24 |             _chromosome_list.append([chrom_name]*j)
25 |             print(j)
26 |         _tmp_wig.close()
27 |     out_bigwig=pbw.open(out_file)
28 |     out_bigwig.addHeader(_header_list)
29 |     out_bigwig.addEntries(_chromosome_list, start_list, ends=end_list, values=_value_list)
30 |     out_bigwig.close()
31 | 
32 | def main():
33 |     file_list=gl.glob("/home/onimaru/fast2/1000genome/tmp/GRCh38_edited_ctcf_test_class_mm10_CTCF_intestine_0days_ENCFF464ZPC_rep2_chr*.bw")
34 |     ofile="/home/onimaru/fast2/1000genome/tmp/tmp.bw"
35 |     merge_biwig(file_list, ofile)
36 | if __name__ == "__main__":    
37 |     main()


--------------------------------------------------------------------------------
/deepgmap/misc/compare_deepsea_data.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | deepsea="/home/fast/onimaru/encode/deepsea/deepsea_pred.txt"
 5 | 
 6 | deepshark="/home/fast/onimaru/encode/deepsea/deepshark_Tue_Apr_17_183529_2018.ckpt-57883_prediction.log"
 7 | 
 8 | deepsea_dict={}
 9 | 
10 | with open(deepsea, 'r') as fin:
11 |     for line in fin:
12 |         if not line.startswith("Cell Type"):
13 |             #print line
14 |             line=line.split()
15 |             if len(line)==0:
16 |                 continue
17 |             print(line)
18 |             if line[4]=="NA":
19 |                 continue
20 |             sname=line[3].split('.')[0]
21 |             AUPRC=float(line[5])
22 |             deepsea_dict[sname]=AUPRC
23 | 
24 | sample_list=[]
25 | deepsea_list=[]
26 | deepshark_list=[]  
27 | with open(deepshark, 'r') as fin:
28 |     go=False
29 |     for line in fin:
30 |         if line.startswith("sample"):
31 |             go=True
32 |             continue
33 |         elif go:
34 |             line=line.split()
35 |             sname=line[0].split("_")[0]
36 |             if "Dnase" in sname and sname in deepsea_dict:
37 |                 sample_list.append(sname)
38 |                 deepsea_list.append(deepsea_dict[sname])
39 |                 deepshark_list.append(float(line[2]))
40 |                 print(sname, deepsea_dict[sname], float(line[2]))
41 | 
42 | deepsea_list=np.array(deepsea_list)
43 | deepshark_list=np.array(deepshark_list)
44 | 
45 | log_fold=np.log2(deepshark_list/deepsea_list)
46 | log_fold_neg=log_fold[log_fold<0.00]
47 | print("total num: "+str(len(log_fold))+"\nless performed num:"+str(len(log_fold_neg))+" ("+str(len(log_fold_neg)/float(len(log_fold))*100.0)+"%)")
48 | 


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/queue.pyx:
--------------------------------------------------------------------------------
 1 | cimport cqueue
 2 | 
 3 | cdef class Queue:
 4 |     """A queue class for C integer values.
 5 | 
 6 |     >>> q = Queue()
 7 |     >>> q.append(5)
 8 |     >>> q.peek()
 9 |     5
10 |     >>> q.pop()
11 |     5
12 |     """
13 |     cdef cqueue.Queue* _c_queue
14 |     def __cinit__(self):
15 |         self._c_queue = cqueue.queue_new()
16 |         if self._c_queue is NULL:
17 |             raise MemoryError()
18 | 
19 |     def __dealloc__(self):
20 |         if self._c_queue is not NULL:
21 |             cqueue.queue_free(self._c_queue)
22 | 
23 |     cpdef append(self, int value):
24 |         if not cqueue.queue_push_tail(self._c_queue,
25 |                                       <void*>value):
26 |             raise MemoryError()
27 | 
28 |     cdef extend(self, int* values, size_t count):
29 |         cdef size_t i
30 |         for i in xrange(count):
31 |             if not cqueue.queue_push_tail(
32 |                     self._c_queue, <void*>values[i]):
33 |                 raise MemoryError()
34 | 
35 |     cpdef int peek(self) except? -1:
36 |         cdef int value = \
37 |             <int>cqueue.queue_peek_head(self._c_queue)
38 |         if value == 0:
39 |             # this may mean that the queue is empty,
40 |             # or that it happens to contain a 0 value
41 |             if cqueue.queue_is_empty(self._c_queue):
42 |                 raise IndexError("Queue is empty")
43 |         return value
44 | 
45 |     cpdef int pop(self) except? -1:
46 |         if cqueue.queue_is_empty(self._c_queue):
47 |             raise IndexError("Queue is empty")
48 |         return <int>cqueue.queue_pop_head(self._c_queue)
49 | 
50 |     def __bool__(self):
51 |         return not cqueue.queue_is_empty(self._c_queue)


--------------------------------------------------------------------------------
/deepgmap/misc/fix_fasta.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import os
 4 | infile="/home/fast2/onimaru/1000genome/all_HG00119_H1.fa"
 5 | outfile="/home/fast2/onimaru/1000genome/all_HG00119_H1_edited.fa"
 6 | 
 7 | 
 8 | with open(infile, "r") as fin, open(os.path.split(infile)[0]+"/_tmp.fa", "w") as fo:
 9 |     i=0
10 |     for line in fin:
11 |         if line.startswith(">") and len(line.split())>1:
12 |             line=line.split()[0]
13 |             if i ==0:
14 |                 line=">chr"+line.strip(">")+"\n"
15 |                 i+=1
16 |             else:
17 |                 line="\n"+">chr"+line.strip(">")+"\n"
18 |             #print line
19 |             fo.write(line)
20 |         else:
21 |             fo.write(line.strip("\n"))
22 | 
23 | #import numpy as np
24 | dna=set(["A","G","C","T","N", "\n"])
25 | with open(os.path.split(infile)[0]+"/_tmp.fa", "r") as fin, open(outfile, "w") as fo:
26 |     
27 |     
28 |     for line in fin:
29 |         if line.startswith(">"):
30 |             fo.write(line)
31 |             #print line
32 |         else:
33 |             i=0
34 |             lline=(line)
35 |             line=iter(line)
36 |             seq=[]
37 |             #i+=len(line)
38 |             while True:
39 |                 try:
40 |                     l=line.next()
41 |                 except StopIteration:
42 |                     break
43 |                 #print l
44 |                 if l=="<":
45 |                     l2=line.next()
46 |                     while l2!=">":
47 |                         l2=line.next()
48 |                     
49 |                     seq.append("N")
50 |                     i+=1
51 |                 else:
52 |                     seq.append(l)
53 |                     i+=1
54 |                 if i%200==0:
55 |                     seq.append("\n")
56 |             fo.write("".join(seq))
57 |                 #if not any(l==dna):
58 |                     
59 | 
60 | 


--------------------------------------------------------------------------------
/deepgmap/misc/randomdna2.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | 
 5 | input_file='/home/fast/onimaru/data/various_dnase_data/mm10_1000_dnase_region_75co.fa'
 6 | temp_file='/home/fast/onimaru/data/random_seq/random_shuffle2_for_mm10_1000_dnase_region_75co.fa'
 7 | output_file='/media/koh/HD-PCFU3/mouse/random_seq/random_shuffle2_for_multidnase_no_chr1_2_noN.fa'
 8 | with open(input_file, 'r') as f1, open(temp_file, 'w') as f2:
 9 |     for line in f1:
10 |         if '>' in line:
11 |             f2.write(str(line))
12 |         elif not line=='' or not line=='\n':
13 |             randomized=''
14 |             select_data=random.randint(1,100)
15 |             shuffling_module=2
16 |             #if select_data<=50:
17 |                 #shuffling_module=3
18 |             line=line.strip('\n')
19 |             index_new=(len(line))/shuffling_module
20 |             a = range(index_new)
21 |             index_random = random.sample(a, len(a))
22 |             index_random_iter=iter(index_random)
23 |             for i in range(index_new):
24 |                 for k in range(shuffling_module):
25 |                     randomized+=line[index_random[i]*shuffling_module+k]
26 |             f2.write(str(randomized)+'\n')
27 | for i in range(2):
28 |     with open(input_file, 'r') as f1, open(temp_file, 'a') as f2:
29 |         for line in f1:
30 |             if '>' in line:
31 |                 f2.write(str(line))
32 |             elif not line=='' or not line=='\n':
33 |                 randomized=''
34 |                 select_data=random.randint(1,100)
35 |                 shuffling_module=2
36 |                 #if select_data<=50:
37 |                     #shuffling_module=3
38 |                 line=line.strip('\n')
39 |                 index_new=(len(line))/shuffling_module
40 |                 a = range(index_new)
41 |                 index_random = random.sample(a, len(a))
42 |                 index_random_iter=iter(index_random)
43 |                 for i in range(index_new):
44 |                     for k in range(shuffling_module):
45 |                         randomized+=line[index_random[i]*shuffling_module+k]
46 |                 f2.write(str(randomized)+'\n')
47 | 


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/motif_compare2.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | #from curses.ascii import isdigit
 4 | from scipy.spatial.distance import cdist
 5 | import deepgmap.post_train_tools.cython_util as cutil
 6 | mc=cutil.motif_compare
 7 | from matplotlib import pyplot as plt 
 8 | import os
 9 | def _is_number(s):
10 |     try:
11 |         complex(s) # for int, long, float and complex
12 |     except ValueError:
13 |         return False
14 | 
15 |     return True
16 | 
17 | def motif_reader(motif_data_dir):
18 |     h,t=os.path.split(motif_data_dir)
19 |     foutname=h+"/"+os.path.splitext(t)[0]+"tmp.meme"
20 |     with open(foutname, "w") as fo, open(motif_data_dir, 'r') as fin:
21 |         
22 |         fo.write("MEME version 4\n\nALPHABET= ACGT\n\nstrands: + -\n\nBackground letter\
23 | frequencies (from uniform background):\nA 0.2500 C 0.2500 G 0.2500 T 0.2500\n\n")
24 |         lines=fin.readlines()
25 |         for i, line in enumerate(lines):
26 |             if line.startswith("letter-probability"):
27 |                 start_line=i+1
28 |                 break
29 |         for i in range(100-2):
30 |             fo.write("MOTIF tmp_"+str(i*10)+"-"+str(i*10+30)+"\n\nletter-probability matrix: alength= 4 w= 30 nsites= 30 E= 0\n")
31 |             for l in lines[start_line+i*10:start_line+i*10+30]:
32 |                 fo.write(l)
33 |             fo.write("\n\n")
34 |             
35 |     return foutname
36 |                 
37 | 
38 | def main():
39 |     motif_data_dir="/home/fast/onimaru/data/meme/merged.meme"
40 |     #long_motif_dir="/home/fast/onimaru/deepgmap/data/reconstructions/conv4frss_Fri_May_11_075425_2018.ckpt-16747Tue_May_15_112518_2018_all_.pdf.meme"
41 |     long_motif_dir="/home/fast2/onimaru/DeepGMAP-dev/data/activation_max/conv4frss_Fri_Sep_28_160038_2018.ckpt-28907Thu_Dec_20_131413_2018_ese14_re.pdf.meme"
42 |     #fout=os.path.splitext(long_motif_dir)[0]+".matches"
43 |     #fout="/home/fast/onimaru/data/output/network_constructor_deepsea_1d3_Fri_Oct_13_133809_2017.ckpt-15899Mon_Oct_16_105338_2017.npz.matches"
44 | 
45 |     fname=motif_reader(long_motif_dir)
46 |     #print fname
47 | if __name__== '__main__':
48 |     main()                
49 |                 


--------------------------------------------------------------------------------
/deepgmap/misc/deepsea_anal.py:
--------------------------------------------------------------------------------
 1 | def is_number(s):
 2 |     try:
 3 |         float(s)
 4 |         return True
 5 |     except ValueError:
 6 |         return False
 7 | import matplotlib.pyplot as plt
 8 | import numpy as np
 9 | f="/home/fast/onimaru/deepgmap/data/misc/deepsea_S3.txt"
10 | 
11 | datadict={}
12 | 
13 | with open(f, 'r') as fin:
14 |     for line in fin:
15 |         
16 |         line=line.split("\t")
17 |         if len(line)==7:
18 |             #print line[1], line[5]
19 |             if is_number(line[5]):
20 |                 if datadict.has_key(line[1]):
21 |                     datadict[line[1]].append(float(line[5]))
22 |                 else:
23 |                     datadict[line[1]]=[]
24 |                     datadict[line[1]].append(float(line[5]))
25 | 
26 | data_list=[]
27 | label_list=[]
28 | 
29 | for k, v in datadict.items():
30 |     if len(v)>3:
31 |         label_list.append(k)
32 |         data_list.append(v)
33 | 
34 | median_list=[]
35 | 
36 | for i in data_list:
37 |     median_list.append(np.median(i))
38 | 
39 | index_=range(len(label_list))
40 |         
41 | median_list, index_=zip(*sorted(zip(median_list, index_), reverse=True))
42 | 
43 | label_list[:] = [label_list[i] for i in index_]
44 | data_list[:] = [data_list[i] for i in index_]
45 | 
46 | 
47 | fig, ax = plt.subplots()
48 | font = {'family' : 'Sans',
49 |         'weight' : 'normal',
50 |         'size'   : 6}
51 | plt.rc('font', **font)
52 | bp_dict=ax.boxplot(data_list, labels=label_list, bootstrap=1000, sym='.')
53 | #ticks=np.linspace(0, 11, 22, endpoint=False)
54 | #ax.set_yticks(ticks)
55 | plt.xticks(rotation='vertical')
56 | ax.grid(True)
57 | """k=0
58 | for i in [data1, data2]:
59 |     y=i
60 |     x = np.random.normal(k+1, 0.04, len(y))
61 |     plt.plot(x, y,marker="o",linestyle="None")
62 |     k+=1"""
63 | """
64 | for line in bp_dict['medians']:
65 |     # get position data for median line
66 |     print line.get_xydata()
67 |     x, y = line.get_xydata()[1] # top of median line
68 |     # overlay median value
69 |     plt.text(x+0.15, y-0.1, round(y,2),
70 |          horizontalalignment='center') # draw above, centered"""
71 | """
72 | import scipy.stats as stats
73 | 
74 | test=stats.ttest_ind(data1,data2)
75 | test2=stats.ttest_ind(data1,data3)
76 | print test, test2
77 | """
78 | plt.show()


--------------------------------------------------------------------------------
/deepgmap/misc/box_plot.py:
--------------------------------------------------------------------------------
 1 | from deepgmap.misc.small_tools import is_number
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | 
 5 | f="/home/fast2/onimaru/DeepGMAP-dev/data/misc/AUPRC_ctcf_boxplot_14jun2018.txt"
 6 | #f="/home/fast2/onimaru/DeepGMAP-dev/data/misc/AUPRC_dnase_boxplot_31may2018.txt"
 7 | data_list=[]
 8 | sample_list=[]
 9 | 
10 | with open(f, 'r') as fin:
11 |     for line in fin:
12 |         data_tmp=[]
13 |         line=line.split()
14 |         if len(line)==0:
15 |             break
16 |         sample_list.append(line[0])
17 |         
18 |         for l in line[1:]:
19 |             if is_number(l) and not l=="nan":
20 |                 data_tmp.append(float(l))
21 |         data_list.append(data_tmp)
22 | 
23 | #print sample_list
24 | #print data_list[-1]
25 | fig, ax = plt.subplots()
26 | font = {'family' : 'Sans',
27 |         'weight' : 'normal',
28 |         'size'   : 12}
29 | plt.rc('font', **font)
30 | ax.boxplot(data_list, labels=sample_list, bootstrap=1000, sym='.')
31 | plt.xticks(rotation='vertical')
32 | ax.grid(True)
33 | """
34 | ig, ax = plt.subplots()
35 | font = {'family' : 'Sans',
36 |         'weight' : 'normal',
37 |         'size'   : 12}
38 | plt.rc('font', **font)
39 | ax.boxplot(data_list[3:6], labels=sample_list[3:6], bootstrap=1000, sym='.')
40 | """
41 | #ticks=np.linspace(0, 11, 22, endpoint=False)
42 | #ax.set_yticks(ticks)
43 | #ax.set_x
44 | #locs, labels=plt.xticks()
45 | #plt.xticks(np.arange(len(sample_list)), sample_list)
46 | plt.xticks(rotation='vertical')
47 | ax.grid(True)
48 | """k=0
49 | for i in [data1, data2]:
50 |     y=i
51 |     x = np.random.normal(k+1, 0.04, len(y))
52 |     plt.plot(x, y,marker="o",linestyle="None")
53 |     k+=1"""
54 | """
55 | for line in bp_dict['medians']:
56 |     # get position data for median line
57 |     print line.get_xydata()
58 |     x, y = line.get_xydata()[1] # top of median line
59 |     # overlay median value
60 |     plt.text(x+0.15, y-0.1, round(y,2),
61 |          horizontalalignment='center') # draw above, centered"""
62 | 
63 | import scipy.stats as stats
64 | sub_data_list1=data_list
65 | i=0
66 | pair_set=set()
67 | test_dict={}
68 | for i in range(len(sub_data_list1)):
69 |     for j in range(len(sub_data_list1)):
70 |          
71 |         if not i==j and not str(i)+"-"+str(j) in pair_set:
72 |             #test=stats.ttest_ind(sub_data_list1[i],sub_data_list1[j])
73 |             test=stats.mannwhitneyu(sub_data_list1[i],sub_data_list1[j],alternative="two-sided")
74 |             test_dict[str(i)+"-"+str(j)]=test
75 |             pair_set.add(str(i)+"-"+str(j))
76 |             pair_set.add(str(j)+"-"+str(i))
77 | #print test_dict
78 | 
79 | 
80 | plt.show()


--------------------------------------------------------------------------------
/deepgmap/misc/randomdna.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | 
 4 | def DNA(length, percentA, percentG, percentC, percentT, percentN):
 5 |     a = int(percentA*100)
 6 |     g = int(percentG*100)
 7 |     c = int(percentC*100)
 8 |     t=int(percentT*100)
 9 |     n=100-(a+g+c+t)
10 |     dnachoice=''
11 |     i=0
12 |     for i in range(a):
13 |         dnachoice+='A'
14 |     for i in range(g):
15 |         dnachoice+='G'
16 |     i=0
17 |     for i in range(c):
18 |         dnachoice+='C'
19 |     i=0
20 |     for i in range(t):
21 |         dnachoice+='T'
22 |     i=0
23 |     for i in range(n):
24 |         dnachoice+='N'
25 |         
26 |     
27 |     return ''.join(random.choice(str(dnachoice)) for _ in range(length))
28 | 
29 | def statistics(file):
30 |     lengthdist=[]
31 |     for line in file:
32 |         line=line.split()
33 |         lengthdist.append(int(line[2])-int(line[1]))
34 | 
35 |     return lengthdist
36 | 
37 | def AGCTcontent(file2):
38 |     #input_file = open('NC_005213.ffn', 'r') 
39 |     #output_file = open('nucleotide_counts.tsv','w') 
40 |     #output_file.write('Gene\tA\tC\tG\tT\tLength\tCG%\n')
41 |     A_count, C_count, G_count, T_count,N_count, length=0,0,0,0,0,0
42 |     from Bio import SeqIO
43 |     
44 |     for cur_record in SeqIO.parse(file2, "fasta") :
45 | #count nucleotides in this record...
46 |         gene_name = cur_record.name 
47 |         A_count += (cur_record.seq.count('A') +cur_record.seq.count('a'))
48 |         C_count += (cur_record.seq.count('C') +cur_record.seq.count('c'))
49 |         G_count += (cur_record.seq.count('G') +cur_record.seq.count('g'))
50 |         T_count += (cur_record.seq.count('T') +cur_record.seq.count('t'))
51 |         N_count += (cur_record.seq.count('N') +cur_record.seq.count('n'))
52 |         length += len(cur_record.seq)
53 |     A_percent=float(A_count)/float(length)
54 |     G_percent=float(G_count)/float(length)
55 |     C_percent=float(C_count)/float(length)
56 |     T_percent=float(T_count)/float(length)
57 |     N_percent=float(N_count)/float(length)
58 |     #print A_percent, G_percent, C_percent, T_percent, N_percent
59 |     return A_percent, G_percent, C_percent, T_percent, N_percent
60 | 
61 | with open('/media/koh/HD-PCFU3/mouse/various_dnase_data/all_peak_75cutoff_sorted_merge.bed', 'r') as f1, open('/media/koh/HD-PCFU3/mouse/various_dnase_data/all_peak_75cutoff_sorted_merge.fa', 'r') as f2:
62 |     seq_distribution=statistics(f1)
63 |     percentA, percentG, percentC, percentT, percentN=AGCTcontent(f2)
64 |     output=open('/media/koh/HD-PCFU3/mouse/random_seq/random_for_multidnase.fa', 'w')
65 |     i=0
66 |     for i in range(len(seq_distribution)):
67 |         output.write('>random'+str(i)+'\n'+DNA(seq_distribution[i], percentA, percentG, percentC, percentT, percentN)+'\n')
68 |     output.close()
69 | 
70 |     


--------------------------------------------------------------------------------
/deepgmap/misc/bed_file_compare.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import sys
 3 | import subprocess as sp
 4 | import networkx as nx
 5 | import os
 6 | from itertools import combinations
 7 | import glob
 8 | 
 9 | file_list=glob.glob(sys.argv[1])
10 | 
11 | file_combination=[]
12 | node_list={}
13 | peak_counts={}
14 | path_sep=os.path.sep
15 | G=nx.MultiDiGraph()
16 | G.peak_count={}
17 | peak_count_dict={}
18 | for i in file_list:
19 |     with open(i, 'r') as j:
20 |         peak_count=len(j.readlines())
21 |     
22 |     file_name=i.split(path_sep)
23 |     file_name=file_name[-1].split('.')
24 |     node1=file_name[0]
25 |     G.add_node(node1)
26 |     G.peak_count[node1]=peak_count
27 |     peak_count_dict[node1]=str(node1)+'\n('+ str(peak_count)+')'
28 |     node_list[i]=node1
29 |     
30 | for i in combinations(file_list, 2):
31 |     file_combination.append(i)
32 |     
33 | edgelabels={}
34 | 
35 | fout=open('/media/koh/HD-PCFU3/mouse/various_dnase_data/bedfiles/testfiles/test.log', 'w')
36 | fout.write("#combination, intersection, overlapping, distance\n")
37 | 
38 | 
39 | for i in file_combination:
40 |     
41 |     intersect1_=sp.check_output(["bedtools", "intersect","-u", "-a", str(i[0]), "-b", str(i[1])])
42 |     intersect1=len(intersect1_.split('\n'))
43 |     intersect2_=sp.check_output(["bedtools", "intersect","-u", "-b", str(i[0]), "-a", str(i[1])])
44 |     intersect2=len(intersect2_.split('\n'))
45 |     #distance=sp.check_output(["bedtools", "jaccard", "-a", str(i[0]), "-b", str(i[1])])
46 |     #distance=distance.split('\n')
47 |     #distance=distance[1].split()
48 |     #distance=distance[2]
49 |     overlap1=G.peak_count[node_list[i[0]]]-intersect1
50 |     overlap2=G.peak_count[node_list[i[1]]]-intersect2
51 |     
52 |     proportion1=overlap1/float(G.peak_count[node_list[i[0]]])
53 |     proportion2=overlap2/float(G.peak_count[node_list[i[1]]])
54 |     
55 |     fout.write(str(node_list[i[0]])+'/'+str(node_list[i[1]])+', '+str(intersect1)+'/'+str(intersect2)+', '+str(overlap1)+'/'+str(overlap2)+', '+str(proportion1)+'/'+str(proportion2)+'\n')
56 |     G.add_edge(node_list[i[0]], node_list[i[1]], edge_width=float(proportion1))
57 |     G.add_edge(node_list[i[1]], node_list[i[0]], edge_width=float(proportion2))
58 |     edgelabels[node_list[i[0]], node_list[i[1]]]=str(overlap1)+'/'+str(overlap2)
59 | fout.close()
60 | import matplotlib.pyplot as plt
61 | edgewidth=[]
62 | for (u,v,d) in G.edges(data=True):
63 |     edgewidth.append(d['edge_width'])
64 |     
65 | plt.figure(figsize=(8,8))
66 | # with nodes colored by degree sized by population
67 | pos=nx.spectral_layout(G)
68 | nx.draw_networkx_edges(G,pos,alpha=0.3,width=20, edge_color=edgewidth)
69 | nodesize=[G.peak_count[v]/100 for v in G]
70 | nx.draw_networkx_nodes(G,pos,node_size=nodesize,node_color='w',alpha=1.0,label=nodesize)
71 | nx.draw_networkx_labels(G,pos,labels=peak_count_dict, fontsize=14)
72 | nx.draw_networkx_edge_labels(G,pos,edge_labels=edgelabels, fontsize=12, alpha=0.1, bbox=dict(facecolor='none', edgecolor='none'))
73 | plt.savefig("chess_masters.png",dpi=75)
74 | plt.show()
75 | 


--------------------------------------------------------------------------------
/deepgmap/misc/motif_logo_creator.py:
--------------------------------------------------------------------------------
 1 | from cairocffi import cairo
 2 | import gzip
 3 | import pickle       
 4 | import numpy as np
 5 | 
 6 | def select_color(cr, DNA):
 7 |     if DNA=="A":
 8 |         cr.set_source_rgb(1, 0, 0)
 9 |     elif DNA=="G":
10 |         cr.set_source_rgb(0, 0, 0)
11 |     elif DNA=="C":
12 |         cr.set_source_rgb(0, 0, 1)
13 |     elif DNA=="T":
14 |         cr.set_source_rgb(0, 1, 0)    
15 |    
16 | def main():
17 |     with gzip.open('/media/koh/HD-PCFU3/mouse/variables_999_Sun_Oct_30_120751_2016.cpickle.gz', 'r') as f:
18 |         variables=pickle.load(f)
19 |         filter1=variables[0]
20 |     
21 |     
22 |     
23 |     i=0 
24 |     j=0
25 |     k=0
26 |     l=0
27 |     
28 |     
29 |     filter_shape=filter1.shape
30 |     width=filter_shape[0]*30+100
31 |     hight=512
32 |     y_center=hight/2
33 |     for i in range(filter_shape[3]):
34 |         ims = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, hight)
35 |         
36 |         
37 |         cr = cairo.Context(ims)
38 |         cr.move_to(50, y_center)
39 |         cr.line_to(filter_shape[0]*30+50, y_center)
40 |         cr.move_to(50, 100)
41 |         cr.line_to(50, 412)
42 |         cr.set_line_width(2)
43 |         cr.stroke()
44 |         for k in range(filter_shape[0]):
45 | 
46 |             AGCT={}
47 |             values=[]
48 |             A=["A", filter1[k][0][0][i]*1000.0]
49 |             G=["G",filter1[k][1][0][i]*1000.0]
50 |             C=["C",filter1[k][2][0][i]*1000.0]
51 |             T=["T", filter1[k][3][0][i]*1000.0]
52 |             values=[A,G,C,T]
53 |             pos=filter(lambda x:x[1]>=0,values)
54 |             neg=filter(lambda x:x[1]<0,values)
55 |             pos.sort(key=lambda x:x[1])
56 |             neg.sort(key=lambda x:x[1], reverse=True)
57 |             Nucpos=0
58 |             Nucneg=0
59 |             for l in range(len(pos)):
60 |                 Nuc=pos[l][0]
61 |                 
62 |                 Nucsize=abs(pos[l][1])
63 |                 
64 |                 cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL)
65 |                 select_color(cr, Nuc)
66 |                 font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=Nucsize,x0=0.0,y0=0.0)
67 |                 cr.set_font_matrix(font_mat)
68 |                 cr.move_to(50+k*40*0.75, y_center-Nucpos*0.75)
69 |                 cr.show_text(str(Nuc))
70 |                 Nucpos+=abs(pos[l][1])
71 |             l=0
72 |             for l in range(len(neg)):
73 |                 Nuc=neg[l][0]
74 |                 Nucsize=abs(neg[l][1])
75 |                 
76 |                 cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL)
77 |                 select_color(cr, Nuc)
78 |                 font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=-Nucsize,x0=0.0,y0=0.0)
79 |                 cr.set_font_matrix(font_mat)
80 |                 cr.move_to(50+k*40*0.75, y_center+(Nucneg)*0.75)
81 |                 cr.show_text(str(Nuc))
82 |                 Nucneg+=abs(neg[l][1])
83 |             
84 |    
85 |     
86 |     
87 |     #cr.set_font_size(40)
88 | 
89 |         ims.write_to_png("motif_"+str(i)+".png")
90 |         
91 |         
92 | if __name__ == "__main__":    
93 |     main()


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #from distutils.core import setup
 2 | from setuptools import setup, find_packages
 3 | from distutils.extension import Extension
 4 | import re
 5 | import os
 6 | import codecs
 7 | here = os.path.abspath(os.path.dirname(__file__))
 8 | 
 9 | 
10 | def read(*parts):
11 |     # intentionally *not* adding an encoding option to open, See:
12 |     #   https://github.com/pypa/virtualenv/issues/201#issuecomment-3145690
13 |     with codecs.open(os.path.join(here, *parts), 'r') as fp:
14 |         return fp.read()
15 | 
16 | 
17 | def find_version(*file_paths):
18 |     version_file = read(*file_paths)
19 |     version_match = re.search(
20 |         r"^__version__ = ['\"]([^'\"]*)['\"]",
21 |         version_file,
22 |         re.M,
23 |     )
24 |     if version_match:
25 |         return version_match.group(1)
26 | 
27 |     raise RuntimeError("Unable to find version string.")
28 | 
29 | try:
30 |     from Cython.Distutils import build_ext
31 | except ImportError:
32 |     use_cython = False
33 | else:
34 |     use_cython = True
35 | 
36 | cmdclass = { }
37 | ext_modules = [ ]
38 | 
39 | if use_cython:
40 |     ext_modules += [
41 |         Extension("deepgmap.data_preprocessing_tools.seq_to_binary2", [ "deepgmap/data_preprocessing_tools/seq_to_binary2.pyx" ]),
42 |         #Extension("data_preprocessing_tools.queue", [ "deepgmap/data_preprocessing_tools/queue.pyx" ],libraries=["calg"]),
43 |         
44 |         Extension("deepgmap.post_train_tools.cython_util", [ "deepgmap/post_train_tools/cython_util.pyx" ]),
45 |     ]
46 |     cmdclass.update({ 'build_ext': build_ext })
47 | else:
48 |     ext_modules += [
49 |         Extension("deepgmap.data_preprocessing_tools.seq_to_binary2", [ "deepgmap/data_preprocessing_tools/seq_to_binary2.c" ]),
50 |         Extension("deepgmap.post_train_tools.cython_util", [ "deepgmap/post_train_tools/cython_util.c" ]),
51 |     ]
52 | #print(find_version("deepgmap", "__init__.py"))
53 | setup(
54 |     name='DeepGMAP',
55 |     #version=VERSION,
56 |     version=find_version("deepgmap", "__init__.py"),
57 |     description='Learning and predicting gene regulatory sequences in genomes',
58 |     author='Koh Onimaru',
59 |     author_email='koh.onimaru@gmail.com',
60 |     url='',
61 |     packages=['deepgmap','deepgmap.train','deepgmap.network_constructors','deepgmap.post_train_tools','deepgmap.data_preprocessing_tools','deepgmap.misc'],
62 |     #packages=find_packages('deepgmap'),
63 |     #packages=['deepgmap.'],
64 |     package_dir={'DeepGMAP':'deepgmap'},
65 |     #package_data = {
66 |     #     '': ['enhancer_prediction/*', '*.pyx', '*.pxd', '*.c', '*.h'],
67 |     #},
68 |     scripts=['bin/deepgmap',
69 |                    ],
70 |     #packages=find_packages(),
71 |     cmdclass = cmdclass,
72 |     ext_modules=ext_modules,
73 |     classifiers=[
74 |         'Development Status :: 3 - Alpha',
75 |         'Environment :: Console',
76 |         'Intended Audience :: Developers',
77 |         'Programming Language :: Python :: 3.6',
78 |         'License :: OSI Approved :: Apache Software License ',
79 |         'Operating System :: POSIX :: Linux',
80 |         'Topic :: Scientific/Engineering :: Bio-Informatics',
81 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
82 |         
83 |     
84 |     ],
85 |     install_requires=['tensorflow>=1.15', 'numpy', 'matplotlib', 'sklearn', 'tornado', 'natsort', 'psutil', 'pyBigWig'],
86 |     long_description=open('README.rst').read(),
87 | )
88 | 
89 | 


--------------------------------------------------------------------------------
/deepgmap/network_constructors/template_model.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import tensorflow as tf
 3 | 
 4 | import importlib as il
 5 | _ac=il.import_module("deepgmap.network_constructors.auc_calc") 
 6 | ac=_ac.auc_pr
 7 | #the code design came from https://gist.github.com/danijar/8663d3bbfd586bffecf6a0094cd116f2
 8 | 
 9 | def doublewrap(function):
10 |     @functools.wraps(function)
11 |     def decorator(*args, **kwargs):
12 |         if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
13 |             return function(args[0])
14 |         else:
15 |             return lambda wrapee: function(wrapee, *args, **kwargs)
16 |     return decorator
17 | 
18 | 
19 | @doublewrap
20 | def define_scope(function, scope=None, *args, **kwargs):
21 |     """
22 |     A decorator for functions that define TensorFlow operations. The wrapped
23 |     function will only be executed once. Subsequent calls to it will directly
24 |     return the result so that operations are added to the graph only once.
25 |     The operations added by the function live within a tf.variable_scope(). If
26 |     this decorator is used with arguments, they will be forwarded to the
27 |     variable scope. The scope name defaults to the name of the wrapped
28 |     function.
29 |     """
30 |     attribute = '_cache_' + function.__name__
31 |     name = scope or function.__name__
32 |     @property
33 |     @functools.wraps(function)
34 |     def decorator(self):
35 |         if not hasattr(self, attribute):
36 |             with tf.variable_scope(name, *args, **kwargs):
37 |                 setattr(self, attribute, function(self))
38 |         return getattr(self, attribute)
39 |     return decorator
40 | 
41 | 
42 | class template_model(object):
43 | 
44 | 
45 |     def __init__(self, label, prediction, max_to_keep, train_speed, GPUID):
46 |         #self.label=label
47 |         #self.prediction=prediction
48 |         self.max_to_keep=max_to_keep
49 |         self.train_speed=train_speed
50 |         self.optimize
51 |         self.error(label, prediction)
52 |         self.saver
53 |         self.cost(prediction)
54 |         self.GPUID=GPUID
55 | 
56 |     @define_scope
57 |     def saver(self):
58 |         return tf.train.Saver(max_to_keep=self.max_to_keep)
59 |     
60 |     @define_scope
61 |     def cost(self, prediction):
62 |         with tf.device('/device:GPU:'+self.GPUID):
63 |             nll=tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(targets=self.label, logits=prediction[0],pos_weight=1.0))
64 |             l2_norm=tf.reduce_sum(prediction[4])
65 |             l1_norm=tf.reduce_sum(tf.abs(prediction[1]))
66 |             return tf.add_n([nll,tf.multiply((5*10**-7), l2_norm),tf.multiply((1*10**-8),l1_norm)])
67 | 
68 |     @define_scope
69 |     def optimize(self):
70 |         with tf.device('/device:GPU:'+self.GPUID):
71 |             optimizer = tf.train.AdamOptimizer(self.train_speed)
72 |             return optimizer.minimize(self.cost)
73 | 
74 |     @define_scope
75 |     def error(self,label, prediction):
76 |         with tf.device('/device:GPU:'+self.GPUID):
77 |             class_n=label.shape[1]
78 |             FPR_list=[]
79 |             TPR_list=[]
80 |             PPV_list=[]
81 |             for i in range(class_n):
82 |                 
83 |                 true=label[:,i]
84 |                 prob=prediction[1][:,i]
85 |                 FPR, TPR, PPV=ac(true,prob,0.5)
86 |                 FPR_list.append(FPR)
87 |                 TPR_list.append(TPR)
88 |                 PPV_list.append(PPV)
89 |             
90 |             return FPR_list, TPR_list, PPV_list
91 | 


--------------------------------------------------------------------------------
/deepgmap/misc/gff_to_colored_bed.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ##gff-version 3
 3 | chr2    fimo    nucleotide_motif    5714959    5714967    56.5    +    .    Name=kernel_0_chr2+;Alias=;ID=kernel_0-1-chr2;pvalue=2.23e-06;qvalue= 1;sequence=CGCCTTCGC;
 4 | chr2    fimo    nucleotide_motif    10439990    10439998    56.5    +    .    Name=kernel_0_chr2+;Alias=;ID=kernel_0-2-chr2;pvalue=2.23e-06;qvalue= 1;sequence=CGCCTTCGC;
 5 | chr2    fimo    nucleotide_motif    13793526    13793534    56.5    +    .    Name=kernel_0_chr2+;Alias=;ID=kernel_0-3-chr2;pvalue=2.23e-06;qvalue= 1;sequence=cgccttcgc;
 6 | chr2    fimo    nucleotide_motif    17940241    17940249    56.5    +    .    Name=kernel_0_chr2+;Alias=;ID=kernel_0-4-chr2;pvalue=2.23e-06;qvalue= 1;sequence=CGCCTTCGC;
 7 | chr2    fimo    nucleotide_motif    18672533    18672541    56.5    +    .    Name=kernel_0_chr2+;Alias=;ID=kernel_0-5-chr2;pvalue=2.23e-06;qvalue= 1;sequence=CGCCTTCGC;
 8 | chr2    fimo    nucleotide_motif    21064760    21064768    56.5    +    .    Name=kernel_0_chr2+;Alias=;ID=kernel_0-6-chr2;pvalue=2.23e-06;qvalue= 1;sequence=CGCCTTCGC;
 9 | chr2    fimo    nucleotide_motif    28545836    28545844    56.5    +    .    Name=kernel_0_chr2+;Alias=;ID=kernel_0-7-chr2;pvalue=2.23e-06;qvalue= 1;sequence=CGCCTTCGC;
10 | """
11 | 
12 | """
13 | browser position 
14 | track name="kernels" description="kernel distribution visualization" visibility=2 itemRgb="On"
15 | chr7    127471196  127472363  Pos1  0  +  127471196  127472363  255,0,0
16 | chr7    127472363  127473530  Pos2  0  +  127472363  127473530  255,0,0
17 | chr7    127473530  127474697  Pos3  0  +  127473530  127474697  255,0,0
18 | chr7    127474697  127475864  Pos4  0  +  127474697  127475864  255,0,0
19 | chr7    127475864  127477031  Neg1  0  -  127475864  127477031  0,0,255
20 | chr7    127477031  127478198  Neg2  0  -  127477031  127478198  0,0,255
21 | chr7    127478198  127479365  Neg3  0  -  127478198  127479365  0,0,255
22 | chr7    127479365  127480532  Pos5  0  +  127479365  127480532  255,0,0
23 | chr7    127480532  127481699  Neg4  0  -  127480532  127481699  0,0,255
24 | """
25 | 
26 | from matplotlib import pyplot as plt
27 | import numpy as np
28 | import os
29 | 
30 | 
31 | cmap = plt.get_cmap('nipy_spectral')
32 | colors = np.array([cmap(i) for i in np.linspace(0, 1, 320)])
33 | 
34 | colors=(255*colors).astype(int)
35 | 
36 | gff="/home/fast/onimaru/deepgmap/data/outputs/conv4frss_trained_variables_Fri_May_11_075425_2018_kernels/fimo_out/fimo.gff"
37 | gff="/home/fast2/onimaru/DeepGMAP-dev/data/outputs/conv4frss_Mon_Feb_25_092345_2019_trained_variables_kernels/fimo_out/fimo.gff"
38 | bed=os.path.splitext(gff)[0]+".bed"
39 | 
40 | with open (gff, 'r') as fin, open(bed, 'w') as fout:
41 |     fout.write('track name="kernels" description="kernel distribution visualization" visibility=2 itemRgb="On"\n')
42 |     for line in fin:
43 |         line=line.split("\t")
44 |         if len(line)==9:
45 |             chr=line[0]
46 |             start=line[3]
47 |             end=line[4]
48 |             #score=line[5]
49 |             orientation=line[6]
50 |             subline=line[-1].split(';')
51 |             for subs in subline:
52 |                 if subs.startswith("Name"):
53 |                     subs=subs.split("=")[1].split("_")
54 |                     name=subs[0]+"_"+subs[1]           
55 |                     name_num=int(subs[1])
56 |                 elif subs.startswith("pvalue"):
57 |                     subs=-np.log10(float(subs.split("=")[1]))*100
58 |                     score=str(subs)
59 |             _color=",".join(map(str, colors[name_num][:3]))
60 |             fout.write("\t".join([chr, start,end,name,score,orientation,start,end, _color])+"\n")
61 | 
62 |         


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/inputGenerator_from_deepsea.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import sys
  3 | import random
  4 | import multiprocessing
  5 | import os
  6 | 
  7 | def dicttoarray(binaryDNAdict, label_list):           
  8 | 
  9 |     num_seq=len(binaryDNAdict)
 10 |     x=0
 11 |     y=0
 12 | 
 13 |     shuf=range(num_seq)
 14 | 
 15 |     random.shuffle(shuf)   
 16 |     binaryDNAdict_shuf=[]
 17 |     binaryDNAdict_shuf_append=binaryDNAdict_shuf.append
 18 |     label_list_shuf=[]
 19 |     label_list_shuf_append=label_list_shuf.append
 20 |     k=0
 21 |     for i in shuf:
 22 |         
 23 |         d=binaryDNAdict[i]
 24 |         l=label_list[i]           
 25 |         #dp=position[i]
 26 |         #lp=label_position[i]
 27 |         r=random.random()
 28 |         
 29 |         #print r, sum(l), reduce_genome
 30 |             #print dp, lp
 31 |             #assert dp==lp
 32 |         binaryDNAdict_shuf_append(d)
 33 |         label_list_shuf_append(l)
 34 |         if sum(l)==0:
 35 |             x+=1
 36 |         else:
 37 |             y+=1
 38 |         prog=100.0*float(k+y+x)/num_seq
 39 |         if prog%10.0==0.0:
 40 |             print(str(prog)+" of data are shuffled.")
 41 |     z=float(x)/float(y+x)
 42 |     print(str(k)+" of negative sequences are skipped\n"+"negative/total="+str(z))
 43 |     return binaryDNAdict_shuf, label_list_shuf
 44 | 
 45 | 
 46 | def array_saver(index_list, binaryDNAdict_shuf,label_list_shuf, sample_num,out_dir):
 47 |     #print "binaryDNAdict_shuf length under array_saver: "+str(len(binaryDNAdict_shuf))
 48 |     
 49 |     for i in range(len(index_list)):
 50 |         data_array=np.array(binaryDNAdict_shuf[i*sample_num:(i*sample_num+sample_num)], np.int32)
 51 |         #print np.sum(data_array)
 52 |         labels=np.array(label_list_shuf[i*sample_num:(i*sample_num+sample_num)], np.int32)
 53 |         #print np.shape(labels)
 54 |                 
 55 |         filename = out_dir+"batch_"+str(index_list[i])+".npz"
 56 |         #print "saving "+str(filename)
 57 |         try:
 58 |             with open(filename, "wb") as output_file:
 59 |                 np.savez_compressed(output_file,labels=labels, data_array=data_array)
 60 |         except IOError as e:    
 61 |             print("I/O error({0}): {1}".format(e.errno, e.strerror))
 62 |         except ValueError:
 63 |             print("Could not convert data")
 64 |         except:
 65 |             print("Unexpected error:", sys.exc_info()[0])
 66 |             raise
 67 | 
 68 | fname="/home/fast/onimaru/encode/deepsea/deepsea_train/train.npz"
 69 | output_dir=os.path.split(fname)[0]+"/train_data_for_my_program/"
 70 | os.makedirs(output_dir)
 71 | fload=np.load(fname)
 72 | data=fload["data_array"]
 73 | labels=fload["labels"]
 74 | 
 75 | binaryDNAdict_shuf, label_list_shuf=dicttoarray(data, labels,)
 76 | 
 77 | dna_dict_length=len(binaryDNAdict_shuf)
 78 | 
 79 | if dna_dict_length%16==0:
 80 |     batch=dna_dict_length/16
 81 | else:
 82 |     batch=dna_dict_length/16+1
 83 |     
 84 | if dna_dict_length%100==0:
 85 |     total_num=dna_dict_length/(100*16)
 86 | else:
 87 |     total_num=dna_dict_length/(100*16)+1
 88 |     
 89 | jobs = []
 90 | for i in range(16):
 91 |     #print str(len(binaryDNAdict_shuf[i*batch:(i+1)*batch]))+" are passed"
 92 |     jobs.append(multiprocessing.Process(target=array_saver, 
 93 |                             args=(range(i*total_num,(i+1)*total_num), 
 94 |                                   binaryDNAdict_shuf[i*batch:(i+1)*batch],
 95 |                                   label_list_shuf[i*batch:(i+1)*batch], 
 96 |                                   100, output_dir,)))
 97 | print("saving data set with "+str(16)+" threads")
 98 | for j in jobs:
 99 |     j.start()
100 |     
101 | for j in jobs:
102 |     j.join()
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/genome_labeling.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import glob as glb
  3 | import sys
  4 | import numpy as np
  5 | 
  6 | narrow_peak_list=glb.glob(sys.argv[1])
  7 | label_array=[]
  8 | for i in range(len(narrow_peak_list)):
  9 |     label_array.append(0)
 10 | #print label_array
 11 | 
 12 | genome_file=sys.argv[2]
 13 | genome_segments=[]
 14 | 
 15 | with open(genome_file, 'r') as fin:
 16 |     for line in fin:
 17 |         #label_array.append(1)
 18 |         genome_segments.append(label_array)
 19 | genome_segments=np.array(genome_segments)
 20 | 
 21 | genome_file2=sys.argv[3]
 22 | genome_size=[]
 23 | chrm_num={}
 24 | chrm_list=set()
 25 | with open(genome_file2,'r') as fin:
 26 |     i=0
 27 |     for line in fin:
 28 |         line=line.split()
 29 |         genome_size.append(int(line[1])/1000)
 30 |         chrm_num[line[0]]=i
 31 |         chrm_list.add(line[0])
 32 |         i+=1
 33 |         
 34 | 
 35 | peak_dict={}
 36 | 
 37 | i=0
 38 | j=0
 39 | for f in narrow_peak_list:
 40 |     with open(f, 'r') as fin:
 41 |         for line in fin:
 42 |             
 43 |             line=line.split()
 44 |             chrm=line[0]
 45 |             if len(line)==4:                
 46 |                 score=int(line[3])
 47 |             elif len(line)==10:
 48 |                 score=int(line[4])
 49 |             else:
 50 |                 #print f
 51 |                 break
 52 |             
 53 |             if chrm in chrm_list:
 54 |                 if score>=75:
 55 |                     
 56 |                     start=int(line[1])
 57 |                     end=int(line[2])
 58 |                     length=end-start
 59 |                     right_bin1=(start/1000+1)*1000
 60 |                     left_bin1=(end/1000)*1000
 61 |                     point_1000=(start+end)/(2*1000)
 62 |                     if end<=right_bin1:
 63 |                         
 64 |                         
 65 |                         genome_location=sum(genome_size[:chrm_num[chrm]])+point_1000
 66 |                         if genome_segments[genome_location][i]==0:
 67 |                             genome_segments[genome_location][i]+=1
 68 |                             j+=1
 69 |                             #print j
 70 |                     else:
 71 |                         
 72 |                         
 73 |                         if right_bin1-start>=100:
 74 |                             left_point=start/1000
 75 |                         else:
 76 |                             left_point=start/1000+1
 77 |                         if end-left_bin1>=100:
 78 |                             right_point=end/1000
 79 |                         else:
 80 |                             right_point=end/1000-1
 81 |                         k=left_point
 82 |                         while left_point<=k<=right_point:
 83 |                             genome_location=sum(genome_size[:chrm_num[chrm]])+k
 84 |                         
 85 |                             if genome_segments[genome_location][i]==0:
 86 |                                 genome_segments[genome_location][i]+=1
 87 |                                 j+=1
 88 |                                 #print j
 89 |                                 #print "longer than 1000 "+ str(genome_location)
 90 |                             k+=1
 91 |                         
 92 |                 #if genome_segments[genome_location][-1]==1:
 93 |                 #    genome_segments[genome_location][-1]=0
 94 |     i+=1
 95 | 
 96 | with open(genome_file,'r') as fin:
 97 |     with open(genome_file+'_limb_75co_OL100.labeled','w') as fout, open(genome_file+'_limb_75co_OL100.bed','w') as fout2:
 98 |         i=0
 99 |         for line in fin:
100 |             fout.write(line.strip('\n')+'\t'+'\t'.join(map(str, list(genome_segments[i])))+'\n')
101 |             if genome_segments[i]==1:
102 |                 fout2.write(line)
103 |             i+=1
104 |             
105 |             
106 | 
107 |         
108 | 
109 | 


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/genome_labeling_compare.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import glob as glb
  3 | import sys
  4 | import numpy as np
  5 | from sklearn.decomposition import KernelPCA as pca_f
  6 | import os
  7 | import matplotlib.pyplot as plt
  8 | from scipy.spatial.distance import pdist
  9 | import scipy.cluster.hierarchy as sch
 10 | from MulticoreTSNE import MulticoreTSNE as TSNE
 11 | 
 12 | def genome_label(bed_file_list, genome_1000):
 13 |     file_num=len(bed_file_list)
 14 |     peak_set_list=[]
 15 |     i=0
 16 |     for f in bed_file_list:
 17 |         peak_set=set()
 18 |         with open(f, 'r') as fin:
 19 |             for line in fin:
 20 |                 if i==0:
 21 |                     _,a,b=line.split()
 22 |                     check_length=int(b)-int(a)
 23 |                     
 24 |                 peak_set.add(line)
 25 |         peak_set_list.append(peak_set)
 26 |                 
 27 |         i+=1
 28 |     label_array_list=[]
 29 |     with open(genome_1000,'r') as fin:
 30 |         i=0
 31 |         for line in fin:
 32 |             k=0
 33 |             label_array=[0 for h in range(file_num)]
 34 | 
 35 |             for s in peak_set_list:
 36 |                 if i==0:
 37 |                     _,a,b=line.split()
 38 |                     assert check_length==int(b)-int(a), "mismatches in sequence lengths"
 39 |                 if line in s:
 40 |                     label_array[k]=1
 41 |                 k+=1 
 42 |             if sum(label_array)>0:
 43 |                 #print sum(label_array)
 44 |                 label_array_list.append(label_array)
 45 |             i+=1
 46 |     return np.array(label_array_list)
 47 | 
 48 | def main():
 49 |     bed_file_dir, genome_1000=sys.argv[1:]
 50 |     bed_file_list=[]
 51 |     if not "*" in bed_file_dir and bed_file_dir.endswith('.bed'):
 52 |         bed_file_list.append(bed_file_dir)
 53 |     elif not '*' in bed_file_dir:
 54 |         bed_file_dir=bed_file_dir+"*.bed"
 55 |     
 56 |     bed_file_list=glb.glob(bed_file_dir)
 57 |     #print bed_file_list
 58 |     if len(bed_file_list)==0:
 59 |        # print("no files in "+str(bed_file_dir))
 60 |         sys.exit()
 61 |     label_array_list=genome_label(bed_file_list, genome_1000)
 62 |     #print label_array_list[0]
 63 |     label_array_list=label_array_list[np.random.randint(label_array_list.shape[0], size=5000), :]
 64 |     
 65 |     
 66 |     label_array_list_=np.transpose(label_array_list)
 67 |     #print sum(label_array_list_[0])
 68 |     lshape=label_array_list.shape
 69 |     C=[]
 70 |     for i in range(lshape[0]):
 71 |         
 72 |         C.append([np.sum(label_array_list[i])/float(lshape[1]),0.0,0.0])
 73 |     tsne = TSNE(n_jobs=18,perplexity = 5.000000)
 74 |     label_array_list=np.array(label_array_list, np.float64)
 75 |     #X_pca2=np.array(X_pca2, np.float64)
 76 |     X_tsne = tsne.fit_transform(label_array_list)
 77 |     plt.scatter(X_tsne[:, 0], X_tsne[:, 1],
 78 |              c=C, lw=2, s=0.5)
 79 |     
 80 |     pca = pca_f(n_components=2, kernel="rbf")
 81 |     X_pca=pca.fit_transform(label_array_list)
 82 |     dist1=pdist(label_array_list_, 'cosine')
 83 |     _, ax1=plt.subplots()
 84 |     
 85 |     Y = sch.linkage(dist1, method='ward')
 86 |     Z1 = sch.dendrogram(Y)
 87 |     idx1 = Z1['leaves']
 88 |     
 89 |     new_sample_list=[]
 90 |     
 91 |     for i in idx1:
 92 |         txt=bed_file_list[i].split("/")[-1]
 93 |         new_sample_list.append(txt)
 94 |     ax1.set_xticklabels(new_sample_list , rotation=90)
 95 |     
 96 |     
 97 |     #print X_pca.shape
 98 |     _, ax2=plt.subplots()
 99 |     ax2.scatter(X_pca[:,0], X_pca[:,1],c=C)
100 |     """for i, txt in enumerate(bed_file_list):
101 |         txt=txt.split("/")[-1]
102 |         ax2.annotate(txt, (X_pca[i,0],X_pca[i,1]))"""
103 |     
104 |     plt.show()
105 | if __name__ == '__main__':
106 |     main()
107 |     
108 |     
109 |     
110 |     
111 |     
112 |     
113 |     
114 |     
115 |     
116 |     
117 | 


--------------------------------------------------------------------------------
/deepgmap/misc/bed_file_compare2.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import sys
  3 | import subprocess as sp
  4 | import networkx as nx
  5 | import os
  6 | from itertools import combinations
  7 | import glob
  8 | from matplotlib import pyplot as plt
  9 | import numpy as np
 10 | from matplotlib_venn import venn3, venn3_circles
 11 | 
 12 | file_list=sorted(glob.glob('/home/fast2/onimaru/DeepGMAP-dev/data/predictions/quick_benchmark/bed_comp_50_es/*'))
 13 | 
 14 | file_combination=[]
 15 | node_list=[]
 16 | peak_counts={}
 17 | path_sep=os.path.sep
 18 | 
 19 | peak_count={}
 20 | peak_count_dict={}
 21 | for i in file_list:
 22 |     with open(i, 'r') as j:
 23 |         peak_count=len(j.readlines())
 24 |     
 25 |     file_name=i.split(path_sep)
 26 |     file_name=file_name[-1].split('.')
 27 |     node1=file_name[0]
 28 |     peak_counts[node1]=peak_count
 29 |     node_list.append(node1)
 30 | 
 31 | ABout=open('./intersectAB.bed', 'w')
 32 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-a", str(file_list[0]), "-b", str(file_list[1])], stdout=ABout)
 33 | ABout.close()
 34 | fAB=open('./intersectAB.bed', 'r')
 35 | AB=len(fAB.readlines())
 36 | fAB.close()
 37 | #print AB, peak_counts[node_list[0]]
 38 | 
 39 | ABout_=open('./intersectAB_.bed', 'w')
 40 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-a", str(file_list[1]), "-b", str(file_list[0])], stdout=ABout_)
 41 | ABout_.close()
 42 | fAB_=open('./intersectAB_.bed', 'r')
 43 | AB_=len(fAB_.readlines())
 44 | fAB_.close()
 45 | #print AB_, peak_counts[node_list[1]]
 46 | 
 47 | if AB>AB_:
 48 |     AB=AB_
 49 | 
 50 | ACout=open('./intersectAC.bed', 'w')
 51 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-a", str(file_list[0]), "-b", str(file_list[2])], stdout=ACout)
 52 | ACout.close()
 53 | fAC=open('intersectAC.bed', 'r')
 54 | AC=len(fAC.readlines())
 55 | fAC.close()
 56 | #print AC, peak_counts[node_list[2]]
 57 | 
 58 | ACout_=open('./intersectAC_.bed', 'w')
 59 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-a", str(file_list[2]), "-b", str(file_list[0])], stdout=ACout_)
 60 | ACout_.close()
 61 | fAC_=open('intersectAC_.bed', 'r')
 62 | AC_=len(fAC_.readlines())
 63 | fAC_.close()
 64 | #print AC_
 65 | 
 66 | if AC>AC_:
 67 |     AC=AC_
 68 | 
 69 | BCout=open('./intersectBC.bed', 'w')
 70 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-a", str(file_list[2]), "-b", str(file_list[1])], stdout=BCout)
 71 | BCout.close()
 72 | fBC=open('intersectBC.bed', 'r')
 73 | BC=len(fBC.readlines())
 74 | fBC.close()
 75 | #print BC
 76 | 
 77 | BCout_=open('./intersectBC_.bed', 'w')
 78 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-a", str(file_list[1]), "-b", str(file_list[2])], stdout=BCout_)
 79 | BCout_.close()
 80 | fBC_=open('intersectBC_.bed', 'r')
 81 | BC_=len(fBC_.readlines())
 82 | fBC_.close()
 83 | #print BC_
 84 | 
 85 | if BC>BC_:
 86 |     BC=BC_
 87 | 
 88 | ABCout=open('./intersectABC.bed', 'w')
 89 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-a", 'intersectAB.bed', "-b", str(file_list[2])],stdout=ABCout)
 90 | ABCout.close()
 91 | fABC=open('intersectABC.bed', 'r')
 92 | ABC=len(fABC.readlines())
 93 | fABC.close()
 94 | 
 95 | ABCout_=open('./intersectABC_.bed', 'w')
 96 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-b", 'intersectAB.bed', "-a", str(file_list[2])],stdout=ABCout_)
 97 | ABCout_.close()
 98 | fABC_=open('intersectABC_.bed', 'r')
 99 | ABC_=len(fABC_.readlines())
100 | fABC_.close()
101 | 
102 | if ABC>ABC_:
103 |     ABC=ABC_
104 | 
105 | Abc=peak_counts[node_list[0]]-AB-AC+ABC
106 | ABc=AB-ABC
107 | AbC=AC-ABC
108 | 
109 | aBc=peak_counts[node_list[1]]-AB-BC+ABC
110 | aBC=BC-ABC
111 | 
112 | abC=peak_counts[node_list[2]]-AC-BC+ABC
113 | 
114 | plt.figure(figsize=(4,4))
115 | v = venn3(subsets=(Abc, aBc, ABc, abC, AbC, aBC, ABC), set_labels = (node_list[0], node_list[1], node_list[2]))
116 | v.get_patch_by_id('100').set_alpha(1.0)
117 | plt.title("Venn diagram")
118 | plt.show()
119 | 


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/sequence_visualizer.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import cairocffi as cairo
  3 | import gzip
  4 | import cPickle       
  5 | import numpy as np
  6 | import glob as gl
  7 | 
  8 | def select_color(cr, DNA):
  9 |     if DNA=="A":
 10 |         cr.set_source_rgb(1, 0, 0)
 11 |     elif DNA=="G":
 12 |         cr.set_source_rgb(0, 0, 0)
 13 |     elif DNA=="C":
 14 |         cr.set_source_rgb(0, 0, 1)
 15 |     elif DNA=="T":
 16 |         cr.set_source_rgb(0, 1, 0)    
 17 |     else:
 18 |         cr.set_source_rgb(1, 1, 1)
 19 | def main():
 20 |     f=gl.glob("/home/fast/onimaru/data/reconstruction/network_constructor_deepsea_1d3_Sat_Jul__1_145520_2017.ckpt-6293_transpose_*.npz")
 21 |     for npz in f:
 22 |         with np.load(npz, 'r') as f:
 23 |             
 24 |             reconstruct=f["conv2"]
 25 |             original_seq=f["original"]
 26 |         i=0 
 27 |         j=0
 28 |         k=0
 29 |         l=0
 30 |         reconstruct=np.reshape(reconstruct, (1000, 4))
 31 |         original_seq=np.reshape(original_seq, (1000, 4))
 32 |         
 33 |         line_num=10
 34 |         DNA_len=1000
 35 |         width=DNA_len*30/line_num+200
 36 |         hight=1024*2*3
 37 |         y_center=300
 38 |         ims1 = cairo.PDFSurface(npz+".pdf", width, hight)       
 39 |         cr = cairo.Context(ims1)
 40 |         cr.move_to(100, y_center)
 41 |         cr.line_to(DNA_len/line_num*30+100, y_center)
 42 |         #cr.move_to(50, 100)
 43 |         #cr.line_to(50, 412)
 44 |         cr.set_line_width(2)
 45 |         cr.stroke()
 46 |         max_value=reconstruct.max()
 47 |         SCALE=300/max_value
 48 |         for k in range(1000):
 49 |             if not k==0 and k%(DNA_len/line_num)==0:
 50 |                 cr.set_source_rgba(0.0,0.0,0,1.0)
 51 |                 y_center+=400
 52 |                 cr.move_to(100, y_center)
 53 |                 cr.line_to(DNA_len//line_num*30+100, y_center)
 54 |                 cr.stroke()
 55 |                 print(y_center)
 56 |             max_value=np.amax(reconstruct[k])
 57 |             sum_value=np.sum(reconstruct[k])
 58 |             max_value2=np.amax(original_seq[k])
 59 |             print(max_value)
 60 |             if max_value>0.0:
 61 |                 max_index=np.argmax(reconstruct[k])
 62 |             
 63 |                 if max_index==0:
 64 |                     Nuc="A"
 65 |                 elif max_index==1:
 66 |                     Nuc="G"
 67 |                 elif max_index==2:
 68 |                     Nuc="C"
 69 |                 elif max_index==3:
 70 |                     Nuc="T"
 71 |             else:
 72 |                 Nuc="N"
 73 |                 
 74 |             if max_value2>0.0:
 75 |                 max_index2=np.argmax(original_seq[k])
 76 |             
 77 |                 if max_index2==0:
 78 |                     Nuc2="A"
 79 |                 elif max_index2==1:
 80 |                     Nuc2="G"
 81 |                 elif max_index2==2:
 82 |                     Nuc2="C"
 83 |                 elif max_index2==3:
 84 |                     Nuc2="T"
 85 |             else:
 86 |                 Nuc2="N"
 87 |                 
 88 |             
 89 |             Nucpos=0
 90 |             Nucneg=0
 91 |             Nucsize=max_value*SCALE
 92 |             Nucsize2=sum_value*SCALE
 93 |             x_pos=k%(DNA_len/line_num)
 94 |             #cr.move_to(50+x_pos*40*0.75, y_center)               
 95 |             #cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL)
 96 |             #select_color(cr, Nuc)
 97 |             #font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=Nucsize+0.1,x0=0.0,y0=0.0)
 98 |             #cr.set_font_matrix(font_mat)
 99 |             #print Nuc
100 |             #cr.show_text(str(Nuc))
101 |             cr.move_to(100+x_pos*40*0.75, y_center)               
102 |             cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL)
103 |             select_color(cr, Nuc2)
104 |             font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=Nucsize+20.0,x0=0.0,y0=0.0)
105 |             cr.set_font_matrix(font_mat)
106 |             cr.show_text(str(Nuc2))    
107 |         #cr.set_font_size(40)
108 |         cr.show_page()
109 |                     
110 | if __name__ == "__main__":    
111 |     main()


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/Clustering_analizer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | #import cPickle
  3 | #import gzip
  4 | #from sklearn.cluster import DBSCAN
  5 | #from sklearn import metrics
  6 | import matplotlib.pyplot as plt
  7 | #import scipy
  8 | import pylab
  9 | import scipy.cluster.hierarchy as sch
 10 | import scipy.spatial.distance as spd
 11 | #from sklearn.decomposition import PCA, IncrementalPCA
 12 | from MulticoreTSNE import MulticoreTSNE as TSNE
 13 | import os
 14 | fname='/home/fast/onimaru/deepgmap/data/outputs/conv4frss_trained_variables_Fri_May_11_075425_2018.npz'
 15 | variables=np.load(fname)
 16 | filter1=variables['prediction/W_conv1:0']
 17 | filter1_shape=filter1.shape
 18 | filter1_flattened_array=[]  
 19 | for i in range(filter1_shape[3]):            
 20 |     tmp_filter=filter1[:,:,:,i]
 21 |     tmp_filter=tmp_filter.reshape(filter1_shape[0], filter1_shape[1])
 22 |     tmp_filter=tmp_filter.flatten()
 23 |     #filter1_flattened_array.append(tmp_filter/np.amax([np.amax(tmp_filter), np.absolute(np.amin(tmp_filter))]))
 24 |     #filter1_flattened_array.append(np.exp(tmp_filter)/np.sum(np.exp(tmp_filter)))
 25 |     filter1_flattened_array.append(tmp_filter)
 26 | X = np.array(filter1_flattened_array, np.float64)
 27 | D = spd.pdist(X, 'cosine')
 28 | # Compute and plot first dendrogram.
 29 | fig = pylab.figure(figsize=(8,8))
 30 | ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
 31 | Y = sch.linkage(D, method='ward')
 32 | Z1 = sch.dendrogram(Y, orientation='left')
 33 | ax1.set_xticks([])
 34 | ax1.set_yticks([])
 35 | 
 36 | # Plot distance matrix.
 37 | axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
 38 | idx1 = Z1['leaves']
 39 | #idx2 = Z2['leaves']
 40 | X2 = X[idx1]
 41 | im = axmatrix.matshow(X2, aspect='auto', origin='lower', cmap=pylab.get_cmap('YlGnBu'))
 42 | axmatrix.set_xticks([])
 43 | axmatrix.set_yticks([])
 44 | 
 45 | # Plot colorbar.
 46 | axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
 47 | pylab.colorbar(im, cax=axcolor)
 48 | #fig.savefig('/media/koh/HD-PCFU3/mouse/filter_1_clustering.png')
 49 | 
 50 | saving_dir_prefix=os.path.splitext(fname)[0]
 51 | plt.savefig(saving_dir_prefix+'_heat_map.pdf', format='pdf')
 52 | 
 53 | tsne = TSNE(n_jobs=18,perplexity = 50.000000,  n_iter=5000)
 54 | #X_pca2=np.array(X_pca2, np.float64)
 55 | X_tsne = tsne.fit_transform(X)
 56 | 
 57 | fig2 = pylab.figure(figsize=(8,8))
 58 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1],
 59 |              lw=2,s=0.5)
 60 | plt.savefig(saving_dir_prefix+'_tSNE.pdf', format='pdf')
 61 | 
 62 | 
 63 | plt.show()
 64 | 
 65 | """
 66 | import matplotlib.pyplot as mplt
 67 | 
 68 | 
 69 | db = DBSCAN(eps=0.3,min_samples=3, algorithm='auto').fit(X)
 70 | core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
 71 | core_samples_mask[db.core_sample_indices_] = True
 72 | labels = db.labels_
 73 | 
 74 | # Number of clusters in labels, ignoring noise if present.
 75 | n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
 76 | unique_labels = set(labels)
 77 | colors = mplt.get_cmap('Spectral')(np.linspace(0, 1, len(unique_labels)))
 78 | for k, col in zip(unique_labels, colors):
 79 |     if k == -1:
 80 |         # Black used for noise.
 81 |         col = 'k'
 82 | 
 83 |     class_member_mask = (labels == k)
 84 | 
 85 |     xy = X[class_member_mask & core_samples_mask]
 86 |     mplt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
 87 |              markeredgecolor='k', markersize=14)
 88 | 
 89 |     xy = X[class_member_mask & ~core_samples_mask]
 90 |     mplt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
 91 |              markeredgecolor='k', markersize=6)
 92 | 
 93 | mplt.title('Estimated number of clusters: %d' % n_clusters_)
 94 | mplt.show()
 95 | 
 96 | n_components = 2
 97 | ipca = IncrementalPCA(n_components=n_components, batch_size=512, whiten=True)
 98 | X_ipca = ipca.fit_transform(X)
 99 | 
100 | pca = PCA(n_components=n_components)
101 | X_pca = pca.fit_transform(X)
102 | for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]:
103 |     plt.figure(figsize=(8, 8))
104 |     
105 |     plt.scatter(X_transformed[0], X_transformed[1], lw=2)
106 | 
107 |     if "Incremental" in title:
108 |         err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean()
109 |         plt.title(title + " of feature map\nMean absolute unsigned error "
110 |                   "%.6f" % err)
111 |     else:
112 |         plt.title(title + " of feature map")
113 |     plt.legend(loc="best", shadow=False, scatterpoints=1)
114 | 
115 | mplt.show()      """


--------------------------------------------------------------------------------
/deepgmap/misc/kernel_distribution_analizer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | #import cPickle
  3 | #import gzip
  4 | #from sklearn.cluster import DBSCAN
  5 | #from sklearn import metrics
  6 | import matplotlib.pyplot as plt
  7 | #import scipy
  8 | import pylab
  9 | import scipy.cluster.hierarchy as sch
 10 | import scipy.spatial.distance as spd
 11 | from sklearn.decomposition import PCA, IncrementalPCA
 12 | from MulticoreTSNE import MulticoreTSNE as TSNE
 13 | import os
 14 | from mpl_toolkits.mplot3d import Axes3D
 15 | import glob as gl
 16 | 
 17 | fname='/home/fast/onimaru/deepgmap/data/outputs/conv4frss_trained_variables_Fri_May_11_075425_2018_kernels/fimo_out/kernels_*_summits_1000.bed'
 18 | 
 19 | flist=gl.glob(fname)
 20 | 
 21 | pos_dict_dict={}
 22 | data_array=[]
 23 | mycolors=[]
 24 | for f in flist:
 25 |     pos_dict={}
 26 |     h,t=os.path.split(f)
 27 |     t=t.split('_')
 28 |     t=t[2]
 29 |     print(t)
 30 |     with open(f, 'r') as fin:
 31 |         for line in fin:
 32 |             line=line.split()
 33 |             if float(line[7])>=500:
 34 |                 pos="\t".join(line[:3])
 35 |                 kernel_num=int(line[6].split("_")[1])
 36 |                 if not pos in pos_dict:
 37 |                     pos_dict[pos]=np.zeros([320], np.float64)
 38 |                 pos_dict[pos][kernel_num]+=1.0   
 39 |     
 40 |     pos_dict_dict[t]=pos_dict
 41 |     
 42 | sample_class=[]
 43 | i=0
 44 | for k, v in pos_dict_dict.items():
 45 |     sample_class.append(k)
 46 |     rgb=np.zeros([3], np.float64)
 47 |     #rgb[3]=0.5
 48 |     if not k=="common":
 49 |         rgb[i]=1.0
 50 |         i+=1
 51 |     for _k,_v in v.items():
 52 |         data_array.append(_v)
 53 |         mycolors.append(rgb)
 54 |     
 55 | print(sample_class)
 56 | X = np.array(data_array, np.float64)
 57 | saving_dir_prefix=fname.split('*')[0]
 58 | 
 59 | D = spd.pdist(X, 'cosine')
 60 | # Compute and plot first dendrogram.
 61 | fig = pylab.figure(figsize=(8,8))
 62 | ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
 63 | Y = sch.linkage(D, method='ward')
 64 | Z1 = sch.dendrogram(Y, orientation='left')
 65 | ax1.set_xticks([])
 66 | ax1.set_yticks([])
 67 | 
 68 | # Plot distance matrix.
 69 | axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
 70 | idx1 = Z1['leaves']
 71 | #idx2 = Z2['leaves']
 72 | X2 = X[idx1]
 73 | im = axmatrix.matshow(X2, aspect='auto', origin='lower', cmap=pylab.get_cmap('YlGnBu'))
 74 | axmatrix.set_xticks([])
 75 | axmatrix.set_yticks([])
 76 | 
 77 | # Plot colorbar.
 78 | axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
 79 | pylab.colorbar(im, cax=axcolor)
 80 | #fig.savefig('/media/koh/HD-PCFU3/mouse/filter_1_clustering.png')
 81 | 
 82 | 
 83 | plt.savefig(saving_dir_prefix+'_heat_map.pdf', format='pdf')
 84 | """
 85 | tsne = TSNE(n_jobs=16,perplexity = 20.000000,  n_iter=10000)
 86 | #X_pca2=np.array(X_pca2, np.float64)
 87 | X_tsne = tsne.fit_transform(X)
 88 | 
 89 | fig2 = pylab.figure(figsize=(8,8))
 90 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1],
 91 |              lw=2,s=0.5, c=mycolors)
 92 | plt.savefig(saving_dir_prefix+'_tSNE.pdf', format='pdf')
 93 | 
 94 | 
 95 | #plt.show()
 96 | import pandas as pd
 97 | import seaborn as sns
 98 | sns.set_style("white")
 99 | #df = sns.load_dataset('iris')
100 |  
101 | #my_dpi=96
102 | #plt.figure(figsize=(480/my_dpi, 480/my_dpi), dpi=my_dpi)
103 |  
104 | # Keep the 'specie' column appart + make it numeric for coloring
105 | #df['species']=pd.Categorical(df['species'])
106 | #my_color=df['species'].cat.codes
107 | #df = df.drop('species', 1)
108 | 
109 | # Run The PCA
110 | pca = PCA(n_components=3)
111 | pca.fit(X)
112 |  
113 | # Store results of PCA in a data frame
114 | result=pd.DataFrame(pca.transform(X), columns=['PCA%i' % i for i in range(3)])
115 |  
116 | # Plot initialisation
117 | fig = plt.figure()
118 | ax = fig.add_subplot(111, projection='3d')
119 | ax.scatter(result['PCA0'], result['PCA1'], result['PCA2'], c=mycolors, s=10)
120 |  
121 | # make simple, bare axis lines through space:
122 | xAxisLine = ((min(result['PCA0']), max(result['PCA0'])), (0, 0), (0,0))
123 | ax.plot(xAxisLine[0], xAxisLine[1], xAxisLine[2], 'r')
124 | yAxisLine = ((0, 0), (min(result['PCA1']), max(result['PCA1'])), (0,0))
125 | ax.plot(yAxisLine[0], yAxisLine[1], yAxisLine[2], 'r')
126 | zAxisLine = ((0, 0), (0,0), (min(result['PCA2']), max(result['PCA2'])))
127 | ax.plot(zAxisLine[0], zAxisLine[1], zAxisLine[2], 'r')
128 |  
129 | # label the axes
130 | ax.set_xlabel("PC1")
131 | ax.set_ylabel("PC2")
132 | ax.set_zlabel("PC3")
133 | ax.set_title("PCA on the iris data set")
134 | #plt.show()
135 | 
136 | plt.show()"""


--------------------------------------------------------------------------------
/INSTALL.rst:
--------------------------------------------------------------------------------
  1 | ==========================
  2 | INSTALL Guide For DeepGMAP
  3 | ==========================
  4 | 
  5 | 
  6 | Install with docker
  7 | ===================
  8 | 
  9 | Prerequisites
 10 | ~~~~~~~~~~~~~
 11 | nvidia-driver 396?.
 12 | 
 13 | ndivie-docker 2.0.3.
 14 | 
 15 | docker 18.06.
 16 | 
 17 | 
 18 | Pull an existing docker image
 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 20 | First, pull a docker image of deepgmap from the docker repository.::
 21 | 
 22 |  $ docker pull koonimaru/deepgmap:dev3
 23 | 
 24 | Build a new docker image
 25 | ~~~~~~~~~~~~~~~~~~~~~~~~
 26 | Alternatively, by building a new image with Dockerfile, you may be able to get a latest version of deepgmap or to change tensorflow version. In this case, please use Dockerfile in this package::
 27 | 
 28 |  $ mkdir deepgmap-docker
 29 |  $ cp DeepGMAP/Dockerfile ./deepgmap-docker/
 30 |  $ cd deepgmap-docker
 31 |  $ docker build --no-cache -t koonimaru/deepgmap .
 32 | 
 33 | Next, download several data for a test run::
 34 | 
 35 |  $ wget https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/15268919/DeepGMAPdatalight.tar.lzma
 36 |  $ lzma -d DeepGMAPdatalight.tar.lzma && tar -xvf DeepGMAPdatalight.tar && rm DeepGMAPdatalight.tar
 37 | 
 38 | If you want, move "data" directory in the DeepGMAP-data-light folder to your working directory for deepgmap. Otherwise, it's ready, please see README.rst for how to run deepgmap.
 39 | 
 40 | 
 41 | 
 42 | Install manually
 43 | ================
 44 | 
 45 | Prerequisites
 46 | ~~~~~~~~~~~~~
 47 | 
 48 | DeepGMAP is verified to work on Linux (Ubunru 16.10). Also using GPU is highly recommended. 
 49 | 
 50 | Python version 3.6.
 51 | 
 52 | Numpy_ (>=1.6). 
 53 | 
 54 | Cython_ (>=0.18) is an optional requirement to recompile ``.pyx`` files.
 55 | 
 56 | Tensorflow_ (>=1.8) Note that Tensorflow requires cuDNN and cudna libraries. 
 57 | 
 58 | Scikitlearn_ (>=0.19.1)
 59 | 
 60 | matplotlib_
 61 | 
 62 | bedtools_ (>=2.25)
 63 | 
 64 | .. _Numpy: http://www.scipy.org/Download
 65 | .. _Cython: http://cython.org/
 66 | .. _Tensorflow: https://www.tensorflow.org/	
 67 | .. _Scikitlearn: http://scikit-learn.org/
 68 | .. _matplotlib: https://matplotlib.org/
 69 | .. _bedtools: http://bedtools.readthedocs.io/
 70 | 
 71 | Installing tensorflow-gpu
 72 | ~~~~~~~~~~~~~~~~~~~~~~~~~
 73 | To accelerate computing, users need to use `cuda-enabled GPUs`_. tensorflow-gpu itself can be easily 
 74 | installed by typing "sudo pip install tensorflow-gpu" or "sudo pip install -r requirements.txt". But, to make 
 75 | tensorflow-gpu work, you need a right version of cuDNN and cuda toolkit libraries (please 
 76 | check the `tensorflow web site`_). If you do not want to care about these softwares, please consider using docker. 
 77 | 
 78 | .. _cuda-enabled GPUs: https://developer.nvidia.com/cuda-gpus
 79 | .. _tensorflow web site: https://www.tensorflow.org/install/install_linux
 80 | 
 81 | Download source and data
 82 | ~~~~~~~~~~~~~~~~~~~~~~~~
 83 | To download the source code from our github repository::
 84 | 
 85 |  $ git clone https://github.com/koonimaru/DeepGMAP.git
 86 |  
 87 | To download a trial data set::
 88 | 
 89 |  $ wget https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/15268919/DeepGMAPdatalight.tar.lzma
 90 |  $ lzma -d DeepGMAPdatalight.tar.lzma && tar -xvf DeepGMAPdatalight.tar && rm DeepGMAPdatalight.tar
 91 | 
 92 | Place the folder named "data" under the DeepGMAP directory.
 93 |  
 94 | Local installation by configuring environment variables
 95 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 96 | 
 97 | You need to add the downloaded location (in this example home directory: $HOME) to your ``PYTHONPATH`` and ``PATH`` environment variables.
 98 | 
 99 | PYTHONPATH
100 | ~~~~~~~~~~
101 | 
102 | You need to include the new value in your ``PYTHONPATH`` by
103 | adding this line to your ``~/.bashrc``::
104 | 
105 |  $ export PYTHONPATH=$HOME/DeepGMAP/:$PYTHONPATH
106 | 
107 | Then, type::
108 | 
109 |  $ source .bashrc
110 | 
111 | Or, re-login to your account.
112 | 
113 | PATH
114 | ~~~~
115 | 
116 | You'll also like to add a new value to your
117 | PATH environment variable so that you can use the deepgmap command line
118 | directly::
119 | 
120 |  $ export PATH=$HOME/DeepGMAP/bin/:$PATH
121 | 
122 | 
123 | Installation system-wide 
124 | ~~~~~~~~~~~~~~~~~~~~~~~~
125 | Using pip::
126 | 
127 |  $ sudo pip install deepgmap
128 | 
129 | Alternatively, go to the DeepGMAP directory, and type::
130 | 
131 |  $ sudo python3 setup.py install
132 |  
133 | 
134 | These commands work only if cuda and cuDNN are already installed and you have a root user priviledge. 
135 | 
136 | 
137 | --
138 | Koh Onimaru <koh.oinmaru@gmail.com>
139 | 
140 | 


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/ROC_space_plotter.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | from sklearn import svm, datasets
  5 | from sklearn.metrics import roc_curve, auc
  6 | from sklearn.preprocessing import label_binarize
  7 | from sklearn.multiclass import OneVsRestClassifier
  8 | from scipy import interp
  9 | import getopt
 10 | from glob import glob
 11 | from natsort import natsorted
 12 | from sklearn.metrics import precision_recall_curve
 13 | from sklearn.metrics import average_precision_score
 14 | import matplotlib as mpl
 15 | 
 16 | def roc_space_calc(label,pred):
 17 |     
 18 |     # Compute ROC curve and ROC area for each class
 19 | 
 20 |     fpr, tpr, _ = roc_curve(label, pred)
 21 |     roc_auc = auc(fpr, tpr)
 22 |     
 23 |     return fpr, tpr, roc_auc
 24 | 
 25 | 
 26 | 
 27 | def roc_space_plotter(label, predictions, name_list,outfile_name):
 28 |     predictions_list=[]
 29 |     label_array=label
 30 |     for pred in predictions:
 31 |         print(pred["prediction"].shape)
 32 |         predictions_list.append(pred["prediction"])
 33 |         
 34 |     fpr_list=[]
 35 |     tpr_list=[]
 36 |     roc_auc_list=[]
 37 |     precision_list=[]
 38 |     recall_list=[]
 39 |     average_precision_list=[]
 40 |     for i in predictions_list:
 41 |         fpr, tpr, roc_auc=roc_space_calc(label_array, i)
 42 |         fpr_list.append(fpr)
 43 |         tpr_list.append(tpr)
 44 |         roc_auc_list.append(roc_auc)
 45 |         recall, precision,_ =precision_recall_curve(label_array, i)
 46 |         precision_list.append(precision)
 47 |         recall_list.append(recall)
 48 |         average_precision = average_precision_score(label_array, i)
 49 |         average_precision_list.append(average_precision)
 50 |         
 51 |        
 52 |        
 53 |     colormap = plt.cm.get_cmap('gnuplot')
 54 |     #C = [colormap(i) for i in np.linspace(0,0.9,len(name_list))]
 55 |             
 56 |     plt.figure(1, figsize=(5,10))
 57 |     ax1=plt.subplot(211)
 58 |     
 59 | 
 60 |     C=['darkorange','green','blue']
 61 |     i=0
 62 |     for fpr, tpr, roc_auc,name in zip(fpr_list,tpr_list,roc_auc_list,name_list):
 63 |         plt.plot(fpr, tpr, color=C[i],
 64 |               label=str(name)+' (area = %0.2f)' % roc_auc)
 65 |         i+=1
 66 |     plt.plot([0, 1], [0, 1.0], color='navy', linestyle='--')
 67 |     plt.axis('equal')
 68 |     ax1.set_xlim([0.0, 1.0])
 69 |     ax1.set_ylim([0.0, 1.0])
 70 |     plt.xlabel('False Positive Rate')
 71 |     plt.ylabel('True Positive Rate')
 72 |     
 73 |     plt.title('Receiver operating characteristic curve')
 74 |     plt.legend(loc="lower right")
 75 |     
 76 |     ax2=plt.subplot(212)
 77 |     i=0
 78 |     for prec, rec, avr_pr,name in zip(precision_list,recall_list,average_precision_list,name_list):
 79 |     
 80 |         plt.plot(prec, rec, lw=2, color=C[i],label=str(name)+' (area = %0.2f)' % avr_pr)
 81 |         i+=1
 82 |     plt.axis('equal')
 83 |     plt.xlabel('Recall')
 84 |     plt.ylabel('Precision')
 85 |     ax2.set_ylim([0.0, 1.00])
 86 |     ax2.set_xlim([0.0, 1.0])
 87 |     
 88 |     plt.title('Precision-Recall curve')
 89 |     plt.legend(loc="lower left")
 90 |     
 91 |     plt.savefig(outfile_name, format='pdf')
 92 |     
 93 |     plt.show()
 94 |     
 95 | def main():
 96 |     outfile_name="/home/fast/onimaru/data/prediction/ROC_space_curve_comp_limb_brain.pdf"
 97 |     npload_list=[]
 98 |     label_array=[]
 99 |     chromosome="chr2"
100 |     #name_list=["DeepSEA", "Bidirectional","Conv_plus","Conv+Bidirectional"]
101 |     name_list=["DeepSEA", "DanQ","Conv+Bidirectional"]
102 |     file_list=['/home/fast/onimaru/data/prediction/network_constructor_danq_1d_Sat_Nov_18_151721_2017.ckpt-12123_label_prediction.npz',
103 |                #'/home/fast/onimaru/data/prediction/network_constructor_deepsea_1d4_Fri_Oct__6_183716_2017.ckpt-11467_label_prediction.npz',
104 |                "/home/fast/onimaru/data/prediction/network_constructor_danq_1d_Sat_Nov_18_151721_2017.ckpt-12123_label_prediction.npz",
105 |                "/home/fast/onimaru/data/prediction/network_constructor_deepsea_1d3_Fri_Nov_17_170434_2017.ckpt-12123_label_prediction.npz"]
106 |     label_file=''
107 |     
108 |     with open(label_file, 'r') as fin:
109 |         for line in fin:
110 |             if line.startswith(chromosome):
111 |                 label_array.append(map(int, line[3:]))
112 |     label_array=np.array(label_array)
113 |     
114 |     for f in file_list:
115 |         npload_list.append(np.load(f))
116 |         
117 |     
118 |     roc_space_plotter(label_array, npload_list, name_list,outfile_name)
119 | 
120 | 
121 | if __name__== '__main__':
122 |     main()
123 |     
124 |     
125 |     
126 |     
127 |     
128 |     
129 |     
130 |     
131 |     


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/inputfileGeneratorForGenomeScan_p2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os.path
  3 | import multiprocessing
  4 | import sys
  5 | import deepgmap.data_preprocessing_tools.seq_to_binary2 as sb2
  6 | import psutil
  7 | import getopt
  8 | import time
  9 | 
 10 | PATH_SEP=os.path.sep
 11 | def div_roundup(x, y):
 12 |     if y%x==0:
 13 |         return y//x
 14 |     else:
 15 |         return y//x+1
 16 | 
 17 | 
 18 | def DNA_to_array_converter(input_file_read,seq_num,target_chr):
 19 |     seq_list=[]
 20 |     seq_list_append=seq_list.append
 21 |     position_list=[]
 22 |     position_list_append=position_list.append
 23 |     b1=0.0
 24 |     i=0
 25 |     
 26 |     data_width=len(input_file_read[1].strip("\n"))
 27 |     print(data_width)
 28 |     SEQ=False
 29 |     #print seq_list
 30 |     for l, line in enumerate(input_file_read):
 31 |         if line.startswith('>'):
 32 |             #if not "_" in line and not line.startswith('>chrM'):
 33 |             if not line.startswith('>chrM'):
 34 |                 #print line,
 35 |                 position_list_append(line.strip('\n'))
 36 |                 SEQ=True
 37 |             else:
 38 |                 SEQ=False
 39 |             if i%100000==0:
 40 |                 print(line)
 41 |         elif SEQ:
 42 |             line=line.strip('\n')
 43 |             
 44 |             #a1=time.time()
 45 |             seq_list_append(sb2.AGCTtoArray4(line.encode('utf-8'),data_width))
 46 |             
 47 |             #b1+=time.time()-a1
 48 |         i+=1
 49 |         #if i%100000==0:
 50 |             #print b1
 51 |             #sys.exit()
 52 |                     
 53 |     return position_list, seq_list
 54 |         
 55 | 
 56 | def array_saver(outfile,positions,sequences):
 57 |     print('saving '+outfile)
 58 |     np.savez_compressed(outfile,positions=positions,sequences=sequences)
 59 |         
 60 | def run(args):
 61 |     
 62 |     main(args)
 63 | 
 64 | def main(args=None):
 65 |     
 66 |     input_file=args.input_genome
 67 |     target_chr=args.chromosome
 68 |     output_file=args.out_directory
 69 |     threads=args.thread_number
 70 |     chunck_data=args.chunck_data
 71 |     print(args)
 72 |     
 73 |     if threads==0:
 74 |         threads=multiprocessing.cpu_count()//2
 75 |     
 76 |     if not input_file.endswith(".fa") and not input_file.endswith(".fasta"):
 77 |         input_file+=PATH_SEP+"genome.fa"
 78 |     if not os.path.isfile(input_file):
 79 |         print("input file must be a dirctory containing genome.fa or a fasta file.")
 80 |         
 81 |     file_size=os.path.getsize(input_file)
 82 |     print(file_size)
 83 |     
 84 |     loop_to_reduce_ram=div_roundup(1000000000, file_size)
 85 |     try:
 86 |         with open(input_file, "r") as fin:
 87 |             input_file_read=fin.readlines()
 88 |     except IOError:
 89 |         print('cannot open', input_file)
 90 |     output_file+="_all"
 91 |     os.makedirs(output_file)
 92 |     line_num=len(input_file_read)
 93 |     #print line_num
 94 |     seq_num=line_num//2
 95 |     
 96 |     sub_seq_num=div_roundup(loop_to_reduce_ram, seq_num)    
 97 |     DIVIDES_NUM=div_roundup(120000, sub_seq_num)
 98 |     
 99 |     for l1 in range(loop_to_reduce_ram):
100 |     
101 |         position_list, seq_list=DNA_to_array_converter(input_file_read[2*l1*sub_seq_num:2*(l1+1)*sub_seq_num],sub_seq_num,target_chr)
102 |         
103 |         print(position_list[0], input_file_read[2*l1*sub_seq_num])
104 |     
105 |     
106 |         outerloop=div_roundup(threads, DIVIDES_NUM)
107 |         chunk_num=div_roundup(DIVIDES_NUM, sub_seq_num)            
108 |             
109 |         if DIVIDES_NUM>=threads:
110 |             job_num=threads
111 |         else:
112 |             job_num=DIVIDES_NUM
113 |             
114 |         print(DIVIDES_NUM, threads, outerloop, job_num)
115 |         
116 |         
117 |         for l in range(outerloop):
118 |             jobs = []    
119 |             for i in range(job_num):
120 |                 if i*chunk_num+l*job_num*chunk_num>sub_seq_num:
121 |                     break
122 |                 jobs.append(multiprocessing.Process(target=array_saver, 
123 |                                     args=(str(output_file)+PATH_SEP+str(l1)+"_"+str(i+l*job_num), 
124 |                                           position_list[i*chunk_num+l*job_num*chunk_num:(i+1)*chunk_num+l*job_num*chunk_num], 
125 |                                           seq_list[i*chunk_num+l*job_num*chunk_num:(i+1)*chunk_num+l*job_num*chunk_num])))
126 |             for j in jobs:
127 |                 j.start()
128 |                 
129 |             for j in jobs:
130 |                 j.join()
131 |         
132 |         
133 |     
134 | if __name__== '__main__':
135 |     main()
136 |     
137 |     
138 |     
139 |     
140 |     
141 |     
142 |     
143 |     
144 |     
145 |     


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/ROC_space_plotter3.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | from sklearn import svm, datasets
  5 | from sklearn.metrics import roc_curve, auc
  6 | from sklearn.preprocessing import label_binarize
  7 | from sklearn.multiclass import OneVsRestClassifier
  8 | from scipy import interp
  9 | import getopt
 10 | from glob import glob
 11 | from natsort import natsorted
 12 | from sklearn.metrics import precision_recall_curve
 13 | from sklearn.metrics import average_precision_score
 14 | import matplotlib as mpl
 15 | 
 16 | def roc_space_calc(label,pred):
 17 |     
 18 |     # Compute ROC curve and ROC area for each class
 19 | 
 20 |     fpr, tpr, _ = roc_curve(label, pred)
 21 |     roc_auc = auc(fpr, tpr)
 22 |     
 23 |     return fpr, tpr, roc_auc
 24 | 
 25 | 
 26 | 
 27 | def roc_space_plotter(label, predictions, name_list,outfile_name):
 28 |     predictions_list=[]
 29 |     label_array=label
 30 |     for pred in predictions:
 31 |         #print pred["prediction"].shape
 32 |         predictions_list.append(pred["prediction"])
 33 |         
 34 |     fpr_list=[]
 35 |     tpr_list=[]
 36 |     roc_auc_list=[]
 37 |     precision_list=[]
 38 |     recall_list=[]
 39 |     average_precision_list=[]
 40 |     label_array_shape= label_array.shape
 41 |     for i in predictions_list:
 42 |         for j in range(label_array_shape[1]):
 43 |             a, b=label_array[:,j], i[:,j]
 44 |             fpr, tpr, roc_auc=roc_space_calc(a, b)
 45 |             fpr_list.append(fpr)
 46 |             tpr_list.append(tpr)
 47 |             roc_auc_list.append(roc_auc)
 48 |             recall, precision,_ =precision_recall_curve(a, b)
 49 |             precision_list.append(precision)
 50 |             recall_list.append(recall)
 51 |             average_precision = average_precision_score(a, b)
 52 |             average_precision_list.append(average_precision)
 53 |             b+=0.15*np.random.randn(label_array_shape[0])
 54 |             b=np.clip(b, 0, 1)
 55 |             fpr, tpr, roc_auc=roc_space_calc(a, b)
 56 |             fpr_list.append(fpr)
 57 |             tpr_list.append(tpr)
 58 |             roc_auc_list.append(roc_auc)
 59 |             recall, precision,_ =precision_recall_curve(a, b)
 60 |             precision_list.append(precision)
 61 |             recall_list.append(recall)
 62 |             average_precision = average_precision_score(a, b)
 63 |             average_precision_list.append(average_precision)
 64 |        
 65 |     colormap = plt.cm.get_cmap('gnuplot')
 66 |     C = [colormap(i) for i in np.linspace(0,0.9,label_array_shape[1]*2)]
 67 |             
 68 |     plt.figure(1, figsize=(5,10))
 69 |     ax1=plt.subplot(211)
 70 |     
 71 |     
 72 |     #C=['darkorange','green','blue']
 73 |     i=0
 74 |     for fpr, tpr, roc_auc in zip(fpr_list,tpr_list,roc_auc_list):
 75 |         plt.plot(fpr, tpr, color=C[i],
 76 |               label=' (area = %0.2f)' % roc_auc)
 77 |         i+=1
 78 |     plt.plot([0, 1], [0, 1.0], color='navy', linestyle='--')
 79 |     plt.axis('equal')
 80 |     ax1.set_xlim([0.0, 1.0])
 81 |     ax1.set_ylim([0.0, 1.0])
 82 |     plt.xlabel('False Positive Rate')
 83 |     plt.ylabel('True Positive Rate')
 84 |     
 85 |     plt.title('Receiver operating characteristic curve')
 86 |     plt.legend(loc="lower right")
 87 |     
 88 |     ax2=plt.subplot(212)
 89 |     i=0
 90 |     for prec, rec, avr_pr in zip(precision_list,recall_list,average_precision_list):
 91 |     
 92 |         plt.plot(prec, rec, lw=2, color=C[i],label=' (area = %0.2f)' % avr_pr)
 93 |         i+=1
 94 |     plt.axis('equal')
 95 |     plt.xlabel('Recall')
 96 |     plt.ylabel('Precision')
 97 |     ax2.set_ylim([0.0, 1.00])
 98 |     ax2.set_xlim([0.0, 1.0])
 99 |     
100 |     plt.title('Precision-Recall curve')
101 |     plt.legend(loc="lower left")
102 |     
103 |     #plt.savefig(outfile_name, format='pdf')
104 |     
105 |     plt.show()
106 |     
107 | def main():
108 |     outfile_name="/home/fast/onimaru/data/prediction/ROC_space_curve_comp_limb_brain.pdf"
109 |     npload_list=[]
110 |     label_array=[]
111 |     chromosome="chr2"
112 |     #name_list=["DeepSEA", "Bidirectional","Conv_plus","Conv+Bidirectional"]
113 |     name_list=["conv4-FRSS"]
114 |     file_list=["/home/fast2/onimaru/DeepGMAP-dev/data/predictions/conv4frss_Fri_Jun__8_101931_2018.ckpt-16747_prediction.npz"]
115 |     label_file='/home/fast2/onimaru/DeepGMAP-dev/data/inputs/mm10_dnase_subset/dnase_subset_mm10_window1000_stride500.bed.labeled'
116 |     
117 |     with open(label_file, 'r') as fin:
118 |         for line in fin:
119 |             if line.startswith(chromosome):
120 |                 
121 |                 label_array.append(map(int, line.split()[3:]))
122 |     label_array=np.array(label_array)
123 |     
124 |     for f in file_list:
125 |         npload_list.append(np.load(f))
126 |         
127 |     
128 |     roc_space_plotter(label_array, npload_list, name_list,outfile_name)
129 | 
130 | 
131 | if __name__== '__main__':
132 |     main()
133 |     
134 |     
135 |     
136 |     
137 |     
138 |     
139 |     
140 |     
141 |     


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/motif_compare.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import sys
  3 | #from curses.ascii import isdigit
  4 | from scipy.spatial.distance import cdist
  5 | import deepgmap.post_train_tools.cython_util as cutil
  6 | mc=cutil.motif_compare
  7 | from matplotlib import pyplot as plt 
  8 | import os
  9 | def _is_number(s):
 10 |     try:
 11 |         complex(s) # for int, long, float and complex
 12 |     except ValueError:
 13 |         return False
 14 | 
 15 |     return True
 16 | 
 17 | def motif_reader(motif_data_dir):
 18 |     motif_name=""
 19 |     motif_dict={}
 20 |     motif_list=[]
 21 |     with open(motif_data_dir, 'r') as fin:
 22 |         MOTIF=False
 23 |         i=0
 24 |         for line in fin:
 25 |             i+=1
 26 |             line=line.split()
 27 |             if len(line)==0:
 28 |                 MOTIF=False
 29 |                 continue
 30 |             elif line[0]=="MOTIF":
 31 |                 if len(motif_name)>0:
 32 |                     motif_dict[motif_name]=np.array(motif_list)
 33 |                     
 34 |                 motif_list=[]
 35 |                 motif_name=""
 36 |                 if len(line)>2:
 37 |                     motif_name="_".join(line[1:])
 38 |                 else:
 39 |                     motif_name=line[1]
 40 |                 
 41 |             elif line[0]=="letter-probability":
 42 |                 if line[4]=="w=":
 43 |                     motif_length=int(line[5])
 44 |                 else:
 45 |                     print("something wrong in line "+str(i))
 46 |                     sys.exit()
 47 |                 MOTIF=True
 48 |             elif MOTIF==True:
 49 |                 #print _is_number(line[0])
 50 |                 if not _is_number(line[0]):
 51 |                     MOTIF=False
 52 |                     continue
 53 |                 else:
 54 |                     motif_list.append(map(float, line))
 55 |         
 56 |         motif_dict[motif_name]=np.array(motif_list)
 57 |     return motif_dict
 58 | 
 59 | 
 60 | 
 61 | 
 62 | def motif_compare(motif_data_dict, long_motif_dict, fout, THRESHOLD=-5.0):
 63 |     with open(fout, "w") as f:
 64 |         f.write("Motif name\tStart\tEnd\tdistance\n")
 65 |         for k1, v1 in long_motif_dict.items():
 66 |             
 67 |             v1shape=v1.shape
 68 |             #print v1
 69 |             j=0
 70 |             for k2, v2 in motif_data_dict.items():
 71 |                 if "secondary" in k2:
 72 |                     continue
 73 |                 #print k2
 74 |                 #j+=1
 75 |                 #print j
 76 |                 v2shape=v2.shape
 77 |                 RAND_DIST=[]
 78 |                 for i in range(12):
 79 |                     rand=np.random.rand(v2shape[0],v2shape[1])
 80 |                     for k in range(v2shape[1]):
 81 |                         rand[k]=rand[k]/np.sum(rand[k])
 82 |                     RAND_DIST.append(np.mean(np.diagonal(cdist(v2, rand,metric='cosine'))))
 83 |                 RAND_MEAN=np.mean(RAND_DIST)
 84 |                 RAND_DEV=np.std(RAND_DIST)
 85 |                 #print RAND_MEAN, RAND_DEV
 86 |                 #print("randome_dist: "+str(RAND_DIST))
 87 |                 
 88 |                 
 89 |                 
 90 |                 for i in range(v1shape[0]-v2shape[0]):
 91 |                     partial_motif=v1[i:(i+v2shape[0])]
 92 |                     #print v2shape, partial_motif.shape
 93 |                     """M=0.5*(partial_motif+v2)+0.00001
 94 |                     JSD=0.5*(np.sum(-v2*np.log(M/(v2+0.00001)))+np.sum(-partial_motif*np.log(M/(partial_motif+0.00001))))/v2shape[0]
 95 |                     print JSD"""
 96 |                     DIST=np.mean(np.diagonal(cdist(v2, partial_motif,metric='cosine')))
 97 |                     Z_SCORE=(DIST-RAND_MEAN)/RAND_DEV
 98 |                     #print Z_SCORE
 99 |                     if Z_SCORE<=THRESHOLD:
100 |                         f.write(str(k2)+"\t"+str(i)+"\t"+str(i+v2shape[0])+"\t"+str(Z_SCORE)+"\n")
101 | 
102 | def main():
103 |     motif_data_dir="/home/fast/onimaru/data/meme/merged.meme"
104 |     #long_motif_dir="/home/fast/onimaru/deepgmap/data/reconstructions/conv4frss_Fri_May_11_075425_2018.ckpt-16747Tue_May_15_112518_2018_all_.pdf.meme"
105 |     long_motif_dir="/home/fast/onimaru/deepgmap/data/reconstructions/conv4frss_Fri_May_11_075425_2018.ckpt-16747Tue_May_15_104419_2018_es_e14_.pdf.meme"
106 |     fout=os.path.splitext(long_motif_dir)[0]+".matches"
107 |     #fout="/home/fast/onimaru/data/output/network_constructor_deepsea_1d3_Fri_Oct_13_133809_2017.ckpt-15899Mon_Oct_16_105338_2017.npz.matches"
108 |     motif_data_dict=motif_reader(motif_data_dir)
109 |     #print len(motif_data_dict)
110 |     long_motif_dict=motif_reader(long_motif_dir)
111 |     #print len(long_motif_dict)
112 |     #motif_compare(motif_data_dict, long_motif_dict, fout)
113 |     Z_SCORE_list=mc(motif_data_dict, long_motif_dict, fout, THRESHOLD=-5)
114 |     plt.hist(Z_SCORE_list, 1000)
115 |     plt.xticks(np.arange(min(Z_SCORE_list), max(Z_SCORE_list)+1, 1.0))
116 |     plt.show()
117 | 
118 | if __name__== '__main__':
119 |     main()                
120 |                 


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/inputfileGeneratorForGenomeScan_gwas.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os.path
  3 | import multiprocessing
  4 | import sys
  5 | import deepgmap.data_preprocessing_tools.seq_to_binary2 as sb2
  6 | import psutil
  7 | import getopt
  8 | import time
  9 | 
 10 | 
 11 | def div_roundup(x, y):
 12 |     if y%x==0:
 13 |         return y//x
 14 |     else:
 15 |         return y//x+1
 16 | 
 17 | 
 18 | def DNA_to_array_converter(input_file_read,seq_num,target_chr):
 19 |     seq_list=[]
 20 |     seq_list_append=seq_list.append
 21 |     position_list=[]
 22 |     position_list_append=position_list.append
 23 |     b1=0.0
 24 |     i=0
 25 |     
 26 |     data_width=len(input_file_read[1].strip("\n"))
 27 |     print(data_width)
 28 |     SEQ=False
 29 |     #print seq_list
 30 |     for l, line in enumerate(input_file_read):
 31 |         if line.startswith('>'):
 32 |             #if not "_" in line and not line.startswith('>chrM'):
 33 |             if not line.startswith('>chrM'):
 34 |                 #print line,
 35 |                 position_list_append(line.strip('\n'))
 36 |                 SEQ=True
 37 |             else:
 38 |                 SEQ=False
 39 |             if i%100000==0:
 40 |                 print(line)
 41 |         elif SEQ:
 42 |             line=line.strip('\n')
 43 |             
 44 |             #a1=time.time()
 45 |             seq_list_append(sb2.AGCTtoArray4(line,data_width))
 46 |             
 47 |             #b1+=time.time()-a1
 48 |         i+=1
 49 |         #if i%100000==0:
 50 |             #print b1
 51 |             #sys.exit()
 52 |                     
 53 |     return position_list, seq_list
 54 |         
 55 | 
 56 | def array_saver(outfile,positions,sequences):
 57 |     print('saving '+outfile)
 58 |     np.savez_compressed(outfile,positions=positions,sequences=sequences)
 59 |         
 60 | def run(args):
 61 |     
 62 |     main(args)
 63 | 
 64 | def main(args=None):
 65 |     if args is not None:
 66 |         input_file=args.input_genome
 67 |         target_chr=args.chromosome
 68 |         output_file=args.out_directory
 69 |         threads=args.thread_number
 70 |         chunck_data=args.chunck_data
 71 |         print(args)
 72 |     else:
 73 |         try:
 74 |             options, args =getopt.getopt(sys.argv[1:], 'i:t:o:p:', ['input_dir=','target_chr=', 'output_dir=','process='])
 75 |         except getopt.GetoptError as err:
 76 |             print(str(err))
 77 |             sys.exit(2)
 78 |         if len(options)<3:
 79 |             print('too few argument')
 80 |             sys.exit(0)
 81 |             
 82 |         threads=psutil.cpu_count()
 83 |         
 84 |         for opt, arg in options:
 85 |             if opt in ('-i', '--input_dir'):
 86 |                 input_file=arg
 87 |             elif opt in ('-t', '--target_chr'):
 88 |                 target_chr=arg
 89 |             elif opt in ('-o', '--output_dir'):
 90 |                 output_file=arg
 91 |             elif opt in ('-p', '--process'):
 92 |                 threads=int(arg)
 93 |     
 94 |         print(options)
 95 |     file_size=os.path.getsize(input_file)
 96 |     print(file_size)
 97 |     
 98 |     loop_to_reduce_ram=div_roundup(1000000000, file_size)
 99 |     try:
100 |         with open(input_file, "r") as fin:
101 |             input_file_read=fin.readlines()
102 |     except IOError:
103 |         print('cannot open', input_file)
104 |     
105 |     line_num=len(input_file_read)
106 |     #print line_num
107 |     seq_num=line_num/2
108 |     
109 |     sub_seq_num=div_roundup(loop_to_reduce_ram, seq_num)    
110 |     DIVIDES_NUM=div_roundup(120000, sub_seq_num)
111 |     
112 |     for l1 in range(loop_to_reduce_ram):
113 |     
114 |         position_list, seq_list=DNA_to_array_converter(input_file_read[2*l1*sub_seq_num:2*(l1+1)*sub_seq_num],sub_seq_num,target_chr)
115 |         
116 |         print(position_list[0], input_file_read[2*l1*sub_seq_num])
117 |     
118 |     
119 |         outerloop=div_roundup(threads, DIVIDES_NUM)
120 |         chunk_num=div_roundup(DIVIDES_NUM, sub_seq_num)            
121 |             
122 |         if DIVIDES_NUM>=threads:
123 |             job_num=threads
124 |         else:
125 |             job_num=DIVIDES_NUM
126 |             
127 |         print(DIVIDES_NUM, threads, outerloop, job_num)
128 |         
129 |         
130 |         for l in range(outerloop):
131 |             jobs = []    
132 |             for i in range(job_num):
133 |                 if i*chunk_num+l*job_num*chunk_num>sub_seq_num:
134 |                     break
135 |                 jobs.append(multiprocessing.Process(target=array_saver, 
136 |                                     args=(str(output_file)+"_"+str(l1)+"_"+str(i+l*job_num), 
137 |                                           position_list[i*chunk_num+l*job_num*chunk_num:(i+1)*chunk_num+l*job_num*chunk_num], 
138 |                                           seq_list[i*chunk_num+l*job_num*chunk_num:(i+1)*chunk_num+l*job_num*chunk_num])))
139 |             for j in jobs:
140 |                 j.start()
141 |                 
142 |             for j in jobs:
143 |                 j.join()
144 |         
145 |         
146 |     
147 | if __name__== '__main__':
148 |     main()
149 |     
150 |     
151 |     
152 |     
153 |     
154 |     
155 |     
156 |     
157 |     
158 |     


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/inputfileGeneratorForGenomeScan_p.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import numpy as np
  3 | import gzip
  4 | #output_handle=open("/home/koh/MLData/test.txt", 'w')
  5 | import time 
  6 | import gzip
  7 | import math
  8 | import os.path
  9 | import multiprocessing
 10 | import sys
 11 | import glob
 12 | import deepgmap.data_preprocessing_tools.seq_to_binary2 as sb2
 13 | import time
 14 | import psutil
 15 | import getopt
 16 | #from __future__ import print_function
 17 | PATH_SEP=os.path.sep
 18 | def DNA_to_array_converter(input_file,target_chr):
 19 |     
 20 |     
 21 |     if "," in target_chr:
 22 |         target_chr=set(target_chr.split(','))
 23 |     else:
 24 |         target_chr=set([target_chr])
 25 |     #print target_chr
 26 |     seq_list=[]
 27 |     position_list=[]
 28 |     b1=0.0
 29 |     i=0
 30 |     with open(input_file, 'r') as fin:
 31 |     
 32 |         SEQ=False
 33 |         for line in fin:
 34 |             if line.startswith('>'):
 35 |                 _line=line.strip('>').split(':')[0]
 36 |                 if _line in target_chr:
 37 |                     print(line)
 38 |                     position_list.append(line.strip('\n'))
 39 |                     SEQ=True
 40 |                     
 41 |                 else:
 42 |                     SEQ=False
 43 |             elif SEQ:
 44 |                 line=line.strip('\n')
 45 |                 data_width=len(line)
 46 |                 #sequence=np.zeros([1,1000,4,1], np.int16)
 47 |                                    
 48 |                 seq_list.append(sb2.AGCTtoArray4(line.encode('utf-8'),data_width))
 49 |                 #seq_list.append(sb2.ACGTtoaltArray(line,data_width))
 50 |     return position_list, seq_list
 51 |         
 52 | 
 53 | def array_saver(outfile,positions,sequences):
 54 |     print('saving '+outfile)
 55 |     np.savez_compressed(outfile,positions=positions,sequences=sequences)
 56 |         
 57 | def run(args):
 58 |     
 59 |     main(args)
 60 | 
 61 | def main(args=None):
 62 |     
 63 |     """
 64 |     argparser_generate_test = subparsers.add_parser( "generate_test",
 65 |                                                 help = "Generate a data set for a test or an application of a trained model." )
 66 |     argparser_generate_test.add_argument( "-i", "--in_file", dest = "input_genome" , type = str, required = True,
 67 |                                      help = "A multiple fasta file containing genome DNA sequences. REQUIRED" )
 68 |     argparser_generate_test.add_argument("-C", "--chromosome", dest = "chromosome", type = str, default = "chr2",
 69 |                                   help = "Set a target chromosome or a contig for prediction. Default: chr2" )
 70 |     argparser_generate_test.add_argument( "-o", "--out_dir", dest = "out_directory", type = str, required = True,
 71 |                                      help = "")
 72 |     argparser_generate_test.add_argument( "-t", "--threads", dest = "thread_number", type = int,
 73 |                                    help = "The number of threads. Multithreading is performed only when saving output numpy arrays. Default: 1", default = 1 )
 74 |     """
 75 |     input_file=args.input_genome
 76 |     if not input_file.endswith(".fa") and not input_file.endswith(".fasta"):
 77 |         input_file+=PATH_SEP+"genome.fa"
 78 |     if not os.path.isfile(input_file):
 79 |         print("input file must be a dirctory containing genome.fa or a fasta file.")
 80 |     target_chr=args.chromosome
 81 |     output_file=args.out_directory+"_"+target_chr
 82 |     threads=args.thread_number
 83 |     if threads==0:
 84 |         threads=multiprocessing.cpu_count()//2
 85 |     print(args)
 86 |     
 87 |     
 88 |     os.makedirs(output_file)
 89 |     output_file+=PATH_SEP
 90 |     position_list, seq_list=DNA_to_array_converter(input_file,target_chr)
 91 |     seq_num=len(position_list)
 92 |     print(seq_num)
 93 |     
 94 |     if seq_num%120000==0:
 95 |         DIVIDES_NUM=seq_num//120000
 96 |     else:
 97 |         DIVIDES_NUM=seq_num//120000+1
 98 |     
 99 |     if DIVIDES_NUM%threads==0:
100 |         outerloop=DIVIDES_NUM//threads
101 |     else:
102 |         outerloop=DIVIDES_NUM//threads+1
103 |         
104 |         
105 |     
106 |     
107 |     if seq_num%DIVIDES_NUM==0:
108 |         chunk_num=seq_num//DIVIDES_NUM
109 |     else:
110 |         chunk_num=seq_num//DIVIDES_NUM+1
111 |     if DIVIDES_NUM>=threads:
112 |         job_num=threads
113 |     else:
114 |         job_num=DIVIDES_NUM
115 |         
116 |     print(DIVIDES_NUM, threads, outerloop, job_num)
117 |     
118 |     
119 |     for l in range(outerloop):
120 |         jobs = []    
121 |         for i in range(job_num):
122 |             if i*chunk_num+l*threads>seq_num:
123 |                 break
124 |             jobs.append(multiprocessing.Process(target=array_saver, 
125 |                                 args=(str(output_file)+str(i+l*threads), 
126 |                                       position_list[i*chunk_num+l*threads*chunk_num:(i+1)*chunk_num+l*threads*chunk_num], 
127 |                                       seq_list[i*chunk_num+l*threads*chunk_num:(i+1)*chunk_num+l*threads*chunk_num])))
128 |         for j in jobs:
129 |             j.start()
130 |             
131 |         for j in jobs:
132 |             j.join()
133 |         
134 | 
135 |     
136 | if __name__== '__main__':
137 |     main()
138 |     
139 |     
140 |     
141 |     
142 |     
143 |     
144 |     
145 |     
146 |     
147 |     


--------------------------------------------------------------------------------
/deepgmap/misc/igv_session.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <Session genome="mm10" hasGeneTrack="true" hasSequenceTrack="true" locus="chr10:4531251-4641378" path="/home/koh/workspace/MLforShark/igv_session.xml" version="8">
 3 |     <Resources>
 4 |         <Resource path="/media/koh/HD-PCFU3/PublicData/Conserved_element/mm10.60way.phastCons.bw"/>
 5 |         <Resource path="/media/koh/HD-PCFU3/mouse/various_dnase_data/bedfiles/all_peak_for_comparision_srt_mrg.bed"/>
 6 |         <Resource path="/run/user/1000/gvfs/sftp:host=172.21.31.92/home/onimaru/workspace/data/predction/Fri_Feb__3_184349_2017chrX_17.bed"/>
 7 |         <Resource path="/run/user/1000/gvfs/sftp:host=172.21.31.92/home/onimaru/workspace/data/predction/Fri_Feb__3_184349_2017chr10_8.bed"/>
 8 |         <Resource path="/run/user/1000/gvfs/sftp:host=172.21.31.92/home/onimaru/workspace/data/predction/Fri_Feb__3_184349_2017chr10_0.bed"/>
 9 |         <Resource path="/media/koh/HD-PCFU3/mouse/prediction/Sun_Jan_22_181643_2017chr10_0.bed"/>
10 |         <Resource path="/media/koh/HD-PCFU3/mouse/prediction/Sun_Jan_22_181643_2017chr10_3.bed"/>
11 |     </Resources>
12 |     <Panel height="830" name="DataPanel" width="2541">
13 |         <Track altColor="0,0,178" autoScale="false" clazz="org.broad.igv.track.FeatureTrack" color="0,0,178" colorScale="ContinuousColorScale;0.0;0.0;255,255,255;0,0,178" displayMode="COLLAPSED" featureVisibilityWindow="-1" fontSize="10" id="/media/koh/HD-PCFU3/mouse/prediction/Sun_Jan_22_181643_2017chr10_0.bed" name="Sun_Jan_22_181643_2017chr10_0.bed" renderer="BASIC_FEATURE" sortable="false" visible="true" windowFunction="count">
14 |             <DataRange baseline="0.0" drawBaseline="true" flipAxis="false" maximum="0.0" minimum="0.0" type="LINEAR"/>
15 |         </Track>
16 |         <Track altColor="0,0,178" autoScale="false" clazz="org.broad.igv.track.FeatureTrack" color="0,0,178" displayMode="COLLAPSED" featureVisibilityWindow="-1" fontSize="10" id="/run/user/1000/gvfs/sftp:host=172.21.31.92/home/onimaru/workspace/data/predction/Fri_Feb__3_184349_2017chr10_0.bed" name="Fri_Feb__3_184349_2017chr10_0.bed" renderer="BASIC_FEATURE" sortable="false" visible="true" windowFunction="count"/>
17 |         <Track altColor="0,0,178" autoScale="false" clazz="org.broad.igv.track.FeatureTrack" color="0,0,178" colorScale="ContinuousColorScale;0.0;0.0;255,255,255;0,0,178" displayMode="COLLAPSED" featureVisibilityWindow="-1" fontSize="10" id="/media/koh/HD-PCFU3/mouse/prediction/Sun_Jan_22_181643_2017chr10_3.bed" name="Sun_Jan_22_181643_2017chr10_3.bed" renderer="BASIC_FEATURE" sortable="false" visible="true" windowFunction="count">
18 |             <DataRange baseline="0.0" drawBaseline="true" flipAxis="false" maximum="0.0" minimum="0.0" type="LINEAR"/>
19 |         </Track>
20 |         <Track altColor="0,0,178" autoScale="false" clazz="org.broad.igv.track.FeatureTrack" color="0,0,178" displayMode="COLLAPSED" featureVisibilityWindow="-1" fontSize="10" id="/media/koh/HD-PCFU3/mouse/various_dnase_data/bedfiles/all_peak_for_comparision_srt_mrg.bed" name="all_peak_for_comparision_srt_mrg.bed" renderer="BASIC_FEATURE" sortable="false" visible="true" windowFunction="count"/>
21 |         <Track altColor="0,0,178" autoScale="false" clazz="org.broad.igv.track.FeatureTrack" color="0,0,178" colorScale="ContinuousColorScale;0.0;185.0;255,255,255;0,0,178" displayMode="EXPANDED" featureVisibilityWindow="-1" fontSize="10" height="35" id="mm10_genes" name="Refseq genes" renderer="BASIC_FEATURE" sortable="false" visible="true" windowFunction="count">
22 |             <DataRange baseline="0.0" drawBaseline="true" flipAxis="false" maximum="185.0" minimum="0.0" type="LINEAR"/>
23 |         </Track>
24 |         <Track altColor="0,0,178" autoScale="true" clazz="org.broad.igv.track.DataSourceTrack" color="0,0,178" displayMode="COLLAPSED" featureVisibilityWindow="-1" fontSize="10" id="/media/koh/HD-PCFU3/PublicData/Conserved_element/mm10.60way.phastCons.bw" name="mm10.60way.phastCons.bw" normalize="false" renderer="BAR_CHART" sortable="true" visible="true" windowFunction="mean">
25 |             <DataRange baseline="0.0" drawBaseline="true" flipAxis="false" maximum="0.9995429" minimum="0.0" type="LINEAR"/>
26 |         </Track>
27 |     </Panel>
28 |     <Panel height="328" name="FeaturePanel" width="2541">
29 |         <Track altColor="0,0,178" autoScale="false" color="0,0,178" displayMode="COLLAPSED" featureVisibilityWindow="-1" fontSize="10" id="Reference sequence" name="Reference sequence" sortable="false" visible="true"/>
30 |         <Track altColor="0,0,178" autoScale="false" clazz="org.broad.igv.track.FeatureTrack" color="0,0,178" displayMode="COLLAPSED" featureVisibilityWindow="-1" fontSize="10" id="/run/user/1000/gvfs/sftp:host=172.21.31.92/home/onimaru/workspace/data/predction/Fri_Feb__3_184349_2017chr10_8.bed" name="Fri_Feb__3_184349_2017chr10_8.bed" renderer="BASIC_FEATURE" sortable="false" visible="true" windowFunction="count"/>
31 |         <Track altColor="0,0,178" autoScale="false" clazz="org.broad.igv.track.FeatureTrack" color="0,0,178" displayMode="COLLAPSED" featureVisibilityWindow="-1" fontSize="10" id="/run/user/1000/gvfs/sftp:host=172.21.31.92/home/onimaru/workspace/data/predction/Fri_Feb__3_184349_2017chrX_17.bed" name="Fri_Feb__3_184349_2017chrX_17.bed" renderer="BASIC_FEATURE" sortable="false" visible="true" windowFunction="count"/>
32 |     </Panel>
33 |     <PanelLayout dividerFractions="0.7141630901287553"/>
34 |     <HiddenAttributes>
35 |         <Attribute name="DATA FILE"/>
36 |         <Attribute name="DATA TYPE"/>
37 |         <Attribute name="NAME"/>
38 |     </HiddenAttributes>
39 | </Session>
40 | 


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/unpooling.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | 
  4 | 
  5 | """def unpool(updates, mask, ksize=[1, 2, 1, 1]):
  6 |     input_shape = updates.get_shape().as_list()
  7 |     #  calculation new shape
  8 |     output_shape = (input_shape[0], input_shape[1] * ksize[1], input_shape[2] * ksize[2], input_shape[3])
  9 |     # calculation indices for batch, height, width and feature maps
 10 |     one_like_mask = tf.ones_like(mask)
 11 |     batch_range = tf.reshape(tf.range(output_shape[0], dtype=tf.int64), shape=[input_shape[0], 1, 1, 1])
 12 |     b = one_like_mask * batch_range
 13 |     y = mask // (output_shape[2] * output_shape[3])
 14 |     x = mask % (output_shape[2] * output_shape[3]) // output_shape[3]
 15 |     feature_range = tf.range(output_shape[3], dtype=tf.int64)
 16 |     f = one_like_mask * feature_range
 17 |     # transpose indices & reshape update values to one dimension
 18 |     updates_size = tf.size(updates)
 19 |     indices = tf.transpose(tf.reshape(tf.stack([b, y, x, f]), [4, updates_size]))
 20 |     values = tf.reshape(updates, [updates_size])
 21 |     ret = tf.scatter_nd(indices, values, output_shape)
 22 |     return ret"""
 23 |     
 24 |     
 25 | def unpool(updates, mask, ksize=[1, 2, 1, 1], output_shape=None, name=''):
 26 |     with tf.variable_scope(name):
 27 |         mask = tf.cast(mask, tf.int32)
 28 |         input_shape = tf.shape(updates, out_type=tf.int32)
 29 |         #  calculation new shape
 30 |         if output_shape is None:
 31 |             output_shape = (input_shape[0], input_shape[1] * ksize[1], input_shape[2] * ksize[2], input_shape[3])
 32 | 
 33 |         # calculation indices for batch, height, width and feature maps
 34 |         one_like_mask = tf.ones_like(mask, dtype=tf.int32)
 35 |         batch_shape = tf.concat([[input_shape[0]], [1], [1], [1]], 0)
 36 |         batch_range = tf.reshape(tf.range(output_shape[0], dtype=tf.int32), shape=batch_shape)
 37 |         b = one_like_mask * batch_range
 38 |         y = mask // (output_shape[2] * output_shape[3])
 39 |         x = (mask // output_shape[3]) % output_shape[2] #mask % (output_shape[2] * output_shape[3]) // output_shape[3]
 40 |         feature_range = tf.range(output_shape[3], dtype=tf.int32)
 41 |         f = one_like_mask * feature_range
 42 |         
 43 |         # transpose indices & reshape update values to one dimension
 44 |         updates_size = tf.size(updates)
 45 |         indices = tf.transpose(tf.reshape(tf.stack([b, y, x, f]), [4, updates_size]))
 46 |         values = tf.reshape(updates, [updates_size])
 47 |         ret = tf.scatter_nd(indices, values, output_shape)
 48 |         print(ret)
 49 |         return ret
 50 |     
 51 |     
 52 | def unpool2(pool, ind, ksize=[1, 2, 1, 1], scope='unpool'):
 53 |     """
 54 |        Unpooling layer after max_pool_with_argmax.
 55 |        Args:
 56 |            updates:   max pooled output tensor
 57 |            mask:      argmax indices
 58 |            ksize:     ksize is the same as for the pool
 59 |        Return:
 60 |            unpool:    unpooling tensor
 61 |     """
 62 |     with tf.variable_scope(scope):
 63 |         input_shape = pool.get_shape().as_list()
 64 |         output_shape = (input_shape[0], input_shape[1] * ksize[1], input_shape[2] * ksize[2], input_shape[3])
 65 |         pool_ = tf.reshape(pool, [input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3]])
 66 |         batch_range = tf.reshape(tf.range(output_shape[0], dtype=ind.dtype), shape=[input_shape[0], 1, 1, 1])
 67 |         b = tf.ones_like(ind) * batch_range
 68 |         b = tf.reshape(b, [input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3], 1])
 69 |         ind_ = tf.reshape(ind, [input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3], 1])
 70 |         ind_ = tf.concat([b, ind_],1)
 71 |         ref = tf.Variable(tf.zeros([output_shape[0], output_shape[1] * output_shape[2] * output_shape[3]]))
 72 |         ret = tf.scatter_nd_update(ref, ind_, pool_)
 73 |         ret = tf.reshape(ret, [output_shape[0], output_shape[1], output_shape[2], output_shape[3]])
 74 |         return ret
 75 |     
 76 | 
 77 | def unpool3(pool, ind, ksize=[1, 2, 1, 1], scope='unpool3'):
 78 |     """
 79 |        Unpooling layer after max_pool_with_argmax.
 80 |        Args:
 81 |            pool:   max pooled output tensor
 82 |            ind:      argmax indices
 83 |            ksize:     ksize is the same as for the pool
 84 |        Return:
 85 |            unpool:    unpooling tensor
 86 |     """
 87 |     with tf.variable_scope(scope):
 88 |         input_shape = tf.shape(pool)
 89 |         output_shape = [input_shape[0], input_shape[1] * ksize[1], input_shape[2] * ksize[2], input_shape[3]]
 90 | 
 91 |         flat_input_size = tf.reduce_prod(input_shape)
 92 |         flat_output_shape = [output_shape[0], output_shape[1] * output_shape[2] * output_shape[3]]
 93 | 
 94 |         pool_ = tf.reshape(pool, [flat_input_size])
 95 |         batch_range = tf.reshape(tf.range(tf.cast(output_shape[0], tf.int64), dtype=ind.dtype), 
 96 |                                           shape=[input_shape[0], 1, 1, 1])
 97 |         b = tf.ones_like(ind) * batch_range
 98 |         b1 = tf.reshape(b, [flat_input_size, 1])
 99 |         ind_ = tf.reshape(ind, [flat_input_size, 1])
100 |         ind_ = tf.concat([b1, ind_], 1)
101 | 
102 |         ret = tf.scatter_nd(ind_, pool_, shape=tf.cast(flat_output_shape, tf.int64))
103 |         ret = tf.reshape(ret, output_shape)
104 | 
105 |         set_input_shape = pool.get_shape()
106 |         set_output_shape = [set_input_shape[0], set_input_shape[1] * ksize[1], set_input_shape[2] * ksize[2], set_input_shape[3]]
107 |         ret.set_shape(set_output_shape)
108 |         return ret
109 | 


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/precision_recall_handmade.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | from scipy.sparse import csr_matrix
  4 | from scipy.sparse import csc_matrix
  5 | def pr_curve_writer(label, pred):
  6 |     
  7 |     a=len(label)
  8 |     
  9 |     b=380000
 10 |     curve_resolution=10000
 11 |     linspace=np.linspace(0.0000, 1.0, curve_resolution,endpoint=True, dtype=np.float64)
 12 |     TPR_array=np.zeros([curve_resolution], dtype=np.float64)
 13 |     FPR_array=np.zeros([curve_resolution], dtype=np.float64)
 14 |     PPV_array=np.zeros([curve_resolution], dtype=np.float64)
 15 |     if a>=b:      
 16 |         label1=csr_matrix(label)
 17 |         label2=csr_matrix(1*np.logical_not(label))
 18 |         
 19 |         print('calculating the first ROC space')
 20 | 
 21 |         for i in range(curve_resolution):
 22 |             print("creating binary array")
 23 |             pred_ = np.where(pred >= linspace[i], np.ones_like(pred), np.zeros_like(pred))
 24 |             pred2=1*np.logical_not(pred_)
 25 |             #pred_=csc_matrix(pred_)
 26 |             #print pred_
 27 |             #print "calc logical and"
 28 |             tp = label1.dot(pred_)
 29 |             
 30 |             #print sum(tp)
 31 |             
 32 |             fp = label2.dot(pred_)
 33 |             #print fp
 34 |             fn = label1.dot(pred2)
 35 |             #print fn
 36 |             tn = label2.dot(pred2)
 37 |             #print tn
 38 |             
 39 |             FPR_array[i] += np.true_divide(fp,tn+fp)
 40 |             TPR_array[i] += np.true_divide(tp,tp+fn)
 41 |             if tp+fp==0.0:
 42 |                 PPV_array[i]+=0.0
 43 |             else:
 44 |                 PPV_array[i] += np.true_divide(tp,tp+fp)
 45 |             #print i
 46 |             #if i>=curve_resolution-3:
 47 |                 #print TPR_array[i],PPV_array[i]
 48 |         
 49 |     else:
 50 |         for i in range(curve_resolution):
 51 |             pred_ = np.where(pred >= linspace[i], np.ones_like(pred), np.zeros_like(pred))
 52 |             #print pred_
 53 |             tp = np.logical_and(pred_, label)
 54 |             fp = np.logical_and(pred_, np.logical_not(label))
 55 |             fn = np.logical_and(np.logical_not(pred_), label)
 56 |             tn = np.logical_and(np.logical_not(pred_), np.logical_not(label))
 57 |             FPR_array[i] = np.true_divide(np.nansum(fp),
 58 |                              np.nansum(np.logical_or(tn, fp)))
 59 |             TPR_array[i] = np.true_divide(np.nansum(tp),
 60 |                              np.nansum(np.logical_or(tp, fn)))
 61 |             if np.nansum(np.logical_or(tp, fp))==0.0:
 62 |                 PPV_array[i]=0.0
 63 |             else:
 64 |                 PPV_array[i] = np.true_divide(np.nansum(tp),
 65 |                              np.nansum(np.logical_or(tp, fp)))
 66 |                 
 67 |             #if i>=curve_resolution-3:
 68 |                 #print TPR_array[i],PPV_array[i]
 69 |             #rint i
 70 |     area=0.0
 71 |     k=curve_resolution-1
 72 |     for i in range(curve_resolution):
 73 |         area+=0.500*(PPV_array[k]+PPV_array[k-1])*(TPR_array[k-1]-TPR_array[k])
 74 |         #print area
 75 |         k-=1
 76 |         if k==0:
 77 |             break
 78 |     
 79 |             
 80 |     return FPR_array, TPR_array, PPV_array, area
 81 | 
 82 | array_file='/home/fast/onimaru/data/prediction/network_constructor_deepsea_1d3_Tue_Sep_19_150851_2017.ckpt-10734_label_prediction.npz'
 83 | #genome_bed=''
 84 | np_in=np.load(array_file)
 85 | pred=np_in["prediction"]
 86 | #print len(pred)
 87 | label_array=np_in["label_array"]
 88 | #print pred[:,0]
 89 | if len(label_array.shape)==1:
 90 |     num_label=1
 91 | else:
 92 |     num_label=label_array.shape[1]
 93 | 
 94 | fpr_list=[]
 95 | tpr_list=[]
 96 | roc_auc_list=[]
 97 | precision_list=[]
 98 | recall_list=[]
 99 | average_precision_list=[]
100 | if num_label>1:
101 |     for i in range(num_label):
102 |         
103 |     
104 |         fpr, tpr, ppv, area=pr_curve_writer(label_array[:,i], pred[:,i])
105 |         precision_list.append(ppv)
106 |         #tpr_list.append(tpr)
107 |         recall_list.append(tpr)
108 |         average_precision_list.append(area)
109 | else:
110 |     fpr, tpr, ppv, area=pr_curve_writer(label_array, pred)
111 | 
112 |     precision_list.append(ppv)
113 |     recall_list.append(tpr)
114 |     average_precision = area
115 |     average_precision_list.append(average_precision)
116 | plt.figure(1, figsize=(8,8))
117 | """ax1=plt.subplot(211)
118 | i=0
119 | for i in range(num_label):
120 |     f,t,r=fpr_list[i],tpr_list[i],roc_auc_list[i]
121 |     plt.plot(f, t, color='darkorange',
122 |         label='ROC curve ('+str(i)+') (area = %0.2f)' % r)
123 |     i+=1
124 | plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
125 | plt.axis('equal')
126 | plt.xlim([0.0, 1.0])
127 | plt.ylim([0.0, 1.0])
128 | plt.xlabel('False Positive Rate')
129 | plt.ylabel('True Positive Rate')
130 | 
131 | plt.title('Receiver operating characteristic curve ('+str(model_name)+')')
132 | plt.legend(loc="lower right")"""
133 | 
134 | #ax2=plt.subplot(212)
135 | i=0
136 | for i in range(num_label):
137 |     r,p,a =recall_list[i],precision_list[i], average_precision_list[i]
138 |     plt.plot(r, p, lw=2, color='navy',label='Precision-Recall curve ('+str(i)+') (area = %0.2f)' % a)
139 |     i+=1
140 | plt.axis('equal')
141 | plt.xlabel('Recall')
142 | plt.ylabel('Precision')
143 | plt.ylim([0.0, 1.00])
144 | plt.xlim([0.0, 1.0])
145 | 
146 | #plt.title('Precision-Recall curve ('+str(model_name)+')')
147 | plt.legend(loc="lower left")
148 | 
149 | #plt.savefig(out_dir+"ROC_space_curve_"+str(model_name)+".pdf", format='pdf')
150 | 
151 | 
152 | plt.show()
153 | 


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/fimo_to_numpy_array.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | from matplotlib import mlab
  4 | import subprocess as sp
  5 | 
  6 | """
  7 | fimo_format
  8 | # motif_id    motif_alt_id    sequence_name    start    stop    strand    score    p-value    q-value    matched_sequence
  9 | MA0139.1    CTCF    chr2    231612721    231612739    +    27.1967    4.53e-12    0.00802    TGGCCACCAGGGGGCGCCG
 10 | MA0139.1    CTCF    chr16    84970710    84970728    -    27.1311    6.58e-12    0.00802    CGGCCACCAGGGGGCGCCA
 11 | MA0139.1    CTCF    chr3    98023412    98023430    -    27.1311    6.58e-12    0.00802    CGGCCACCAGGGGGCGCCA
 12 | MA0139.1    CTCF    chr5    137499397    137499415    -    27.1311    6.58e-12    0.00802    CGGCCACCAGGGGGCGCCA
 13 | """
 14 | 
 15 | 
 16 | 
 17 | x=[]
 18 | 
 19 | y=[]
 20 | 
 21 | fimo_file="/home/fast/onimaru/mouse/fimo_out/fimo.txt"
 22 | narrowpeak_file="/home/fast/onimaru/mouse/fimo_out/fimo_logq33.narrowPeak"
 23 | intersect_file='/home/fast/onimaru/mouse/fimo_out/fimo_1000_logq33.narrowPeak'
 24 | bed_file='/home/fast/onimaru/mouse/fimo_out/fimo_1000_logq33.bed'
 25 | genome_1000='/home/slow/onimaru/data/genome_fasta/mm10_1000.bed'
 26 | prediction_arrray="/home/fast/onimaru/mouse/fimo_out/fimo_prediction_all"
 27 | target="all"
 28 | logq_threshold=0.33
 29 | 
 30 | with open(fimo_file, 'r') as fin:
 31 |     with open(narrowpeak_file, 'w') as fout:
 32 |         i=0
 33 |         for line in fin:
 34 |             if not line[0]=="#":
 35 |                 a=line.split()
 36 |                 chromo=a[2]
 37 |                 start=int(a[3])
 38 |                 end=int(a[4])
 39 |                 name='fimo_'+str(a[0])+'_'+str(a[1])
 40 |                 orientation=a[5]
 41 |                 score=float(a[6])
 42 |                 logp=-np.log10(float(a[7]))
 43 |                 logq=-np.log10(float(a[8]))
 44 |                 if logq>=logq_threshold:
 45 |                     fout.write(str(chromo)+"\t"+
 46 |                                str(start)+"\t"+
 47 |                                str(end)+"\t"+
 48 |                                str(name)+"\t"+
 49 |                                str(logq*400)+"\t"+
 50 |                                str(orientation)+"\t"+
 51 |                                str(score)+"\t"+
 52 |                                str(logp)+"\t"+
 53 |                                str(logq)+"\t"+
 54 |                                "-1\n"
 55 |                                )
 56 |                 
 57 |                 
 58 |             i+=1
 59 |             if i%10000==0:
 60 |                 print("reading "+str(i) + "th line of fimo file")
 61 | 
 62 | print("converting narrowPeak to 1000 binned peaks")
 63 | intersectout=open(intersect_file, 'w')
 64 | sp.check_call(["bedtools", "intersect","-F","0.4","-wo", "-a", str(genome_1000), "-b", str(narrowpeak_file)], stdout=intersectout)
 65 | intersectout.close()
 66 | print("conversion is done")
 67 | """
 68 | chr1    10500    11500    chr1    11223    11241    fimo_MA0139.1_CTCF    675.298455578    -    24.4754    8.87289520164    1.68824613894    -1    18
 69 | chr1    10500    11500    chr1    11281    11299    fimo_MA0139.1_CTCF    566.267510253    -    22.7377    7.99567862622    1.41566877563    -1    18
 70 | chr1    11000    12000    chr1    11223    11241    fimo_MA0139.1_CTCF    675.298455578    -    24.4754    8.87289520164    1.68824613894    -1    18
 71 | chr1    11000    12000    chr1    11281    11299    fimo_MA0139.1_CTCF    566.267510253    -    22.7377    7.99567862622    1.41566877563    -1    18
 72 | """
 73 | 
 74 | 
 75 | 
 76 | 
 77 | #intersect_file='/home/fast/onimaru/human/fimo_out_1e3/fimo_cutoff_0p33_logq.narrowPeak_test.bed'
 78 | fimo_peak_dict={}
 79 | 
 80 | if target=="all":
 81 |     startswtith="chr"
 82 | else:
 83 |     startswtith=str(target)+"\t"
 84 |     
 85 | 
 86 | with open(intersect_file,"r") as fin:
 87 |     for line in fin:
 88 |         if line.startswith(startswtith):
 89 |             a=line.split()
 90 |             position=str(a[0])+"\t"+str(a[1])+"\t"+str(a[2])
 91 |             logq=float(a[11])
 92 |             if not position in fimo_peak_dict:
 93 |                 fimo_peak_dict[position]=logq
 94 |             elif logq>fimo_peak_dict[position]:
 95 |                 fimo_peak_dict[position]=logq
 96 |         
 97 | 
 98 | #genome_1000='/home/slow/onimaru/data/genome_fasta/hg38_1000.bed'
 99 | qvalue_list=[]
100 | with open(genome_1000, "r") as fin, open(bed_file,'w') as fout:
101 |     for line in fin:
102 |         if line.startswith(startswtith):
103 |             a=line.strip('\n')
104 |             if fimo_peak_dict.has_key(a):
105 |                 fout.write(a+"\n")
106 |                 qvalue_list.append(fimo_peak_dict[a])
107 |                 print(line)
108 |             else:
109 |                 qvalue_list.append(0.00)
110 | 
111 | 
112 | qvalue_array=np.array(qvalue_list)/np.max(qvalue_list)
113 | 
114 | np.savez_compressed(prediction_arrray, prediction=qvalue_array)
115 |         
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 |                 
124 | """
125 | x=np.array(x)/np.max(x)
126 | 
127 | np.savez_compressed("/home/fast/onimaru/human/fimo_out_1e3/fimo_prediction", prediction=x)
128 | 
129 | # the histogram of the data
130 | 
131 | plt.subplot(211)
132 | n, bins, patches = plt.hist(x, 50, facecolor='green', alpha=0.5)
133 | plt.yscale('log', nonposy='clip')
134 | #plt.hist(x, 50, facecolor='red', alpha=0.5, cumulative=True)
135 | # add a 'best fit' line
136 | 
137 | #l = plt.plot(bins, y, 'r--', linewidth=1)
138 | 
139 | plt.xlabel('-Log10(p value)')
140 | #plt.ylabel('Scores')
141 | plt.title('fimo_prediction_dist')
142 | #plt.axis([40, 160, 0, 0.03])
143 | plt.subplot(212)
144 | plt.hist(y, 50, facecolor='blue', alpha=0.5)
145 | plt.grid(True)
146 | 
147 | plt.show()"""


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/trained_deepshark_local_multiple_label.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import sys
  3 | import gzip
  4 | import cPickle
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | import time
  8 | import math
  9 | import os
 10 | from natsort import natsorted, ns
 11 | import network_constructor_multiple_label as nc
 12 | import subprocess as sp
 13 | start=time.time()
 14 | #dimension1_2=16
 15 | 
 16 | #with gzip.open('/media/koh/HD-PCFU3/mouse/filter1_999_Tue_Oct_25_122720_2016.cpickle.gz', 'r') as f:
 17 |  #   saved_variables=cPickle.load(f)
 18 |  #   W_conv1, W_conv2, W_conv3, b_conv1, b_conv2, b_conv3, W_fc1, W_fc2, W_fc3, W_fc4, b_fc1, b_fc2, b_fc3, b_fc4=saved_variables
 19 | 
 20 | import glob
 21 | def genome_scan(filename):
 22 |  #/media/koh/HD-PCFU3/mouse/test_genome/genome_chr1_06_250plus.cpickle.gz
 23 |     with open(filename, 'r') as f1:
 24 |         file_name=f1.name
 25 |         path_sep=os.path.sep
 26 |         file_name1=file_name.split(path_sep)
 27 |         file_name2=file_name1[-1].split('_')
 28 |         chromosome=file_name2[2]
 29 |         a=file_name2[3]
 30 |         b=a.split('.')
 31 |         chr_position=int(b[0])
 32 |         #window_id=(file_name2[3])[:3]
 33 |         genome_seq=np.load(f1)
 34 |         shape_of_genome=genome_seq['genome'].shape
 35 |         genome_seq_re=np.reshape(genome_seq['genome'], (shape_of_genome[0], shape_of_genome[1], 4, 1))
 36 |         genome_seq_re_list=np.array_split(genome_seq_re, 100)
 37 |     return genome_seq_re_list, chromosome, chr_position #, window_id
 38 |     
 39 | def process(f, out_dir):
 40 |     sess = tf.Session()
 41 |     x_image = tf.placeholder(tf.float32, shape=[None, 1000, 4, 1])
 42 |     y_ = tf.placeholder(tf.float32, shape=[None, 19])
 43 |     keep_prob = tf.placeholder(tf.float32)
 44 |     keep_prob2 = tf.placeholder(tf.float32)
 45 |     keep_prob3 = tf.placeholder(tf.float32)
 46 |     phase=tf.placeholder(tf.bool)
 47 |     data_length=1000
 48 |     if 'ckpt' in sys.argv[1].rsplit('.', 1)[1]: 
 49 |         input_dir=sys.argv[1]
 50 |     elif 'meta'  in sys.argv[1].rsplit('.', 1)[1] or 'index'  in sys.argv[1].rsplit('.', 1)[1]:
 51 |         input_dir=sys.argv[1].rsplit('.', 1)[0]
 52 |     else:
 53 |         print("the input file should be a ckpt file")
 54 |         sys.exit(1)
 55 |     
 56 |     model = nc.Model(image=x_image, label_dim=19, label=y_, phase=phase, output_dir=None, start_at=None, keep_prob=keep_prob, keep_prob2=keep_prob2, keep_prob3=keep_prob3, data_length=data_length)
 57 |     sess.run(tf.global_variables_initializer())
 58 |     saver=model.saver
 59 |     try:
 60 |         saver.restore(sess, input_dir)
 61 |     except:
 62 |         print("can't open "+str(input_dir))
 63 |         sys.exit(0)
 64 |     for seq in f:
 65 |         
 66 |         try:
 67 |             genome_seq_re_list, chromosome, chr_position=genome_scan(seq)
 68 |         except:
 69 |             print("can't open "+str(file[0]))
 70 |             sys.exit(0)
 71 |         y_prediction1=[]
 72 |         i=0
 73 |         for i in range(len(genome_seq_re_list)):
 74 |             scanning=genome_seq_re_list[i]
 75 |             y_prediction2 =np.array(sess.run(model.prediction[0], feed_dict={x_image: scanning, keep_prob: 1.0, keep_prob2: 1.0, keep_prob3: 1.0,phase:False}), np.float64)
 76 |             y_prediction1.append(1.0 / (1.0 + np.exp(-y_prediction2)))
 77 |             if i%10==0:
 78 |                 print('scanning '+str(chromosome)+'_'+str(chr_position)+', '+str(100*i/len(genome_seq_re_list))+' %')    
 79 |         filename_1=str(out_dir)+str(chromosome)+'.bed'
 80 |         print('writing '+filename_1)
 81 |         if os.path.isfile(filename_1):
 82 |             output_handle=open(filename_1, 'a')
 83 |         else:
 84 |             output_handle=open(filename_1, 'w')
 85 |         i=0
 86 |         j=0
 87 |         y_len=len(y_prediction1)
 88 |         for j in range(y_len):
 89 |             y_len_j=len(y_prediction1[j])
 90 |             for i in range(y_len_j):
 91 |                 value=np.max(y_prediction1[j][i][:-1])-y_prediction1[j][i][-1]
 92 |                 if value>0.0:
 93 |                     if int(sys.argv[4])==500:
 94 |                         start_pos=int(chr_position)*int(1e7)+500*i+200*500*j
 95 |                         end_pos=start_pos+499
 96 |                     elif int(sys.argv[4])==1000:
 97 |                         start_pos=int(chr_position)*int(1e7)+1000*i+100*1000*j
 98 |                         end_pos=start_pos+999
 99 |                 
100 |                     output_handle.write(str(chromosome)+'\t'+str(start_pos)+'\t'+str(end_pos)+'\t'+str(value)+'\n')
101 |                     if i%10==0:
102 |                         print(str(str(chromosome)+'\t'+str(start_pos)+'\t'+str(end_pos)+'\t'+str(value))) 
103 |         output_handle.close()
104 |         print('finished writing '+filename_1)
105 |     sess.close()
106 |     out=open(str(out_dir)+str(chromosome)+"_srt.bed", 'w')
107 |     sp.check_call(["bedtools", "sort","-i", str(filename_1)], stdout=out)
108 |     out.close()
109 | 
110 | import multiprocessing    
111 | def main():
112 |     input_dir=sys.argv[1].rsplit('.', 1)[0]
113 |     
114 |         
115 |     path_sep=os.sep
116 |     file_name=input_dir.split(path_sep)
117 |     a=time.asctime()
118 |     b=a.replace(':', '')
119 |     start_at=b.replace(' ', '_')
120 |     out_dir=sys.argv[2]+file_name[-1]
121 |     
122 |     if not os.path.exists(os.path.dirname(out_dir)):
123 |             try:
124 |                 os.makedirs(os.path.dirname(out_dir))
125 |             except OSError as exc: # Guard against race condition
126 |                 if exc.errno != exc.errno.EEXIST:
127 |                     raise
128 | 
129 |     start=time.time()
130 |     s=0
131 |     try:
132 |         f = glob.glob(sys.argv[3])
133 |         process(f, out_dir)
134 |         #x=p.apply_async(process, (t_,))
135 |         #x.get()
136 |     except :
137 |         print("Unexpected error:", sys.exc_info()[0])
138 |         raise      
139 |         
140 |     #for i in f:
141 |     #  process(i, out_dir)
142 |     
143 |       
144 |         
145 |     print(time.time()-start)
146 | 
147 | 
148 | if __name__== '__main__':
149 |     main()
150 | 
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/genome_divider.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import math
  3 | import os
  4 | import subprocess as sp
  5 | 
  6 | 
  7 | def genome_divider(genome_fasta, genome_file, WINDOW_SIZE, outname):
  8 |     OUTDIR=os.path.splitext(genome_file)[0]+'_'+str(WINDOW_SIZE)
  9 |     try:
 10 |         os.makedirs(OUTDIR)
 11 |     except OSError as err:
 12 |         print("OS error: {0}".format(err))
 13 |     outbed=OUTDIR+'/genome.bed'
 14 |     outfasta=OUTDIR+'/genome.fa'
 15 |     #WINDOW_SIZE=1000
 16 |     #genome_file="/home/fast/onimaru/lamprey/LetJap1.0.1.genome"
 17 |     #with open(genome_file, 'r') as fin, open('/home/fast/onimaru/data/genome_fasta/hg38_1000_altwindow.bed', 'w') as fout1, open('/home/fast/onimaru/data/genome_fasta/hg38_1000_.bed', 'w') as fout2:
 18 |     with open(genome_file, 'r') as fin, open(outbed, 'w') as fout1:
 19 |         
 20 |         for line in fin:
 21 |             line=line.split()
 22 |             chrom=line[0]
 23 |             chrom_size=int(line[1])
 24 |             divide_num=chrom_size/WINDOW_SIZE
 25 |             #divide_num=chrom_size/WINDOW_SIZE-4
 26 |             for i in range(divide_num):
 27 |                 
 28 |                 #if i>=2:
 29 |                 
 30 |                 if i*WINDOW_SIZE+WINDOW_SIZE<=chrom_size:
 31 |                     fout1.write(str(chrom)+'\t'+str(i*WINDOW_SIZE)+'\t'+str(i*WINDOW_SIZE+WINDOW_SIZE)+'\n')
 32 |                 else:
 33 |                     break
 34 |                 if i*WINDOW_SIZE+WINDOW_SIZE+WINDOW_SIZE/2<=chrom_size:
 35 |                     fout1.write(str(chrom)+'\t'+str(i*WINDOW_SIZE+WINDOW_SIZE/2)+'\t'+str(i*WINDOW_SIZE+WINDOW_SIZE+WINDOW_SIZE/2)+'\n')
 36 |                 else:
 37 |                     break
 38 |     try:
 39 |         sp.call(["bedtools", "getfasta","-fi",genome_fasta,"-bed",outbed, "-fo", outfasta])
 40 |     except OSError as e:
 41 |         print(e)
 42 |         sys.exit(1)
 43 |     
 44 |     print(outbed+" and "+outfasta+' were successfully generated.')
 45 | 
 46 | def genome_divider2(genome_fasta, genome_file, WINDOW_SIZE, outname, stride=None):
 47 |     
 48 |     
 49 |     if outname is not None:
 50 |         OUTDIR=outname
 51 |     elif stride is not None:
 52 |         OUTDIR=os.path.splitext(genome_file)[0]+'_window'+str(WINDOW_SIZE)+'_stride'+str(stride)
 53 |     else:
 54 |         OUTDIR=os.path.splitext(genome_file)[0]+'_'+str(WINDOW_SIZE)
 55 |     try:
 56 |         os.makedirs(OUTDIR)
 57 |     except OSError as err:
 58 |         #print("OS error: {0}".format(err))
 59 |         sys.exit(err)
 60 |     
 61 |     outbed=OUTDIR+'/genome.bed'
 62 |     outfasta=OUTDIR+'/genome.fa'
 63 |     """
 64 |     if outname is not None:
 65 |         outbed=outname+'.bed'
 66 |         outfasta=outname+'.fa'
 67 |     elif stride is not None:
 68 |         outbed=os.path.splitext(genome_file)[0]+'_window'+str(WINDOW_SIZE)+'_stride'+str(stride)+'.bed'
 69 |         outfasta=os.path.splitext(genome_file)[0]+'_window'+str(WINDOW_SIZE)+'_stride'+str(stride)+'.fa'
 70 |     else:
 71 |         outbed=os.path.splitext(genome_file)[0]+'_'+str(WINDOW_SIZE)+'.bed'
 72 |         outfasta=os.path.splitext(genome_file)[0]+'_'+str(WINDOW_SIZE)+'.fa'
 73 |     """
 74 |     #if stride==None:
 75 |         #stride=WINDOW_SIZE/2
 76 |     #adding=WINDOW_SIZE/stride
 77 |     
 78 |     #WINDOW_SIZE=1000
 79 |     #genome_file="/home/fast/onimaru/lamprey/LetJap1.0.1.genome"
 80 |     #with open(genome_file, 'r') as fin, open('/home/fast/onimaru/data/genome_fasta/hg38_1000_altwindow.bed', 'w') as fout1, open('/home/fast/onimaru/data/genome_fasta/hg38_1000_.bed', 'w') as fout2:
 81 |     with open(genome_file, 'r') as fin, open(outbed, 'w') as fout1:
 82 |         
 83 |         for line in fin:
 84 |             line=line.split()
 85 |             chrom=line[0]
 86 |             chrom_size=int(line[1])
 87 |             #divide_num=chrom_size/WINDOW_SIZE
 88 |             
 89 |             #divide_num=chrom_size/WINDOW_SIZE-4
 90 |             i=0
 91 |             while WINDOW_SIZE+stride*i<=chrom_size:
 92 |                 fout1.write(str(chrom)+'\t'+str(stride*i)+'\t'+str(WINDOW_SIZE+stride*i)+'\n')
 93 |                 i+=1
 94 |             
 95 |     
 96 |     try:
 97 |         stdout_file=open(outbed+"_tmp", "w")
 98 |         sp.check_call(["bedtools", "sort", "-i",outbed], stdout=stdout_file)
 99 |         stdout_file.close()
100 |         
101 |     except sp.CalledProcessError as e:
102 |         print("something wrong with bedtools")
103 |         sys.exit(e)
104 |     if os.path.exists(outbed) and os.path.exists(outbed+"_tmp"):
105 |         os.remove(outbed)
106 |         os.rename(outbed+"_tmp", outbed)
107 |     else:
108 |         sys.exit("bed file was not created.")
109 |         
110 |     try:
111 |         sp.check_call(["bedtools", "getfasta","-fi",genome_fasta,"-bed",outbed, "-fo", outfasta])
112 |     except sp.CalledProcessError as e:
113 |         print("something wrong with bedtools")
114 |         sys.exit(e)
115 |     
116 |     print(outbed+" and "+outfasta+' were successfully generated.')
117 | 
118 | def genome_file_maker(genome_fasta, genome_file):
119 | 
120 |     length_list=[]
121 |     
122 |     with open(genome_fasta, 'r') as fin, open(genome_file, 'w') as fout:
123 |         seq=0
124 |         chrom_name=''
125 |         for line in fin:
126 |             
127 |             if '>' in line:
128 |                 
129 |                 if not seq==0:
130 |                     length_list.append(seq)
131 |                     #if not "_" in chrom_name and not "M" in chrom_name:
132 |                     fout.write(str(chrom_name)+'\t'+str(seq)+'\n')
133 |                 line=line.split()
134 |                 chrom_name=line[0].strip('>')
135 |                 seq=0
136 |             else:
137 |                 line1=line.strip("\n")
138 |                 seq+=len(line1)
139 |         #if len(chrom_name)==3 and not "M" in chrom_name:
140 |         fout.write(str(chrom_name)+'\t'+str(seq)+'\n')
141 |         
142 | def run(args):
143 |     genome_fasta=args.genome_fasta
144 |     windowsize=args.windowsize
145 |     genome_file=os.path.splitext(genome_fasta)[0]+'.genome'
146 |     outname=args.outname
147 |     stride=args.stride
148 |     if not os.path.isfile(genome_file):
149 |         print("generating genome file.")
150 |         genome_file_maker(genome_fasta,genome_file)
151 |     else:
152 |         print("using a pre-existing genome file: "+genome_file)
153 |     genome_divider2(genome_fasta, genome_file, windowsize, outname, stride=stride)
154 | 


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/cython_util.pyx:
--------------------------------------------------------------------------------
  1 | cimport cython
  2 | import sys
  3 | import numpy as np
  4 | from scipy.spatial.distance import cdist
  5 | 
  6 | def narrowPeak_writer(str out_dir,list y_prediction2,list position_list):
  7 |     cdef str filename_1
  8 |     filename_1=out_dir+'.narrowPeak'
  9 |     print('writing '+filename_1)
 10 |     output_handle=open(filename_1, 'w')
 11 |     cdef int k=0
 12 |     cdef float value
 13 |     cdef str chrom, start_, end_
 14 |     for i in range(len(y_prediction2)):
 15 | 
 16 |         a=position_list[i].strip('>')
 17 |         #print(str(a)+'\t'+str(y_prediction2[i]))
 18 |         k+=1
 19 |         a=a.split(':')
 20 |         chrom=a[0]
 21 |         b=a[1].split('-')
 22 |         start_=b[0]
 23 |         end_=b[1]
 24 |         value=y_prediction2[i]
 25 |         output_handle.write(str(chrom)+'\t'
 26 |                             +str(start_)+'\t'
 27 |                             +str(end_)+'\t.\t'
 28 |                             +str(value*1000).strip('[]')+'\t.\t'
 29 |                             +str(value).strip('[]')+"\t-1\t-1\t-1\n")
 30 |             
 31 |     print("prediction num: "+str(k))
 32 |     output_handle.close()
 33 |     print('finished writing '+filename_1)
 34 |     
 35 |     
 36 | def motif_compare(motif_data_dict, long_motif_dict, fout, THRESHOLD=-5.0):
 37 |     cdef int i,k, j, l=0
 38 |     cdef str k1, k2
 39 |     cdef double RAND_MEAN, RAND_DEV,DIST,Z_SCORE, ic
 40 |     cdef list comp_result, comp_result2, cpr, Z_SCORE_list=[]
 41 |     cdef int[2] v2shape,v1shape
 42 |     #cdef double[4] pm1
 43 |     with open(fout, "w") as f:
 44 |         comp_result2=[]
 45 |         f.write("Motif name\tStart\tEnd\tDistance\n")
 46 |         for k1, v1 in long_motif_dict.items():
 47 |             
 48 |             v1shape=v1.shape
 49 |             #print v1shape
 50 |             for k2, v2 in motif_data_dict.items():
 51 |                 ic1=0
 52 |                 if "secondary" in k2:
 53 |                     continue
 54 |                 #print k2
 55 |                 #j+=1
 56 |                 #print j
 57 |                 v2shape=v2.shape
 58 |                 #print v2shape
 59 |                 """for i in range(v2shape[0]):
 60 |                     ic=np.nansum(v2[i]*np.log2(v2[i]*4+0.000001))
 61 |                     v2[i]=v2[i]"""
 62 |  
 63 |                 RAND_DIST=np.zeros([500], np.float32)
 64 |                 for i in range(500):
 65 |                     rand=np.random.rand(v2shape[0],v2shape[1])
 66 |                     for k in range(v2shape[0]):
 67 |                         rand[k]=rand[k]/np.sum(rand[k])
 68 |                         #rand[k]=pm1*(np.sum(pm1*np.log2(pm1*4+0.00001)))
 69 |                     #RAND_DIST.append(np.mean(np.diagonal(cdist(v2, rand,metric='euclidean'))))
 70 |                     M=0.5*(rand+v2)+0.00001
 71 |                     DIST=0.5*(np.sum(-v2*np.log(M/(v2+0.00001)))+np.sum(-rand*np.log(M/(rand+0.00001))))/float(v2shape[0])
 72 |                     #DIST=-np.sum(v2*np.log(rand+0.00001)+(1.0-v2)*np.log(1.0-rand+0.00001))/float(v2shape[0])
 73 |                     RAND_DIST[i]+=DIST
 74 |                     
 75 |                 RAND_MEAN=np.mean(RAND_DIST)
 76 |                 RAND_DEV=np.std(RAND_DIST)
 77 |                 #print RAND_MEAN, RAND_DEV
 78 |                 #print("randome_dist: "+str(RAND_DIST))
 79 |                 comp_result=[]
 80 |                 for i in range(v1shape[0]-v2shape[0]):
 81 |                     #partial_motif=[]
 82 |                     #for j in range(v2shape[0]):
 83 |                     #    pm1=v1[i+j]
 84 |                         #ic=np.sum(pm1*np.log2(pm1*4+0.000001))
 85 |                     partial_motif_=v1[i:i+v2shape[0]]
 86 |                     #partial_motif_=np.array(partial_motif)
 87 |                     #partial_motif=v1[i:(i+v2shape[0])]
 88 |                     #print v2shape, np.shape(partial_motif)
 89 |                     M=0.5*(partial_motif_+v2)+0.00001
 90 |                     DIST=0.5*(np.sum(-v2*np.log(M/(v2+0.00001)))+np.sum(-partial_motif_*np.log(M/(partial_motif_+0.00001))))/float(v2shape[0])
 91 |                     #print JSD
 92 |                     v2_comp=np.flip(np.flip(v2,0),1)
 93 |                     M_comp=0.5*(partial_motif_+v2_comp)+0.00001
 94 |                     DIST_comp=0.5*(np.sum(-v2_comp*np.log(M_comp/(v2_comp+0.00001)))+np.sum(-partial_motif_*np.log(M_comp/(partial_motif_+0.00001))))/float(v2shape[0])
 95 |                     #DIST=np.mean(np.diagonal(cdist(v2, partial_motif_,metric='euclidean')))
 96 |                     #DIST=np.mean(np.diagonal(cdist(v2, partial_motif_,metric='euclidean')))
 97 |                     #DIST_comp=np.mean(np.diagonal(cdist(v2_comp, partial_motif_,metric='euclidean')))
 98 |                     
 99 |                     #DIST=-np.sum(v2*np.log(partial_motif_+0.00001)+(1.0-v2)*np.log(1.0-partial_motif_+0.00001))/float(v2shape[0])
100 |                     #DIST_comp=-np.sum(v2_comp*np.log(partial_motif_+0.00001)+(1.0-v2_comp)*np.log(1.0-partial_motif_+0.00001))/float(v2shape[0])
101 |                     ori="+"
102 |                     if DIST_comp<DIST:
103 |                         DIST=DIST_comp
104 |                         ori="-"
105 |                     Z_SCORE=(DIST-RAND_MEAN)/RAND_DEV
106 |                     #print Z_SCORE
107 |                     Z_SCORE_list.append(Z_SCORE)
108 |                     if Z_SCORE<=THRESHOLD:
109 |                         l+=1
110 |                         #f.write(str(k2)+"\t"+str(i)+"\t"+str(i+v2shape[0])+"\t"+str(Z_SCORE)+"\n")
111 |                         comp_result.append([k2,i,i+v2shape[0],Z_SCORE,ori])
112 |                 #print comp_result
113 |                     
114 |                 if 0<len(comp_result)<10:
115 |                     for cpr in comp_result:
116 |                         comp_result2.append(cpr)
117 |                         #f.write(str(cpr[0])+"\t"+str(cpr[1])+"\t"+str(cpr[2])+"\t"+str(cpr[3])+"\n")
118 |                 elif len(comp_result)>=10:
119 |                     comp_result.sort(key = lambda x: x[3])
120 |                     for cpr in comp_result[-10:]:
121 |                         comp_result2.append(cpr)
122 |                         
123 |         comp_result2.sort(key = lambda x: x[1])
124 |         for cpr in comp_result2:
125 |             f.write("\t".join([str(cpr[0]),str(cpr[1]),str(cpr[2]),str(cpr[3]),str(cpr[4])])+"\n")
126 |                     
127 |                         
128 |         print("the number of motif matches: "+str(l))
129 |     return Z_SCORE_list
130 |                         
131 |                         
132 |                         
133 |                         
134 |                         
135 |                         
136 | 


--------------------------------------------------------------------------------
/deepgmap/data_preprocessing_tools/genome_labeling2.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import glob as glb
  3 | import sys
  4 | import numpy as np
  5 | from sklearn.decomposition import KernelPCA as pca_f
  6 | import os
  7 | import matplotlib as mpl
  8 | mpl.use("WebAgg")
  9 | import matplotlib.pyplot as plt
 10 | from scipy.spatial.distance import pdist
 11 | import scipy.cluster.hierarchy as sch
 12 | import time
 13 | import copy
 14 | 
 15 | def genome_label(bed_file_list, genome_1000,out_dir):
 16 |     
 17 |     file_num=len(bed_file_list)
 18 | 
 19 |     #print file_num
 20 |     peak_set_list=[]
 21 |     peak_set_list_append=peak_set_list.append
 22 |     #start=time.time()
 23 |     i=0
 24 |     for f in bed_file_list:
 25 |         peak_set=set()
 26 |         peak_set_add=peak_set.add
 27 |         with open(f, 'r') as fin:
 28 |         
 29 |         
 30 |             for line in fin:
 31 |                 if i==0:
 32 |                     _,a,b=line.split()
 33 |                     check_length=int(b)-int(a)
 34 |                     
 35 |                 peak_set_add(line)
 36 |             peak_set_list_append(peak_set)
 37 |                     
 38 |         i+=1
 39 |     
 40 |     fo_name=out_dir
 41 |     label_array_list=[]
 42 |     label_array_list_append=label_array_list.append
 43 |     with open(genome_1000,'r') as fin:
 44 |         with open(fo_name,'w') as fout:
 45 |             fout.write("#sample_list: "+"\t".join(bed_file_list)+"\n")
 46 |             i=0
 47 |             
 48 |             for line in fin:
 49 |                 k=0
 50 |                 label_array=["0" for h in range(file_num)]
 51 |     
 52 |                 for s in peak_set_list:
 53 |                     if i==0:
 54 |                         _,a,b=line.split()
 55 |                         assert check_length==int(b)-int(a), "mismatches in sequence lengths"
 56 |                     if line in s:
 57 |                         label_array[k]="1"
 58 |                     k+=1 
 59 |                 fout.write(line.strip('\n')+'\t'+' '.join(label_array)+'\n')
 60 |                 #if sum(label_array)>0:
 61 |                     #abel_array_list_append(label_array)
 62 |                 i+=1
 63 |                 if i%200000==0:
 64 |                     
 65 |                     sys.stdout.write("\rwriting labeled file "+ line.strip("\n"))
 66 |                     sys.stdout.flush()
 67 |     #print time.time()-start
 68 |     #sys.exit()
 69 |     print("\n"+fo_name+" has been saved. This file is going to be used when testing a trained model too.")
 70 |     #return label_array_list
 71 | 
 72 | 
 73 | def genome_label2(bed_file_list, genome_1000,out_dir):
 74 |     
 75 |     file_num=len(bed_file_list)
 76 | 
 77 |     #print file_num
 78 |     peak_set_dict={}
 79 |     #peak_set_list_append=peak_set_list.append
 80 |     start=time.time()
 81 |     i=0
 82 |     #zero=["0" for h in range(file_num)]
 83 |     for f in bed_file_list:
 84 |         with open(f, 'r') as fin:
 85 |             for line in fin:
 86 |                 if i==0:
 87 |                     _,a,b=line.split()
 88 |                     check_length=int(b)-int(a)
 89 | 
 90 |                 if not line in peak_set_dict:
 91 |                     peak_set_dict[line]=["0" for h in range(file_num)]
 92 |                     #peak_set_dict[line]=copy.deepcopy(zero)
 93 |                 peak_set_dict[line][i]="1"
 94 |         i+=1
 95 |     print(time.time()-start)
 96 |     fo_name=out_dir
 97 |     label_array_list=[]
 98 |     label_array_list_append=label_array_list.append
 99 |     zero=' '.join(["0" for h in range(file_num)])
100 |     with open(genome_1000,'r') as fin:
101 |         with open(fo_name,'w') as fout:
102 |             fout.write("#sample_list: "+"\t".join(bed_file_list)+"\n")
103 |             #start=time.time()
104 |             i=0
105 |             for line in fin:
106 |                 if i==0:
107 |                     _,a,b=line.split()
108 |                     assert check_length==int(b)-int(a), "mismatches in sequence lengths"
109 |                 if line in peak_set_dict:
110 |                     fout.write(line.strip('\n')+'\t'+' '.join(peak_set_dict[line])+'\n')
111 |                 else:
112 |                     fout.write(line.strip('\n')+'\t'+zero+'\n')
113 |                 #if sum(label_array)>0:
114 |                     #label_array_list_append(label_array)
115 |                 i+=1
116 |                 if i%200000==0:
117 |                     
118 |                     sys.stdout.write("\rwriting labeled file "+ line.strip("\n"))
119 |                     sys.stdout.flush()
120 |     print("genome_labeling2 "+str(time.time()-start))
121 |     #sys.exit()
122 |     print("\n"+fo_name+" has been saved. This file is going to be used when testing a trained model too.")
123 |     #return label_array_list
124 | 
125 | 
126 | def main():
127 |     #bed_file_dir, genome_1000, out_dir=sys.argv[1:]
128 |     bed_file_dir="/home/fast/onimaru/deepgmap/data/inputs/hg38_dnase/peaks_10k/test_hg38_window1000_stride300.bed_list/*"
129 |     genome_1000="/home/fast/onimaru/deepgmap/data/genomes/hg38_window1000_stride300.bed"
130 |     out_dir="/home/fast/onimaru/deepgmap/data/inputs/hg38_dnase/peaks_10k/test.labeled"
131 |     bed_file_list=[]
132 |     if not "*" in bed_file_dir and bed_file_dir.endswith('.bed'):
133 |         bed_file_list.append(bed_file_dir)
134 |     elif not '*' in bed_file_dir:
135 |         bed_file_dir=bed_file_dir+"*.bed"
136 |     
137 |     bed_file_list=sorted(glb.glob(bed_file_dir))
138 |     print(bed_file_list)
139 |     if len(bed_file_list)==0:
140 |         print("no files in "+str(bed_file_dir))
141 |         sys.exit()
142 |     label_array_list=genome_label2(bed_file_list, genome_1000,out_dir)
143 |     print(label_array_list[0])
144 |     label_array_list_=np.transpose(label_array_list)
145 |     print(sum(label_array_list_[0]))
146 |     pca = pca_f(n_components=2, kernel="rbf")
147 |     X_pca=pca.fit_transform(label_array_list_)
148 |     dist1=pdist(label_array_list_, 'cosine')
149 |     _, ax1=plt.subplots()
150 |     
151 |     Y = sch.linkage(dist1, method='ward')
152 |     Z1 = sch.dendrogram(Y)
153 |     idx1 = Z1['leaves']
154 |     
155 |     new_sample_list=[]
156 |     
157 |     for i in idx1:
158 |         txt=bed_file_list[i].split("/")[-1]
159 |         new_sample_list.append(txt)
160 |     ax1.set_xticklabels(new_sample_list , rotation=90)
161 |     
162 |     
163 |     print(X_pca.shape)
164 |     _, ax2=plt.subplots()
165 |     ax2.scatter(X_pca[:,0], X_pca[:,1])
166 |     for i, txt in enumerate(bed_file_list):
167 |         txt=txt.split("/")[-1]
168 |         ax2.annotate(txt, (X_pca[i,0],X_pca[i,1]))
169 |     
170 |     #plt.show()
171 | if __name__ == '__main__':
172 |     main()
173 |     
174 |     
175 |     
176 |     
177 |     
178 |     
179 |     
180 |     
181 |     
182 |     
183 | 


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/ROC_space_plotter2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | from sklearn import svm, datasets
  5 | from sklearn.metrics import roc_curve, auc
  6 | from sklearn.preprocessing import label_binarize
  7 | from sklearn.multiclass import OneVsRestClassifier
  8 | from scipy import interp
  9 | from scipy import stats
 10 | import getopt
 11 | from glob import glob
 12 | from natsort import natsorted
 13 | from sklearn.metrics import precision_recall_curve
 14 | from sklearn.metrics import average_precision_score
 15 | from sklearn.metrics import f1_score
 16 | import matplotlib as mpl
 17 | import os
 18 | from decimal import Decimal
 19 | import pandas as pd
 20 | 
 21 | def roc_space_calc(label,pred):
 22 |     
 23 |     # Compute ROC curve and ROC area for each class
 24 | 
 25 |     fpr, tpr, _ = roc_curve(label, pred)
 26 |     roc_auc = auc(fpr, tpr)
 27 |     
 28 |     return fpr, tpr, roc_auc
 29 | 
 30 | 
 31 | 
 32 | def roc_space_plotter(label, predictions1,outfile_name):
 33 |     ind_list=['es', 'brain', 'limb']
 34 |     pos=[0,1]
 35 |     width=0.25
 36 |     predictions_list=[]
 37 |     label_array=np.array(label)
 38 |     label_array_shape=label_array.shape
 39 |     for pred in predictions1:
 40 |         print(pred["prediction"].shape)
 41 |         predictions_list.append(pred["prediction"])
 42 |     """df_rearanged = pd.DataFrame({
 43 |     ind_list[0] : [[], []],
 44 |     ind_list[1] : [[], []],
 45 |     ind_list[2] : [[], []],
 46 |     },index = ["deepsea", "conv4-frss"])"""
 47 |     
 48 |     data_dict={}
 49 |     for n, i in enumerate(predictions_list):
 50 |         if n<3:
 51 |             _key='deepsea'
 52 |             
 53 |         else:
 54 |             _key='conv4-frss'
 55 |             
 56 |         if not _key in data_dict:
 57 |                 data_dict[_key]={}
 58 |         
 59 |         for j in range(label_array_shape[1]):
 60 |             
 61 |             _tmp_pred=np.where(i[:,j]>=0.5, 1,0)
 62 |             _tmp_label=label_array[:,j]
 63 |             #true_pos=((_tmp_pred+_tmp_label) ==2).sum()
 64 |             false_pos=((_tmp_label-_tmp_pred) <0).sum()
 65 |             #false_neg=((_tmp_label-_tmp_pred) ==1).sum()
 66 |             if not ind_list[j] in data_dict[_key]:
 67 |                 data_dict[_key][ind_list[j]]=[]
 68 |             data_dict[_key][ind_list[j]].append(float(false_pos))
 69 |             
 70 |     for k in ind_list:
 71 |         a=data_dict['deepsea'][k]
 72 |         b=data_dict['conv4-frss'][k]
 73 |         s,p=stats.ttest_ind(a,b)
 74 |         print(p, k)
 75 |     
 76 |     
 77 |     
 78 |     df=pd.DataFrame(columns=["class1", "class2", "mean","stdv"])
 79 |     
 80 |     """class1=[]
 81 |     class2=[]
 82 |     name_of_class=["model","cell-type"]
 83 |     data_dict2={}"""
 84 |     for k, v in data_dict.items():
 85 |         #print k, v
 86 |         for k1,v1 in v.items():
 87 |             """for e in v1:
 88 |                 class1.append(k)
 89 |                 class2.append(k1)
 90 |                 if not k in data_dict2:
 91 |                     data_dict2[k]=[]
 92 |                 data_dict2[k].append(e)"""
 93 |     
 94 |             df=df.append({"class1":k, "class2":k1, "mean":np.mean(v1),"stdv":np.std(v1)}, ignore_index=True)
 95 |             
 96 |             
 97 |     """print data_dict2
 98 |     print class1
 99 |     print class2
100 |     ix3 = pd.MultiIndex.from_arrays([class1, class2], names=name_of_class)
101 |     df3 = pd.DataFrame(data_dict2, index=ix3)
102 |     gp3 = df3.groupby(level=name_of_class)
103 |     means = gp3.mean()
104 |     errors = gp3.std()
105 |     fig, ax = plt.subplots()
106 |     means.plot.bar(yerr=errors, ax=ax)
107 |     """
108 |     #print df
109 |     yerr=df.pivot(index='class2',columns='class1',values='stdv')
110 |     #print np.shape(yerr)
111 |     #print df.pivot(index='class2',columns='class1',values='mean')
112 |     df.pivot(index='class2',columns='class1',values='mean').plot(kind='bar', yerr=yerr)
113 |     
114 |     #df.pivot(index='class1',columns='class2',values='mean').plot(kind='bar', yerr=df.std.reshape((2,3)))
115 |     #print df.pivot(index='class1',columns='class2',values='std').values
116 |     #df.pivot(index='class1',columns='class2',values='mean').plot(kind='bar')
117 |     plt.grid(b=True, which='major', color='gray', linestyle='-',axis= 'y')
118 |     plt.grid(b=True, which='minor', color='gray', linestyle='--',axis= 'y')
119 |     plt.minorticks_on()
120 |     #plt.grid(True)
121 |     
122 |     #plt.minorticks_on()
123 |     plt.show()
124 |             #print false_pos
125 |             #print round(false_pos/np.float(false_pos+true_pos), 4)
126 |             #print f1_score(label_array[:,j], )
127 | 
128 |     
129 | def main():
130 |     outfile_name="/home/fast/onimaru/data/prediction/ROC_space_curve_comp_limb_brain.pdf"
131 |     npload_list1=[]
132 |     npload_list2=[]
133 |     label_array=[]
134 |     label_array_append=label_array.append
135 |     chromosome="chr2"
136 |     #name_list=["DeepSEA", "Bidirectional","Conv_plus","Conv+Bidirectional"]
137 |     name_list=["conv4frss", "deepsea"]
138 |     file_list1=[
139 |                 "/home/fast2/onimaru/DeepGMAP-dev/data/predictions/quick_benchmark/deepsea_Fri_Apr_20_140717_2018.ckpt-16747_prediction.npz",
140 |                 "/home/fast2/onimaru/DeepGMAP-dev/data/predictions/deepsea_Thu_Jun__7_072332_2018.ckpt-16747_prediction.npz",
141 |                 "/home/fast2/onimaru/DeepGMAP-dev/data/predictions/quick_benchmark/deepsea_Thu_Apr_26_115030_2018.ckpt-16747_prediction.npz",
142 |                 '/home/fast2/onimaru/DeepGMAP-dev/data/predictions/conv4frss_Fri_Jun__8_101931_2018.ckpt-16747_prediction.npz',
143 |                 '/home/fast2/onimaru/DeepGMAP-dev/data/predictions/conv4frss_Fri_Jun__8_122816_2018.ckpt-16747_prediction.npz',
144 |                 "/home/fast2/onimaru/DeepGMAP-dev/data/predictions/quick_benchmark/deepsharktest_Thu_Apr_19_191806_2018.ckpt-16747_prediction.npz",
145 |               ]
146 |     label_file_array="/home/fast/onimaru/deepgmap/data/inputs/mm10_dnase_subset/dnase_summits_subset_mm10_1000_chr2_testlabels.npz"
147 |     if not os.path.isfile(label_file_array):
148 |         label_file='/home/fast/onimaru/deepgmap/data/inputs/mm10_dnase_subset/dnase_summits_subset_mm10_1000.bed.labeled'
149 |         with open(label_file, 'r') as fin:
150 |             for line in fin:
151 |                 if line.startswith(chromosome):
152 |                     line=line.split()
153 |                     #print line
154 |                     label_array_append(map(int, line[3:]))
155 |         label_array=np.array(label_array)
156 |         np.savez_compressed( label_file_array,  labels=label_array,)
157 |     else:
158 |         label_array=np.load(label_file_array)["labels"]
159 |     for f in file_list1:
160 |         npload_list1.append(np.load(f))
161 |     
162 |     roc_space_plotter(label_array, npload_list1,outfile_name)
163 | 
164 | 
165 | if __name__== '__main__':
166 |     main()
167 |     
168 |     
169 |     
170 |     
171 |     
172 |     
173 |     
174 |     
175 |     


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/sequence_visualizer2.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import cairocffi as cairo
  3 | import numpy as np
  4 | 
  5 | def _select_color(cr, DNA):
  6 |     if DNA=="A":
  7 |         cr.set_source_rgb(1, 0, 0)
  8 |     elif DNA=="G":
  9 |         cr.set_source_rgb(0.8, 0.8, 0)
 10 |     elif DNA=="C":
 11 |         cr.set_source_rgb(0, 0, 1)
 12 |     elif DNA=="T":
 13 |         cr.set_source_rgb(0, 1, 0)    
 14 |     else:
 15 |         cr.set_source_rgb(0.8, 0.8, 0.8)
 16 | def seuquence_visualizer2(npz_file, output_file):
 17 | 
 18 |     if type(npz_file)==str:
 19 |         with np.load(npz_file) as f:
 20 |             reconstruct=f["recon"]
 21 |     else:
 22 |         reconstruct=npz_file
 23 | 
 24 |     i=0 
 25 |     j=0
 26 |     k=0
 27 |     l=0
 28 |     line_num=10
 29 |     DNA_len=1000
 30 |     #max_value=reconstruct.max()
 31 |     
 32 |     reconstruct=np.reshape(reconstruct, (DNA_len, 4))
 33 |     max_value=np.max(reconstruct)
 34 |     reconstruct=80*reconstruct/max_value
 35 |     #print norm
 36 |     """reconstruct_pos=reconstruct.clip(min=0)
 37 |     reconstruct_neg=reconstruct.clip(max=0)
 38 |     reconstruct_pos_sum=np.sum(reconstruct_pos, axis=1)
 39 |     reconstruct_neg_sum=np.sum(reconstruct_neg, axis=1)
 40 |     
 41 |     max_value=reconstruct_pos_sum.max()
 42 |     min_value=reconstruct_neg_sum.min()
 43 |     min_value_abs=-min_value
 44 |     
 45 |     if min_value_abs>max_value:
 46 |         max_value=min_value_abs"""
 47 |     
 48 |     #scale_factor=400/max_value
 49 |     
 50 |     width=DNA_len*30/line_num+200
 51 |     hight=1024*2*3
 52 |     y_center=300
 53 |     ims1 = cairo.PDFSurface(output_file, width, hight)       
 54 |     cr = cairo.Context(ims1)
 55 |     cr.move_to(100, y_center)
 56 |     cr.line_to(DNA_len/line_num*30+100, y_center)
 57 |     #cr.move_to(50, 100)
 58 |     #cr.line_to(50, 412)
 59 |     cr.set_line_width(2)
 60 |     cr.stroke()
 61 |     
 62 |     meme_fileout=open(output_file+'.meme','w')
 63 |     meme_fileout.write("MEME version 4\n\nALPHABET= ACGT\n\nstrands: + -\n\n\
 64 | Background letter frequencies (from uniform background):\nA 0.2500 C 0.2500 G 0.2500 T 0.2500\n\nMOTIF LONG_MOTIF\n\nletter-probability matrix: alength= 4 w= 1000 nsites= 20 E= 0\n")
 65 |         
 66 |     for k in range(1000):
 67 |         if not k==0 and k%(DNA_len/line_num)==0:
 68 |             cr.set_source_rgba(0.0,0.0,0,1.0)
 69 |             y_center+=300
 70 |             cr.move_to(100, y_center)
 71 |             cr.line_to(DNA_len/line_num*30+100, y_center)
 72 |             cr.stroke()
 73 |             print(y_center)
 74 |             
 75 |         AGCT={}
 76 |         values=[]
 77 |         #print reconstruct[k]
 78 |         
 79 |         #reconstruct[k]=reconstruct[k]*50
 80 |         #mean=np.mean(reconstruct[k])
 81 |         #stdv=np.std(reconstruct[k])
 82 |         
 83 |         #reconstruct[k]=(reconstruct[k]-mean)/stdv
 84 |         probability=np.round(np.true_divide(np.exp(reconstruct[k]),np.nansum(np.exp(reconstruct[k]))),6)
 85 |         probability/=np.nansum(probability)
 86 |         to_print=""
 87 |         for i in range(4):
 88 |             if np.isnan(probability[i]):
 89 |                 print(k)
 90 |                 probability[i]=0.0
 91 |         
 92 |         to_print=str(probability[0])+" "+str(probability[2])+" "+str(probability[1])+" "+str(probability[3])+"\n"
 93 |         #print(to_print)
 94 |         meme_fileout.write(to_print)
 95 |         
 96 |         ic=np.nansum(probability*np.log2(probability*4+0.0001))*120
 97 |         #print ic
 98 |         A=["A", probability[0]*ic]
 99 |         G=["G",probability[1]*ic]
100 |         C=["C",probability[2]*ic]
101 |         T=["T", probability[3]*ic]
102 | 
103 |         """
104 |         A=["A", reconstruct[k][0]*scale_factor]
105 |         G=["G",reconstruct[k][1]*scale_factor]
106 |         C=["C",reconstruct[k][2]*scale_factor]
107 |         T=["T", reconstruct[k][3]*scale_factor]"""
108 |         values=[A,G,C,T]
109 |         pos=filter(lambda x:x[1]>=0,values)
110 |         #neg=filter(lambda x:x[1]<0,values)
111 |         pos.sort(key=lambda x:x[1])
112 |         #neg.sort(key=lambda x:x[1], reverse=True)
113 |         Nucpos=0
114 |         #Nucneg=0
115 |         x_pos=k%(DNA_len/line_num)
116 |         
117 |         for l in range(len(pos)):
118 |             Nuc=pos[l][0]
119 |             
120 |             Nucsize=abs(pos[l][1])+0.1
121 |             cr.move_to(100+x_pos*40*0.75, y_center-Nucpos*0.75)               
122 |             cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL)
123 |             _select_color(cr, Nuc)
124 |             font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=Nucsize,x0=0.0,y0=0.0)
125 |             cr.set_font_matrix(font_mat)
126 |             cr.show_text(str(Nuc))
127 |             Nucpos+=abs(pos[l][1])
128 |             
129 |         """
130 |         l=0
131 |         for l in range(len(neg)):
132 |             Nuc=neg[l][0]
133 |             Nucsize=abs(neg[l][1])
134 |             
135 |             cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL)
136 |             _select_color(cr, Nuc)
137 |             font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=-Nucsize,x0=0.0,y0=0.0)
138 |             cr.set_font_matrix(font_mat)
139 |             cr.move_to(100+x_pos*40*0.75, y_center+(Nucneg)*0.75)
140 |             cr.show_text(str(Nuc))
141 |             Nucneg+=abs(neg[l][1])"""
142 |         """     
143 |         max_value=np.amax(reconstruct[k])
144 |         sum_value=np.sum(reconstruct[k])
145 |         print max_value
146 |         if max_value>0.0:
147 |             max_index=np.argmax(reconstruct[k])
148 |         
149 |             if max_index==0:
150 |                 Nuc="A"
151 |             elif max_index==1:
152 |                 Nuc="G"
153 |             elif max_index==2:
154 |                 Nuc="C"
155 |             elif max_index==3:
156 |                 Nuc="T"
157 |         else:
158 |             Nuc="N"
159 | 
160 |         Nucpos=0
161 |         Nucneg=0
162 |         Nucsize=max_value*1000
163 |         Nucsize2=sum_value*1000"""
164 | 
165 |         #cr.move_to(50+x_pos*40*0.75, y_center)               
166 |         #cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL)
167 |         #select_color(cr, Nuc)
168 |         #font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=Nucsize+0.1,x0=0.0,y0=0.0)
169 |         #cr.set_font_matrix(font_mat)
170 |         #print Nuc
171 |         #cr.show_text(str(Nuc))
172 |         """cr.move_to(50+x_pos*40*0.75, y_center)               
173 |         cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL)
174 |         select_color(cr, Nuc2)
175 |         font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=Nucsize2+20.0,x0=0.0,y0=0.0)
176 |         cr.set_font_matrix(font_mat)
177 |         cr.show_text(str(Nuc2))    """
178 |     #cr.set_font_size(40)
179 |     meme_fileout.close()
180 |     cr.show_page()
181 | 
182 | def main():
183 |     npz_file='/home/fast2/onimaru/DeepGMAP-dev/data/reconstructions/conv4frss_Fri_May_11_075425_2018.ckpt-16747Tue_May_15_112518_2018_all_.npz'
184 |     a=npz_file.split('/')[-1]
185 |     a=a.split('.')[0]
186 |     output_file=npz_file+'.pdf'
187 |     seuquence_visualizer2(npz_file, output_file)
188 | if __name__ == "__main__":    
189 |     main()
190 |     


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/kernel_visualizer2.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import cairocffi as cairo
  3 | #import cairo
  4 | #from gi.repository import Gtk
  5 | 
  6 | import gzip
  7 | import cPickle       
  8 | import numpy as np
  9 | import os
 10 | import math
 11 | import enum
 12 | from PIL import Image
 13 | try:
 14 |     from StringIO import StringIO
 15 | except ImportError:
 16 |     from io import StringIO
 17 | def _select_color(cr, DNA):
 18 |     if DNA=="A":
 19 |         cr.set_source_rgb(1, 0, 0)
 20 |     elif DNA=="G":
 21 |         cr.set_source_rgb(0.8, 0.8, 0)
 22 |     elif DNA=="C":
 23 |         cr.set_source_rgb(0, 0, 1)
 24 |     elif DNA=="T":
 25 |         cr.set_source_rgb(0, 1, 0)    
 26 |     else:
 27 |         cr.set_source_rgb(0.8, 0.8, 0.8)
 28 |         
 29 | def rectangle(x, y, w,h,lw, context):
 30 |     context.set_line_width(lw)
 31 |     context.move_to(x, y)
 32 |     context.rel_line_to(w, 0)
 33 |     context.rel_line_to(0, h)
 34 |     context.rel_line_to(-w, 0)
 35 |     context.close_path()
 36 | 
 37 | def seuquence_visualizer(npz_file):
 38 |     png_list=[]
 39 |     with np.load(npz_file) as f:
 40 |         kernels=f["prediction/W_conv1:0"]
 41 |     
 42 |     kernel_shape=kernels.shape
 43 |     i=0 
 44 |     j=0
 45 |     k=0
 46 |     l=0
 47 |     #line_num=10
 48 |     #DNA_len=1000
 49 |     #max_value=reconstruct.max()
 50 |     
 51 |     kernels=np.reshape(kernels, (kernel_shape[0], kernel_shape[1],kernel_shape[3]))
 52 |     kernel_shape=kernels.shape
 53 |     # (9, 4, 320)
 54 |     #kernels=np.exp(kernels)
 55 |     #kernels=kernels/np.amax(kernels)
 56 |     #reconstruct=80*reconstruct/max_value
 57 |     #print norm
 58 |     #scale_factor=400/max_value
 59 |     width=kernel_shape[0]*40+10
 60 |     hight=150
 61 |     y_center=hight*0.8
 62 |     
 63 |     
 64 |     
 65 |     prefix=os.path.splitext(npz_file)[0]+"_kernels/"
 66 |     if not os.path.isdir(prefix):
 67 |         try:
 68 |             os.mkdir(prefix)
 69 |         except:
 70 |             sys.exit()
 71 |     
 72 |     meme_fileout=open(prefix+'motifs.meme','w')
 73 |     meme_def="MEME version 4\n\nALPHABET= ACGT\n\nstrands: + -\n\n\Background letter frequencies (from uniform background):\nA 0.2500 C 0.2500 G 0.2500 T 0.2500\n\n"
 74 |     meme_fileout.write(meme_def)
 75 |     kernel_shape_ic_list=[]
 76 |     for k in range(kernel_shape[2]):
 77 |         meme_def="MOTIF kernel_"+str(k)+"\n\nletter-probability matrix: alength= 4 w= 9 nsites= 9 E= 0\n"
 78 |         meme_fileout.write(meme_def)
 79 |         ims1 = cairo.PDFSurface(None, width, hight)       
 80 |         cr = cairo.Context(ims1)
 81 |         cr.set_source_rgb(0.0,0.0,0)
 82 |         cr.move_to(width*0.1, y_center)
 83 |         cr.line_to(width*0.9, y_center)
 84 |         #cr.move_to(50, 100)
 85 |         #cr.line_to(50, 412)
 86 |         cr.set_line_width(2)
 87 |         cr.stroke()
 88 |         cr.move_to(width*0.1, y_center)
 89 |         cr.line_to(width*0.1, y_center-120)
 90 |         cr.set_line_width(2)
 91 |         cr.stroke()
 92 |         cr.move_to(width*0.1, y_center-60)
 93 |         cr.line_to(width*0.08, y_center-60)
 94 |         cr.set_line_width(2)
 95 |         cr.stroke()
 96 |         cr.move_to(width*0.075, y_center-60+4*10)
 97 |         cr.rotate(-90*math.pi/180.0)
 98 |         #cr.set_line_width(2)
 99 |         cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL)
100 |         font_mat=cairo.Matrix(xx=32.0,yx=0.0,xy=0.0,yy=32,x0=0.0,y0=0.0)
101 |         cr.set_font_matrix(font_mat)
102 |         cr.show_text("2 bit")
103 |         cr.rotate(90*math.pi/180.0)
104 |         font_mat=cairo.Matrix(xx=12.0,yx=0.0,xy=0.0,yy=12,x0=0.0,y0=0.0)
105 |         cr.move_to(width*0.5, hight)
106 |         cr.show_text("k"+str(k))
107 |         #print y_center
108 |             
109 |         AGCT={}
110 |         #values=[]
111 |         #print reconstruct[k]
112 |         
113 |         #reconstruct[k]=reconstruct[k]*50
114 |         #mean=np.mean(reconstruct[k])
115 |         #stdv=np.std(reconstruct[k])
116 |         
117 |         #reconstruct[k]=(reconstruct[k]-mean)/stdv
118 |         #print kernels[:,:,k]
119 |         xkernel=kernels[:,:,k]
120 |         xkernel=np.exp(xkernel*100.0)
121 |         #print xkernel
122 |         #sys.exit()
123 |         probability=xkernel/np.nansum(xkernel, axis=1)[:,None]
124 |         #probability/=np.nansum(probability)
125 |         for p in probability:        
126 |             to_print=str(p[0])+" "+str(p[2])+" "+str(p[1])+" "+str(p[3])+"\n"
127 |             meme_fileout.write(to_print)
128 |         meme_fileout.write("\n\n")
129 |         ic_sum=0.0
130 |         for pind, p in enumerate(probability):
131 |             
132 |             ic=np.nansum(p*np.log2(p*4+0.0001))*80
133 |             ic_sum+=ic
134 |             #print ic
135 |             A=["A", p[0]*ic]
136 |             G=["G",p[1]*ic]
137 |             C=["C",p[2]*ic]
138 |             T=["T", p[3]*ic]
139 |             values=[A,G,C,T]
140 |             pos=filter(lambda x:x[1]>=0,values)
141 |             #neg=filter(lambda x:x[1]<0,values)
142 |             pos.sort(key=lambda x:x[1])
143 |             #neg.sort(key=lambda x:x[1], reverse=True)
144 |             Nucpos=0.01
145 |             #Nucneg=0
146 |             x_pos=width*0.1+pind*30
147 |         
148 |             for l in range(len(pos)):
149 |                 Nuc=pos[l][0]
150 |                 
151 |                 Nucsize=pos[l][1]+0.01
152 |                 cr.move_to(x_pos, y_center-Nucpos*0.75)               
153 |                 cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL)
154 |                 _select_color(cr, Nuc)
155 |                 font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=Nucsize,x0=0.0,y0=0.0)
156 |                 cr.set_font_matrix(font_mat)
157 |                 cr.show_text(str(Nuc))
158 |                 Nucpos+=abs(pos[l][1])
159 |         ims1.write_to_png(prefix+"kernel_"+str(k)+'.png')
160 |         png_list.append(prefix+"kernel_"+str(k)+'.png')
161 |         kernel_shape_ic_list.append(ic_sum)
162 |         cr.show_page()
163 |     meme_fileout.close()
164 |     
165 |     return png_list, kernel_shape_ic_list
166 | 
167 | def kernel_connector(png_list):
168 |     
169 |     
170 |     
171 |     
172 |     pt_per_mm = 72 / 25.4
173 |     width, height = 210 * pt_per_mm, 297 * pt_per_mm
174 |     upper_lim=height*0.1
175 |     lateral_lim=width*0.1
176 |     x_interval=width*0.8/9.0
177 |     nodew=width*0.005 
178 |     nodeh=0.0015*height
179 |     
180 |     out_dir=os.path.split(png_list[0])[0]+"/kernels.pdf"
181 |     ims1 = cairo.PDFSurface(out_dir, width, height)
182 |     cr = cairo.Context(ims1)
183 |     #cr.scale(pt_per_mm,pt_per_mm)
184 |     coordinates={}
185 |     im = Image.open(png_list[0])
186 |     xwidth=int(width*0.8/10.0)+5
187 |     ywidth=int(im.size[1]*xwidth/float(im.size[0]))
188 |     #print xwidth, ywidth
189 |     for k, i in enumerate(png_list):
190 |         im = Image.open(i)
191 |         
192 |         im=im.resize([xwidth, ywidth], Image.ANTIALIAS)
193 |         #im.thumbnail([xwidth, ywidth], Image.LANCZOS)
194 |         _buffer = StringIO.StringIO()
195 |         im.save(_buffer, format="PNG", quality=100)#,compress_level=0, dpi=(xwidth, ywidth))
196 |         _buffer.seek(0)
197 |         png_image=cairo.ImageSurface.create_from_png(_buffer)
198 |         cr.save()
199 |         #cr.scale(0.5, 0.5)
200 |         cr.set_source_surface(png_image, lateral_lim+(xwidth-5)*(k%10), upper_lim+ywidth*((k/10)))
201 |         cr.paint()
202 | 
203 |     cr.show_page()
204 |         
205 | 
206 | def main():
207 |     if len(sys.argv)>1:
208 |         npz_file=sys.argv[1]
209 |     else:
210 |         #npz_file='/home/fast/onimaru/deepgmap/data/outputs/conv4frss_trained_variables_Fri_May_11_075425_2018.npz'
211 |         npz_file='/home/fast2/onimaru/DeepGMAP-dev/data/outputs/conv4frss_Mon_Feb_25_092345_2019_trained_variables.npz'
212 |     #output_file='/home/fast/onimaru/data/output/deepshark_trained_variables_Sat_Apr_28_170548_2018.npz'
213 |     
214 |     png_list, kernel_shape_ic_list=seuquence_visualizer(npz_file)
215 |     kernel_connector(png_list)
216 | if __name__ == "__main__":    
217 |     main()
218 |     


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/deconvolution_to_signal.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import gzip
  3 | import cPickle
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import time
  7 | import math
  8 | import os
  9 | import matplotlib.pyplot as plt
 10 | import pylab
 11 | from deepgmap.post_train_tools import unpooling
 12 | import importlib as il
 13 | import getopt
 14 | from natsort import natsorted
 15 | from glob import glob
 16 | def test_batch(test_batch_file):
 17 |     with np.load(test_batch_file) as f:
 18 |         dnase_data_labels1=f['labels'], f['data_array']
 19 |         images=np.reshape(dnase_data_labels1[1], (batch_size, data_length, 4, 1))
 20 |         labels=dnase_data_labels1[0]
 21 |     return images, labels
 22 | 
 23 | def genome_scan(filename):
 24 |     with open(filename, 'r') as f1:
 25 |         file_name=f1.name
 26 |         path_sep=os.path.sep
 27 |         file_name1=file_name.split(path_sep)
 28 |         file_name2=file_name1[-1].split('_')
 29 |         chromosome=file_name2[2]
 30 |         a=file_name2[3]
 31 |         b=a.split('.')
 32 |         chr_position=int(b[0])
 33 |         #window_id=(file_name2[3])[:3]
 34 |         genome_seq=np.load(f1)
 35 |         shape_of_genome=genome_seq['genome'].shape
 36 |         genome_seq_re=np.reshape(genome_seq['genome'], (shape_of_genome[0], shape_of_genome[1], 4, 1))
 37 |         genome_seq_re_list=np.array_split(genome_seq_re, 100)
 38 |     return genome_seq_re_list, chromosome, chr_position #, window_id
 39 | 
 40 | 
 41 | BATCH_SIZE=1000
 42 | start=time.time()
 43 | try:
 44 |     options, args =getopt.getopt(sys.argv[1:], 'm:t:n:o:', ['model=','test_genome=','network_constructor=','output_dir='])
 45 | except getopt.GetoptError as err:
 46 |     #print str(err)
 47 |     sys.exit(2)
 48 | if len(options)<3:
 49 |     print('too few argument')
 50 |     sys.exit(0)
 51 | for opt, arg in options:
 52 |     if opt in ('-m', '--model'):
 53 |         trained_model=arg
 54 |     elif opt in ('-t', '--test_genome'):
 55 |         test_genome=arg
 56 |     elif opt in ('-n', '--network_constructor'):
 57 |         network_constructor=arg
 58 |     elif opt in ('-o', '--output_dir'):
 59 |         output_dir=arg
 60 | 
 61 | 
 62 | #output_dir=None
 63 | 
 64 | config = tf.ConfigProto()
 65 | config.gpu_options.allow_growth=True
 66 | sess = tf.Session(config=config)
 67 | 
 68 | keep_prob = tf.placeholder(tf.float32)
 69 | keep_prob2 = tf.placeholder(tf.float32)
 70 | keep_prob3 = tf.placeholder(tf.float32)
 71 | 
 72 | 
 73 | x_image = tf.placeholder(tf.float32, shape=[None, 1000, 4, 1])
 74 | y_ = tf.placeholder(tf.float32, shape=[None, 3])
 75 | phase=tf.placeholder(tf.bool)
 76 | dropout_1=0.95
 77 | dropout_2=0.9
 78 | dropout_3=0.85
 79 | batch_size=100
 80 | data_length=1000 
 81 | input_dir=trained_model
 82 | nc=il.import_module("deepgmap.network_constructors."+str(network_constructor))   
 83 | train_speed=0.00005
 84 | a=time.asctime()
 85 | b=a.replace(':', '')
 86 | start_at=b.replace(' ', '_')
 87 | 
 88 | model = nc.Model(image=x_image, label=y_, 
 89 |                  output_dir=output_dir,
 90 |                  phase=phase, 
 91 |                  start_at=start_at, 
 92 |                  keep_prob=keep_prob, 
 93 |                  keep_prob2=keep_prob2, 
 94 |                  keep_prob3=keep_prob3, 
 95 |                  data_length=data_length,
 96 |                  max_to_keep=2,
 97 |                  GPUID="1")
 98 | 
 99 | 
100 | sess.run(tf.global_variables_initializer())
101 | saver=model.saver
102 | saver.restore(sess, input_dir)
103 | 
104 | test_genome_list=natsorted(glob(test_genome))
105 | if len(test_genome_list)==0:
106 |     sys.exit(test_genome+" does not exist.")
107 |  
108 | def conv2d_tp(x, W, output_shape):
109 |     return tf.nn.conv2d_transpose(x, W, output_shape, strides=[1, 1, 1, 1], padding='VALID')
110 | def conv2d_tp2(x, W, output_shape):
111 |     return tf.nn.conv2d_transpose(x, W, output_shape, strides=[1,2, 1,1], padding='VALID') 
112 | def conv2d_tp4(x, W, output_shape):
113 |     return tf.nn.conv2d_transpose(x, W, output_shape, strides=[1, 4, 1, 1], padding='VALID')    
114 | def max_pool_2x1(x):
115 |     return tf.nn.max_pool_with_argmax(x, ksize=[1, 2, 1, 1], strides=[1, 2, 1, 1], padding='SAME')
116 | def max_pool_4x1(x):
117 |     return tf.nn.max_pool_with_argmax(x, ksize=[1, 4, 1, 1], strides=[1, 4, 1, 1], padding='SAME')
118 | position_list=[]
119 | y_prediction2=[]
120 | for test_genome_ in test_genome_list:
121 |     print(test_genome_)
122 |     genome_data=np.load(test_genome_)
123 |     position_list_, seq_list=genome_data['positions'], genome_data['sequences']
124 |     if len(position_list)==0:
125 |         position_list=position_list_
126 |     else:
127 |         position_list=np.concatenate([position_list,position_list_])
128 |     seq_list=np.array(seq_list, np.int16).reshape(-1, data_length, 4, 1)
129 |     seq_length=seq_list.shape[0]
130 |     print(seq_length)
131 |     
132 |     
133 |     loop=int(math.ceil(float(seq_length)/BATCH_SIZE))
134 |     for i in range(loop):
135 |         if i*BATCH_SIZE>seq_length:
136 |             break
137 |         scanning=seq_list[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
138 |         if len(y_prediction2)==0:
139 |             _, y_prediction2,variavl_dict, neurons_dict,_2=sess.run(model.prediction, feed_dict={x_image: scanning, keep_prob: 1.0, keep_prob2: 1.0, keep_prob3: 1.0,phase:False})
140 |             
141 |         else:
142 |             _, y_prediction1,variavl_dict, neurons_dict,_2=sess.run(model.prediction, feed_dict={x_image: scanning, keep_prob: 1.0, keep_prob2: 1.0, keep_prob3: 1.0,phase:False})
143 |             y_prediction2=np.concatenate([y_prediction2,y_prediction1],axis=0)
144 | 
145 |         
146 |         h_conv11_,\
147 |         h_conv12_,\
148 |         h_conv2_,\
149 |         h_conv21_,\
150 |         h_conv22_,\
151 |         h_pool1_,\
152 |         h_pool1_rc_,\
153 |         h_pool2_,\
154 |         h_pool21_,\
155 |         h_pool22_ =\
156 |         neurons_dict["h_conv11"],\
157 |         neurons_dict["h_conv12"],\
158 |         neurons_dict["h_conv2"],\
159 |         neurons_dict["h_conv21"],\
160 |         neurons_dict["h_conv22"],\
161 |         neurons_dict["h_pool1"],\
162 |         neurons_dict["h_pool1_rc"],\
163 |         neurons_dict["h_pool2"],\
164 |         neurons_dict["h_pool21"],\
165 |         neurons_dict["h_pool22"]
166 |     
167 |         
168 |         sess2 = tf.Session()
169 |         #print h_pool21_
170 |         h_pool21_shape=list(h_pool21_.shape)
171 |         h_pool21_t4= conv2d_tp(h_conv22_, variavl_dict["W_conv22"], h_pool21_shape)
172 |         _, mask21=max_pool_2x1(h_conv21_)
173 |         #h_unpool21_t4=unpooling.unpool(h_pool21_t4, mask21,output_shape=h_conv21_.shape)
174 |         h_unpool21_t4=unpooling.unpool2(h_pool21_t4, mask21)
175 |         
176 |         h_pool2_shape=list(h_pool2_.shape)
177 |         h_pool2_t4= conv2d_tp(h_unpool21_t4, variavl_dict["W_conv21"], h_pool2_shape)
178 |         _, mask2=max_pool_2x1(h_conv2_)
179 |         #h_unpool2_t4=unpooling.unpool(h_pool2_t4,mask2,output_shape=h_conv2_.shape)
180 |         h_unpool2_t4=unpooling.unpool2(h_pool2_t4,mask2)
181 |         
182 |         h_pool1_shape=list(h_pool1_.shape)
183 |         h_pool1_t4= conv2d_tp(h_unpool2_t4, variavl_dict["W_conv2"], h_pool1_shape)
184 |         _,mask1=max_pool_2x1(h_conv11_)
185 |         #h_unpool1_t4=unpooling.unpool(h_pool1_t4,mask1,output_shape=h_conv11_.shape)
186 |         h_unpool1_t4=unpooling.unpool2(h_pool1_t4,mask1)
187 |         
188 |         h_pool1_rc_t4=conv2d_tp(h_unpool2_t4, tf.reverse(variavl_dict["W_conv2"], [0,1]), h_pool1_shape)
189 |         _,mask1rc=max_pool_2x1(h_conv12_)
190 |         #h_unpool1_rc_t4=unpooling.unpool(h_pool1_rc_t4,mask1rc,output_shape=h_conv12_.shape)
191 |         h_unpool1_rc_t4=unpooling.unpool2(h_pool1_rc_t4,mask1rc)
192 |         
193 |         reconstruction_shape=scanning.shape
194 |         #print reconstruction_shape
195 |         reconstruction_conv22=conv2d_tp(h_unpool1_t4, variavl_dict["W_conv1"], reconstruction_shape)+conv2d_tp(h_unpool1_rc_t4, tf.reverse(variavl_dict["W_conv1"], [0,1]), reconstruction_shape)
196 |         
197 |         
198 |         sess2.run(tf.global_variables_initializer())
199 |         units_conv22 = sess2.run(reconstruction_conv22)
200 |         reshaped_conv22=np.reshape(units_conv22, (data_length, 4))
201 |         
202 |         sess2.close()
203 |         
204 |         
205 | sess.close()
206 |                                                


--------------------------------------------------------------------------------
/deepgmap/post_train_tools/deconv_deepshark_local_extend.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import gzip
  3 | import cPickle
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import time
  7 | import math
  8 | import os
  9 | import matplotlib.pyplot as plt
 10 | import pylab
 11 | from deepgmap.post_train_tools import unpooling
 12 | import importlib as il
 13 | import getopt
 14 | 
 15 | 
 16 | def test_batch(test_batch_file):
 17 |     with np.load(test_batch_file) as f:
 18 |         dnase_data_labels1=f['labels'], f['data_array']
 19 |         images=np.reshape(dnase_data_labels1[1], (batch_size, data_length, 4, 1))
 20 |         labels=dnase_data_labels1[0]
 21 |     return images, labels
 22 | 
 23 | start=time.time()
 24 | 
 25 | 
 26 | try:
 27 |     options, args =getopt.getopt(sys.argv[1:], 'm:t:n:o:d:', ['model=','test_batch=','network_constructor=','output_dir=','deconv='])
 28 | except getopt.GetoptError as err:
 29 |     #print str(err)
 30 |     sys.exit(2)
 31 | if len(options)<3:
 32 |     print('too few argument')
 33 |     sys.exit(0)
 34 | for opt, arg in options:
 35 |     if opt in ('-m', '--model'):
 36 |         trained_model=arg
 37 |     elif opt in ('-t', '--test_batch'):
 38 |         test_batch_file=arg
 39 |     elif opt in ('-n', '--network_constructor'):
 40 |         network_constructor=arg
 41 |     elif opt in ('-o', '--output_dir'):
 42 |         output_dir=arg
 43 |     elif opt in ('-d','--deconv'):
 44 |         deconv=arg
 45 | 
 46 | 
 47 | #output_dir=None
 48 | 
 49 | config = tf.ConfigProto()
 50 | config.gpu_options.allow_growth=True
 51 | sess = tf.Session(config=config)
 52 | 
 53 | keep_prob = tf.placeholder(tf.float32)
 54 | keep_prob2 = tf.placeholder(tf.float32)
 55 | keep_prob3 = tf.placeholder(tf.float32)
 56 | 
 57 | 
 58 | x_image = tf.placeholder(tf.float32, shape=[None, 1000, 4, 1])
 59 | y_ = tf.placeholder(tf.float32, shape=[None, 20])
 60 | phase=tf.placeholder(tf.bool)
 61 | dropout_1=0.95
 62 | dropout_2=0.9
 63 | dropout_3=0.85
 64 | batch_size=100
 65 | data_length=1000 
 66 | input_dir=trained_model
 67 | nc=il.import_module("network_constructors."+str(network_constructor))   
 68 | train_speed=0.00005
 69 | a=time.asctime()
 70 | b=a.replace(':', '')
 71 | start_at=b.replace(' ', '_')
 72 | 
 73 | model = nc.Model(image=x_image, label=y_, 
 74 |                  output_dir=output_dir,
 75 |                  phase=phase, 
 76 |                  start_at=start_at, 
 77 |                  keep_prob=keep_prob, 
 78 |                  keep_prob2=keep_prob2, 
 79 |                  keep_prob3=keep_prob3, 
 80 |                  data_length=data_length)
 81 | 
 82 | 
 83 | sess.run(tf.global_variables_initializer())
 84 | saver=model.saver
 85 | saver.restore(sess, input_dir)
 86 | 
 87 | 
 88 | batch = test_batch(test_batch_file)
 89 | test_accuracy1, y_label1, y_prediction1 =sess.run([model.error, y_, model.prediction[1]], feed_dict={x_image: batch[0], y_: batch[1], keep_prob: 1.0, keep_prob2: 1.0, keep_prob3: 1.0}) 
 90 | #print "test accuracy (true:false=5:5): "+str(test_accuracy1)
 91 | #print deconv
 92 | 
 93 | 
 94 | 
 95 | def conv2d_tp(x, W, output_shape):
 96 |     return tf.nn.conv2d_transpose(x, W, output_shape, strides=[1, 1, 1, 1], padding='VALID')
 97 | def conv2d_tp2(x, W, output_shape):
 98 |     return tf.nn.conv2d_transpose(x, W, output_shape, strides=[1,2, 1,1], padding='VALID') 
 99 | def conv2d_tp4(x, W, output_shape):
100 |     return tf.nn.conv2d_transpose(x, W, output_shape, strides=[1, 4, 1, 1], padding='VALID')    
101 | def max_pool_2x1(x):
102 |     return tf.nn.max_pool_with_argmax(x, ksize=[1, 2, 1, 1], strides=[1, 2, 1, 1], padding='SAME')
103 | def max_pool_4x1(x):
104 |     return tf.nn.max_pool_with_argmax(x, ksize=[1, 4, 1, 1], strides=[1, 4, 1, 1], padding='SAME')
105 | 
106 | index_of_image=0
107 | positive_image=[]
108 | for y in batch[1]:
109 |     #print y[0]
110 |     if np.sum(y)>0:
111 |         positive_image.append(index_of_image)
112 |     index_of_image+=1
113 | #print len(positive_image)
114 | for k in range(len(positive_image)):  
115 | 
116 |     images4=np.reshape(batch[0][positive_image[k]], (1, data_length, 4, 1))
117 |     
118 |     #h_conv3_, h_conv25_, h_conv24_, h_conv23_, h_conv22_,h_conv21_, h_conv2_, h_conv1_, b_conv3_=sess.run([h_conv3, h_conv25,h_conv24, h_conv23, h_conv22,h_conv21, h_conv2, h_conv1, b_conv3], 
119 |     
120 |     #                                                                                                      feed_dict={x_image: images4, keep_prob: 1.0, keep_prob2: 1.0, keep_prob3: 1.0})
121 |     _2,_1, variavl_dict, neurons_dict, _3=sess.run(model.prediction, feed_dict={x_image: images4, keep_prob: 1.0, keep_prob2: 1.0, keep_prob3: 1.0, phase: False})
122 |     
123 |     h_conv11_,\
124 |     h_conv12_,\
125 |     h_conv2_,\
126 |     h_conv21_,\
127 |     h_conv22_,\
128 |     h_pool1_,\
129 |     h_pool1_rc_,\
130 |     h_pool2_,\
131 |     h_pool21_,\
132 |     h_pool22_ =\
133 |     neurons_dict["h_conv11"],\
134 |     neurons_dict["h_conv12"],\
135 |     neurons_dict["h_conv2"],\
136 |     neurons_dict["h_conv21"],\
137 |     neurons_dict["h_conv22"],\
138 |     neurons_dict["h_pool1"],\
139 |     neurons_dict["h_pool1_rc"],\
140 |     neurons_dict["h_pool2"],\
141 |     neurons_dict["h_pool21"],\
142 |     neurons_dict["h_pool22"]
143 | 
144 |     
145 |     sess2 = tf.Session()
146 |     #print h_pool21_
147 |     h_pool21_shape=list(h_pool21_.shape)
148 |     h_pool21_t4= conv2d_tp(h_conv22_, variavl_dict["W_conv22"], h_pool21_shape)
149 |     _, mask21=max_pool_2x1(h_conv21_)
150 |     #h_unpool21_t4=unpooling.unpool(h_pool21_t4, mask21,output_shape=h_conv21_.shape)
151 |     h_unpool21_t4=unpooling.unpool2(h_pool21_t4, mask21)
152 |     
153 |     h_pool2_shape=list(h_pool2_.shape)
154 |     h_pool2_t4= conv2d_tp(h_unpool21_t4, variavl_dict["W_conv21"], h_pool2_shape)
155 |     _, mask2=max_pool_2x1(h_conv2_)
156 |     #h_unpool2_t4=unpooling.unpool(h_pool2_t4,mask2,output_shape=h_conv2_.shape)
157 |     h_unpool2_t4=unpooling.unpool2(h_pool2_t4,mask2)
158 |     
159 |     h_pool1_shape=list(h_pool1_.shape)
160 |     h_pool1_t4= conv2d_tp(h_unpool2_t4, variavl_dict["W_conv2"], h_pool1_shape)
161 |     _,mask1=max_pool_2x1(h_conv11_)
162 |     #h_unpool1_t4=unpooling.unpool(h_pool1_t4,mask1,output_shape=h_conv11_.shape)
163 |     h_unpool1_t4=unpooling.unpool2(h_pool1_t4,mask1)
164 |     
165 |     h_pool1_rc_t4=conv2d_tp(h_unpool2_t4, tf.reverse(variavl_dict["W_conv2"], [0,1]), h_pool1_shape)
166 |     _,mask1rc=max_pool_2x1(h_conv12_)
167 |     #h_unpool1_rc_t4=unpooling.unpool(h_pool1_rc_t4,mask1rc,output_shape=h_conv12_.shape)
168 |     h_unpool1_rc_t4=unpooling.unpool2(h_pool1_rc_t4,mask1rc)
169 |     
170 |     reconstruction_shape=images4.shape
171 |     #print reconstruction_shape
172 |     reconstruction_conv22=conv2d_tp(h_unpool1_t4, variavl_dict["W_conv1"], reconstruction_shape)+\
173 |                         conv2d_tp(h_unpool1_rc_t4, tf.reverse(variavl_dict["W_conv1"], [0,1]), reconstruction_shape)
174 |     
175 |     
176 |     sess2.run(tf.global_variables_initializer())
177 |     units_conv22 = sess2.run(reconstruction_conv22)
178 |     
179 | 
180 |     reshaped_conv22=np.reshape(units_conv22, (data_length, 4))
181 |     
182 |     # Compute and plot first dendrogram.
183 |     fig = plt.figure(figsize=(12,8))
184 |     
185 |     # Plot distance matrix.
186 | 
187 |     
188 |     axmatrix_conv22 = fig.add_axes([0.05,0.05,0.1,0.9])
189 |     im_conv22 = axmatrix_conv22.matshow(reshaped_conv22, aspect='auto', origin='lower', cmap=plt.get_cmap('YlGnBu'))
190 |     axmatrix_conv22.set_xticks([])
191 |     axmatrix_conv22.set_yticks([])
192 |     axcolor = fig.add_axes([0.16,0.05,0.02,0.9])
193 |     pylab.colorbar(im_conv22, cax=axcolor)
194 |     
195 |     
196 |     reshaped2=np.reshape(images4, (data_length, 4))
197 |     axmatrix3 = fig.add_axes([0.85,0.05,0.1,0.9])
198 |     im3 = axmatrix3.matshow(reshaped2, aspect='auto', origin='lower', cmap=plt.get_cmap('YlGnBu'))
199 |     axmatrix3.set_xticks([])
200 |     axmatrix3.set_yticks([])
201 |     axcolor = fig.add_axes([0.96,0.05,0.02,0.9])
202 |     pylab.colorbar(im3, cax=axcolor)
203 |     
204 |     np.savez_compressed(str(output_dir)+str(trained_model.split('/')[-1])+"_transpose_"+str(k), 
205 |                         conv22=reshaped_conv22, 
206 |                         original=np.reshape(images4,(data_length, 4)))
207 | 
208 |     fig.savefig(str(output_dir)+str(trained_model.split('/')[-1])+'_reconstruction_'+str(k)+'.png')
209 |     #plt.show()
210 |     sess2.close()
211 | sess.close()
212 |                                            


--------------------------------------------------------------------------------