├── deepgmap ├── misc │ ├── WORKSPACE │ ├── __init__.py │ ├── Constants.py │ ├── Constants.pyc │ ├── __init__.pyc │ ├── small_tools.pyc │ ├── .gitignore │ ├── small_tools.py │ ├── pickup_pos_seq_region.py │ ├── spearman_r.py │ ├── dataset_checker.py │ ├── edit_labeled_file.py │ ├── fasta_file_from_labeled.py │ ├── dataset_checker_multiple_label.py │ ├── TSS_bedfile.py │ ├── reduce_negatives.py │ ├── optimal_frip_cutoff.py │ ├── compare_deepsea_data.py │ ├── fix_fasta.py │ ├── randomdna2.py │ ├── deepsea_anal.py │ ├── box_plot.py │ ├── randomdna.py │ ├── bed_file_compare.py │ ├── motif_logo_creator.py │ ├── gff_to_colored_bed.py │ ├── bed_file_compare2.py │ ├── kernel_distribution_analizer.py │ └── igv_session.xml ├── train │ ├── __init__.py │ ├── .gitignore │ ├── __init__.pyc │ └── deepshark_local_oop_1d.pyc ├── post_train_tools │ ├── __init__.py │ ├── test.pdf │ ├── test.png │ ├── __init__.pyc │ ├── unpooling.pyc │ ├── cython_util.so │ ├── sequence_visualizer2.pyc │ ├── inputfileGeneratorForGenomeScan_p.pyc │ ├── inputfileGeneratorForGenomeScan_p2.pyc │ ├── cython_util.cpython-36m-x86_64-linux-gnu.so │ ├── randomize_labels.py │ ├── PCA.py │ ├── compare_narrowPeak_scores.py │ ├── liftover_indiv_genome_to_hg38.py │ ├── merge_bigwig.py │ ├── motif_compare2.py │ ├── sequence_visualizer.py │ ├── Clustering_analizer.py │ ├── ROC_space_plotter.py │ ├── inputfileGeneratorForGenomeScan_p2.py │ ├── ROC_space_plotter3.py │ ├── motif_compare.py │ ├── inputfileGeneratorForGenomeScan_gwas.py │ ├── inputfileGeneratorForGenomeScan_p.py │ ├── unpooling.py │ ├── precision_recall_handmade.py │ ├── fimo_to_numpy_array.py │ ├── trained_deepshark_local_multiple_label.py │ ├── cython_util.pyx │ ├── ROC_space_plotter2.py │ ├── sequence_visualizer2.py │ ├── kernel_visualizer2.py │ ├── deconvolution_to_signal.py │ └── deconv_deepshark_local_extend.py ├── __init__.py ├── network_constructors │ ├── __init__.py │ ├── conv4.pyc │ ├── danq.pyc │ ├── danq2.pyc │ ├── danq3.pyc │ ├── danq4.pyc │ ├── __init__.pyc │ ├── auc_calc.pyc │ ├── basset.pyc │ ├── danqfrss.pyc │ ├── deepsea.pyc │ ├── conv3frss.pyc │ ├── conv4frss.pyc │ ├── danqblock.pyc │ ├── conv4frssplus.pyc │ ├── conv4frssplus2.pyc │ ├── template_model.pyc │ ├── __pycache__ │ │ └── auc_calc.cpython-36.pyc │ ├── .gitignore │ ├── auc_calc.py │ └── template_model.py ├── data_preprocessing_tools │ ├── __init__.py │ ├── queue.c │ ├── __init__.pyc │ ├── genome_divider.pyc │ ├── seq_to_binary.pyc │ ├── seq_to_binary2.so │ ├── genome_labeling2.pyc │ ├── input_generator_from_narrowPeaks2.pyc │ ├── inputfileGenerator_multiple_label3.pyc │ ├── build │ │ └── temp.linux-x86_64-3.6 │ │ │ └── seq_to_binary2.o │ ├── seq_to_binary2.cpython-36m-x86_64-linux-gnu.so │ ├── setup.py │ ├── deepgmap │ │ └── data_preprocessing_tools │ │ │ └── seq_to_binary2.cpython-36m-x86_64-linux-gnu.so │ ├── remove_chr.py │ ├── bed_file_500_add_seq.py │ ├── cqueue.pxd │ ├── pick_one_chromosome.py │ ├── bed_file_500_add_seq2.py │ ├── remove_excess_negatives.py │ ├── genome_file_maker.py │ ├── remove_variant_annotations.py │ ├── seq_to_binary.py │ ├── bed_file_500.py │ ├── queue.pyx │ ├── inputGenerator_from_deepsea.py │ ├── genome_labeling.py │ ├── genome_labeling_compare.py │ ├── genome_divider.py │ └── genome_labeling2.py └── __init__.pyc ├── .gitignore ├── requirements.txt ├── Dockerfile ├── setup.py └── INSTALL.rst /deepgmap/misc/WORKSPACE: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepgmap/misc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepgmap/train/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepgmap/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.0" -------------------------------------------------------------------------------- /deepgmap/network_constructors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deepgmap/train/.gitignore: -------------------------------------------------------------------------------- 1 | /send_email.py 2 | -------------------------------------------------------------------------------- /deepgmap/misc/Constants.py: -------------------------------------------------------------------------------- 1 | DeepGMAP_VERSION = "dev3" 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .project 2 | .pydevproject 3 | /data/ 4 | /build/ 5 | /_tmp.bw 6 | -------------------------------------------------------------------------------- /deepgmap/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/__init__.pyc -------------------------------------------------------------------------------- /deepgmap/misc/Constants.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/misc/Constants.pyc -------------------------------------------------------------------------------- /deepgmap/misc/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/misc/__init__.pyc -------------------------------------------------------------------------------- /deepgmap/train/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/train/__init__.pyc -------------------------------------------------------------------------------- /deepgmap/misc/small_tools.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/misc/small_tools.pyc -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/queue.c: -------------------------------------------------------------------------------- 1 | #error Do not use this file, it is the result of a failed Cython compilation. 2 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/test.pdf -------------------------------------------------------------------------------- /deepgmap/post_train_tools/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/test.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow>=1.15.2 2 | numpy 3 | matplotlib 4 | sklearn 5 | cairocffi 6 | cython 7 | tornado 8 | pyBigWig -------------------------------------------------------------------------------- /deepgmap/network_constructors/conv4.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/conv4.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/danq.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/danq.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/danq2.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/danq2.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/danq3.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/danq3.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/danq4.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/danq4.pyc -------------------------------------------------------------------------------- /deepgmap/post_train_tools/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/__init__.pyc -------------------------------------------------------------------------------- /deepgmap/post_train_tools/unpooling.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/unpooling.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/__init__.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/auc_calc.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/auc_calc.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/basset.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/basset.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/danqfrss.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/danqfrss.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/deepsea.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/deepsea.pyc -------------------------------------------------------------------------------- /deepgmap/post_train_tools/cython_util.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/cython_util.so -------------------------------------------------------------------------------- /deepgmap/train/deepshark_local_oop_1d.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/train/deepshark_local_oop_1d.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/conv3frss.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/conv3frss.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/conv4frss.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/conv4frss.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/danqblock.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/danqblock.pyc -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/__init__.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/conv4frssplus.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/conv4frssplus.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/conv4frssplus2.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/conv4frssplus2.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/template_model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/template_model.pyc -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/genome_divider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/genome_divider.pyc -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/seq_to_binary.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/seq_to_binary.pyc -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/seq_to_binary2.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/seq_to_binary2.so -------------------------------------------------------------------------------- /deepgmap/post_train_tools/sequence_visualizer2.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/sequence_visualizer2.pyc -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/genome_labeling2.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/genome_labeling2.pyc -------------------------------------------------------------------------------- /deepgmap/post_train_tools/inputfileGeneratorForGenomeScan_p.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/inputfileGeneratorForGenomeScan_p.pyc -------------------------------------------------------------------------------- /deepgmap/post_train_tools/inputfileGeneratorForGenomeScan_p2.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/inputfileGeneratorForGenomeScan_p2.pyc -------------------------------------------------------------------------------- /deepgmap/network_constructors/__pycache__/auc_calc.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/network_constructors/__pycache__/auc_calc.cpython-36.pyc -------------------------------------------------------------------------------- /deepgmap/post_train_tools/cython_util.cpython-36m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/post_train_tools/cython_util.cpython-36m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/input_generator_from_narrowPeaks2.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/input_generator_from_narrowPeaks2.pyc -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/inputfileGenerator_multiple_label3.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/inputfileGenerator_multiple_label3.pyc -------------------------------------------------------------------------------- /deepgmap/misc/.gitignore: -------------------------------------------------------------------------------- 1 | /intersectAB.bed 2 | /intersectABC.bed 3 | /intersectABC_.bed 4 | /intersectAB_.bed 5 | /intersectAC.bed 6 | /intersectAC_.bed 7 | /intersectBC.bed 8 | /intersectBC_.bed 9 | -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/build/temp.linux-x86_64-3.6/seq_to_binary2.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/build/temp.linux-x86_64-3.6/seq_to_binary2.o -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/seq_to_binary2.cpython-36m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/seq_to_binary2.cpython-36m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | 4 | setup( 5 | name = "seq_to_binary", 6 | ext_modules = cythonize('seq_to_binary.pyx'), # accepts a glob pattern 7 | ) 8 | -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/deepgmap/data_preprocessing_tools/seq_to_binary2.cpython-36m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koonimaru/DeepGMAP/HEAD/deepgmap/data_preprocessing_tools/deepgmap/data_preprocessing_tools/seq_to_binary2.cpython-36m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /deepgmap/misc/small_tools.py: -------------------------------------------------------------------------------- 1 | def is_number(s): 2 | try: 3 | float(s) 4 | return True 5 | except ValueError: 6 | return False 7 | 8 | def div_roundup(x, y): 9 | if y%x==0: 10 | return y/x 11 | else: 12 | return y/x+1 13 | 14 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.9.0-rc2-gpu-py3 2 | RUN apt-get update && apt-get install -y --no-install-recommends bedtools git 3 | RUN pip3 install --no-cache-dir setuptools matplotlib pyBigWig 4 | RUN git clone -b dev3 https://github.com/koonimaru/DeepGMAP.git && \ 5 | cd DeepGMAP && git checkout && \ 6 | python3 setup.py install -------------------------------------------------------------------------------- /deepgmap/misc/pickup_pos_seq_region.py: -------------------------------------------------------------------------------- 1 | 2 | labeled_file="/home/fast/onimaru/data/CTCF/mm10_CTCF_narrowPeak_mapq/picard_mm10_1000.bed.labeled" 3 | 4 | with open(labeled_file, "r") as fin, open(labeled_file.split('.')[0]+"_positive_region.bed", 'w') as fo: 5 | for line in fin: 6 | if not line.startswith("#"): 7 | line1=line.split() 8 | a=map(int, line1[3:]) 9 | if sum(a) >0: 10 | fo.write(line) 11 | 12 | -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/remove_chr.py: -------------------------------------------------------------------------------- 1 | file_name="/home/slow/onimaru/1000genome/HG00119.fa.ed" 2 | file_out="/home/slow/onimaru/1000genome/HG00119_ed_chr.fa" 3 | with open(file_name, "r") as fin, open(file_out,"w") as fout: 4 | for line in fin: 5 | if line.startswith('>'): 6 | line=line.split() 7 | chromo=line[0].strip('>') 8 | line=">chr"+str(chromo)+"\n" 9 | fout.write(line) 10 | else: 11 | fout.write(line) 12 | -------------------------------------------------------------------------------- /deepgmap/misc/spearman_r.py: -------------------------------------------------------------------------------- 1 | import scipy.stats as sc 2 | 3 | f="/home/fast2/onimaru/DeepGMAP-dev/data/misc/cfrip_mm10_ctcf.txt" 4 | 5 | peaks=[] 6 | frips=[] 7 | cfrips=[] 8 | 9 | with open(f, "r") as fin: 10 | for line in fin: 11 | line=line.split() 12 | if not line[0]=="ID": 13 | peaks.append(float(line[4])) 14 | frips.append(float(line[2])) 15 | cfrips.append(float(line[5])) 16 | 17 | #print sc.spearmanr(frips, peaks) 18 | #print sc.spearmanr(cfrips, peaks) -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/bed_file_500_add_seq.py: -------------------------------------------------------------------------------- 1 | 2 | infile="/home/fast/onimaru/data/CTCF/hg38_200_no_hiPS_CTCF.bed" 3 | outfile="/home/fast/onimaru/data/CTCF/hg38_200_no_hiPS_CTCF_pm400.bed" 4 | with open(infile, 'r') as fin, open(outfile, 'w') as fout: 5 | 6 | for line in fin: 7 | line=line.split() 8 | chrom=line[0] 9 | start=int(line[1]) 10 | end=int(line[2]) 11 | new_start=start-400 12 | new_end=end+400 13 | fout.write(str(chrom)+"\t"+str(new_start)+"\t"+str(new_end)+"\n") 14 | -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/cqueue.pxd: -------------------------------------------------------------------------------- 1 | # file: cqueue.pxd 2 | 3 | cdef extern from "libcalg/queue.h": 4 | ctypedef struct Queue: 5 | pass 6 | ctypedef void* QueueValue 7 | 8 | Queue* queue_new() 9 | void queue_free(Queue* queue) 10 | 11 | int queue_push_head(Queue* queue, QueueValue data) 12 | QueueValue queue_pop_head(Queue* queue) 13 | QueueValue queue_peek_head(Queue* queue) 14 | 15 | int queue_push_tail(Queue* queue, QueueValue data) 16 | QueueValue queue_pop_tail(Queue* queue) 17 | QueueValue queue_peek_tail(Queue* queue) 18 | 19 | bint queue_is_empty(Queue* queue) -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/pick_one_chromosome.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | chromosome_name="chr1" 4 | file_name="/home/slow/onimaru/1000genome/HG00119_1000.fa" 5 | with open(file_name, "r") as fin, open(file_name+"_"+chromosome_name, "w") as fout: 6 | WRITE=False 7 | for line in fin: 8 | if line.startswith('>'): 9 | #line1=line.split() 10 | a=line.strip('>\n') 11 | #print a 12 | if a.startswith(chromosome_name): 13 | fout.write(line) 14 | WRITE=True 15 | else: 16 | WRITE=False 17 | 18 | elif WRITE: 19 | fout.write(line) 20 | -------------------------------------------------------------------------------- /deepgmap/misc/dataset_checker.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | def dataset_checker(input, output): 5 | with open(input, 'r') as f_in, open(output, 'w') as f_out: 6 | for line in f_in: 7 | if '>' in line: 8 | position=line 9 | else: 10 | sequence=line 11 | N_percent=float(sequence.count('N'))/len(sequence) 12 | if N_percent<0.90 and len(sequence)>100: 13 | f_out.write(str(position)+str(sequence)) 14 | 15 | if __name__ == '__main__': 16 | dataset_checker('/home/fast/onimaru/data/CTCF/mm10_no_CTCF.fa', '/home/fast/onimaru/data/CTCF/mm10_no_CTCF_noN.fa') -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/bed_file_500_add_seq2.py: -------------------------------------------------------------------------------- 1 | 2 | infile="/home/fast/onimaru/encode/mm10_dnase-seq_subset/*_summits.bed" 3 | outfile="/home/fast/onimaru/data/CTCF/hiPS_CTCF_peaks.narrowPeak_600.bed" 4 | with open(infile, 'r') as fin, open(outfile, 'w') as fout: 5 | 6 | for line in fin: 7 | 8 | line=line.split() 9 | chrom=line[0] 10 | if not chrom.startswith('chrM') and not '_' in chrom: 11 | start=int(line[1]) 12 | end=int(line[2]) 13 | mid_p=(start+end)/2 14 | new_start=mid_p-300 15 | new_end=mid_p+300 16 | fout.write(str(chrom)+"\t"+str(new_start)+"\t"+str(new_end)+"\n") 17 | -------------------------------------------------------------------------------- /deepgmap/network_constructors/.gitignore: -------------------------------------------------------------------------------- 1 | /__init__.pyc 2 | /basset.pyc 3 | /conv4frss.pyc 4 | /conv4frssplus2.pyc 5 | /danq.pyc 6 | /danq2.pyc 7 | /danq3.pyc 8 | /danq4.pyc 9 | /danqblock.pyc 10 | /deepsea.pyc 11 | /deepshark2.pyc 12 | /deepshark4.pyc 13 | /deepshark5.pyc 14 | /deepsharkcheck.pyc 15 | /deepsharkcheck2.pyc 16 | /deepsharktest3.pyc 17 | /network_constructor_basset.pyc 18 | /network_constructor_danq_1d3.pyc 19 | /network_constructor_deepsea_1d2.pyc 20 | /network_constructor_deepsea_1d4.pyc 21 | /network_constructor_deepsea_1d5.pyc 22 | /network_constructor_deepsea_1d6.pyc 23 | /template_model.pyc 24 | /conv4frss2.pyc 25 | /conv4frss3.pyc 26 | /conv4frssplus3.pyc 27 | /conv4frssplus4.pyc 28 | -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/remove_excess_negatives.py: -------------------------------------------------------------------------------- 1 | import random 2 | labeled_file="/home/fast/onimaru/data/Chip-seq/three_tfs_hg38_500_rand_250_5times_srt.bed.labeled" 3 | labeled_file_out="/home/fast/onimaru/data/Chip-seq/three_tfs_hg38_500_rand_250_5times_srt_reduced.bed.labeled" 4 | with open(labeled_file, 'r') as fin, open(labeled_file_out ,'w') as fout: 5 | for line in fin: 6 | if line.startswith("#"): 7 | fout.write(line) 8 | else: 9 | r=random.random() 10 | line1=line.split() 11 | #print line1[3:] 12 | label_num=sum(map(int, line1[3:])) 13 | if label_num==0 and r<=0.800: 14 | continue 15 | else: 16 | fout.write(line) 17 | -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/genome_file_maker.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import math 3 | length_list=[] 4 | 5 | 6 | with open(sys.argv[1], 'r') as fin, open('./'+sys.argv[2], 'w') as fout: 7 | seq=0 8 | chrom_name='' 9 | for line in fin: 10 | 11 | if '>' in line: 12 | 13 | if not seq==0: 14 | length_list.append(seq) 15 | #if not "_" in chrom_name and not "M" in chrom_name: 16 | fout.write(str(chrom_name)+'\t'+str(seq)+'\n') 17 | line=line.split() 18 | chrom_name=line[0].strip('>') 19 | seq=0 20 | else: 21 | line1=line.strip("\n") 22 | seq+=len(line1) 23 | #if len(chrom_name)==3 and not "M" in chrom_name: 24 | fout.write(str(chrom_name)+'\t'+str(seq)+'\n') -------------------------------------------------------------------------------- /deepgmap/misc/edit_labeled_file.py: -------------------------------------------------------------------------------- 1 | 2 | lf="/home/fast/onimaru/encode/mm10_dnase-seq_subset/deepsea_type_wondow_mm10_s200.bed.labeled" 3 | gf="/home/fast/onimaru/data/genome_fasta/mm10.genome" 4 | 5 | chrm_dict={} 6 | 7 | with open(gf, 'r') as fin: 8 | for line in fin: 9 | line=line.split() 10 | chrm_dict[line[0]]=int(line[1]) 11 | import os 12 | h, t=os.path.split(lf) 13 | elf=h+"/edited_"+t 14 | with open(lf,'r') as fin, open(elf,'w') as fo: 15 | for line in fin: 16 | if line.startswith("#"): 17 | fo.write(line) 18 | else: 19 | line=line.split() 20 | start=int(line[1])-400 21 | end=int(line[2])+400 22 | if start>=0 and end<=chrm_dict[line[0]]: 23 | fo.write('\t'.join([line[0],str(start),str(end)])+"\t"+" ".join(line[3:])+"\n") 24 | -------------------------------------------------------------------------------- /deepgmap/misc/fasta_file_from_labeled.py: -------------------------------------------------------------------------------- 1 | 2 | position_set=set() 3 | 4 | with open("/home/fast/onimaru/data/mm10_1000_limb_altwindow_75co_non.labeled", 'r') as f1, open("/home/fast/onimaru/data/mm10_1000_altwindow_non.fa", 'r') as f2: 5 | for line in f1: 6 | line=line.split() 7 | if int(line[3])==1: 8 | position_set.add(line[0]+":"+line[1]+"-"+line[2]) 9 | 10 | with open("/home/fast/onimaru/data/mm10_1000_limb_altwindow_75co_non.labeled.fa", 'w') as fo: 11 | WRITE=False 12 | for line in f2: 13 | if line.startswith('>'): 14 | current_position=line.strip('>\n') 15 | if current_position in position_set: 16 | fo.write(line) 17 | WRITE=True 18 | else: 19 | WRITE=False 20 | elif WRITE: 21 | fo.write(line) 22 | -------------------------------------------------------------------------------- /deepgmap/misc/dataset_checker_multiple_label.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | def dataset_checker(input1,input2, output1,output2): 5 | with open(input1, 'r') as fin1,open(input2, 'r') as fin2, open(output1, 'w') as fout1,open(output2, 'w') as fout2: 6 | 7 | for line in fin1: 8 | if '>' in line: 9 | position=line 10 | fin2_line=fin2.readline() 11 | else: 12 | sequence=line 13 | N_percent=float(sequence.count('N'))/len(sequence) 14 | if N_percent<=0.80: 15 | fout1.write(str(position)+str(sequence)) 16 | fout2.write(fin2_line) 17 | if __name__ == '__main__': 18 | dataset_checker('/home/fast/onimaru/data/mm10_1000_mrg_srt.fa', 19 | '/home/fast/onimaru/data/mm10_1000_mrg_srt.bedadipo_mrg.labeled', 20 | '/home/fast/onimaru/data/mm10_1000_mrg_srt_non.fa', 21 | '/home/fast/onimaru/data/mm10_1000_mrg_srt.bedadipo_mrg_noN.labeled') -------------------------------------------------------------------------------- /deepgmap/post_train_tools/randomize_labels.py: -------------------------------------------------------------------------------- 1 | 2 | file_name="/home/fast/onimaru/data/CTCF/mm10_CTCF_qc2_mm10_1000.bed.labeled" 3 | from random import shuffle 4 | 5 | a=file_name.split('.')[0]+"_shuffled.bed.labeled" 6 | 7 | with open(file_name, "r") as fin, open(a, "w") as fo: 8 | i=0 9 | for line in fin: 10 | if line.startswith("#"): 11 | line=line.split() 12 | list_of_label=line[1:] 13 | 14 | x = [i for i in range(len(list_of_label))] 15 | shuffle(x) 16 | list_of_label_shuf=[] 17 | for e in x: 18 | list_of_label_shuf.append(list_of_label[e]) 19 | fo.write(line[0]+" "+" ".join(list_of_label_shuf)+"\n") 20 | else: 21 | b=line.split() 22 | pos="\t".join(b[:3]) 23 | tmp=b[3:] 24 | tmp_shuf=[] 25 | #print tmp 26 | tmp2=map(int, tmp) 27 | if sum(tmp2)>0: 28 | 29 | for e in x: 30 | tmp_shuf.append(tmp[e]) 31 | else: 32 | tmp_shuf=tmp 33 | label=" ".join(tmp_shuf) 34 | fo.write(pos+"\t"+label+"\n") 35 | 36 | -------------------------------------------------------------------------------- /deepgmap/misc/TSS_bedfile.py: -------------------------------------------------------------------------------- 1 | 2 | gene_list=[] 3 | with open ('/media/koh/HD-PCFU3/mouse/mouse_UCSC_wholegenes.bed', 'r') as fin: 4 | with open ('/media/koh/HD-PCFU3/mouse/mouse_TSS.bed', 'w') as fout: 5 | for line in fin: 6 | if not line=='' or not line=='\n': 7 | line=line.split() 8 | chromosome=line[0] 9 | left=int(line[1]) 10 | right=int(line[2]) 11 | direction=line[5] 12 | if direction=='+': 13 | gene=line[0]+':'+line[1] 14 | elif direction=='-': 15 | gene=line[0]+':'+line[2] 16 | 17 | if not gene in gene_list: 18 | gene_list.append(gene) 19 | if direction=='+': 20 | start=left-1000 21 | end=left+1000 22 | fout.write(str(chromosome)+'\t'+str(start)+'\t'+str(end)+'\n') 23 | if direction=='-': 24 | start=right-1000 25 | end=right+1000 26 | fout.write(str(chromosome)+'\t'+str(start)+'\t'+str(end)+'\n') 27 | -------------------------------------------------------------------------------- /deepgmap/misc/reduce_negatives.py: -------------------------------------------------------------------------------- 1 | import random 2 | import os 3 | labeledf="/home/fast/onimaru/data/Chip-seq/narrowPeaks/three_3times.labeled" 4 | 5 | h, t =os.path.split(labeledf) 6 | 7 | pos_num=0 8 | neg_num=0 9 | 10 | with open(labeledf, 'r') as fin: 11 | for line in fin: 12 | if line.startswith("#"): 13 | continue 14 | line=line.split() 15 | i=sum(map(int, line[3:])) 16 | if i >0: 17 | pos_num+=1 18 | else: 19 | neg_num+=1 20 | 21 | 22 | #print pos_num, neg_num 23 | 24 | r=float(pos_num)/(0.75*neg_num) 25 | 26 | with open(labeledf, 'r') as fin, open(h+"/down_sampled_"+str(round(r,4))+"_"+t, "w") as fl, open(h+"/down_sampled_"+str(round(r,4))+"_"+t+".bed", "w") as fb: 27 | for line in fin: 28 | if line.startswith("#"): 29 | fl.write(line) 30 | continue 31 | line_=line.split() 32 | i=sum(map(int, line_[3:])) 33 | rand=random.random() 34 | 35 | if i > 0: 36 | fl.write(line) 37 | fb.write("\t".join(line_[:3])+"\n") 38 | elif rand') and "dna:chromosome" in line: 13 | line=line.split()[0] 14 | a=line.strip('>') 15 | a='chr'+str(a) 16 | seq=[] 17 | chromosome_list[a]=seq 18 | seq_list.append(a) 19 | print(a) 20 | WRITE=True 21 | 22 | 23 | elif line.startswith('>') and "GL" in line: 24 | WRITE=False 25 | 26 | elif WRITE: 27 | line1=re.sub(r'\<.*?\>', '', line) 28 | line1=re.sub(r'\<.*?\n', '', line1) 29 | line1=re.sub(r'.*?\>', '', line1) 30 | #line1=line1.strip("\n") 31 | chromosome_list[a].append(line1.strip('\n')) 32 | 33 | with open(file_name+'.ed','w') as fout: 34 | for k in seq_list: 35 | #print k 36 | fout.write(">"+str(k)+"\n") 37 | for i in chromosome_list[k]: 38 | fout.write(str(i)) 39 | fout.write("\n") -------------------------------------------------------------------------------- /deepgmap/misc/optimal_frip_cutoff.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from scipy.interpolate import BSpline 4 | 5 | fname="/home/fast2/onimaru/DeepGMAP-dev/data/misc/mm10_ctcf_auprcs_frips.txt" 6 | 7 | auprc=[] 8 | frip=[] 9 | """ 10 | with open(fname, 'r') as fin: 11 | for line in fin: 12 | line=line.split() 13 | if len(line)>0: 14 | if line[0]=="AUPRC": 15 | auprc=map(float, line[1:]) 16 | elif line[0]=="correctedFRiP": 17 | frip=map(float, line[1:])""" 18 | 19 | with open(fname, 'r') as fin: 20 | for line in fin: 21 | line=line.split() 22 | if not len(line)==0 and not line[0]=="ID": 23 | auprc.append(float(line[1])) 24 | frip.append(float(line[5])) 25 | frip, auprc=zip(*sorted(zip(frip, auprc))) 26 | auprc_av=[] 27 | for i in range(len(auprc)): 28 | auprc_av.append(np.average(auprc[i:])) 29 | 30 | plt.figure(1, figsize=(4,4)) 31 | ax1=plt.subplot() 32 | ax1.plot(frip,auprc_av) 33 | ax1.grid(b=True, which='major', color='black', linestyle='-') 34 | plt.xticks(np.arange(0, max(frip), 0.02)) 35 | 36 | ax1.grid(b=True, which='minor', color='gray', linestyle='--') 37 | plt.minorticks_on() 38 | plt.show() -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/seq_to_binary.py: -------------------------------------------------------------------------------- 1 | #return a single one hot vector of DNA 2 | 3 | def AGCTtoArray(Nuc): 4 | onehot=[] 5 | if Nuc=="A" or Nuc=="a": 6 | onehot=(1, 0, 0, 0) 7 | return onehot 8 | elif Nuc=="G" or Nuc=="g": 9 | onehot=(0, 1, 0, 0) 10 | return onehot 11 | elif Nuc=="C" or Nuc=="c": 12 | onehot=(0, 0, 1, 0) 13 | return onehot 14 | elif Nuc=="T" or Nuc=="t": 15 | onehot=(0, 0, 0, 1) 16 | return onehot 17 | elif Nuc=="N" or Nuc=="n": 18 | onehot=(0, 0, 0, 0) 19 | return onehot 20 | else: 21 | pass 22 | 23 | #a function to convert AGCTN to 4d array 24 | def AGCTtoArray2(Seq): 25 | onehot=[] 26 | for Nuc in Seq: 27 | if Nuc=="A" or Nuc=="a": 28 | onehot.append((1, 0, 0, 0)) 29 | 30 | elif Nuc=="G" or Nuc=="g": 31 | onehot.append((0, 1, 0, 0)) 32 | elif Nuc=="C" or Nuc=="c": 33 | onehot.append((0, 0, 1, 0)) 34 | elif Nuc=="T" or Nuc=="t": 35 | onehot.append((0, 0, 0, 1)) 36 | elif Nuc=="N" or Nuc=="n": 37 | onehot.append((0, 0, 0, 0)) 38 | else: 39 | pass 40 | 41 | return onehot -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/bed_file_500.py: -------------------------------------------------------------------------------- 1 | WINDOW_SIZE=200 2 | genome_file="/home/fast/onimaru/data/genome_fasta/mm10.genome" 3 | #with open(genome_file, 'r') as fin, open('/home/fast/onimaru/data/genome_fasta/hg38_1000_altwindow.bed', 'w') as fout1, open('/home/fast/onimaru/data/genome_fasta/hg38_1000_.bed', 'w') as fout2: 4 | with open(genome_file, 'r') as fin, open('/home/fast/onimaru/data/genome_fasta/mm10_200_single_'+str(WINDOW_SIZE)+'.bed', 'w') as fout1: 5 | 6 | for line in fin: 7 | line=line.split() 8 | chrom=line[0] 9 | chrom_size=int(line[1]) 10 | divide_num=chrom_size/WINDOW_SIZE 11 | #divide_num=chrom_size/WINDOW_SIZE-4 12 | for i in range(divide_num): 13 | 14 | #if i>=2: 15 | 16 | if i*WINDOW_SIZE+WINDOW_SIZE<=chrom_size: 17 | fout1.write(str(chrom)+'\t'+str(i*WINDOW_SIZE)+'\t'+str(i*WINDOW_SIZE+WINDOW_SIZE)+'\n') 18 | else: 19 | break 20 | #if i*WINDOW_SIZE+WINDOW_SIZE+WINDOW_SIZE/2<=chrom_size: 21 | #fout1.write(str(chrom)+'\t'+str(i*WINDOW_SIZE+WINDOW_SIZE/2)+'\t'+str(i*WINDOW_SIZE+WINDOW_SIZE+WINDOW_SIZE/2)+'\n') 22 | #else: 23 | #break 24 | 25 | -------------------------------------------------------------------------------- /deepgmap/network_constructors/auc_calc.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def auc_pr(true, prob, threshold): 4 | 5 | pred = tf.where(prob > threshold, tf.ones_like(prob), tf.zeros_like(prob)) 6 | tp = tf.logical_and(tf.cast(pred, tf.bool), tf.cast(true, tf.bool)) 7 | fp = tf.logical_and(tf.cast(pred, tf.bool), tf.logical_not(tf.cast(true, tf.bool))) 8 | fn = tf.logical_and(tf.logical_not(tf.cast(pred, tf.bool)), tf.cast(true, tf.bool)) 9 | tn = tf.logical_and(tf.logical_not(tf.cast(pred, tf.bool)), tf.logical_not(tf.cast(true, tf.bool))) 10 | FPR = tf.truediv(tf.reduce_sum(tf.cast(fp, tf.int32)), 11 | tf.reduce_sum(tf.cast(tf.logical_or(tn, fp), tf.int32))) 12 | TPR = tf.truediv(tf.reduce_sum(tf.cast(tp, tf.int32)), 13 | tf.reduce_sum(tf.cast(tf.logical_or(tp, fn), tf.int32))) 14 | PPV = tf.truediv(tf.reduce_sum(tf.cast(tp, tf.int32)), 15 | tf.reduce_sum(tf.cast(tf.logical_or(tp, fp), tf.int32))) 16 | 17 | return FPR, TPR, PPV 18 | 19 | 20 | def auc_pr2(true, prob, threshold): 21 | FPR, _ = tf.metrics.false_positives_at_thresholds(true, prob, [threshold]) 22 | TPR, _ = tf.metrics.true_negatives_at_thresholds(true, prob, [threshold]) 23 | PPV, _ = tf.metrics.precision_at_thresholds(true, prob, [threshold]) 24 | 25 | return FPR, TPR, PPV -------------------------------------------------------------------------------- /deepgmap/post_train_tools/PCA.py: -------------------------------------------------------------------------------- 1 | # Authors: Kyle Kastner 2 | # License: BSD 3 clause 3 | 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | from sklearn.datasets import load_iris 8 | from sklearn.decomposition import PCA, IncrementalPCA 9 | 10 | iris = load_iris() 11 | X = iris.data 12 | y = iris.target 13 | 14 | #print X.shape 15 | 16 | n_components = 2 17 | ipca = IncrementalPCA(n_components=n_components, batch_size=10) 18 | X_ipca = ipca.fit_transform(X) 19 | 20 | #print X_ipca.shape 21 | 22 | pca = PCA(n_components=n_components) 23 | X_pca = pca.fit_transform(X) 24 | 25 | colors = ['navy', 'turquoise', 'darkorange'] 26 | 27 | for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]: 28 | plt.figure(figsize=(8, 8)) 29 | for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names): 30 | plt.scatter(X_transformed[y == i, 0], X_transformed[y == i, 1], 31 | color=color, lw=2, label=target_name) 32 | 33 | if "Incremental" in title: 34 | err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean() 35 | plt.title(title + " of iris dataset\nMean absolute unsigned error " 36 | "%.6f" % err) 37 | else: 38 | plt.title(title + " of iris dataset") 39 | plt.legend(loc="best", shadow=False, scatterpoints=1) 40 | plt.axis([-4, 4, -1.5, 1.5]) 41 | 42 | plt.show() -------------------------------------------------------------------------------- /deepgmap/post_train_tools/compare_narrowPeak_scores.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | """ 4 | chr1 0 1000 . 3.64092565 . 0.00364093 -1 -1 -1 5 | chr1 500 1500 . 3.64092565 . 0.00364093 -1 -1 -1 6 | chr1 1000 2000 . 3.64092565 . 0.00364093 -1 -1 -1 7 | chr1 1500 2500 . 3.64092565 . 0.00364093 -1 -1 -1 8 | """ 9 | 10 | 11 | ref_data="/home/fast/onimaru/data/prediction/CTCF/network_constructor_deepsea_1d3_Wed_Oct_11_074555_2017.ckpt-13019.narrowPeak" 12 | ind_data="/home/fast/onimaru/data/prediction/CTCF/HG00119_network_constructor_deepsea_1d3_Wed_Oct_11_074555_2017.ckpt-13019.narrowPeak.hg38.narrowPeak" 13 | 14 | ref_data_dict={} 15 | ind_data_dicts={} 16 | 17 | with open(ref_data,'r') as fin: 18 | for line in fin: 19 | line=line.split() 20 | position=str(line[0])+'\t'+str(line[1])+'\t'+str(line[2]) 21 | score=float(line[4]) 22 | ref_data_dict[position]=score 23 | 24 | with open(ind_data,'r') as fin: 25 | for line in fin: 26 | line_=line.split() 27 | position=str(line_[0])+'\t'+str(line_[1])+'\t'+str(line_[2]) 28 | score=float(line_[4]) 29 | if position in ref_data_dict: 30 | score_of_ref=ref_data_dict[position] 31 | abs_diff=math.fabs(score-score_of_ref) 32 | 33 | ref_data_dict[position]=score -------------------------------------------------------------------------------- /deepgmap/post_train_tools/liftover_indiv_genome_to_hg38.py: -------------------------------------------------------------------------------- 1 | import re 2 | map_file="/home/slow/onimaru/1000genome/hg38_HG00119_1000.outfmt" 3 | 4 | 5 | with open(map_file, 'r') as fin: 6 | has_seen=set() 7 | map_dict={} 8 | for line in fin: 9 | line=line.split() 10 | hg38_coo, indiv_coo=line[1], line[0] 11 | if not hg38_coo in has_seen: 12 | has_seen.add(hg38_coo) 13 | map_dict[indiv_coo]=hg38_coo 14 | 15 | narrowPeak_prediction="/home/fast/onimaru/data/prediction/HG00119/HG00119_network_constructor_deepsea_1d3_Wed_Oct_11_074555_2017.ckpt-13019.narrowPeak" 16 | 17 | with open(narrowPeak_prediction, 'r') as fin, open(narrowPeak_prediction+".hg38.narrowPeak", 'w') as fout: 18 | for line in fin: 19 | a=line.split() 20 | b=str(a[0])+":"+str(a[1])+"-"+str(a[2]) 21 | if b in map_dict: 22 | new_coo=map_dict[b] 23 | new_coo=re.findall(r"[\w']+", new_coo) 24 | fout.write(str(new_coo[0])+"\t" 25 | +str(new_coo[1])+"\t" 26 | +str(new_coo[2])+"\t" 27 | +str(a[3])+"\t" 28 | +str(a[4])+"\t" 29 | +str(a[5])+"\t" 30 | +str(a[6])+"\t" 31 | +str(a[7])+"\t" 32 | +str(a[8])+"\t" 33 | +str(a[9])+"\n") 34 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/merge_bigwig.py: -------------------------------------------------------------------------------- 1 | import pyBigWig as pbw 2 | import math 3 | import glob as gl 4 | def merge_biwig(bigwig_file_list, out_file): 5 | bigwig_list=[] 6 | _chromosome_list=[] 7 | _value_list=[] 8 | _header_list=[] 9 | start_list=[] 10 | end_list=[] 11 | for f in bigwig_file_list: 12 | _tmp_wig=pbw.open(f) 13 | chroms=_tmp_wig.chroms() 14 | print(chroms) 15 | for chrom_name, chrom_length in chroms.items(): 16 | _header_list.append((chrom_name, chrom_length)) 17 | j=0 18 | for s,e, v in _tmp_wig.intervals(chrom_name, 0, chrom_length): 19 | j+=1 20 | start_list.append(s) 21 | end_list.append(e) 22 | _value_list.append(v) 23 | 24 | _chromosome_list.append([chrom_name]*j) 25 | print(j) 26 | _tmp_wig.close() 27 | out_bigwig=pbw.open(out_file) 28 | out_bigwig.addHeader(_header_list) 29 | out_bigwig.addEntries(_chromosome_list, start_list, ends=end_list, values=_value_list) 30 | out_bigwig.close() 31 | 32 | def main(): 33 | file_list=gl.glob("/home/onimaru/fast2/1000genome/tmp/GRCh38_edited_ctcf_test_class_mm10_CTCF_intestine_0days_ENCFF464ZPC_rep2_chr*.bw") 34 | ofile="/home/onimaru/fast2/1000genome/tmp/tmp.bw" 35 | merge_biwig(file_list, ofile) 36 | if __name__ == "__main__": 37 | main() -------------------------------------------------------------------------------- /deepgmap/misc/compare_deepsea_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | deepsea="/home/fast/onimaru/encode/deepsea/deepsea_pred.txt" 5 | 6 | deepshark="/home/fast/onimaru/encode/deepsea/deepshark_Tue_Apr_17_183529_2018.ckpt-57883_prediction.log" 7 | 8 | deepsea_dict={} 9 | 10 | with open(deepsea, 'r') as fin: 11 | for line in fin: 12 | if not line.startswith("Cell Type"): 13 | #print line 14 | line=line.split() 15 | if len(line)==0: 16 | continue 17 | print(line) 18 | if line[4]=="NA": 19 | continue 20 | sname=line[3].split('.')[0] 21 | AUPRC=float(line[5]) 22 | deepsea_dict[sname]=AUPRC 23 | 24 | sample_list=[] 25 | deepsea_list=[] 26 | deepshark_list=[] 27 | with open(deepshark, 'r') as fin: 28 | go=False 29 | for line in fin: 30 | if line.startswith("sample"): 31 | go=True 32 | continue 33 | elif go: 34 | line=line.split() 35 | sname=line[0].split("_")[0] 36 | if "Dnase" in sname and sname in deepsea_dict: 37 | sample_list.append(sname) 38 | deepsea_list.append(deepsea_dict[sname]) 39 | deepshark_list.append(float(line[2])) 40 | print(sname, deepsea_dict[sname], float(line[2])) 41 | 42 | deepsea_list=np.array(deepsea_list) 43 | deepshark_list=np.array(deepshark_list) 44 | 45 | log_fold=np.log2(deepshark_list/deepsea_list) 46 | log_fold_neg=log_fold[log_fold<0.00] 47 | print("total num: "+str(len(log_fold))+"\nless performed num:"+str(len(log_fold_neg))+" ("+str(len(log_fold_neg)/float(len(log_fold))*100.0)+"%)") 48 | -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/queue.pyx: -------------------------------------------------------------------------------- 1 | cimport cqueue 2 | 3 | cdef class Queue: 4 | """A queue class for C integer values. 5 | 6 | >>> q = Queue() 7 | >>> q.append(5) 8 | >>> q.peek() 9 | 5 10 | >>> q.pop() 11 | 5 12 | """ 13 | cdef cqueue.Queue* _c_queue 14 | def __cinit__(self): 15 | self._c_queue = cqueue.queue_new() 16 | if self._c_queue is NULL: 17 | raise MemoryError() 18 | 19 | def __dealloc__(self): 20 | if self._c_queue is not NULL: 21 | cqueue.queue_free(self._c_queue) 22 | 23 | cpdef append(self, int value): 24 | if not cqueue.queue_push_tail(self._c_queue, 25 | value): 26 | raise MemoryError() 27 | 28 | cdef extend(self, int* values, size_t count): 29 | cdef size_t i 30 | for i in xrange(count): 31 | if not cqueue.queue_push_tail( 32 | self._c_queue, values[i]): 33 | raise MemoryError() 34 | 35 | cpdef int peek(self) except? -1: 36 | cdef int value = \ 37 | cqueue.queue_peek_head(self._c_queue) 38 | if value == 0: 39 | # this may mean that the queue is empty, 40 | # or that it happens to contain a 0 value 41 | if cqueue.queue_is_empty(self._c_queue): 42 | raise IndexError("Queue is empty") 43 | return value 44 | 45 | cpdef int pop(self) except? -1: 46 | if cqueue.queue_is_empty(self._c_queue): 47 | raise IndexError("Queue is empty") 48 | return cqueue.queue_pop_head(self._c_queue) 49 | 50 | def __bool__(self): 51 | return not cqueue.queue_is_empty(self._c_queue) -------------------------------------------------------------------------------- /deepgmap/misc/fix_fasta.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | infile="/home/fast2/onimaru/1000genome/all_HG00119_H1.fa" 5 | outfile="/home/fast2/onimaru/1000genome/all_HG00119_H1_edited.fa" 6 | 7 | 8 | with open(infile, "r") as fin, open(os.path.split(infile)[0]+"/_tmp.fa", "w") as fo: 9 | i=0 10 | for line in fin: 11 | if line.startswith(">") and len(line.split())>1: 12 | line=line.split()[0] 13 | if i ==0: 14 | line=">chr"+line.strip(">")+"\n" 15 | i+=1 16 | else: 17 | line="\n"+">chr"+line.strip(">")+"\n" 18 | #print line 19 | fo.write(line) 20 | else: 21 | fo.write(line.strip("\n")) 22 | 23 | #import numpy as np 24 | dna=set(["A","G","C","T","N", "\n"]) 25 | with open(os.path.split(infile)[0]+"/_tmp.fa", "r") as fin, open(outfile, "w") as fo: 26 | 27 | 28 | for line in fin: 29 | if line.startswith(">"): 30 | fo.write(line) 31 | #print line 32 | else: 33 | i=0 34 | lline=(line) 35 | line=iter(line) 36 | seq=[] 37 | #i+=len(line) 38 | while True: 39 | try: 40 | l=line.next() 41 | except StopIteration: 42 | break 43 | #print l 44 | if l=="<": 45 | l2=line.next() 46 | while l2!=">": 47 | l2=line.next() 48 | 49 | seq.append("N") 50 | i+=1 51 | else: 52 | seq.append(l) 53 | i+=1 54 | if i%200==0: 55 | seq.append("\n") 56 | fo.write("".join(seq)) 57 | #if not any(l==dna): 58 | 59 | 60 | -------------------------------------------------------------------------------- /deepgmap/misc/randomdna2.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | 5 | input_file='/home/fast/onimaru/data/various_dnase_data/mm10_1000_dnase_region_75co.fa' 6 | temp_file='/home/fast/onimaru/data/random_seq/random_shuffle2_for_mm10_1000_dnase_region_75co.fa' 7 | output_file='/media/koh/HD-PCFU3/mouse/random_seq/random_shuffle2_for_multidnase_no_chr1_2_noN.fa' 8 | with open(input_file, 'r') as f1, open(temp_file, 'w') as f2: 9 | for line in f1: 10 | if '>' in line: 11 | f2.write(str(line)) 12 | elif not line=='' or not line=='\n': 13 | randomized='' 14 | select_data=random.randint(1,100) 15 | shuffling_module=2 16 | #if select_data<=50: 17 | #shuffling_module=3 18 | line=line.strip('\n') 19 | index_new=(len(line))/shuffling_module 20 | a = range(index_new) 21 | index_random = random.sample(a, len(a)) 22 | index_random_iter=iter(index_random) 23 | for i in range(index_new): 24 | for k in range(shuffling_module): 25 | randomized+=line[index_random[i]*shuffling_module+k] 26 | f2.write(str(randomized)+'\n') 27 | for i in range(2): 28 | with open(input_file, 'r') as f1, open(temp_file, 'a') as f2: 29 | for line in f1: 30 | if '>' in line: 31 | f2.write(str(line)) 32 | elif not line=='' or not line=='\n': 33 | randomized='' 34 | select_data=random.randint(1,100) 35 | shuffling_module=2 36 | #if select_data<=50: 37 | #shuffling_module=3 38 | line=line.strip('\n') 39 | index_new=(len(line))/shuffling_module 40 | a = range(index_new) 41 | index_random = random.sample(a, len(a)) 42 | index_random_iter=iter(index_random) 43 | for i in range(index_new): 44 | for k in range(shuffling_module): 45 | randomized+=line[index_random[i]*shuffling_module+k] 46 | f2.write(str(randomized)+'\n') 47 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/motif_compare2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | #from curses.ascii import isdigit 4 | from scipy.spatial.distance import cdist 5 | import deepgmap.post_train_tools.cython_util as cutil 6 | mc=cutil.motif_compare 7 | from matplotlib import pyplot as plt 8 | import os 9 | def _is_number(s): 10 | try: 11 | complex(s) # for int, long, float and complex 12 | except ValueError: 13 | return False 14 | 15 | return True 16 | 17 | def motif_reader(motif_data_dir): 18 | h,t=os.path.split(motif_data_dir) 19 | foutname=h+"/"+os.path.splitext(t)[0]+"tmp.meme" 20 | with open(foutname, "w") as fo, open(motif_data_dir, 'r') as fin: 21 | 22 | fo.write("MEME version 4\n\nALPHABET= ACGT\n\nstrands: + -\n\nBackground letter\ 23 | frequencies (from uniform background):\nA 0.2500 C 0.2500 G 0.2500 T 0.2500\n\n") 24 | lines=fin.readlines() 25 | for i, line in enumerate(lines): 26 | if line.startswith("letter-probability"): 27 | start_line=i+1 28 | break 29 | for i in range(100-2): 30 | fo.write("MOTIF tmp_"+str(i*10)+"-"+str(i*10+30)+"\n\nletter-probability matrix: alength= 4 w= 30 nsites= 30 E= 0\n") 31 | for l in lines[start_line+i*10:start_line+i*10+30]: 32 | fo.write(l) 33 | fo.write("\n\n") 34 | 35 | return foutname 36 | 37 | 38 | def main(): 39 | motif_data_dir="/home/fast/onimaru/data/meme/merged.meme" 40 | #long_motif_dir="/home/fast/onimaru/deepgmap/data/reconstructions/conv4frss_Fri_May_11_075425_2018.ckpt-16747Tue_May_15_112518_2018_all_.pdf.meme" 41 | long_motif_dir="/home/fast2/onimaru/DeepGMAP-dev/data/activation_max/conv4frss_Fri_Sep_28_160038_2018.ckpt-28907Thu_Dec_20_131413_2018_ese14_re.pdf.meme" 42 | #fout=os.path.splitext(long_motif_dir)[0]+".matches" 43 | #fout="/home/fast/onimaru/data/output/network_constructor_deepsea_1d3_Fri_Oct_13_133809_2017.ckpt-15899Mon_Oct_16_105338_2017.npz.matches" 44 | 45 | fname=motif_reader(long_motif_dir) 46 | #print fname 47 | if __name__== '__main__': 48 | main() 49 | -------------------------------------------------------------------------------- /deepgmap/misc/deepsea_anal.py: -------------------------------------------------------------------------------- 1 | def is_number(s): 2 | try: 3 | float(s) 4 | return True 5 | except ValueError: 6 | return False 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | f="/home/fast/onimaru/deepgmap/data/misc/deepsea_S3.txt" 10 | 11 | datadict={} 12 | 13 | with open(f, 'r') as fin: 14 | for line in fin: 15 | 16 | line=line.split("\t") 17 | if len(line)==7: 18 | #print line[1], line[5] 19 | if is_number(line[5]): 20 | if datadict.has_key(line[1]): 21 | datadict[line[1]].append(float(line[5])) 22 | else: 23 | datadict[line[1]]=[] 24 | datadict[line[1]].append(float(line[5])) 25 | 26 | data_list=[] 27 | label_list=[] 28 | 29 | for k, v in datadict.items(): 30 | if len(v)>3: 31 | label_list.append(k) 32 | data_list.append(v) 33 | 34 | median_list=[] 35 | 36 | for i in data_list: 37 | median_list.append(np.median(i)) 38 | 39 | index_=range(len(label_list)) 40 | 41 | median_list, index_=zip(*sorted(zip(median_list, index_), reverse=True)) 42 | 43 | label_list[:] = [label_list[i] for i in index_] 44 | data_list[:] = [data_list[i] for i in index_] 45 | 46 | 47 | fig, ax = plt.subplots() 48 | font = {'family' : 'Sans', 49 | 'weight' : 'normal', 50 | 'size' : 6} 51 | plt.rc('font', **font) 52 | bp_dict=ax.boxplot(data_list, labels=label_list, bootstrap=1000, sym='.') 53 | #ticks=np.linspace(0, 11, 22, endpoint=False) 54 | #ax.set_yticks(ticks) 55 | plt.xticks(rotation='vertical') 56 | ax.grid(True) 57 | """k=0 58 | for i in [data1, data2]: 59 | y=i 60 | x = np.random.normal(k+1, 0.04, len(y)) 61 | plt.plot(x, y,marker="o",linestyle="None") 62 | k+=1""" 63 | """ 64 | for line in bp_dict['medians']: 65 | # get position data for median line 66 | print line.get_xydata() 67 | x, y = line.get_xydata()[1] # top of median line 68 | # overlay median value 69 | plt.text(x+0.15, y-0.1, round(y,2), 70 | horizontalalignment='center') # draw above, centered""" 71 | """ 72 | import scipy.stats as stats 73 | 74 | test=stats.ttest_ind(data1,data2) 75 | test2=stats.ttest_ind(data1,data3) 76 | print test, test2 77 | """ 78 | plt.show() -------------------------------------------------------------------------------- /deepgmap/misc/box_plot.py: -------------------------------------------------------------------------------- 1 | from deepgmap.misc.small_tools import is_number 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | f="/home/fast2/onimaru/DeepGMAP-dev/data/misc/AUPRC_ctcf_boxplot_14jun2018.txt" 6 | #f="/home/fast2/onimaru/DeepGMAP-dev/data/misc/AUPRC_dnase_boxplot_31may2018.txt" 7 | data_list=[] 8 | sample_list=[] 9 | 10 | with open(f, 'r') as fin: 11 | for line in fin: 12 | data_tmp=[] 13 | line=line.split() 14 | if len(line)==0: 15 | break 16 | sample_list.append(line[0]) 17 | 18 | for l in line[1:]: 19 | if is_number(l) and not l=="nan": 20 | data_tmp.append(float(l)) 21 | data_list.append(data_tmp) 22 | 23 | #print sample_list 24 | #print data_list[-1] 25 | fig, ax = plt.subplots() 26 | font = {'family' : 'Sans', 27 | 'weight' : 'normal', 28 | 'size' : 12} 29 | plt.rc('font', **font) 30 | ax.boxplot(data_list, labels=sample_list, bootstrap=1000, sym='.') 31 | plt.xticks(rotation='vertical') 32 | ax.grid(True) 33 | """ 34 | ig, ax = plt.subplots() 35 | font = {'family' : 'Sans', 36 | 'weight' : 'normal', 37 | 'size' : 12} 38 | plt.rc('font', **font) 39 | ax.boxplot(data_list[3:6], labels=sample_list[3:6], bootstrap=1000, sym='.') 40 | """ 41 | #ticks=np.linspace(0, 11, 22, endpoint=False) 42 | #ax.set_yticks(ticks) 43 | #ax.set_x 44 | #locs, labels=plt.xticks() 45 | #plt.xticks(np.arange(len(sample_list)), sample_list) 46 | plt.xticks(rotation='vertical') 47 | ax.grid(True) 48 | """k=0 49 | for i in [data1, data2]: 50 | y=i 51 | x = np.random.normal(k+1, 0.04, len(y)) 52 | plt.plot(x, y,marker="o",linestyle="None") 53 | k+=1""" 54 | """ 55 | for line in bp_dict['medians']: 56 | # get position data for median line 57 | print line.get_xydata() 58 | x, y = line.get_xydata()[1] # top of median line 59 | # overlay median value 60 | plt.text(x+0.15, y-0.1, round(y,2), 61 | horizontalalignment='center') # draw above, centered""" 62 | 63 | import scipy.stats as stats 64 | sub_data_list1=data_list 65 | i=0 66 | pair_set=set() 67 | test_dict={} 68 | for i in range(len(sub_data_list1)): 69 | for j in range(len(sub_data_list1)): 70 | 71 | if not i==j and not str(i)+"-"+str(j) in pair_set: 72 | #test=stats.ttest_ind(sub_data_list1[i],sub_data_list1[j]) 73 | test=stats.mannwhitneyu(sub_data_list1[i],sub_data_list1[j],alternative="two-sided") 74 | test_dict[str(i)+"-"+str(j)]=test 75 | pair_set.add(str(i)+"-"+str(j)) 76 | pair_set.add(str(j)+"-"+str(i)) 77 | #print test_dict 78 | 79 | 80 | plt.show() -------------------------------------------------------------------------------- /deepgmap/misc/randomdna.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | def DNA(length, percentA, percentG, percentC, percentT, percentN): 5 | a = int(percentA*100) 6 | g = int(percentG*100) 7 | c = int(percentC*100) 8 | t=int(percentT*100) 9 | n=100-(a+g+c+t) 10 | dnachoice='' 11 | i=0 12 | for i in range(a): 13 | dnachoice+='A' 14 | for i in range(g): 15 | dnachoice+='G' 16 | i=0 17 | for i in range(c): 18 | dnachoice+='C' 19 | i=0 20 | for i in range(t): 21 | dnachoice+='T' 22 | i=0 23 | for i in range(n): 24 | dnachoice+='N' 25 | 26 | 27 | return ''.join(random.choice(str(dnachoice)) for _ in range(length)) 28 | 29 | def statistics(file): 30 | lengthdist=[] 31 | for line in file: 32 | line=line.split() 33 | lengthdist.append(int(line[2])-int(line[1])) 34 | 35 | return lengthdist 36 | 37 | def AGCTcontent(file2): 38 | #input_file = open('NC_005213.ffn', 'r') 39 | #output_file = open('nucleotide_counts.tsv','w') 40 | #output_file.write('Gene\tA\tC\tG\tT\tLength\tCG%\n') 41 | A_count, C_count, G_count, T_count,N_count, length=0,0,0,0,0,0 42 | from Bio import SeqIO 43 | 44 | for cur_record in SeqIO.parse(file2, "fasta") : 45 | #count nucleotides in this record... 46 | gene_name = cur_record.name 47 | A_count += (cur_record.seq.count('A') +cur_record.seq.count('a')) 48 | C_count += (cur_record.seq.count('C') +cur_record.seq.count('c')) 49 | G_count += (cur_record.seq.count('G') +cur_record.seq.count('g')) 50 | T_count += (cur_record.seq.count('T') +cur_record.seq.count('t')) 51 | N_count += (cur_record.seq.count('N') +cur_record.seq.count('n')) 52 | length += len(cur_record.seq) 53 | A_percent=float(A_count)/float(length) 54 | G_percent=float(G_count)/float(length) 55 | C_percent=float(C_count)/float(length) 56 | T_percent=float(T_count)/float(length) 57 | N_percent=float(N_count)/float(length) 58 | #print A_percent, G_percent, C_percent, T_percent, N_percent 59 | return A_percent, G_percent, C_percent, T_percent, N_percent 60 | 61 | with open('/media/koh/HD-PCFU3/mouse/various_dnase_data/all_peak_75cutoff_sorted_merge.bed', 'r') as f1, open('/media/koh/HD-PCFU3/mouse/various_dnase_data/all_peak_75cutoff_sorted_merge.fa', 'r') as f2: 62 | seq_distribution=statistics(f1) 63 | percentA, percentG, percentC, percentT, percentN=AGCTcontent(f2) 64 | output=open('/media/koh/HD-PCFU3/mouse/random_seq/random_for_multidnase.fa', 'w') 65 | i=0 66 | for i in range(len(seq_distribution)): 67 | output.write('>random'+str(i)+'\n'+DNA(seq_distribution[i], percentA, percentG, percentC, percentT, percentN)+'\n') 68 | output.close() 69 | 70 | -------------------------------------------------------------------------------- /deepgmap/misc/bed_file_compare.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import subprocess as sp 4 | import networkx as nx 5 | import os 6 | from itertools import combinations 7 | import glob 8 | 9 | file_list=glob.glob(sys.argv[1]) 10 | 11 | file_combination=[] 12 | node_list={} 13 | peak_counts={} 14 | path_sep=os.path.sep 15 | G=nx.MultiDiGraph() 16 | G.peak_count={} 17 | peak_count_dict={} 18 | for i in file_list: 19 | with open(i, 'r') as j: 20 | peak_count=len(j.readlines()) 21 | 22 | file_name=i.split(path_sep) 23 | file_name=file_name[-1].split('.') 24 | node1=file_name[0] 25 | G.add_node(node1) 26 | G.peak_count[node1]=peak_count 27 | peak_count_dict[node1]=str(node1)+'\n('+ str(peak_count)+')' 28 | node_list[i]=node1 29 | 30 | for i in combinations(file_list, 2): 31 | file_combination.append(i) 32 | 33 | edgelabels={} 34 | 35 | fout=open('/media/koh/HD-PCFU3/mouse/various_dnase_data/bedfiles/testfiles/test.log', 'w') 36 | fout.write("#combination, intersection, overlapping, distance\n") 37 | 38 | 39 | for i in file_combination: 40 | 41 | intersect1_=sp.check_output(["bedtools", "intersect","-u", "-a", str(i[0]), "-b", str(i[1])]) 42 | intersect1=len(intersect1_.split('\n')) 43 | intersect2_=sp.check_output(["bedtools", "intersect","-u", "-b", str(i[0]), "-a", str(i[1])]) 44 | intersect2=len(intersect2_.split('\n')) 45 | #distance=sp.check_output(["bedtools", "jaccard", "-a", str(i[0]), "-b", str(i[1])]) 46 | #distance=distance.split('\n') 47 | #distance=distance[1].split() 48 | #distance=distance[2] 49 | overlap1=G.peak_count[node_list[i[0]]]-intersect1 50 | overlap2=G.peak_count[node_list[i[1]]]-intersect2 51 | 52 | proportion1=overlap1/float(G.peak_count[node_list[i[0]]]) 53 | proportion2=overlap2/float(G.peak_count[node_list[i[1]]]) 54 | 55 | fout.write(str(node_list[i[0]])+'/'+str(node_list[i[1]])+', '+str(intersect1)+'/'+str(intersect2)+', '+str(overlap1)+'/'+str(overlap2)+', '+str(proportion1)+'/'+str(proportion2)+'\n') 56 | G.add_edge(node_list[i[0]], node_list[i[1]], edge_width=float(proportion1)) 57 | G.add_edge(node_list[i[1]], node_list[i[0]], edge_width=float(proportion2)) 58 | edgelabels[node_list[i[0]], node_list[i[1]]]=str(overlap1)+'/'+str(overlap2) 59 | fout.close() 60 | import matplotlib.pyplot as plt 61 | edgewidth=[] 62 | for (u,v,d) in G.edges(data=True): 63 | edgewidth.append(d['edge_width']) 64 | 65 | plt.figure(figsize=(8,8)) 66 | # with nodes colored by degree sized by population 67 | pos=nx.spectral_layout(G) 68 | nx.draw_networkx_edges(G,pos,alpha=0.3,width=20, edge_color=edgewidth) 69 | nodesize=[G.peak_count[v]/100 for v in G] 70 | nx.draw_networkx_nodes(G,pos,node_size=nodesize,node_color='w',alpha=1.0,label=nodesize) 71 | nx.draw_networkx_labels(G,pos,labels=peak_count_dict, fontsize=14) 72 | nx.draw_networkx_edge_labels(G,pos,edge_labels=edgelabels, fontsize=12, alpha=0.1, bbox=dict(facecolor='none', edgecolor='none')) 73 | plt.savefig("chess_masters.png",dpi=75) 74 | plt.show() 75 | -------------------------------------------------------------------------------- /deepgmap/misc/motif_logo_creator.py: -------------------------------------------------------------------------------- 1 | from cairocffi import cairo 2 | import gzip 3 | import pickle 4 | import numpy as np 5 | 6 | def select_color(cr, DNA): 7 | if DNA=="A": 8 | cr.set_source_rgb(1, 0, 0) 9 | elif DNA=="G": 10 | cr.set_source_rgb(0, 0, 0) 11 | elif DNA=="C": 12 | cr.set_source_rgb(0, 0, 1) 13 | elif DNA=="T": 14 | cr.set_source_rgb(0, 1, 0) 15 | 16 | def main(): 17 | with gzip.open('/media/koh/HD-PCFU3/mouse/variables_999_Sun_Oct_30_120751_2016.cpickle.gz', 'r') as f: 18 | variables=pickle.load(f) 19 | filter1=variables[0] 20 | 21 | 22 | 23 | i=0 24 | j=0 25 | k=0 26 | l=0 27 | 28 | 29 | filter_shape=filter1.shape 30 | width=filter_shape[0]*30+100 31 | hight=512 32 | y_center=hight/2 33 | for i in range(filter_shape[3]): 34 | ims = cairo.ImageSurface(cairo.FORMAT_ARGB32, width, hight) 35 | 36 | 37 | cr = cairo.Context(ims) 38 | cr.move_to(50, y_center) 39 | cr.line_to(filter_shape[0]*30+50, y_center) 40 | cr.move_to(50, 100) 41 | cr.line_to(50, 412) 42 | cr.set_line_width(2) 43 | cr.stroke() 44 | for k in range(filter_shape[0]): 45 | 46 | AGCT={} 47 | values=[] 48 | A=["A", filter1[k][0][0][i]*1000.0] 49 | G=["G",filter1[k][1][0][i]*1000.0] 50 | C=["C",filter1[k][2][0][i]*1000.0] 51 | T=["T", filter1[k][3][0][i]*1000.0] 52 | values=[A,G,C,T] 53 | pos=filter(lambda x:x[1]>=0,values) 54 | neg=filter(lambda x:x[1]<0,values) 55 | pos.sort(key=lambda x:x[1]) 56 | neg.sort(key=lambda x:x[1], reverse=True) 57 | Nucpos=0 58 | Nucneg=0 59 | for l in range(len(pos)): 60 | Nuc=pos[l][0] 61 | 62 | Nucsize=abs(pos[l][1]) 63 | 64 | cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL) 65 | select_color(cr, Nuc) 66 | font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=Nucsize,x0=0.0,y0=0.0) 67 | cr.set_font_matrix(font_mat) 68 | cr.move_to(50+k*40*0.75, y_center-Nucpos*0.75) 69 | cr.show_text(str(Nuc)) 70 | Nucpos+=abs(pos[l][1]) 71 | l=0 72 | for l in range(len(neg)): 73 | Nuc=neg[l][0] 74 | Nucsize=abs(neg[l][1]) 75 | 76 | cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL) 77 | select_color(cr, Nuc) 78 | font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=-Nucsize,x0=0.0,y0=0.0) 79 | cr.set_font_matrix(font_mat) 80 | cr.move_to(50+k*40*0.75, y_center+(Nucneg)*0.75) 81 | cr.show_text(str(Nuc)) 82 | Nucneg+=abs(neg[l][1]) 83 | 84 | 85 | 86 | 87 | #cr.set_font_size(40) 88 | 89 | ims.write_to_png("motif_"+str(i)+".png") 90 | 91 | 92 | if __name__ == "__main__": 93 | main() -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #from distutils.core import setup 2 | from setuptools import setup, find_packages 3 | from distutils.extension import Extension 4 | import re 5 | import os 6 | import codecs 7 | here = os.path.abspath(os.path.dirname(__file__)) 8 | 9 | 10 | def read(*parts): 11 | # intentionally *not* adding an encoding option to open, See: 12 | # https://github.com/pypa/virtualenv/issues/201#issuecomment-3145690 13 | with codecs.open(os.path.join(here, *parts), 'r') as fp: 14 | return fp.read() 15 | 16 | 17 | def find_version(*file_paths): 18 | version_file = read(*file_paths) 19 | version_match = re.search( 20 | r"^__version__ = ['\"]([^'\"]*)['\"]", 21 | version_file, 22 | re.M, 23 | ) 24 | if version_match: 25 | return version_match.group(1) 26 | 27 | raise RuntimeError("Unable to find version string.") 28 | 29 | try: 30 | from Cython.Distutils import build_ext 31 | except ImportError: 32 | use_cython = False 33 | else: 34 | use_cython = True 35 | 36 | cmdclass = { } 37 | ext_modules = [ ] 38 | 39 | if use_cython: 40 | ext_modules += [ 41 | Extension("deepgmap.data_preprocessing_tools.seq_to_binary2", [ "deepgmap/data_preprocessing_tools/seq_to_binary2.pyx" ]), 42 | #Extension("data_preprocessing_tools.queue", [ "deepgmap/data_preprocessing_tools/queue.pyx" ],libraries=["calg"]), 43 | 44 | Extension("deepgmap.post_train_tools.cython_util", [ "deepgmap/post_train_tools/cython_util.pyx" ]), 45 | ] 46 | cmdclass.update({ 'build_ext': build_ext }) 47 | else: 48 | ext_modules += [ 49 | Extension("deepgmap.data_preprocessing_tools.seq_to_binary2", [ "deepgmap/data_preprocessing_tools/seq_to_binary2.c" ]), 50 | Extension("deepgmap.post_train_tools.cython_util", [ "deepgmap/post_train_tools/cython_util.c" ]), 51 | ] 52 | #print(find_version("deepgmap", "__init__.py")) 53 | setup( 54 | name='DeepGMAP', 55 | #version=VERSION, 56 | version=find_version("deepgmap", "__init__.py"), 57 | description='Learning and predicting gene regulatory sequences in genomes', 58 | author='Koh Onimaru', 59 | author_email='koh.onimaru@gmail.com', 60 | url='', 61 | packages=['deepgmap','deepgmap.train','deepgmap.network_constructors','deepgmap.post_train_tools','deepgmap.data_preprocessing_tools','deepgmap.misc'], 62 | #packages=find_packages('deepgmap'), 63 | #packages=['deepgmap.'], 64 | package_dir={'DeepGMAP':'deepgmap'}, 65 | #package_data = { 66 | # '': ['enhancer_prediction/*', '*.pyx', '*.pxd', '*.c', '*.h'], 67 | #}, 68 | scripts=['bin/deepgmap', 69 | ], 70 | #packages=find_packages(), 71 | cmdclass = cmdclass, 72 | ext_modules=ext_modules, 73 | classifiers=[ 74 | 'Development Status :: 3 - Alpha', 75 | 'Environment :: Console', 76 | 'Intended Audience :: Developers', 77 | 'Programming Language :: Python :: 3.6', 78 | 'License :: OSI Approved :: Apache Software License ', 79 | 'Operating System :: POSIX :: Linux', 80 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 81 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 82 | 83 | 84 | ], 85 | install_requires=['tensorflow>=1.15', 'numpy', 'matplotlib', 'sklearn', 'tornado', 'natsort', 'psutil', 'pyBigWig'], 86 | long_description=open('README.rst').read(), 87 | ) 88 | 89 | -------------------------------------------------------------------------------- /deepgmap/network_constructors/template_model.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import tensorflow as tf 3 | 4 | import importlib as il 5 | _ac=il.import_module("deepgmap.network_constructors.auc_calc") 6 | ac=_ac.auc_pr 7 | #the code design came from https://gist.github.com/danijar/8663d3bbfd586bffecf6a0094cd116f2 8 | 9 | def doublewrap(function): 10 | @functools.wraps(function) 11 | def decorator(*args, **kwargs): 12 | if len(args) == 1 and len(kwargs) == 0 and callable(args[0]): 13 | return function(args[0]) 14 | else: 15 | return lambda wrapee: function(wrapee, *args, **kwargs) 16 | return decorator 17 | 18 | 19 | @doublewrap 20 | def define_scope(function, scope=None, *args, **kwargs): 21 | """ 22 | A decorator for functions that define TensorFlow operations. The wrapped 23 | function will only be executed once. Subsequent calls to it will directly 24 | return the result so that operations are added to the graph only once. 25 | The operations added by the function live within a tf.variable_scope(). If 26 | this decorator is used with arguments, they will be forwarded to the 27 | variable scope. The scope name defaults to the name of the wrapped 28 | function. 29 | """ 30 | attribute = '_cache_' + function.__name__ 31 | name = scope or function.__name__ 32 | @property 33 | @functools.wraps(function) 34 | def decorator(self): 35 | if not hasattr(self, attribute): 36 | with tf.variable_scope(name, *args, **kwargs): 37 | setattr(self, attribute, function(self)) 38 | return getattr(self, attribute) 39 | return decorator 40 | 41 | 42 | class template_model(object): 43 | 44 | 45 | def __init__(self, label, prediction, max_to_keep, train_speed, GPUID): 46 | #self.label=label 47 | #self.prediction=prediction 48 | self.max_to_keep=max_to_keep 49 | self.train_speed=train_speed 50 | self.optimize 51 | self.error(label, prediction) 52 | self.saver 53 | self.cost(prediction) 54 | self.GPUID=GPUID 55 | 56 | @define_scope 57 | def saver(self): 58 | return tf.train.Saver(max_to_keep=self.max_to_keep) 59 | 60 | @define_scope 61 | def cost(self, prediction): 62 | with tf.device('/device:GPU:'+self.GPUID): 63 | nll=tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(targets=self.label, logits=prediction[0],pos_weight=1.0)) 64 | l2_norm=tf.reduce_sum(prediction[4]) 65 | l1_norm=tf.reduce_sum(tf.abs(prediction[1])) 66 | return tf.add_n([nll,tf.multiply((5*10**-7), l2_norm),tf.multiply((1*10**-8),l1_norm)]) 67 | 68 | @define_scope 69 | def optimize(self): 70 | with tf.device('/device:GPU:'+self.GPUID): 71 | optimizer = tf.train.AdamOptimizer(self.train_speed) 72 | return optimizer.minimize(self.cost) 73 | 74 | @define_scope 75 | def error(self,label, prediction): 76 | with tf.device('/device:GPU:'+self.GPUID): 77 | class_n=label.shape[1] 78 | FPR_list=[] 79 | TPR_list=[] 80 | PPV_list=[] 81 | for i in range(class_n): 82 | 83 | true=label[:,i] 84 | prob=prediction[1][:,i] 85 | FPR, TPR, PPV=ac(true,prob,0.5) 86 | FPR_list.append(FPR) 87 | TPR_list.append(TPR) 88 | PPV_list.append(PPV) 89 | 90 | return FPR_list, TPR_list, PPV_list 91 | -------------------------------------------------------------------------------- /deepgmap/misc/gff_to_colored_bed.py: -------------------------------------------------------------------------------- 1 | """ 2 | ##gff-version 3 3 | chr2 fimo nucleotide_motif 5714959 5714967 56.5 + . Name=kernel_0_chr2+;Alias=;ID=kernel_0-1-chr2;pvalue=2.23e-06;qvalue= 1;sequence=CGCCTTCGC; 4 | chr2 fimo nucleotide_motif 10439990 10439998 56.5 + . Name=kernel_0_chr2+;Alias=;ID=kernel_0-2-chr2;pvalue=2.23e-06;qvalue= 1;sequence=CGCCTTCGC; 5 | chr2 fimo nucleotide_motif 13793526 13793534 56.5 + . Name=kernel_0_chr2+;Alias=;ID=kernel_0-3-chr2;pvalue=2.23e-06;qvalue= 1;sequence=cgccttcgc; 6 | chr2 fimo nucleotide_motif 17940241 17940249 56.5 + . Name=kernel_0_chr2+;Alias=;ID=kernel_0-4-chr2;pvalue=2.23e-06;qvalue= 1;sequence=CGCCTTCGC; 7 | chr2 fimo nucleotide_motif 18672533 18672541 56.5 + . Name=kernel_0_chr2+;Alias=;ID=kernel_0-5-chr2;pvalue=2.23e-06;qvalue= 1;sequence=CGCCTTCGC; 8 | chr2 fimo nucleotide_motif 21064760 21064768 56.5 + . Name=kernel_0_chr2+;Alias=;ID=kernel_0-6-chr2;pvalue=2.23e-06;qvalue= 1;sequence=CGCCTTCGC; 9 | chr2 fimo nucleotide_motif 28545836 28545844 56.5 + . Name=kernel_0_chr2+;Alias=;ID=kernel_0-7-chr2;pvalue=2.23e-06;qvalue= 1;sequence=CGCCTTCGC; 10 | """ 11 | 12 | """ 13 | browser position 14 | track name="kernels" description="kernel distribution visualization" visibility=2 itemRgb="On" 15 | chr7 127471196 127472363 Pos1 0 + 127471196 127472363 255,0,0 16 | chr7 127472363 127473530 Pos2 0 + 127472363 127473530 255,0,0 17 | chr7 127473530 127474697 Pos3 0 + 127473530 127474697 255,0,0 18 | chr7 127474697 127475864 Pos4 0 + 127474697 127475864 255,0,0 19 | chr7 127475864 127477031 Neg1 0 - 127475864 127477031 0,0,255 20 | chr7 127477031 127478198 Neg2 0 - 127477031 127478198 0,0,255 21 | chr7 127478198 127479365 Neg3 0 - 127478198 127479365 0,0,255 22 | chr7 127479365 127480532 Pos5 0 + 127479365 127480532 255,0,0 23 | chr7 127480532 127481699 Neg4 0 - 127480532 127481699 0,0,255 24 | """ 25 | 26 | from matplotlib import pyplot as plt 27 | import numpy as np 28 | import os 29 | 30 | 31 | cmap = plt.get_cmap('nipy_spectral') 32 | colors = np.array([cmap(i) for i in np.linspace(0, 1, 320)]) 33 | 34 | colors=(255*colors).astype(int) 35 | 36 | gff="/home/fast/onimaru/deepgmap/data/outputs/conv4frss_trained_variables_Fri_May_11_075425_2018_kernels/fimo_out/fimo.gff" 37 | gff="/home/fast2/onimaru/DeepGMAP-dev/data/outputs/conv4frss_Mon_Feb_25_092345_2019_trained_variables_kernels/fimo_out/fimo.gff" 38 | bed=os.path.splitext(gff)[0]+".bed" 39 | 40 | with open (gff, 'r') as fin, open(bed, 'w') as fout: 41 | fout.write('track name="kernels" description="kernel distribution visualization" visibility=2 itemRgb="On"\n') 42 | for line in fin: 43 | line=line.split("\t") 44 | if len(line)==9: 45 | chr=line[0] 46 | start=line[3] 47 | end=line[4] 48 | #score=line[5] 49 | orientation=line[6] 50 | subline=line[-1].split(';') 51 | for subs in subline: 52 | if subs.startswith("Name"): 53 | subs=subs.split("=")[1].split("_") 54 | name=subs[0]+"_"+subs[1] 55 | name_num=int(subs[1]) 56 | elif subs.startswith("pvalue"): 57 | subs=-np.log10(float(subs.split("=")[1]))*100 58 | score=str(subs) 59 | _color=",".join(map(str, colors[name_num][:3])) 60 | fout.write("\t".join([chr, start,end,name,score,orientation,start,end, _color])+"\n") 61 | 62 | -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/inputGenerator_from_deepsea.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import random 4 | import multiprocessing 5 | import os 6 | 7 | def dicttoarray(binaryDNAdict, label_list): 8 | 9 | num_seq=len(binaryDNAdict) 10 | x=0 11 | y=0 12 | 13 | shuf=range(num_seq) 14 | 15 | random.shuffle(shuf) 16 | binaryDNAdict_shuf=[] 17 | binaryDNAdict_shuf_append=binaryDNAdict_shuf.append 18 | label_list_shuf=[] 19 | label_list_shuf_append=label_list_shuf.append 20 | k=0 21 | for i in shuf: 22 | 23 | d=binaryDNAdict[i] 24 | l=label_list[i] 25 | #dp=position[i] 26 | #lp=label_position[i] 27 | r=random.random() 28 | 29 | #print r, sum(l), reduce_genome 30 | #print dp, lp 31 | #assert dp==lp 32 | binaryDNAdict_shuf_append(d) 33 | label_list_shuf_append(l) 34 | if sum(l)==0: 35 | x+=1 36 | else: 37 | y+=1 38 | prog=100.0*float(k+y+x)/num_seq 39 | if prog%10.0==0.0: 40 | print(str(prog)+" of data are shuffled.") 41 | z=float(x)/float(y+x) 42 | print(str(k)+" of negative sequences are skipped\n"+"negative/total="+str(z)) 43 | return binaryDNAdict_shuf, label_list_shuf 44 | 45 | 46 | def array_saver(index_list, binaryDNAdict_shuf,label_list_shuf, sample_num,out_dir): 47 | #print "binaryDNAdict_shuf length under array_saver: "+str(len(binaryDNAdict_shuf)) 48 | 49 | for i in range(len(index_list)): 50 | data_array=np.array(binaryDNAdict_shuf[i*sample_num:(i*sample_num+sample_num)], np.int32) 51 | #print np.sum(data_array) 52 | labels=np.array(label_list_shuf[i*sample_num:(i*sample_num+sample_num)], np.int32) 53 | #print np.shape(labels) 54 | 55 | filename = out_dir+"batch_"+str(index_list[i])+".npz" 56 | #print "saving "+str(filename) 57 | try: 58 | with open(filename, "wb") as output_file: 59 | np.savez_compressed(output_file,labels=labels, data_array=data_array) 60 | except IOError as e: 61 | print("I/O error({0}): {1}".format(e.errno, e.strerror)) 62 | except ValueError: 63 | print("Could not convert data") 64 | except: 65 | print("Unexpected error:", sys.exc_info()[0]) 66 | raise 67 | 68 | fname="/home/fast/onimaru/encode/deepsea/deepsea_train/train.npz" 69 | output_dir=os.path.split(fname)[0]+"/train_data_for_my_program/" 70 | os.makedirs(output_dir) 71 | fload=np.load(fname) 72 | data=fload["data_array"] 73 | labels=fload["labels"] 74 | 75 | binaryDNAdict_shuf, label_list_shuf=dicttoarray(data, labels,) 76 | 77 | dna_dict_length=len(binaryDNAdict_shuf) 78 | 79 | if dna_dict_length%16==0: 80 | batch=dna_dict_length/16 81 | else: 82 | batch=dna_dict_length/16+1 83 | 84 | if dna_dict_length%100==0: 85 | total_num=dna_dict_length/(100*16) 86 | else: 87 | total_num=dna_dict_length/(100*16)+1 88 | 89 | jobs = [] 90 | for i in range(16): 91 | #print str(len(binaryDNAdict_shuf[i*batch:(i+1)*batch]))+" are passed" 92 | jobs.append(multiprocessing.Process(target=array_saver, 93 | args=(range(i*total_num,(i+1)*total_num), 94 | binaryDNAdict_shuf[i*batch:(i+1)*batch], 95 | label_list_shuf[i*batch:(i+1)*batch], 96 | 100, output_dir,))) 97 | print("saving data set with "+str(16)+" threads") 98 | for j in jobs: 99 | j.start() 100 | 101 | for j in jobs: 102 | j.join() 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/genome_labeling.py: -------------------------------------------------------------------------------- 1 | 2 | import glob as glb 3 | import sys 4 | import numpy as np 5 | 6 | narrow_peak_list=glb.glob(sys.argv[1]) 7 | label_array=[] 8 | for i in range(len(narrow_peak_list)): 9 | label_array.append(0) 10 | #print label_array 11 | 12 | genome_file=sys.argv[2] 13 | genome_segments=[] 14 | 15 | with open(genome_file, 'r') as fin: 16 | for line in fin: 17 | #label_array.append(1) 18 | genome_segments.append(label_array) 19 | genome_segments=np.array(genome_segments) 20 | 21 | genome_file2=sys.argv[3] 22 | genome_size=[] 23 | chrm_num={} 24 | chrm_list=set() 25 | with open(genome_file2,'r') as fin: 26 | i=0 27 | for line in fin: 28 | line=line.split() 29 | genome_size.append(int(line[1])/1000) 30 | chrm_num[line[0]]=i 31 | chrm_list.add(line[0]) 32 | i+=1 33 | 34 | 35 | peak_dict={} 36 | 37 | i=0 38 | j=0 39 | for f in narrow_peak_list: 40 | with open(f, 'r') as fin: 41 | for line in fin: 42 | 43 | line=line.split() 44 | chrm=line[0] 45 | if len(line)==4: 46 | score=int(line[3]) 47 | elif len(line)==10: 48 | score=int(line[4]) 49 | else: 50 | #print f 51 | break 52 | 53 | if chrm in chrm_list: 54 | if score>=75: 55 | 56 | start=int(line[1]) 57 | end=int(line[2]) 58 | length=end-start 59 | right_bin1=(start/1000+1)*1000 60 | left_bin1=(end/1000)*1000 61 | point_1000=(start+end)/(2*1000) 62 | if end<=right_bin1: 63 | 64 | 65 | genome_location=sum(genome_size[:chrm_num[chrm]])+point_1000 66 | if genome_segments[genome_location][i]==0: 67 | genome_segments[genome_location][i]+=1 68 | j+=1 69 | #print j 70 | else: 71 | 72 | 73 | if right_bin1-start>=100: 74 | left_point=start/1000 75 | else: 76 | left_point=start/1000+1 77 | if end-left_bin1>=100: 78 | right_point=end/1000 79 | else: 80 | right_point=end/1000-1 81 | k=left_point 82 | while left_point<=k<=right_point: 83 | genome_location=sum(genome_size[:chrm_num[chrm]])+k 84 | 85 | if genome_segments[genome_location][i]==0: 86 | genome_segments[genome_location][i]+=1 87 | j+=1 88 | #print j 89 | #print "longer than 1000 "+ str(genome_location) 90 | k+=1 91 | 92 | #if genome_segments[genome_location][-1]==1: 93 | # genome_segments[genome_location][-1]=0 94 | i+=1 95 | 96 | with open(genome_file,'r') as fin: 97 | with open(genome_file+'_limb_75co_OL100.labeled','w') as fout, open(genome_file+'_limb_75co_OL100.bed','w') as fout2: 98 | i=0 99 | for line in fin: 100 | fout.write(line.strip('\n')+'\t'+'\t'.join(map(str, list(genome_segments[i])))+'\n') 101 | if genome_segments[i]==1: 102 | fout2.write(line) 103 | i+=1 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/genome_labeling_compare.py: -------------------------------------------------------------------------------- 1 | 2 | import glob as glb 3 | import sys 4 | import numpy as np 5 | from sklearn.decomposition import KernelPCA as pca_f 6 | import os 7 | import matplotlib.pyplot as plt 8 | from scipy.spatial.distance import pdist 9 | import scipy.cluster.hierarchy as sch 10 | from MulticoreTSNE import MulticoreTSNE as TSNE 11 | 12 | def genome_label(bed_file_list, genome_1000): 13 | file_num=len(bed_file_list) 14 | peak_set_list=[] 15 | i=0 16 | for f in bed_file_list: 17 | peak_set=set() 18 | with open(f, 'r') as fin: 19 | for line in fin: 20 | if i==0: 21 | _,a,b=line.split() 22 | check_length=int(b)-int(a) 23 | 24 | peak_set.add(line) 25 | peak_set_list.append(peak_set) 26 | 27 | i+=1 28 | label_array_list=[] 29 | with open(genome_1000,'r') as fin: 30 | i=0 31 | for line in fin: 32 | k=0 33 | label_array=[0 for h in range(file_num)] 34 | 35 | for s in peak_set_list: 36 | if i==0: 37 | _,a,b=line.split() 38 | assert check_length==int(b)-int(a), "mismatches in sequence lengths" 39 | if line in s: 40 | label_array[k]=1 41 | k+=1 42 | if sum(label_array)>0: 43 | #print sum(label_array) 44 | label_array_list.append(label_array) 45 | i+=1 46 | return np.array(label_array_list) 47 | 48 | def main(): 49 | bed_file_dir, genome_1000=sys.argv[1:] 50 | bed_file_list=[] 51 | if not "*" in bed_file_dir and bed_file_dir.endswith('.bed'): 52 | bed_file_list.append(bed_file_dir) 53 | elif not '*' in bed_file_dir: 54 | bed_file_dir=bed_file_dir+"*.bed" 55 | 56 | bed_file_list=glb.glob(bed_file_dir) 57 | #print bed_file_list 58 | if len(bed_file_list)==0: 59 | # print("no files in "+str(bed_file_dir)) 60 | sys.exit() 61 | label_array_list=genome_label(bed_file_list, genome_1000) 62 | #print label_array_list[0] 63 | label_array_list=label_array_list[np.random.randint(label_array_list.shape[0], size=5000), :] 64 | 65 | 66 | label_array_list_=np.transpose(label_array_list) 67 | #print sum(label_array_list_[0]) 68 | lshape=label_array_list.shape 69 | C=[] 70 | for i in range(lshape[0]): 71 | 72 | C.append([np.sum(label_array_list[i])/float(lshape[1]),0.0,0.0]) 73 | tsne = TSNE(n_jobs=18,perplexity = 5.000000) 74 | label_array_list=np.array(label_array_list, np.float64) 75 | #X_pca2=np.array(X_pca2, np.float64) 76 | X_tsne = tsne.fit_transform(label_array_list) 77 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], 78 | c=C, lw=2, s=0.5) 79 | 80 | pca = pca_f(n_components=2, kernel="rbf") 81 | X_pca=pca.fit_transform(label_array_list) 82 | dist1=pdist(label_array_list_, 'cosine') 83 | _, ax1=plt.subplots() 84 | 85 | Y = sch.linkage(dist1, method='ward') 86 | Z1 = sch.dendrogram(Y) 87 | idx1 = Z1['leaves'] 88 | 89 | new_sample_list=[] 90 | 91 | for i in idx1: 92 | txt=bed_file_list[i].split("/")[-1] 93 | new_sample_list.append(txt) 94 | ax1.set_xticklabels(new_sample_list , rotation=90) 95 | 96 | 97 | #print X_pca.shape 98 | _, ax2=plt.subplots() 99 | ax2.scatter(X_pca[:,0], X_pca[:,1],c=C) 100 | """for i, txt in enumerate(bed_file_list): 101 | txt=txt.split("/")[-1] 102 | ax2.annotate(txt, (X_pca[i,0],X_pca[i,1]))""" 103 | 104 | plt.show() 105 | if __name__ == '__main__': 106 | main() 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /deepgmap/misc/bed_file_compare2.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import subprocess as sp 4 | import networkx as nx 5 | import os 6 | from itertools import combinations 7 | import glob 8 | from matplotlib import pyplot as plt 9 | import numpy as np 10 | from matplotlib_venn import venn3, venn3_circles 11 | 12 | file_list=sorted(glob.glob('/home/fast2/onimaru/DeepGMAP-dev/data/predictions/quick_benchmark/bed_comp_50_es/*')) 13 | 14 | file_combination=[] 15 | node_list=[] 16 | peak_counts={} 17 | path_sep=os.path.sep 18 | 19 | peak_count={} 20 | peak_count_dict={} 21 | for i in file_list: 22 | with open(i, 'r') as j: 23 | peak_count=len(j.readlines()) 24 | 25 | file_name=i.split(path_sep) 26 | file_name=file_name[-1].split('.') 27 | node1=file_name[0] 28 | peak_counts[node1]=peak_count 29 | node_list.append(node1) 30 | 31 | ABout=open('./intersectAB.bed', 'w') 32 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-a", str(file_list[0]), "-b", str(file_list[1])], stdout=ABout) 33 | ABout.close() 34 | fAB=open('./intersectAB.bed', 'r') 35 | AB=len(fAB.readlines()) 36 | fAB.close() 37 | #print AB, peak_counts[node_list[0]] 38 | 39 | ABout_=open('./intersectAB_.bed', 'w') 40 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-a", str(file_list[1]), "-b", str(file_list[0])], stdout=ABout_) 41 | ABout_.close() 42 | fAB_=open('./intersectAB_.bed', 'r') 43 | AB_=len(fAB_.readlines()) 44 | fAB_.close() 45 | #print AB_, peak_counts[node_list[1]] 46 | 47 | if AB>AB_: 48 | AB=AB_ 49 | 50 | ACout=open('./intersectAC.bed', 'w') 51 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-a", str(file_list[0]), "-b", str(file_list[2])], stdout=ACout) 52 | ACout.close() 53 | fAC=open('intersectAC.bed', 'r') 54 | AC=len(fAC.readlines()) 55 | fAC.close() 56 | #print AC, peak_counts[node_list[2]] 57 | 58 | ACout_=open('./intersectAC_.bed', 'w') 59 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-a", str(file_list[2]), "-b", str(file_list[0])], stdout=ACout_) 60 | ACout_.close() 61 | fAC_=open('intersectAC_.bed', 'r') 62 | AC_=len(fAC_.readlines()) 63 | fAC_.close() 64 | #print AC_ 65 | 66 | if AC>AC_: 67 | AC=AC_ 68 | 69 | BCout=open('./intersectBC.bed', 'w') 70 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-a", str(file_list[2]), "-b", str(file_list[1])], stdout=BCout) 71 | BCout.close() 72 | fBC=open('intersectBC.bed', 'r') 73 | BC=len(fBC.readlines()) 74 | fBC.close() 75 | #print BC 76 | 77 | BCout_=open('./intersectBC_.bed', 'w') 78 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-a", str(file_list[1]), "-b", str(file_list[2])], stdout=BCout_) 79 | BCout_.close() 80 | fBC_=open('intersectBC_.bed', 'r') 81 | BC_=len(fBC_.readlines()) 82 | fBC_.close() 83 | #print BC_ 84 | 85 | if BC>BC_: 86 | BC=BC_ 87 | 88 | ABCout=open('./intersectABC.bed', 'w') 89 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-a", 'intersectAB.bed', "-b", str(file_list[2])],stdout=ABCout) 90 | ABCout.close() 91 | fABC=open('intersectABC.bed', 'r') 92 | ABC=len(fABC.readlines()) 93 | fABC.close() 94 | 95 | ABCout_=open('./intersectABC_.bed', 'w') 96 | sp.check_call(["bedtools", "intersect","-u","-F","1.0","-f","1.0", "-b", 'intersectAB.bed', "-a", str(file_list[2])],stdout=ABCout_) 97 | ABCout_.close() 98 | fABC_=open('intersectABC_.bed', 'r') 99 | ABC_=len(fABC_.readlines()) 100 | fABC_.close() 101 | 102 | if ABC>ABC_: 103 | ABC=ABC_ 104 | 105 | Abc=peak_counts[node_list[0]]-AB-AC+ABC 106 | ABc=AB-ABC 107 | AbC=AC-ABC 108 | 109 | aBc=peak_counts[node_list[1]]-AB-BC+ABC 110 | aBC=BC-ABC 111 | 112 | abC=peak_counts[node_list[2]]-AC-BC+ABC 113 | 114 | plt.figure(figsize=(4,4)) 115 | v = venn3(subsets=(Abc, aBc, ABc, abC, AbC, aBC, ABC), set_labels = (node_list[0], node_list[1], node_list[2])) 116 | v.get_patch_by_id('100').set_alpha(1.0) 117 | plt.title("Venn diagram") 118 | plt.show() 119 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/sequence_visualizer.py: -------------------------------------------------------------------------------- 1 | 2 | import cairocffi as cairo 3 | import gzip 4 | import cPickle 5 | import numpy as np 6 | import glob as gl 7 | 8 | def select_color(cr, DNA): 9 | if DNA=="A": 10 | cr.set_source_rgb(1, 0, 0) 11 | elif DNA=="G": 12 | cr.set_source_rgb(0, 0, 0) 13 | elif DNA=="C": 14 | cr.set_source_rgb(0, 0, 1) 15 | elif DNA=="T": 16 | cr.set_source_rgb(0, 1, 0) 17 | else: 18 | cr.set_source_rgb(1, 1, 1) 19 | def main(): 20 | f=gl.glob("/home/fast/onimaru/data/reconstruction/network_constructor_deepsea_1d3_Sat_Jul__1_145520_2017.ckpt-6293_transpose_*.npz") 21 | for npz in f: 22 | with np.load(npz, 'r') as f: 23 | 24 | reconstruct=f["conv2"] 25 | original_seq=f["original"] 26 | i=0 27 | j=0 28 | k=0 29 | l=0 30 | reconstruct=np.reshape(reconstruct, (1000, 4)) 31 | original_seq=np.reshape(original_seq, (1000, 4)) 32 | 33 | line_num=10 34 | DNA_len=1000 35 | width=DNA_len*30/line_num+200 36 | hight=1024*2*3 37 | y_center=300 38 | ims1 = cairo.PDFSurface(npz+".pdf", width, hight) 39 | cr = cairo.Context(ims1) 40 | cr.move_to(100, y_center) 41 | cr.line_to(DNA_len/line_num*30+100, y_center) 42 | #cr.move_to(50, 100) 43 | #cr.line_to(50, 412) 44 | cr.set_line_width(2) 45 | cr.stroke() 46 | max_value=reconstruct.max() 47 | SCALE=300/max_value 48 | for k in range(1000): 49 | if not k==0 and k%(DNA_len/line_num)==0: 50 | cr.set_source_rgba(0.0,0.0,0,1.0) 51 | y_center+=400 52 | cr.move_to(100, y_center) 53 | cr.line_to(DNA_len//line_num*30+100, y_center) 54 | cr.stroke() 55 | print(y_center) 56 | max_value=np.amax(reconstruct[k]) 57 | sum_value=np.sum(reconstruct[k]) 58 | max_value2=np.amax(original_seq[k]) 59 | print(max_value) 60 | if max_value>0.0: 61 | max_index=np.argmax(reconstruct[k]) 62 | 63 | if max_index==0: 64 | Nuc="A" 65 | elif max_index==1: 66 | Nuc="G" 67 | elif max_index==2: 68 | Nuc="C" 69 | elif max_index==3: 70 | Nuc="T" 71 | else: 72 | Nuc="N" 73 | 74 | if max_value2>0.0: 75 | max_index2=np.argmax(original_seq[k]) 76 | 77 | if max_index2==0: 78 | Nuc2="A" 79 | elif max_index2==1: 80 | Nuc2="G" 81 | elif max_index2==2: 82 | Nuc2="C" 83 | elif max_index2==3: 84 | Nuc2="T" 85 | else: 86 | Nuc2="N" 87 | 88 | 89 | Nucpos=0 90 | Nucneg=0 91 | Nucsize=max_value*SCALE 92 | Nucsize2=sum_value*SCALE 93 | x_pos=k%(DNA_len/line_num) 94 | #cr.move_to(50+x_pos*40*0.75, y_center) 95 | #cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL) 96 | #select_color(cr, Nuc) 97 | #font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=Nucsize+0.1,x0=0.0,y0=0.0) 98 | #cr.set_font_matrix(font_mat) 99 | #print Nuc 100 | #cr.show_text(str(Nuc)) 101 | cr.move_to(100+x_pos*40*0.75, y_center) 102 | cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL) 103 | select_color(cr, Nuc2) 104 | font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=Nucsize+20.0,x0=0.0,y0=0.0) 105 | cr.set_font_matrix(font_mat) 106 | cr.show_text(str(Nuc2)) 107 | #cr.set_font_size(40) 108 | cr.show_page() 109 | 110 | if __name__ == "__main__": 111 | main() -------------------------------------------------------------------------------- /deepgmap/post_train_tools/Clustering_analizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | #import cPickle 3 | #import gzip 4 | #from sklearn.cluster import DBSCAN 5 | #from sklearn import metrics 6 | import matplotlib.pyplot as plt 7 | #import scipy 8 | import pylab 9 | import scipy.cluster.hierarchy as sch 10 | import scipy.spatial.distance as spd 11 | #from sklearn.decomposition import PCA, IncrementalPCA 12 | from MulticoreTSNE import MulticoreTSNE as TSNE 13 | import os 14 | fname='/home/fast/onimaru/deepgmap/data/outputs/conv4frss_trained_variables_Fri_May_11_075425_2018.npz' 15 | variables=np.load(fname) 16 | filter1=variables['prediction/W_conv1:0'] 17 | filter1_shape=filter1.shape 18 | filter1_flattened_array=[] 19 | for i in range(filter1_shape[3]): 20 | tmp_filter=filter1[:,:,:,i] 21 | tmp_filter=tmp_filter.reshape(filter1_shape[0], filter1_shape[1]) 22 | tmp_filter=tmp_filter.flatten() 23 | #filter1_flattened_array.append(tmp_filter/np.amax([np.amax(tmp_filter), np.absolute(np.amin(tmp_filter))])) 24 | #filter1_flattened_array.append(np.exp(tmp_filter)/np.sum(np.exp(tmp_filter))) 25 | filter1_flattened_array.append(tmp_filter) 26 | X = np.array(filter1_flattened_array, np.float64) 27 | D = spd.pdist(X, 'cosine') 28 | # Compute and plot first dendrogram. 29 | fig = pylab.figure(figsize=(8,8)) 30 | ax1 = fig.add_axes([0.09,0.1,0.2,0.6]) 31 | Y = sch.linkage(D, method='ward') 32 | Z1 = sch.dendrogram(Y, orientation='left') 33 | ax1.set_xticks([]) 34 | ax1.set_yticks([]) 35 | 36 | # Plot distance matrix. 37 | axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) 38 | idx1 = Z1['leaves'] 39 | #idx2 = Z2['leaves'] 40 | X2 = X[idx1] 41 | im = axmatrix.matshow(X2, aspect='auto', origin='lower', cmap=pylab.get_cmap('YlGnBu')) 42 | axmatrix.set_xticks([]) 43 | axmatrix.set_yticks([]) 44 | 45 | # Plot colorbar. 46 | axcolor = fig.add_axes([0.91,0.1,0.02,0.6]) 47 | pylab.colorbar(im, cax=axcolor) 48 | #fig.savefig('/media/koh/HD-PCFU3/mouse/filter_1_clustering.png') 49 | 50 | saving_dir_prefix=os.path.splitext(fname)[0] 51 | plt.savefig(saving_dir_prefix+'_heat_map.pdf', format='pdf') 52 | 53 | tsne = TSNE(n_jobs=18,perplexity = 50.000000, n_iter=5000) 54 | #X_pca2=np.array(X_pca2, np.float64) 55 | X_tsne = tsne.fit_transform(X) 56 | 57 | fig2 = pylab.figure(figsize=(8,8)) 58 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], 59 | lw=2,s=0.5) 60 | plt.savefig(saving_dir_prefix+'_tSNE.pdf', format='pdf') 61 | 62 | 63 | plt.show() 64 | 65 | """ 66 | import matplotlib.pyplot as mplt 67 | 68 | 69 | db = DBSCAN(eps=0.3,min_samples=3, algorithm='auto').fit(X) 70 | core_samples_mask = np.zeros_like(db.labels_, dtype=bool) 71 | core_samples_mask[db.core_sample_indices_] = True 72 | labels = db.labels_ 73 | 74 | # Number of clusters in labels, ignoring noise if present. 75 | n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) 76 | unique_labels = set(labels) 77 | colors = mplt.get_cmap('Spectral')(np.linspace(0, 1, len(unique_labels))) 78 | for k, col in zip(unique_labels, colors): 79 | if k == -1: 80 | # Black used for noise. 81 | col = 'k' 82 | 83 | class_member_mask = (labels == k) 84 | 85 | xy = X[class_member_mask & core_samples_mask] 86 | mplt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, 87 | markeredgecolor='k', markersize=14) 88 | 89 | xy = X[class_member_mask & ~core_samples_mask] 90 | mplt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, 91 | markeredgecolor='k', markersize=6) 92 | 93 | mplt.title('Estimated number of clusters: %d' % n_clusters_) 94 | mplt.show() 95 | 96 | n_components = 2 97 | ipca = IncrementalPCA(n_components=n_components, batch_size=512, whiten=True) 98 | X_ipca = ipca.fit_transform(X) 99 | 100 | pca = PCA(n_components=n_components) 101 | X_pca = pca.fit_transform(X) 102 | for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]: 103 | plt.figure(figsize=(8, 8)) 104 | 105 | plt.scatter(X_transformed[0], X_transformed[1], lw=2) 106 | 107 | if "Incremental" in title: 108 | err = np.abs(np.abs(X_pca) - np.abs(X_ipca)).mean() 109 | plt.title(title + " of feature map\nMean absolute unsigned error " 110 | "%.6f" % err) 111 | else: 112 | plt.title(title + " of feature map") 113 | plt.legend(loc="best", shadow=False, scatterpoints=1) 114 | 115 | mplt.show() """ -------------------------------------------------------------------------------- /deepgmap/misc/kernel_distribution_analizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | #import cPickle 3 | #import gzip 4 | #from sklearn.cluster import DBSCAN 5 | #from sklearn import metrics 6 | import matplotlib.pyplot as plt 7 | #import scipy 8 | import pylab 9 | import scipy.cluster.hierarchy as sch 10 | import scipy.spatial.distance as spd 11 | from sklearn.decomposition import PCA, IncrementalPCA 12 | from MulticoreTSNE import MulticoreTSNE as TSNE 13 | import os 14 | from mpl_toolkits.mplot3d import Axes3D 15 | import glob as gl 16 | 17 | fname='/home/fast/onimaru/deepgmap/data/outputs/conv4frss_trained_variables_Fri_May_11_075425_2018_kernels/fimo_out/kernels_*_summits_1000.bed' 18 | 19 | flist=gl.glob(fname) 20 | 21 | pos_dict_dict={} 22 | data_array=[] 23 | mycolors=[] 24 | for f in flist: 25 | pos_dict={} 26 | h,t=os.path.split(f) 27 | t=t.split('_') 28 | t=t[2] 29 | print(t) 30 | with open(f, 'r') as fin: 31 | for line in fin: 32 | line=line.split() 33 | if float(line[7])>=500: 34 | pos="\t".join(line[:3]) 35 | kernel_num=int(line[6].split("_")[1]) 36 | if not pos in pos_dict: 37 | pos_dict[pos]=np.zeros([320], np.float64) 38 | pos_dict[pos][kernel_num]+=1.0 39 | 40 | pos_dict_dict[t]=pos_dict 41 | 42 | sample_class=[] 43 | i=0 44 | for k, v in pos_dict_dict.items(): 45 | sample_class.append(k) 46 | rgb=np.zeros([3], np.float64) 47 | #rgb[3]=0.5 48 | if not k=="common": 49 | rgb[i]=1.0 50 | i+=1 51 | for _k,_v in v.items(): 52 | data_array.append(_v) 53 | mycolors.append(rgb) 54 | 55 | print(sample_class) 56 | X = np.array(data_array, np.float64) 57 | saving_dir_prefix=fname.split('*')[0] 58 | 59 | D = spd.pdist(X, 'cosine') 60 | # Compute and plot first dendrogram. 61 | fig = pylab.figure(figsize=(8,8)) 62 | ax1 = fig.add_axes([0.09,0.1,0.2,0.6]) 63 | Y = sch.linkage(D, method='ward') 64 | Z1 = sch.dendrogram(Y, orientation='left') 65 | ax1.set_xticks([]) 66 | ax1.set_yticks([]) 67 | 68 | # Plot distance matrix. 69 | axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) 70 | idx1 = Z1['leaves'] 71 | #idx2 = Z2['leaves'] 72 | X2 = X[idx1] 73 | im = axmatrix.matshow(X2, aspect='auto', origin='lower', cmap=pylab.get_cmap('YlGnBu')) 74 | axmatrix.set_xticks([]) 75 | axmatrix.set_yticks([]) 76 | 77 | # Plot colorbar. 78 | axcolor = fig.add_axes([0.91,0.1,0.02,0.6]) 79 | pylab.colorbar(im, cax=axcolor) 80 | #fig.savefig('/media/koh/HD-PCFU3/mouse/filter_1_clustering.png') 81 | 82 | 83 | plt.savefig(saving_dir_prefix+'_heat_map.pdf', format='pdf') 84 | """ 85 | tsne = TSNE(n_jobs=16,perplexity = 20.000000, n_iter=10000) 86 | #X_pca2=np.array(X_pca2, np.float64) 87 | X_tsne = tsne.fit_transform(X) 88 | 89 | fig2 = pylab.figure(figsize=(8,8)) 90 | plt.scatter(X_tsne[:, 0], X_tsne[:, 1], 91 | lw=2,s=0.5, c=mycolors) 92 | plt.savefig(saving_dir_prefix+'_tSNE.pdf', format='pdf') 93 | 94 | 95 | #plt.show() 96 | import pandas as pd 97 | import seaborn as sns 98 | sns.set_style("white") 99 | #df = sns.load_dataset('iris') 100 | 101 | #my_dpi=96 102 | #plt.figure(figsize=(480/my_dpi, 480/my_dpi), dpi=my_dpi) 103 | 104 | # Keep the 'specie' column appart + make it numeric for coloring 105 | #df['species']=pd.Categorical(df['species']) 106 | #my_color=df['species'].cat.codes 107 | #df = df.drop('species', 1) 108 | 109 | # Run The PCA 110 | pca = PCA(n_components=3) 111 | pca.fit(X) 112 | 113 | # Store results of PCA in a data frame 114 | result=pd.DataFrame(pca.transform(X), columns=['PCA%i' % i for i in range(3)]) 115 | 116 | # Plot initialisation 117 | fig = plt.figure() 118 | ax = fig.add_subplot(111, projection='3d') 119 | ax.scatter(result['PCA0'], result['PCA1'], result['PCA2'], c=mycolors, s=10) 120 | 121 | # make simple, bare axis lines through space: 122 | xAxisLine = ((min(result['PCA0']), max(result['PCA0'])), (0, 0), (0,0)) 123 | ax.plot(xAxisLine[0], xAxisLine[1], xAxisLine[2], 'r') 124 | yAxisLine = ((0, 0), (min(result['PCA1']), max(result['PCA1'])), (0,0)) 125 | ax.plot(yAxisLine[0], yAxisLine[1], yAxisLine[2], 'r') 126 | zAxisLine = ((0, 0), (0,0), (min(result['PCA2']), max(result['PCA2']))) 127 | ax.plot(zAxisLine[0], zAxisLine[1], zAxisLine[2], 'r') 128 | 129 | # label the axes 130 | ax.set_xlabel("PC1") 131 | ax.set_ylabel("PC2") 132 | ax.set_zlabel("PC3") 133 | ax.set_title("PCA on the iris data set") 134 | #plt.show() 135 | 136 | plt.show()""" -------------------------------------------------------------------------------- /INSTALL.rst: -------------------------------------------------------------------------------- 1 | ========================== 2 | INSTALL Guide For DeepGMAP 3 | ========================== 4 | 5 | 6 | Install with docker 7 | =================== 8 | 9 | Prerequisites 10 | ~~~~~~~~~~~~~ 11 | nvidia-driver 396?. 12 | 13 | ndivie-docker 2.0.3. 14 | 15 | docker 18.06. 16 | 17 | 18 | Pull an existing docker image 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | First, pull a docker image of deepgmap from the docker repository.:: 21 | 22 | $ docker pull koonimaru/deepgmap:dev3 23 | 24 | Build a new docker image 25 | ~~~~~~~~~~~~~~~~~~~~~~~~ 26 | Alternatively, by building a new image with Dockerfile, you may be able to get a latest version of deepgmap or to change tensorflow version. In this case, please use Dockerfile in this package:: 27 | 28 | $ mkdir deepgmap-docker 29 | $ cp DeepGMAP/Dockerfile ./deepgmap-docker/ 30 | $ cd deepgmap-docker 31 | $ docker build --no-cache -t koonimaru/deepgmap . 32 | 33 | Next, download several data for a test run:: 34 | 35 | $ wget https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/15268919/DeepGMAPdatalight.tar.lzma 36 | $ lzma -d DeepGMAPdatalight.tar.lzma && tar -xvf DeepGMAPdatalight.tar && rm DeepGMAPdatalight.tar 37 | 38 | If you want, move "data" directory in the DeepGMAP-data-light folder to your working directory for deepgmap. Otherwise, it's ready, please see README.rst for how to run deepgmap. 39 | 40 | 41 | 42 | Install manually 43 | ================ 44 | 45 | Prerequisites 46 | ~~~~~~~~~~~~~ 47 | 48 | DeepGMAP is verified to work on Linux (Ubunru 16.10). Also using GPU is highly recommended. 49 | 50 | Python version 3.6. 51 | 52 | Numpy_ (>=1.6). 53 | 54 | Cython_ (>=0.18) is an optional requirement to recompile ``.pyx`` files. 55 | 56 | Tensorflow_ (>=1.8) Note that Tensorflow requires cuDNN and cudna libraries. 57 | 58 | Scikitlearn_ (>=0.19.1) 59 | 60 | matplotlib_ 61 | 62 | bedtools_ (>=2.25) 63 | 64 | .. _Numpy: http://www.scipy.org/Download 65 | .. _Cython: http://cython.org/ 66 | .. _Tensorflow: https://www.tensorflow.org/ 67 | .. _Scikitlearn: http://scikit-learn.org/ 68 | .. _matplotlib: https://matplotlib.org/ 69 | .. _bedtools: http://bedtools.readthedocs.io/ 70 | 71 | Installing tensorflow-gpu 72 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 73 | To accelerate computing, users need to use `cuda-enabled GPUs`_. tensorflow-gpu itself can be easily 74 | installed by typing "sudo pip install tensorflow-gpu" or "sudo pip install -r requirements.txt". But, to make 75 | tensorflow-gpu work, you need a right version of cuDNN and cuda toolkit libraries (please 76 | check the `tensorflow web site`_). If you do not want to care about these softwares, please consider using docker. 77 | 78 | .. _cuda-enabled GPUs: https://developer.nvidia.com/cuda-gpus 79 | .. _tensorflow web site: https://www.tensorflow.org/install/install_linux 80 | 81 | Download source and data 82 | ~~~~~~~~~~~~~~~~~~~~~~~~ 83 | To download the source code from our github repository:: 84 | 85 | $ git clone https://github.com/koonimaru/DeepGMAP.git 86 | 87 | To download a trial data set:: 88 | 89 | $ wget https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/15268919/DeepGMAPdatalight.tar.lzma 90 | $ lzma -d DeepGMAPdatalight.tar.lzma && tar -xvf DeepGMAPdatalight.tar && rm DeepGMAPdatalight.tar 91 | 92 | Place the folder named "data" under the DeepGMAP directory. 93 | 94 | Local installation by configuring environment variables 95 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 96 | 97 | You need to add the downloaded location (in this example home directory: $HOME) to your ``PYTHONPATH`` and ``PATH`` environment variables. 98 | 99 | PYTHONPATH 100 | ~~~~~~~~~~ 101 | 102 | You need to include the new value in your ``PYTHONPATH`` by 103 | adding this line to your ``~/.bashrc``:: 104 | 105 | $ export PYTHONPATH=$HOME/DeepGMAP/:$PYTHONPATH 106 | 107 | Then, type:: 108 | 109 | $ source .bashrc 110 | 111 | Or, re-login to your account. 112 | 113 | PATH 114 | ~~~~ 115 | 116 | You'll also like to add a new value to your 117 | PATH environment variable so that you can use the deepgmap command line 118 | directly:: 119 | 120 | $ export PATH=$HOME/DeepGMAP/bin/:$PATH 121 | 122 | 123 | Installation system-wide 124 | ~~~~~~~~~~~~~~~~~~~~~~~~ 125 | Using pip:: 126 | 127 | $ sudo pip install deepgmap 128 | 129 | Alternatively, go to the DeepGMAP directory, and type:: 130 | 131 | $ sudo python3 setup.py install 132 | 133 | 134 | These commands work only if cuda and cuDNN are already installed and you have a root user priviledge. 135 | 136 | 137 | -- 138 | Koh Onimaru 139 | 140 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/ROC_space_plotter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import matplotlib.pyplot as plt 4 | from sklearn import svm, datasets 5 | from sklearn.metrics import roc_curve, auc 6 | from sklearn.preprocessing import label_binarize 7 | from sklearn.multiclass import OneVsRestClassifier 8 | from scipy import interp 9 | import getopt 10 | from glob import glob 11 | from natsort import natsorted 12 | from sklearn.metrics import precision_recall_curve 13 | from sklearn.metrics import average_precision_score 14 | import matplotlib as mpl 15 | 16 | def roc_space_calc(label,pred): 17 | 18 | # Compute ROC curve and ROC area for each class 19 | 20 | fpr, tpr, _ = roc_curve(label, pred) 21 | roc_auc = auc(fpr, tpr) 22 | 23 | return fpr, tpr, roc_auc 24 | 25 | 26 | 27 | def roc_space_plotter(label, predictions, name_list,outfile_name): 28 | predictions_list=[] 29 | label_array=label 30 | for pred in predictions: 31 | print(pred["prediction"].shape) 32 | predictions_list.append(pred["prediction"]) 33 | 34 | fpr_list=[] 35 | tpr_list=[] 36 | roc_auc_list=[] 37 | precision_list=[] 38 | recall_list=[] 39 | average_precision_list=[] 40 | for i in predictions_list: 41 | fpr, tpr, roc_auc=roc_space_calc(label_array, i) 42 | fpr_list.append(fpr) 43 | tpr_list.append(tpr) 44 | roc_auc_list.append(roc_auc) 45 | recall, precision,_ =precision_recall_curve(label_array, i) 46 | precision_list.append(precision) 47 | recall_list.append(recall) 48 | average_precision = average_precision_score(label_array, i) 49 | average_precision_list.append(average_precision) 50 | 51 | 52 | 53 | colormap = plt.cm.get_cmap('gnuplot') 54 | #C = [colormap(i) for i in np.linspace(0,0.9,len(name_list))] 55 | 56 | plt.figure(1, figsize=(5,10)) 57 | ax1=plt.subplot(211) 58 | 59 | 60 | C=['darkorange','green','blue'] 61 | i=0 62 | for fpr, tpr, roc_auc,name in zip(fpr_list,tpr_list,roc_auc_list,name_list): 63 | plt.plot(fpr, tpr, color=C[i], 64 | label=str(name)+' (area = %0.2f)' % roc_auc) 65 | i+=1 66 | plt.plot([0, 1], [0, 1.0], color='navy', linestyle='--') 67 | plt.axis('equal') 68 | ax1.set_xlim([0.0, 1.0]) 69 | ax1.set_ylim([0.0, 1.0]) 70 | plt.xlabel('False Positive Rate') 71 | plt.ylabel('True Positive Rate') 72 | 73 | plt.title('Receiver operating characteristic curve') 74 | plt.legend(loc="lower right") 75 | 76 | ax2=plt.subplot(212) 77 | i=0 78 | for prec, rec, avr_pr,name in zip(precision_list,recall_list,average_precision_list,name_list): 79 | 80 | plt.plot(prec, rec, lw=2, color=C[i],label=str(name)+' (area = %0.2f)' % avr_pr) 81 | i+=1 82 | plt.axis('equal') 83 | plt.xlabel('Recall') 84 | plt.ylabel('Precision') 85 | ax2.set_ylim([0.0, 1.00]) 86 | ax2.set_xlim([0.0, 1.0]) 87 | 88 | plt.title('Precision-Recall curve') 89 | plt.legend(loc="lower left") 90 | 91 | plt.savefig(outfile_name, format='pdf') 92 | 93 | plt.show() 94 | 95 | def main(): 96 | outfile_name="/home/fast/onimaru/data/prediction/ROC_space_curve_comp_limb_brain.pdf" 97 | npload_list=[] 98 | label_array=[] 99 | chromosome="chr2" 100 | #name_list=["DeepSEA", "Bidirectional","Conv_plus","Conv+Bidirectional"] 101 | name_list=["DeepSEA", "DanQ","Conv+Bidirectional"] 102 | file_list=['/home/fast/onimaru/data/prediction/network_constructor_danq_1d_Sat_Nov_18_151721_2017.ckpt-12123_label_prediction.npz', 103 | #'/home/fast/onimaru/data/prediction/network_constructor_deepsea_1d4_Fri_Oct__6_183716_2017.ckpt-11467_label_prediction.npz', 104 | "/home/fast/onimaru/data/prediction/network_constructor_danq_1d_Sat_Nov_18_151721_2017.ckpt-12123_label_prediction.npz", 105 | "/home/fast/onimaru/data/prediction/network_constructor_deepsea_1d3_Fri_Nov_17_170434_2017.ckpt-12123_label_prediction.npz"] 106 | label_file='' 107 | 108 | with open(label_file, 'r') as fin: 109 | for line in fin: 110 | if line.startswith(chromosome): 111 | label_array.append(map(int, line[3:])) 112 | label_array=np.array(label_array) 113 | 114 | for f in file_list: 115 | npload_list.append(np.load(f)) 116 | 117 | 118 | roc_space_plotter(label_array, npload_list, name_list,outfile_name) 119 | 120 | 121 | if __name__== '__main__': 122 | main() 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/inputfileGeneratorForGenomeScan_p2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os.path 3 | import multiprocessing 4 | import sys 5 | import deepgmap.data_preprocessing_tools.seq_to_binary2 as sb2 6 | import psutil 7 | import getopt 8 | import time 9 | 10 | PATH_SEP=os.path.sep 11 | def div_roundup(x, y): 12 | if y%x==0: 13 | return y//x 14 | else: 15 | return y//x+1 16 | 17 | 18 | def DNA_to_array_converter(input_file_read,seq_num,target_chr): 19 | seq_list=[] 20 | seq_list_append=seq_list.append 21 | position_list=[] 22 | position_list_append=position_list.append 23 | b1=0.0 24 | i=0 25 | 26 | data_width=len(input_file_read[1].strip("\n")) 27 | print(data_width) 28 | SEQ=False 29 | #print seq_list 30 | for l, line in enumerate(input_file_read): 31 | if line.startswith('>'): 32 | #if not "_" in line and not line.startswith('>chrM'): 33 | if not line.startswith('>chrM'): 34 | #print line, 35 | position_list_append(line.strip('\n')) 36 | SEQ=True 37 | else: 38 | SEQ=False 39 | if i%100000==0: 40 | print(line) 41 | elif SEQ: 42 | line=line.strip('\n') 43 | 44 | #a1=time.time() 45 | seq_list_append(sb2.AGCTtoArray4(line.encode('utf-8'),data_width)) 46 | 47 | #b1+=time.time()-a1 48 | i+=1 49 | #if i%100000==0: 50 | #print b1 51 | #sys.exit() 52 | 53 | return position_list, seq_list 54 | 55 | 56 | def array_saver(outfile,positions,sequences): 57 | print('saving '+outfile) 58 | np.savez_compressed(outfile,positions=positions,sequences=sequences) 59 | 60 | def run(args): 61 | 62 | main(args) 63 | 64 | def main(args=None): 65 | 66 | input_file=args.input_genome 67 | target_chr=args.chromosome 68 | output_file=args.out_directory 69 | threads=args.thread_number 70 | chunck_data=args.chunck_data 71 | print(args) 72 | 73 | if threads==0: 74 | threads=multiprocessing.cpu_count()//2 75 | 76 | if not input_file.endswith(".fa") and not input_file.endswith(".fasta"): 77 | input_file+=PATH_SEP+"genome.fa" 78 | if not os.path.isfile(input_file): 79 | print("input file must be a dirctory containing genome.fa or a fasta file.") 80 | 81 | file_size=os.path.getsize(input_file) 82 | print(file_size) 83 | 84 | loop_to_reduce_ram=div_roundup(1000000000, file_size) 85 | try: 86 | with open(input_file, "r") as fin: 87 | input_file_read=fin.readlines() 88 | except IOError: 89 | print('cannot open', input_file) 90 | output_file+="_all" 91 | os.makedirs(output_file) 92 | line_num=len(input_file_read) 93 | #print line_num 94 | seq_num=line_num//2 95 | 96 | sub_seq_num=div_roundup(loop_to_reduce_ram, seq_num) 97 | DIVIDES_NUM=div_roundup(120000, sub_seq_num) 98 | 99 | for l1 in range(loop_to_reduce_ram): 100 | 101 | position_list, seq_list=DNA_to_array_converter(input_file_read[2*l1*sub_seq_num:2*(l1+1)*sub_seq_num],sub_seq_num,target_chr) 102 | 103 | print(position_list[0], input_file_read[2*l1*sub_seq_num]) 104 | 105 | 106 | outerloop=div_roundup(threads, DIVIDES_NUM) 107 | chunk_num=div_roundup(DIVIDES_NUM, sub_seq_num) 108 | 109 | if DIVIDES_NUM>=threads: 110 | job_num=threads 111 | else: 112 | job_num=DIVIDES_NUM 113 | 114 | print(DIVIDES_NUM, threads, outerloop, job_num) 115 | 116 | 117 | for l in range(outerloop): 118 | jobs = [] 119 | for i in range(job_num): 120 | if i*chunk_num+l*job_num*chunk_num>sub_seq_num: 121 | break 122 | jobs.append(multiprocessing.Process(target=array_saver, 123 | args=(str(output_file)+PATH_SEP+str(l1)+"_"+str(i+l*job_num), 124 | position_list[i*chunk_num+l*job_num*chunk_num:(i+1)*chunk_num+l*job_num*chunk_num], 125 | seq_list[i*chunk_num+l*job_num*chunk_num:(i+1)*chunk_num+l*job_num*chunk_num]))) 126 | for j in jobs: 127 | j.start() 128 | 129 | for j in jobs: 130 | j.join() 131 | 132 | 133 | 134 | if __name__== '__main__': 135 | main() 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/ROC_space_plotter3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import matplotlib.pyplot as plt 4 | from sklearn import svm, datasets 5 | from sklearn.metrics import roc_curve, auc 6 | from sklearn.preprocessing import label_binarize 7 | from sklearn.multiclass import OneVsRestClassifier 8 | from scipy import interp 9 | import getopt 10 | from glob import glob 11 | from natsort import natsorted 12 | from sklearn.metrics import precision_recall_curve 13 | from sklearn.metrics import average_precision_score 14 | import matplotlib as mpl 15 | 16 | def roc_space_calc(label,pred): 17 | 18 | # Compute ROC curve and ROC area for each class 19 | 20 | fpr, tpr, _ = roc_curve(label, pred) 21 | roc_auc = auc(fpr, tpr) 22 | 23 | return fpr, tpr, roc_auc 24 | 25 | 26 | 27 | def roc_space_plotter(label, predictions, name_list,outfile_name): 28 | predictions_list=[] 29 | label_array=label 30 | for pred in predictions: 31 | #print pred["prediction"].shape 32 | predictions_list.append(pred["prediction"]) 33 | 34 | fpr_list=[] 35 | tpr_list=[] 36 | roc_auc_list=[] 37 | precision_list=[] 38 | recall_list=[] 39 | average_precision_list=[] 40 | label_array_shape= label_array.shape 41 | for i in predictions_list: 42 | for j in range(label_array_shape[1]): 43 | a, b=label_array[:,j], i[:,j] 44 | fpr, tpr, roc_auc=roc_space_calc(a, b) 45 | fpr_list.append(fpr) 46 | tpr_list.append(tpr) 47 | roc_auc_list.append(roc_auc) 48 | recall, precision,_ =precision_recall_curve(a, b) 49 | precision_list.append(precision) 50 | recall_list.append(recall) 51 | average_precision = average_precision_score(a, b) 52 | average_precision_list.append(average_precision) 53 | b+=0.15*np.random.randn(label_array_shape[0]) 54 | b=np.clip(b, 0, 1) 55 | fpr, tpr, roc_auc=roc_space_calc(a, b) 56 | fpr_list.append(fpr) 57 | tpr_list.append(tpr) 58 | roc_auc_list.append(roc_auc) 59 | recall, precision,_ =precision_recall_curve(a, b) 60 | precision_list.append(precision) 61 | recall_list.append(recall) 62 | average_precision = average_precision_score(a, b) 63 | average_precision_list.append(average_precision) 64 | 65 | colormap = plt.cm.get_cmap('gnuplot') 66 | C = [colormap(i) for i in np.linspace(0,0.9,label_array_shape[1]*2)] 67 | 68 | plt.figure(1, figsize=(5,10)) 69 | ax1=plt.subplot(211) 70 | 71 | 72 | #C=['darkorange','green','blue'] 73 | i=0 74 | for fpr, tpr, roc_auc in zip(fpr_list,tpr_list,roc_auc_list): 75 | plt.plot(fpr, tpr, color=C[i], 76 | label=' (area = %0.2f)' % roc_auc) 77 | i+=1 78 | plt.plot([0, 1], [0, 1.0], color='navy', linestyle='--') 79 | plt.axis('equal') 80 | ax1.set_xlim([0.0, 1.0]) 81 | ax1.set_ylim([0.0, 1.0]) 82 | plt.xlabel('False Positive Rate') 83 | plt.ylabel('True Positive Rate') 84 | 85 | plt.title('Receiver operating characteristic curve') 86 | plt.legend(loc="lower right") 87 | 88 | ax2=plt.subplot(212) 89 | i=0 90 | for prec, rec, avr_pr in zip(precision_list,recall_list,average_precision_list): 91 | 92 | plt.plot(prec, rec, lw=2, color=C[i],label=' (area = %0.2f)' % avr_pr) 93 | i+=1 94 | plt.axis('equal') 95 | plt.xlabel('Recall') 96 | plt.ylabel('Precision') 97 | ax2.set_ylim([0.0, 1.00]) 98 | ax2.set_xlim([0.0, 1.0]) 99 | 100 | plt.title('Precision-Recall curve') 101 | plt.legend(loc="lower left") 102 | 103 | #plt.savefig(outfile_name, format='pdf') 104 | 105 | plt.show() 106 | 107 | def main(): 108 | outfile_name="/home/fast/onimaru/data/prediction/ROC_space_curve_comp_limb_brain.pdf" 109 | npload_list=[] 110 | label_array=[] 111 | chromosome="chr2" 112 | #name_list=["DeepSEA", "Bidirectional","Conv_plus","Conv+Bidirectional"] 113 | name_list=["conv4-FRSS"] 114 | file_list=["/home/fast2/onimaru/DeepGMAP-dev/data/predictions/conv4frss_Fri_Jun__8_101931_2018.ckpt-16747_prediction.npz"] 115 | label_file='/home/fast2/onimaru/DeepGMAP-dev/data/inputs/mm10_dnase_subset/dnase_subset_mm10_window1000_stride500.bed.labeled' 116 | 117 | with open(label_file, 'r') as fin: 118 | for line in fin: 119 | if line.startswith(chromosome): 120 | 121 | label_array.append(map(int, line.split()[3:])) 122 | label_array=np.array(label_array) 123 | 124 | for f in file_list: 125 | npload_list.append(np.load(f)) 126 | 127 | 128 | roc_space_plotter(label_array, npload_list, name_list,outfile_name) 129 | 130 | 131 | if __name__== '__main__': 132 | main() 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/motif_compare.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | #from curses.ascii import isdigit 4 | from scipy.spatial.distance import cdist 5 | import deepgmap.post_train_tools.cython_util as cutil 6 | mc=cutil.motif_compare 7 | from matplotlib import pyplot as plt 8 | import os 9 | def _is_number(s): 10 | try: 11 | complex(s) # for int, long, float and complex 12 | except ValueError: 13 | return False 14 | 15 | return True 16 | 17 | def motif_reader(motif_data_dir): 18 | motif_name="" 19 | motif_dict={} 20 | motif_list=[] 21 | with open(motif_data_dir, 'r') as fin: 22 | MOTIF=False 23 | i=0 24 | for line in fin: 25 | i+=1 26 | line=line.split() 27 | if len(line)==0: 28 | MOTIF=False 29 | continue 30 | elif line[0]=="MOTIF": 31 | if len(motif_name)>0: 32 | motif_dict[motif_name]=np.array(motif_list) 33 | 34 | motif_list=[] 35 | motif_name="" 36 | if len(line)>2: 37 | motif_name="_".join(line[1:]) 38 | else: 39 | motif_name=line[1] 40 | 41 | elif line[0]=="letter-probability": 42 | if line[4]=="w=": 43 | motif_length=int(line[5]) 44 | else: 45 | print("something wrong in line "+str(i)) 46 | sys.exit() 47 | MOTIF=True 48 | elif MOTIF==True: 49 | #print _is_number(line[0]) 50 | if not _is_number(line[0]): 51 | MOTIF=False 52 | continue 53 | else: 54 | motif_list.append(map(float, line)) 55 | 56 | motif_dict[motif_name]=np.array(motif_list) 57 | return motif_dict 58 | 59 | 60 | 61 | 62 | def motif_compare(motif_data_dict, long_motif_dict, fout, THRESHOLD=-5.0): 63 | with open(fout, "w") as f: 64 | f.write("Motif name\tStart\tEnd\tdistance\n") 65 | for k1, v1 in long_motif_dict.items(): 66 | 67 | v1shape=v1.shape 68 | #print v1 69 | j=0 70 | for k2, v2 in motif_data_dict.items(): 71 | if "secondary" in k2: 72 | continue 73 | #print k2 74 | #j+=1 75 | #print j 76 | v2shape=v2.shape 77 | RAND_DIST=[] 78 | for i in range(12): 79 | rand=np.random.rand(v2shape[0],v2shape[1]) 80 | for k in range(v2shape[1]): 81 | rand[k]=rand[k]/np.sum(rand[k]) 82 | RAND_DIST.append(np.mean(np.diagonal(cdist(v2, rand,metric='cosine')))) 83 | RAND_MEAN=np.mean(RAND_DIST) 84 | RAND_DEV=np.std(RAND_DIST) 85 | #print RAND_MEAN, RAND_DEV 86 | #print("randome_dist: "+str(RAND_DIST)) 87 | 88 | 89 | 90 | for i in range(v1shape[0]-v2shape[0]): 91 | partial_motif=v1[i:(i+v2shape[0])] 92 | #print v2shape, partial_motif.shape 93 | """M=0.5*(partial_motif+v2)+0.00001 94 | JSD=0.5*(np.sum(-v2*np.log(M/(v2+0.00001)))+np.sum(-partial_motif*np.log(M/(partial_motif+0.00001))))/v2shape[0] 95 | print JSD""" 96 | DIST=np.mean(np.diagonal(cdist(v2, partial_motif,metric='cosine'))) 97 | Z_SCORE=(DIST-RAND_MEAN)/RAND_DEV 98 | #print Z_SCORE 99 | if Z_SCORE<=THRESHOLD: 100 | f.write(str(k2)+"\t"+str(i)+"\t"+str(i+v2shape[0])+"\t"+str(Z_SCORE)+"\n") 101 | 102 | def main(): 103 | motif_data_dir="/home/fast/onimaru/data/meme/merged.meme" 104 | #long_motif_dir="/home/fast/onimaru/deepgmap/data/reconstructions/conv4frss_Fri_May_11_075425_2018.ckpt-16747Tue_May_15_112518_2018_all_.pdf.meme" 105 | long_motif_dir="/home/fast/onimaru/deepgmap/data/reconstructions/conv4frss_Fri_May_11_075425_2018.ckpt-16747Tue_May_15_104419_2018_es_e14_.pdf.meme" 106 | fout=os.path.splitext(long_motif_dir)[0]+".matches" 107 | #fout="/home/fast/onimaru/data/output/network_constructor_deepsea_1d3_Fri_Oct_13_133809_2017.ckpt-15899Mon_Oct_16_105338_2017.npz.matches" 108 | motif_data_dict=motif_reader(motif_data_dir) 109 | #print len(motif_data_dict) 110 | long_motif_dict=motif_reader(long_motif_dir) 111 | #print len(long_motif_dict) 112 | #motif_compare(motif_data_dict, long_motif_dict, fout) 113 | Z_SCORE_list=mc(motif_data_dict, long_motif_dict, fout, THRESHOLD=-5) 114 | plt.hist(Z_SCORE_list, 1000) 115 | plt.xticks(np.arange(min(Z_SCORE_list), max(Z_SCORE_list)+1, 1.0)) 116 | plt.show() 117 | 118 | if __name__== '__main__': 119 | main() 120 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/inputfileGeneratorForGenomeScan_gwas.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os.path 3 | import multiprocessing 4 | import sys 5 | import deepgmap.data_preprocessing_tools.seq_to_binary2 as sb2 6 | import psutil 7 | import getopt 8 | import time 9 | 10 | 11 | def div_roundup(x, y): 12 | if y%x==0: 13 | return y//x 14 | else: 15 | return y//x+1 16 | 17 | 18 | def DNA_to_array_converter(input_file_read,seq_num,target_chr): 19 | seq_list=[] 20 | seq_list_append=seq_list.append 21 | position_list=[] 22 | position_list_append=position_list.append 23 | b1=0.0 24 | i=0 25 | 26 | data_width=len(input_file_read[1].strip("\n")) 27 | print(data_width) 28 | SEQ=False 29 | #print seq_list 30 | for l, line in enumerate(input_file_read): 31 | if line.startswith('>'): 32 | #if not "_" in line and not line.startswith('>chrM'): 33 | if not line.startswith('>chrM'): 34 | #print line, 35 | position_list_append(line.strip('\n')) 36 | SEQ=True 37 | else: 38 | SEQ=False 39 | if i%100000==0: 40 | print(line) 41 | elif SEQ: 42 | line=line.strip('\n') 43 | 44 | #a1=time.time() 45 | seq_list_append(sb2.AGCTtoArray4(line,data_width)) 46 | 47 | #b1+=time.time()-a1 48 | i+=1 49 | #if i%100000==0: 50 | #print b1 51 | #sys.exit() 52 | 53 | return position_list, seq_list 54 | 55 | 56 | def array_saver(outfile,positions,sequences): 57 | print('saving '+outfile) 58 | np.savez_compressed(outfile,positions=positions,sequences=sequences) 59 | 60 | def run(args): 61 | 62 | main(args) 63 | 64 | def main(args=None): 65 | if args is not None: 66 | input_file=args.input_genome 67 | target_chr=args.chromosome 68 | output_file=args.out_directory 69 | threads=args.thread_number 70 | chunck_data=args.chunck_data 71 | print(args) 72 | else: 73 | try: 74 | options, args =getopt.getopt(sys.argv[1:], 'i:t:o:p:', ['input_dir=','target_chr=', 'output_dir=','process=']) 75 | except getopt.GetoptError as err: 76 | print(str(err)) 77 | sys.exit(2) 78 | if len(options)<3: 79 | print('too few argument') 80 | sys.exit(0) 81 | 82 | threads=psutil.cpu_count() 83 | 84 | for opt, arg in options: 85 | if opt in ('-i', '--input_dir'): 86 | input_file=arg 87 | elif opt in ('-t', '--target_chr'): 88 | target_chr=arg 89 | elif opt in ('-o', '--output_dir'): 90 | output_file=arg 91 | elif opt in ('-p', '--process'): 92 | threads=int(arg) 93 | 94 | print(options) 95 | file_size=os.path.getsize(input_file) 96 | print(file_size) 97 | 98 | loop_to_reduce_ram=div_roundup(1000000000, file_size) 99 | try: 100 | with open(input_file, "r") as fin: 101 | input_file_read=fin.readlines() 102 | except IOError: 103 | print('cannot open', input_file) 104 | 105 | line_num=len(input_file_read) 106 | #print line_num 107 | seq_num=line_num/2 108 | 109 | sub_seq_num=div_roundup(loop_to_reduce_ram, seq_num) 110 | DIVIDES_NUM=div_roundup(120000, sub_seq_num) 111 | 112 | for l1 in range(loop_to_reduce_ram): 113 | 114 | position_list, seq_list=DNA_to_array_converter(input_file_read[2*l1*sub_seq_num:2*(l1+1)*sub_seq_num],sub_seq_num,target_chr) 115 | 116 | print(position_list[0], input_file_read[2*l1*sub_seq_num]) 117 | 118 | 119 | outerloop=div_roundup(threads, DIVIDES_NUM) 120 | chunk_num=div_roundup(DIVIDES_NUM, sub_seq_num) 121 | 122 | if DIVIDES_NUM>=threads: 123 | job_num=threads 124 | else: 125 | job_num=DIVIDES_NUM 126 | 127 | print(DIVIDES_NUM, threads, outerloop, job_num) 128 | 129 | 130 | for l in range(outerloop): 131 | jobs = [] 132 | for i in range(job_num): 133 | if i*chunk_num+l*job_num*chunk_num>sub_seq_num: 134 | break 135 | jobs.append(multiprocessing.Process(target=array_saver, 136 | args=(str(output_file)+"_"+str(l1)+"_"+str(i+l*job_num), 137 | position_list[i*chunk_num+l*job_num*chunk_num:(i+1)*chunk_num+l*job_num*chunk_num], 138 | seq_list[i*chunk_num+l*job_num*chunk_num:(i+1)*chunk_num+l*job_num*chunk_num]))) 139 | for j in jobs: 140 | j.start() 141 | 142 | for j in jobs: 143 | j.join() 144 | 145 | 146 | 147 | if __name__== '__main__': 148 | main() 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/inputfileGeneratorForGenomeScan_p.py: -------------------------------------------------------------------------------- 1 | import re 2 | import numpy as np 3 | import gzip 4 | #output_handle=open("/home/koh/MLData/test.txt", 'w') 5 | import time 6 | import gzip 7 | import math 8 | import os.path 9 | import multiprocessing 10 | import sys 11 | import glob 12 | import deepgmap.data_preprocessing_tools.seq_to_binary2 as sb2 13 | import time 14 | import psutil 15 | import getopt 16 | #from __future__ import print_function 17 | PATH_SEP=os.path.sep 18 | def DNA_to_array_converter(input_file,target_chr): 19 | 20 | 21 | if "," in target_chr: 22 | target_chr=set(target_chr.split(',')) 23 | else: 24 | target_chr=set([target_chr]) 25 | #print target_chr 26 | seq_list=[] 27 | position_list=[] 28 | b1=0.0 29 | i=0 30 | with open(input_file, 'r') as fin: 31 | 32 | SEQ=False 33 | for line in fin: 34 | if line.startswith('>'): 35 | _line=line.strip('>').split(':')[0] 36 | if _line in target_chr: 37 | print(line) 38 | position_list.append(line.strip('\n')) 39 | SEQ=True 40 | 41 | else: 42 | SEQ=False 43 | elif SEQ: 44 | line=line.strip('\n') 45 | data_width=len(line) 46 | #sequence=np.zeros([1,1000,4,1], np.int16) 47 | 48 | seq_list.append(sb2.AGCTtoArray4(line.encode('utf-8'),data_width)) 49 | #seq_list.append(sb2.ACGTtoaltArray(line,data_width)) 50 | return position_list, seq_list 51 | 52 | 53 | def array_saver(outfile,positions,sequences): 54 | print('saving '+outfile) 55 | np.savez_compressed(outfile,positions=positions,sequences=sequences) 56 | 57 | def run(args): 58 | 59 | main(args) 60 | 61 | def main(args=None): 62 | 63 | """ 64 | argparser_generate_test = subparsers.add_parser( "generate_test", 65 | help = "Generate a data set for a test or an application of a trained model." ) 66 | argparser_generate_test.add_argument( "-i", "--in_file", dest = "input_genome" , type = str, required = True, 67 | help = "A multiple fasta file containing genome DNA sequences. REQUIRED" ) 68 | argparser_generate_test.add_argument("-C", "--chromosome", dest = "chromosome", type = str, default = "chr2", 69 | help = "Set a target chromosome or a contig for prediction. Default: chr2" ) 70 | argparser_generate_test.add_argument( "-o", "--out_dir", dest = "out_directory", type = str, required = True, 71 | help = "") 72 | argparser_generate_test.add_argument( "-t", "--threads", dest = "thread_number", type = int, 73 | help = "The number of threads. Multithreading is performed only when saving output numpy arrays. Default: 1", default = 1 ) 74 | """ 75 | input_file=args.input_genome 76 | if not input_file.endswith(".fa") and not input_file.endswith(".fasta"): 77 | input_file+=PATH_SEP+"genome.fa" 78 | if not os.path.isfile(input_file): 79 | print("input file must be a dirctory containing genome.fa or a fasta file.") 80 | target_chr=args.chromosome 81 | output_file=args.out_directory+"_"+target_chr 82 | threads=args.thread_number 83 | if threads==0: 84 | threads=multiprocessing.cpu_count()//2 85 | print(args) 86 | 87 | 88 | os.makedirs(output_file) 89 | output_file+=PATH_SEP 90 | position_list, seq_list=DNA_to_array_converter(input_file,target_chr) 91 | seq_num=len(position_list) 92 | print(seq_num) 93 | 94 | if seq_num%120000==0: 95 | DIVIDES_NUM=seq_num//120000 96 | else: 97 | DIVIDES_NUM=seq_num//120000+1 98 | 99 | if DIVIDES_NUM%threads==0: 100 | outerloop=DIVIDES_NUM//threads 101 | else: 102 | outerloop=DIVIDES_NUM//threads+1 103 | 104 | 105 | 106 | 107 | if seq_num%DIVIDES_NUM==0: 108 | chunk_num=seq_num//DIVIDES_NUM 109 | else: 110 | chunk_num=seq_num//DIVIDES_NUM+1 111 | if DIVIDES_NUM>=threads: 112 | job_num=threads 113 | else: 114 | job_num=DIVIDES_NUM 115 | 116 | print(DIVIDES_NUM, threads, outerloop, job_num) 117 | 118 | 119 | for l in range(outerloop): 120 | jobs = [] 121 | for i in range(job_num): 122 | if i*chunk_num+l*threads>seq_num: 123 | break 124 | jobs.append(multiprocessing.Process(target=array_saver, 125 | args=(str(output_file)+str(i+l*threads), 126 | position_list[i*chunk_num+l*threads*chunk_num:(i+1)*chunk_num+l*threads*chunk_num], 127 | seq_list[i*chunk_num+l*threads*chunk_num:(i+1)*chunk_num+l*threads*chunk_num]))) 128 | for j in jobs: 129 | j.start() 130 | 131 | for j in jobs: 132 | j.join() 133 | 134 | 135 | 136 | if __name__== '__main__': 137 | main() 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /deepgmap/misc/igv_session.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/unpooling.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | 5 | """def unpool(updates, mask, ksize=[1, 2, 1, 1]): 6 | input_shape = updates.get_shape().as_list() 7 | # calculation new shape 8 | output_shape = (input_shape[0], input_shape[1] * ksize[1], input_shape[2] * ksize[2], input_shape[3]) 9 | # calculation indices for batch, height, width and feature maps 10 | one_like_mask = tf.ones_like(mask) 11 | batch_range = tf.reshape(tf.range(output_shape[0], dtype=tf.int64), shape=[input_shape[0], 1, 1, 1]) 12 | b = one_like_mask * batch_range 13 | y = mask // (output_shape[2] * output_shape[3]) 14 | x = mask % (output_shape[2] * output_shape[3]) // output_shape[3] 15 | feature_range = tf.range(output_shape[3], dtype=tf.int64) 16 | f = one_like_mask * feature_range 17 | # transpose indices & reshape update values to one dimension 18 | updates_size = tf.size(updates) 19 | indices = tf.transpose(tf.reshape(tf.stack([b, y, x, f]), [4, updates_size])) 20 | values = tf.reshape(updates, [updates_size]) 21 | ret = tf.scatter_nd(indices, values, output_shape) 22 | return ret""" 23 | 24 | 25 | def unpool(updates, mask, ksize=[1, 2, 1, 1], output_shape=None, name=''): 26 | with tf.variable_scope(name): 27 | mask = tf.cast(mask, tf.int32) 28 | input_shape = tf.shape(updates, out_type=tf.int32) 29 | # calculation new shape 30 | if output_shape is None: 31 | output_shape = (input_shape[0], input_shape[1] * ksize[1], input_shape[2] * ksize[2], input_shape[3]) 32 | 33 | # calculation indices for batch, height, width and feature maps 34 | one_like_mask = tf.ones_like(mask, dtype=tf.int32) 35 | batch_shape = tf.concat([[input_shape[0]], [1], [1], [1]], 0) 36 | batch_range = tf.reshape(tf.range(output_shape[0], dtype=tf.int32), shape=batch_shape) 37 | b = one_like_mask * batch_range 38 | y = mask // (output_shape[2] * output_shape[3]) 39 | x = (mask // output_shape[3]) % output_shape[2] #mask % (output_shape[2] * output_shape[3]) // output_shape[3] 40 | feature_range = tf.range(output_shape[3], dtype=tf.int32) 41 | f = one_like_mask * feature_range 42 | 43 | # transpose indices & reshape update values to one dimension 44 | updates_size = tf.size(updates) 45 | indices = tf.transpose(tf.reshape(tf.stack([b, y, x, f]), [4, updates_size])) 46 | values = tf.reshape(updates, [updates_size]) 47 | ret = tf.scatter_nd(indices, values, output_shape) 48 | print(ret) 49 | return ret 50 | 51 | 52 | def unpool2(pool, ind, ksize=[1, 2, 1, 1], scope='unpool'): 53 | """ 54 | Unpooling layer after max_pool_with_argmax. 55 | Args: 56 | updates: max pooled output tensor 57 | mask: argmax indices 58 | ksize: ksize is the same as for the pool 59 | Return: 60 | unpool: unpooling tensor 61 | """ 62 | with tf.variable_scope(scope): 63 | input_shape = pool.get_shape().as_list() 64 | output_shape = (input_shape[0], input_shape[1] * ksize[1], input_shape[2] * ksize[2], input_shape[3]) 65 | pool_ = tf.reshape(pool, [input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3]]) 66 | batch_range = tf.reshape(tf.range(output_shape[0], dtype=ind.dtype), shape=[input_shape[0], 1, 1, 1]) 67 | b = tf.ones_like(ind) * batch_range 68 | b = tf.reshape(b, [input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3], 1]) 69 | ind_ = tf.reshape(ind, [input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3], 1]) 70 | ind_ = tf.concat([b, ind_],1) 71 | ref = tf.Variable(tf.zeros([output_shape[0], output_shape[1] * output_shape[2] * output_shape[3]])) 72 | ret = tf.scatter_nd_update(ref, ind_, pool_) 73 | ret = tf.reshape(ret, [output_shape[0], output_shape[1], output_shape[2], output_shape[3]]) 74 | return ret 75 | 76 | 77 | def unpool3(pool, ind, ksize=[1, 2, 1, 1], scope='unpool3'): 78 | """ 79 | Unpooling layer after max_pool_with_argmax. 80 | Args: 81 | pool: max pooled output tensor 82 | ind: argmax indices 83 | ksize: ksize is the same as for the pool 84 | Return: 85 | unpool: unpooling tensor 86 | """ 87 | with tf.variable_scope(scope): 88 | input_shape = tf.shape(pool) 89 | output_shape = [input_shape[0], input_shape[1] * ksize[1], input_shape[2] * ksize[2], input_shape[3]] 90 | 91 | flat_input_size = tf.reduce_prod(input_shape) 92 | flat_output_shape = [output_shape[0], output_shape[1] * output_shape[2] * output_shape[3]] 93 | 94 | pool_ = tf.reshape(pool, [flat_input_size]) 95 | batch_range = tf.reshape(tf.range(tf.cast(output_shape[0], tf.int64), dtype=ind.dtype), 96 | shape=[input_shape[0], 1, 1, 1]) 97 | b = tf.ones_like(ind) * batch_range 98 | b1 = tf.reshape(b, [flat_input_size, 1]) 99 | ind_ = tf.reshape(ind, [flat_input_size, 1]) 100 | ind_ = tf.concat([b1, ind_], 1) 101 | 102 | ret = tf.scatter_nd(ind_, pool_, shape=tf.cast(flat_output_shape, tf.int64)) 103 | ret = tf.reshape(ret, output_shape) 104 | 105 | set_input_shape = pool.get_shape() 106 | set_output_shape = [set_input_shape[0], set_input_shape[1] * ksize[1], set_input_shape[2] * ksize[2], set_input_shape[3]] 107 | ret.set_shape(set_output_shape) 108 | return ret 109 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/precision_recall_handmade.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from scipy.sparse import csr_matrix 4 | from scipy.sparse import csc_matrix 5 | def pr_curve_writer(label, pred): 6 | 7 | a=len(label) 8 | 9 | b=380000 10 | curve_resolution=10000 11 | linspace=np.linspace(0.0000, 1.0, curve_resolution,endpoint=True, dtype=np.float64) 12 | TPR_array=np.zeros([curve_resolution], dtype=np.float64) 13 | FPR_array=np.zeros([curve_resolution], dtype=np.float64) 14 | PPV_array=np.zeros([curve_resolution], dtype=np.float64) 15 | if a>=b: 16 | label1=csr_matrix(label) 17 | label2=csr_matrix(1*np.logical_not(label)) 18 | 19 | print('calculating the first ROC space') 20 | 21 | for i in range(curve_resolution): 22 | print("creating binary array") 23 | pred_ = np.where(pred >= linspace[i], np.ones_like(pred), np.zeros_like(pred)) 24 | pred2=1*np.logical_not(pred_) 25 | #pred_=csc_matrix(pred_) 26 | #print pred_ 27 | #print "calc logical and" 28 | tp = label1.dot(pred_) 29 | 30 | #print sum(tp) 31 | 32 | fp = label2.dot(pred_) 33 | #print fp 34 | fn = label1.dot(pred2) 35 | #print fn 36 | tn = label2.dot(pred2) 37 | #print tn 38 | 39 | FPR_array[i] += np.true_divide(fp,tn+fp) 40 | TPR_array[i] += np.true_divide(tp,tp+fn) 41 | if tp+fp==0.0: 42 | PPV_array[i]+=0.0 43 | else: 44 | PPV_array[i] += np.true_divide(tp,tp+fp) 45 | #print i 46 | #if i>=curve_resolution-3: 47 | #print TPR_array[i],PPV_array[i] 48 | 49 | else: 50 | for i in range(curve_resolution): 51 | pred_ = np.where(pred >= linspace[i], np.ones_like(pred), np.zeros_like(pred)) 52 | #print pred_ 53 | tp = np.logical_and(pred_, label) 54 | fp = np.logical_and(pred_, np.logical_not(label)) 55 | fn = np.logical_and(np.logical_not(pred_), label) 56 | tn = np.logical_and(np.logical_not(pred_), np.logical_not(label)) 57 | FPR_array[i] = np.true_divide(np.nansum(fp), 58 | np.nansum(np.logical_or(tn, fp))) 59 | TPR_array[i] = np.true_divide(np.nansum(tp), 60 | np.nansum(np.logical_or(tp, fn))) 61 | if np.nansum(np.logical_or(tp, fp))==0.0: 62 | PPV_array[i]=0.0 63 | else: 64 | PPV_array[i] = np.true_divide(np.nansum(tp), 65 | np.nansum(np.logical_or(tp, fp))) 66 | 67 | #if i>=curve_resolution-3: 68 | #print TPR_array[i],PPV_array[i] 69 | #rint i 70 | area=0.0 71 | k=curve_resolution-1 72 | for i in range(curve_resolution): 73 | area+=0.500*(PPV_array[k]+PPV_array[k-1])*(TPR_array[k-1]-TPR_array[k]) 74 | #print area 75 | k-=1 76 | if k==0: 77 | break 78 | 79 | 80 | return FPR_array, TPR_array, PPV_array, area 81 | 82 | array_file='/home/fast/onimaru/data/prediction/network_constructor_deepsea_1d3_Tue_Sep_19_150851_2017.ckpt-10734_label_prediction.npz' 83 | #genome_bed='' 84 | np_in=np.load(array_file) 85 | pred=np_in["prediction"] 86 | #print len(pred) 87 | label_array=np_in["label_array"] 88 | #print pred[:,0] 89 | if len(label_array.shape)==1: 90 | num_label=1 91 | else: 92 | num_label=label_array.shape[1] 93 | 94 | fpr_list=[] 95 | tpr_list=[] 96 | roc_auc_list=[] 97 | precision_list=[] 98 | recall_list=[] 99 | average_precision_list=[] 100 | if num_label>1: 101 | for i in range(num_label): 102 | 103 | 104 | fpr, tpr, ppv, area=pr_curve_writer(label_array[:,i], pred[:,i]) 105 | precision_list.append(ppv) 106 | #tpr_list.append(tpr) 107 | recall_list.append(tpr) 108 | average_precision_list.append(area) 109 | else: 110 | fpr, tpr, ppv, area=pr_curve_writer(label_array, pred) 111 | 112 | precision_list.append(ppv) 113 | recall_list.append(tpr) 114 | average_precision = area 115 | average_precision_list.append(average_precision) 116 | plt.figure(1, figsize=(8,8)) 117 | """ax1=plt.subplot(211) 118 | i=0 119 | for i in range(num_label): 120 | f,t,r=fpr_list[i],tpr_list[i],roc_auc_list[i] 121 | plt.plot(f, t, color='darkorange', 122 | label='ROC curve ('+str(i)+') (area = %0.2f)' % r) 123 | i+=1 124 | plt.plot([0, 1], [0, 1], color='navy', linestyle='--') 125 | plt.axis('equal') 126 | plt.xlim([0.0, 1.0]) 127 | plt.ylim([0.0, 1.0]) 128 | plt.xlabel('False Positive Rate') 129 | plt.ylabel('True Positive Rate') 130 | 131 | plt.title('Receiver operating characteristic curve ('+str(model_name)+')') 132 | plt.legend(loc="lower right")""" 133 | 134 | #ax2=plt.subplot(212) 135 | i=0 136 | for i in range(num_label): 137 | r,p,a =recall_list[i],precision_list[i], average_precision_list[i] 138 | plt.plot(r, p, lw=2, color='navy',label='Precision-Recall curve ('+str(i)+') (area = %0.2f)' % a) 139 | i+=1 140 | plt.axis('equal') 141 | plt.xlabel('Recall') 142 | plt.ylabel('Precision') 143 | plt.ylim([0.0, 1.00]) 144 | plt.xlim([0.0, 1.0]) 145 | 146 | #plt.title('Precision-Recall curve ('+str(model_name)+')') 147 | plt.legend(loc="lower left") 148 | 149 | #plt.savefig(out_dir+"ROC_space_curve_"+str(model_name)+".pdf", format='pdf') 150 | 151 | 152 | plt.show() 153 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/fimo_to_numpy_array.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from matplotlib import mlab 4 | import subprocess as sp 5 | 6 | """ 7 | fimo_format 8 | # motif_id motif_alt_id sequence_name start stop strand score p-value q-value matched_sequence 9 | MA0139.1 CTCF chr2 231612721 231612739 + 27.1967 4.53e-12 0.00802 TGGCCACCAGGGGGCGCCG 10 | MA0139.1 CTCF chr16 84970710 84970728 - 27.1311 6.58e-12 0.00802 CGGCCACCAGGGGGCGCCA 11 | MA0139.1 CTCF chr3 98023412 98023430 - 27.1311 6.58e-12 0.00802 CGGCCACCAGGGGGCGCCA 12 | MA0139.1 CTCF chr5 137499397 137499415 - 27.1311 6.58e-12 0.00802 CGGCCACCAGGGGGCGCCA 13 | """ 14 | 15 | 16 | 17 | x=[] 18 | 19 | y=[] 20 | 21 | fimo_file="/home/fast/onimaru/mouse/fimo_out/fimo.txt" 22 | narrowpeak_file="/home/fast/onimaru/mouse/fimo_out/fimo_logq33.narrowPeak" 23 | intersect_file='/home/fast/onimaru/mouse/fimo_out/fimo_1000_logq33.narrowPeak' 24 | bed_file='/home/fast/onimaru/mouse/fimo_out/fimo_1000_logq33.bed' 25 | genome_1000='/home/slow/onimaru/data/genome_fasta/mm10_1000.bed' 26 | prediction_arrray="/home/fast/onimaru/mouse/fimo_out/fimo_prediction_all" 27 | target="all" 28 | logq_threshold=0.33 29 | 30 | with open(fimo_file, 'r') as fin: 31 | with open(narrowpeak_file, 'w') as fout: 32 | i=0 33 | for line in fin: 34 | if not line[0]=="#": 35 | a=line.split() 36 | chromo=a[2] 37 | start=int(a[3]) 38 | end=int(a[4]) 39 | name='fimo_'+str(a[0])+'_'+str(a[1]) 40 | orientation=a[5] 41 | score=float(a[6]) 42 | logp=-np.log10(float(a[7])) 43 | logq=-np.log10(float(a[8])) 44 | if logq>=logq_threshold: 45 | fout.write(str(chromo)+"\t"+ 46 | str(start)+"\t"+ 47 | str(end)+"\t"+ 48 | str(name)+"\t"+ 49 | str(logq*400)+"\t"+ 50 | str(orientation)+"\t"+ 51 | str(score)+"\t"+ 52 | str(logp)+"\t"+ 53 | str(logq)+"\t"+ 54 | "-1\n" 55 | ) 56 | 57 | 58 | i+=1 59 | if i%10000==0: 60 | print("reading "+str(i) + "th line of fimo file") 61 | 62 | print("converting narrowPeak to 1000 binned peaks") 63 | intersectout=open(intersect_file, 'w') 64 | sp.check_call(["bedtools", "intersect","-F","0.4","-wo", "-a", str(genome_1000), "-b", str(narrowpeak_file)], stdout=intersectout) 65 | intersectout.close() 66 | print("conversion is done") 67 | """ 68 | chr1 10500 11500 chr1 11223 11241 fimo_MA0139.1_CTCF 675.298455578 - 24.4754 8.87289520164 1.68824613894 -1 18 69 | chr1 10500 11500 chr1 11281 11299 fimo_MA0139.1_CTCF 566.267510253 - 22.7377 7.99567862622 1.41566877563 -1 18 70 | chr1 11000 12000 chr1 11223 11241 fimo_MA0139.1_CTCF 675.298455578 - 24.4754 8.87289520164 1.68824613894 -1 18 71 | chr1 11000 12000 chr1 11281 11299 fimo_MA0139.1_CTCF 566.267510253 - 22.7377 7.99567862622 1.41566877563 -1 18 72 | """ 73 | 74 | 75 | 76 | 77 | #intersect_file='/home/fast/onimaru/human/fimo_out_1e3/fimo_cutoff_0p33_logq.narrowPeak_test.bed' 78 | fimo_peak_dict={} 79 | 80 | if target=="all": 81 | startswtith="chr" 82 | else: 83 | startswtith=str(target)+"\t" 84 | 85 | 86 | with open(intersect_file,"r") as fin: 87 | for line in fin: 88 | if line.startswith(startswtith): 89 | a=line.split() 90 | position=str(a[0])+"\t"+str(a[1])+"\t"+str(a[2]) 91 | logq=float(a[11]) 92 | if not position in fimo_peak_dict: 93 | fimo_peak_dict[position]=logq 94 | elif logq>fimo_peak_dict[position]: 95 | fimo_peak_dict[position]=logq 96 | 97 | 98 | #genome_1000='/home/slow/onimaru/data/genome_fasta/hg38_1000.bed' 99 | qvalue_list=[] 100 | with open(genome_1000, "r") as fin, open(bed_file,'w') as fout: 101 | for line in fin: 102 | if line.startswith(startswtith): 103 | a=line.strip('\n') 104 | if fimo_peak_dict.has_key(a): 105 | fout.write(a+"\n") 106 | qvalue_list.append(fimo_peak_dict[a]) 107 | print(line) 108 | else: 109 | qvalue_list.append(0.00) 110 | 111 | 112 | qvalue_array=np.array(qvalue_list)/np.max(qvalue_list) 113 | 114 | np.savez_compressed(prediction_arrray, prediction=qvalue_array) 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | """ 125 | x=np.array(x)/np.max(x) 126 | 127 | np.savez_compressed("/home/fast/onimaru/human/fimo_out_1e3/fimo_prediction", prediction=x) 128 | 129 | # the histogram of the data 130 | 131 | plt.subplot(211) 132 | n, bins, patches = plt.hist(x, 50, facecolor='green', alpha=0.5) 133 | plt.yscale('log', nonposy='clip') 134 | #plt.hist(x, 50, facecolor='red', alpha=0.5, cumulative=True) 135 | # add a 'best fit' line 136 | 137 | #l = plt.plot(bins, y, 'r--', linewidth=1) 138 | 139 | plt.xlabel('-Log10(p value)') 140 | #plt.ylabel('Scores') 141 | plt.title('fimo_prediction_dist') 142 | #plt.axis([40, 160, 0, 0.03]) 143 | plt.subplot(212) 144 | plt.hist(y, 50, facecolor='blue', alpha=0.5) 145 | plt.grid(True) 146 | 147 | plt.show()""" -------------------------------------------------------------------------------- /deepgmap/post_train_tools/trained_deepshark_local_multiple_label.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import gzip 4 | import cPickle 5 | import tensorflow as tf 6 | import numpy as np 7 | import time 8 | import math 9 | import os 10 | from natsort import natsorted, ns 11 | import network_constructor_multiple_label as nc 12 | import subprocess as sp 13 | start=time.time() 14 | #dimension1_2=16 15 | 16 | #with gzip.open('/media/koh/HD-PCFU3/mouse/filter1_999_Tue_Oct_25_122720_2016.cpickle.gz', 'r') as f: 17 | # saved_variables=cPickle.load(f) 18 | # W_conv1, W_conv2, W_conv3, b_conv1, b_conv2, b_conv3, W_fc1, W_fc2, W_fc3, W_fc4, b_fc1, b_fc2, b_fc3, b_fc4=saved_variables 19 | 20 | import glob 21 | def genome_scan(filename): 22 | #/media/koh/HD-PCFU3/mouse/test_genome/genome_chr1_06_250plus.cpickle.gz 23 | with open(filename, 'r') as f1: 24 | file_name=f1.name 25 | path_sep=os.path.sep 26 | file_name1=file_name.split(path_sep) 27 | file_name2=file_name1[-1].split('_') 28 | chromosome=file_name2[2] 29 | a=file_name2[3] 30 | b=a.split('.') 31 | chr_position=int(b[0]) 32 | #window_id=(file_name2[3])[:3] 33 | genome_seq=np.load(f1) 34 | shape_of_genome=genome_seq['genome'].shape 35 | genome_seq_re=np.reshape(genome_seq['genome'], (shape_of_genome[0], shape_of_genome[1], 4, 1)) 36 | genome_seq_re_list=np.array_split(genome_seq_re, 100) 37 | return genome_seq_re_list, chromosome, chr_position #, window_id 38 | 39 | def process(f, out_dir): 40 | sess = tf.Session() 41 | x_image = tf.placeholder(tf.float32, shape=[None, 1000, 4, 1]) 42 | y_ = tf.placeholder(tf.float32, shape=[None, 19]) 43 | keep_prob = tf.placeholder(tf.float32) 44 | keep_prob2 = tf.placeholder(tf.float32) 45 | keep_prob3 = tf.placeholder(tf.float32) 46 | phase=tf.placeholder(tf.bool) 47 | data_length=1000 48 | if 'ckpt' in sys.argv[1].rsplit('.', 1)[1]: 49 | input_dir=sys.argv[1] 50 | elif 'meta' in sys.argv[1].rsplit('.', 1)[1] or 'index' in sys.argv[1].rsplit('.', 1)[1]: 51 | input_dir=sys.argv[1].rsplit('.', 1)[0] 52 | else: 53 | print("the input file should be a ckpt file") 54 | sys.exit(1) 55 | 56 | model = nc.Model(image=x_image, label_dim=19, label=y_, phase=phase, output_dir=None, start_at=None, keep_prob=keep_prob, keep_prob2=keep_prob2, keep_prob3=keep_prob3, data_length=data_length) 57 | sess.run(tf.global_variables_initializer()) 58 | saver=model.saver 59 | try: 60 | saver.restore(sess, input_dir) 61 | except: 62 | print("can't open "+str(input_dir)) 63 | sys.exit(0) 64 | for seq in f: 65 | 66 | try: 67 | genome_seq_re_list, chromosome, chr_position=genome_scan(seq) 68 | except: 69 | print("can't open "+str(file[0])) 70 | sys.exit(0) 71 | y_prediction1=[] 72 | i=0 73 | for i in range(len(genome_seq_re_list)): 74 | scanning=genome_seq_re_list[i] 75 | y_prediction2 =np.array(sess.run(model.prediction[0], feed_dict={x_image: scanning, keep_prob: 1.0, keep_prob2: 1.0, keep_prob3: 1.0,phase:False}), np.float64) 76 | y_prediction1.append(1.0 / (1.0 + np.exp(-y_prediction2))) 77 | if i%10==0: 78 | print('scanning '+str(chromosome)+'_'+str(chr_position)+', '+str(100*i/len(genome_seq_re_list))+' %') 79 | filename_1=str(out_dir)+str(chromosome)+'.bed' 80 | print('writing '+filename_1) 81 | if os.path.isfile(filename_1): 82 | output_handle=open(filename_1, 'a') 83 | else: 84 | output_handle=open(filename_1, 'w') 85 | i=0 86 | j=0 87 | y_len=len(y_prediction1) 88 | for j in range(y_len): 89 | y_len_j=len(y_prediction1[j]) 90 | for i in range(y_len_j): 91 | value=np.max(y_prediction1[j][i][:-1])-y_prediction1[j][i][-1] 92 | if value>0.0: 93 | if int(sys.argv[4])==500: 94 | start_pos=int(chr_position)*int(1e7)+500*i+200*500*j 95 | end_pos=start_pos+499 96 | elif int(sys.argv[4])==1000: 97 | start_pos=int(chr_position)*int(1e7)+1000*i+100*1000*j 98 | end_pos=start_pos+999 99 | 100 | output_handle.write(str(chromosome)+'\t'+str(start_pos)+'\t'+str(end_pos)+'\t'+str(value)+'\n') 101 | if i%10==0: 102 | print(str(str(chromosome)+'\t'+str(start_pos)+'\t'+str(end_pos)+'\t'+str(value))) 103 | output_handle.close() 104 | print('finished writing '+filename_1) 105 | sess.close() 106 | out=open(str(out_dir)+str(chromosome)+"_srt.bed", 'w') 107 | sp.check_call(["bedtools", "sort","-i", str(filename_1)], stdout=out) 108 | out.close() 109 | 110 | import multiprocessing 111 | def main(): 112 | input_dir=sys.argv[1].rsplit('.', 1)[0] 113 | 114 | 115 | path_sep=os.sep 116 | file_name=input_dir.split(path_sep) 117 | a=time.asctime() 118 | b=a.replace(':', '') 119 | start_at=b.replace(' ', '_') 120 | out_dir=sys.argv[2]+file_name[-1] 121 | 122 | if not os.path.exists(os.path.dirname(out_dir)): 123 | try: 124 | os.makedirs(os.path.dirname(out_dir)) 125 | except OSError as exc: # Guard against race condition 126 | if exc.errno != exc.errno.EEXIST: 127 | raise 128 | 129 | start=time.time() 130 | s=0 131 | try: 132 | f = glob.glob(sys.argv[3]) 133 | process(f, out_dir) 134 | #x=p.apply_async(process, (t_,)) 135 | #x.get() 136 | except : 137 | print("Unexpected error:", sys.exc_info()[0]) 138 | raise 139 | 140 | #for i in f: 141 | # process(i, out_dir) 142 | 143 | 144 | 145 | print(time.time()-start) 146 | 147 | 148 | if __name__== '__main__': 149 | main() 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/genome_divider.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import math 3 | import os 4 | import subprocess as sp 5 | 6 | 7 | def genome_divider(genome_fasta, genome_file, WINDOW_SIZE, outname): 8 | OUTDIR=os.path.splitext(genome_file)[0]+'_'+str(WINDOW_SIZE) 9 | try: 10 | os.makedirs(OUTDIR) 11 | except OSError as err: 12 | print("OS error: {0}".format(err)) 13 | outbed=OUTDIR+'/genome.bed' 14 | outfasta=OUTDIR+'/genome.fa' 15 | #WINDOW_SIZE=1000 16 | #genome_file="/home/fast/onimaru/lamprey/LetJap1.0.1.genome" 17 | #with open(genome_file, 'r') as fin, open('/home/fast/onimaru/data/genome_fasta/hg38_1000_altwindow.bed', 'w') as fout1, open('/home/fast/onimaru/data/genome_fasta/hg38_1000_.bed', 'w') as fout2: 18 | with open(genome_file, 'r') as fin, open(outbed, 'w') as fout1: 19 | 20 | for line in fin: 21 | line=line.split() 22 | chrom=line[0] 23 | chrom_size=int(line[1]) 24 | divide_num=chrom_size/WINDOW_SIZE 25 | #divide_num=chrom_size/WINDOW_SIZE-4 26 | for i in range(divide_num): 27 | 28 | #if i>=2: 29 | 30 | if i*WINDOW_SIZE+WINDOW_SIZE<=chrom_size: 31 | fout1.write(str(chrom)+'\t'+str(i*WINDOW_SIZE)+'\t'+str(i*WINDOW_SIZE+WINDOW_SIZE)+'\n') 32 | else: 33 | break 34 | if i*WINDOW_SIZE+WINDOW_SIZE+WINDOW_SIZE/2<=chrom_size: 35 | fout1.write(str(chrom)+'\t'+str(i*WINDOW_SIZE+WINDOW_SIZE/2)+'\t'+str(i*WINDOW_SIZE+WINDOW_SIZE+WINDOW_SIZE/2)+'\n') 36 | else: 37 | break 38 | try: 39 | sp.call(["bedtools", "getfasta","-fi",genome_fasta,"-bed",outbed, "-fo", outfasta]) 40 | except OSError as e: 41 | print(e) 42 | sys.exit(1) 43 | 44 | print(outbed+" and "+outfasta+' were successfully generated.') 45 | 46 | def genome_divider2(genome_fasta, genome_file, WINDOW_SIZE, outname, stride=None): 47 | 48 | 49 | if outname is not None: 50 | OUTDIR=outname 51 | elif stride is not None: 52 | OUTDIR=os.path.splitext(genome_file)[0]+'_window'+str(WINDOW_SIZE)+'_stride'+str(stride) 53 | else: 54 | OUTDIR=os.path.splitext(genome_file)[0]+'_'+str(WINDOW_SIZE) 55 | try: 56 | os.makedirs(OUTDIR) 57 | except OSError as err: 58 | #print("OS error: {0}".format(err)) 59 | sys.exit(err) 60 | 61 | outbed=OUTDIR+'/genome.bed' 62 | outfasta=OUTDIR+'/genome.fa' 63 | """ 64 | if outname is not None: 65 | outbed=outname+'.bed' 66 | outfasta=outname+'.fa' 67 | elif stride is not None: 68 | outbed=os.path.splitext(genome_file)[0]+'_window'+str(WINDOW_SIZE)+'_stride'+str(stride)+'.bed' 69 | outfasta=os.path.splitext(genome_file)[0]+'_window'+str(WINDOW_SIZE)+'_stride'+str(stride)+'.fa' 70 | else: 71 | outbed=os.path.splitext(genome_file)[0]+'_'+str(WINDOW_SIZE)+'.bed' 72 | outfasta=os.path.splitext(genome_file)[0]+'_'+str(WINDOW_SIZE)+'.fa' 73 | """ 74 | #if stride==None: 75 | #stride=WINDOW_SIZE/2 76 | #adding=WINDOW_SIZE/stride 77 | 78 | #WINDOW_SIZE=1000 79 | #genome_file="/home/fast/onimaru/lamprey/LetJap1.0.1.genome" 80 | #with open(genome_file, 'r') as fin, open('/home/fast/onimaru/data/genome_fasta/hg38_1000_altwindow.bed', 'w') as fout1, open('/home/fast/onimaru/data/genome_fasta/hg38_1000_.bed', 'w') as fout2: 81 | with open(genome_file, 'r') as fin, open(outbed, 'w') as fout1: 82 | 83 | for line in fin: 84 | line=line.split() 85 | chrom=line[0] 86 | chrom_size=int(line[1]) 87 | #divide_num=chrom_size/WINDOW_SIZE 88 | 89 | #divide_num=chrom_size/WINDOW_SIZE-4 90 | i=0 91 | while WINDOW_SIZE+stride*i<=chrom_size: 92 | fout1.write(str(chrom)+'\t'+str(stride*i)+'\t'+str(WINDOW_SIZE+stride*i)+'\n') 93 | i+=1 94 | 95 | 96 | try: 97 | stdout_file=open(outbed+"_tmp", "w") 98 | sp.check_call(["bedtools", "sort", "-i",outbed], stdout=stdout_file) 99 | stdout_file.close() 100 | 101 | except sp.CalledProcessError as e: 102 | print("something wrong with bedtools") 103 | sys.exit(e) 104 | if os.path.exists(outbed) and os.path.exists(outbed+"_tmp"): 105 | os.remove(outbed) 106 | os.rename(outbed+"_tmp", outbed) 107 | else: 108 | sys.exit("bed file was not created.") 109 | 110 | try: 111 | sp.check_call(["bedtools", "getfasta","-fi",genome_fasta,"-bed",outbed, "-fo", outfasta]) 112 | except sp.CalledProcessError as e: 113 | print("something wrong with bedtools") 114 | sys.exit(e) 115 | 116 | print(outbed+" and "+outfasta+' were successfully generated.') 117 | 118 | def genome_file_maker(genome_fasta, genome_file): 119 | 120 | length_list=[] 121 | 122 | with open(genome_fasta, 'r') as fin, open(genome_file, 'w') as fout: 123 | seq=0 124 | chrom_name='' 125 | for line in fin: 126 | 127 | if '>' in line: 128 | 129 | if not seq==0: 130 | length_list.append(seq) 131 | #if not "_" in chrom_name and not "M" in chrom_name: 132 | fout.write(str(chrom_name)+'\t'+str(seq)+'\n') 133 | line=line.split() 134 | chrom_name=line[0].strip('>') 135 | seq=0 136 | else: 137 | line1=line.strip("\n") 138 | seq+=len(line1) 139 | #if len(chrom_name)==3 and not "M" in chrom_name: 140 | fout.write(str(chrom_name)+'\t'+str(seq)+'\n') 141 | 142 | def run(args): 143 | genome_fasta=args.genome_fasta 144 | windowsize=args.windowsize 145 | genome_file=os.path.splitext(genome_fasta)[0]+'.genome' 146 | outname=args.outname 147 | stride=args.stride 148 | if not os.path.isfile(genome_file): 149 | print("generating genome file.") 150 | genome_file_maker(genome_fasta,genome_file) 151 | else: 152 | print("using a pre-existing genome file: "+genome_file) 153 | genome_divider2(genome_fasta, genome_file, windowsize, outname, stride=stride) 154 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/cython_util.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | import sys 3 | import numpy as np 4 | from scipy.spatial.distance import cdist 5 | 6 | def narrowPeak_writer(str out_dir,list y_prediction2,list position_list): 7 | cdef str filename_1 8 | filename_1=out_dir+'.narrowPeak' 9 | print('writing '+filename_1) 10 | output_handle=open(filename_1, 'w') 11 | cdef int k=0 12 | cdef float value 13 | cdef str chrom, start_, end_ 14 | for i in range(len(y_prediction2)): 15 | 16 | a=position_list[i].strip('>') 17 | #print(str(a)+'\t'+str(y_prediction2[i])) 18 | k+=1 19 | a=a.split(':') 20 | chrom=a[0] 21 | b=a[1].split('-') 22 | start_=b[0] 23 | end_=b[1] 24 | value=y_prediction2[i] 25 | output_handle.write(str(chrom)+'\t' 26 | +str(start_)+'\t' 27 | +str(end_)+'\t.\t' 28 | +str(value*1000).strip('[]')+'\t.\t' 29 | +str(value).strip('[]')+"\t-1\t-1\t-1\n") 30 | 31 | print("prediction num: "+str(k)) 32 | output_handle.close() 33 | print('finished writing '+filename_1) 34 | 35 | 36 | def motif_compare(motif_data_dict, long_motif_dict, fout, THRESHOLD=-5.0): 37 | cdef int i,k, j, l=0 38 | cdef str k1, k2 39 | cdef double RAND_MEAN, RAND_DEV,DIST,Z_SCORE, ic 40 | cdef list comp_result, comp_result2, cpr, Z_SCORE_list=[] 41 | cdef int[2] v2shape,v1shape 42 | #cdef double[4] pm1 43 | with open(fout, "w") as f: 44 | comp_result2=[] 45 | f.write("Motif name\tStart\tEnd\tDistance\n") 46 | for k1, v1 in long_motif_dict.items(): 47 | 48 | v1shape=v1.shape 49 | #print v1shape 50 | for k2, v2 in motif_data_dict.items(): 51 | ic1=0 52 | if "secondary" in k2: 53 | continue 54 | #print k2 55 | #j+=1 56 | #print j 57 | v2shape=v2.shape 58 | #print v2shape 59 | """for i in range(v2shape[0]): 60 | ic=np.nansum(v2[i]*np.log2(v2[i]*4+0.000001)) 61 | v2[i]=v2[i]""" 62 | 63 | RAND_DIST=np.zeros([500], np.float32) 64 | for i in range(500): 65 | rand=np.random.rand(v2shape[0],v2shape[1]) 66 | for k in range(v2shape[0]): 67 | rand[k]=rand[k]/np.sum(rand[k]) 68 | #rand[k]=pm1*(np.sum(pm1*np.log2(pm1*4+0.00001))) 69 | #RAND_DIST.append(np.mean(np.diagonal(cdist(v2, rand,metric='euclidean')))) 70 | M=0.5*(rand+v2)+0.00001 71 | DIST=0.5*(np.sum(-v2*np.log(M/(v2+0.00001)))+np.sum(-rand*np.log(M/(rand+0.00001))))/float(v2shape[0]) 72 | #DIST=-np.sum(v2*np.log(rand+0.00001)+(1.0-v2)*np.log(1.0-rand+0.00001))/float(v2shape[0]) 73 | RAND_DIST[i]+=DIST 74 | 75 | RAND_MEAN=np.mean(RAND_DIST) 76 | RAND_DEV=np.std(RAND_DIST) 77 | #print RAND_MEAN, RAND_DEV 78 | #print("randome_dist: "+str(RAND_DIST)) 79 | comp_result=[] 80 | for i in range(v1shape[0]-v2shape[0]): 81 | #partial_motif=[] 82 | #for j in range(v2shape[0]): 83 | # pm1=v1[i+j] 84 | #ic=np.sum(pm1*np.log2(pm1*4+0.000001)) 85 | partial_motif_=v1[i:i+v2shape[0]] 86 | #partial_motif_=np.array(partial_motif) 87 | #partial_motif=v1[i:(i+v2shape[0])] 88 | #print v2shape, np.shape(partial_motif) 89 | M=0.5*(partial_motif_+v2)+0.00001 90 | DIST=0.5*(np.sum(-v2*np.log(M/(v2+0.00001)))+np.sum(-partial_motif_*np.log(M/(partial_motif_+0.00001))))/float(v2shape[0]) 91 | #print JSD 92 | v2_comp=np.flip(np.flip(v2,0),1) 93 | M_comp=0.5*(partial_motif_+v2_comp)+0.00001 94 | DIST_comp=0.5*(np.sum(-v2_comp*np.log(M_comp/(v2_comp+0.00001)))+np.sum(-partial_motif_*np.log(M_comp/(partial_motif_+0.00001))))/float(v2shape[0]) 95 | #DIST=np.mean(np.diagonal(cdist(v2, partial_motif_,metric='euclidean'))) 96 | #DIST=np.mean(np.diagonal(cdist(v2, partial_motif_,metric='euclidean'))) 97 | #DIST_comp=np.mean(np.diagonal(cdist(v2_comp, partial_motif_,metric='euclidean'))) 98 | 99 | #DIST=-np.sum(v2*np.log(partial_motif_+0.00001)+(1.0-v2)*np.log(1.0-partial_motif_+0.00001))/float(v2shape[0]) 100 | #DIST_comp=-np.sum(v2_comp*np.log(partial_motif_+0.00001)+(1.0-v2_comp)*np.log(1.0-partial_motif_+0.00001))/float(v2shape[0]) 101 | ori="+" 102 | if DIST_comp=10: 119 | comp_result.sort(key = lambda x: x[3]) 120 | for cpr in comp_result[-10:]: 121 | comp_result2.append(cpr) 122 | 123 | comp_result2.sort(key = lambda x: x[1]) 124 | for cpr in comp_result2: 125 | f.write("\t".join([str(cpr[0]),str(cpr[1]),str(cpr[2]),str(cpr[3]),str(cpr[4])])+"\n") 126 | 127 | 128 | print("the number of motif matches: "+str(l)) 129 | return Z_SCORE_list 130 | 131 | 132 | 133 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /deepgmap/data_preprocessing_tools/genome_labeling2.py: -------------------------------------------------------------------------------- 1 | 2 | import glob as glb 3 | import sys 4 | import numpy as np 5 | from sklearn.decomposition import KernelPCA as pca_f 6 | import os 7 | import matplotlib as mpl 8 | mpl.use("WebAgg") 9 | import matplotlib.pyplot as plt 10 | from scipy.spatial.distance import pdist 11 | import scipy.cluster.hierarchy as sch 12 | import time 13 | import copy 14 | 15 | def genome_label(bed_file_list, genome_1000,out_dir): 16 | 17 | file_num=len(bed_file_list) 18 | 19 | #print file_num 20 | peak_set_list=[] 21 | peak_set_list_append=peak_set_list.append 22 | #start=time.time() 23 | i=0 24 | for f in bed_file_list: 25 | peak_set=set() 26 | peak_set_add=peak_set.add 27 | with open(f, 'r') as fin: 28 | 29 | 30 | for line in fin: 31 | if i==0: 32 | _,a,b=line.split() 33 | check_length=int(b)-int(a) 34 | 35 | peak_set_add(line) 36 | peak_set_list_append(peak_set) 37 | 38 | i+=1 39 | 40 | fo_name=out_dir 41 | label_array_list=[] 42 | label_array_list_append=label_array_list.append 43 | with open(genome_1000,'r') as fin: 44 | with open(fo_name,'w') as fout: 45 | fout.write("#sample_list: "+"\t".join(bed_file_list)+"\n") 46 | i=0 47 | 48 | for line in fin: 49 | k=0 50 | label_array=["0" for h in range(file_num)] 51 | 52 | for s in peak_set_list: 53 | if i==0: 54 | _,a,b=line.split() 55 | assert check_length==int(b)-int(a), "mismatches in sequence lengths" 56 | if line in s: 57 | label_array[k]="1" 58 | k+=1 59 | fout.write(line.strip('\n')+'\t'+' '.join(label_array)+'\n') 60 | #if sum(label_array)>0: 61 | #abel_array_list_append(label_array) 62 | i+=1 63 | if i%200000==0: 64 | 65 | sys.stdout.write("\rwriting labeled file "+ line.strip("\n")) 66 | sys.stdout.flush() 67 | #print time.time()-start 68 | #sys.exit() 69 | print("\n"+fo_name+" has been saved. This file is going to be used when testing a trained model too.") 70 | #return label_array_list 71 | 72 | 73 | def genome_label2(bed_file_list, genome_1000,out_dir): 74 | 75 | file_num=len(bed_file_list) 76 | 77 | #print file_num 78 | peak_set_dict={} 79 | #peak_set_list_append=peak_set_list.append 80 | start=time.time() 81 | i=0 82 | #zero=["0" for h in range(file_num)] 83 | for f in bed_file_list: 84 | with open(f, 'r') as fin: 85 | for line in fin: 86 | if i==0: 87 | _,a,b=line.split() 88 | check_length=int(b)-int(a) 89 | 90 | if not line in peak_set_dict: 91 | peak_set_dict[line]=["0" for h in range(file_num)] 92 | #peak_set_dict[line]=copy.deepcopy(zero) 93 | peak_set_dict[line][i]="1" 94 | i+=1 95 | print(time.time()-start) 96 | fo_name=out_dir 97 | label_array_list=[] 98 | label_array_list_append=label_array_list.append 99 | zero=' '.join(["0" for h in range(file_num)]) 100 | with open(genome_1000,'r') as fin: 101 | with open(fo_name,'w') as fout: 102 | fout.write("#sample_list: "+"\t".join(bed_file_list)+"\n") 103 | #start=time.time() 104 | i=0 105 | for line in fin: 106 | if i==0: 107 | _,a,b=line.split() 108 | assert check_length==int(b)-int(a), "mismatches in sequence lengths" 109 | if line in peak_set_dict: 110 | fout.write(line.strip('\n')+'\t'+' '.join(peak_set_dict[line])+'\n') 111 | else: 112 | fout.write(line.strip('\n')+'\t'+zero+'\n') 113 | #if sum(label_array)>0: 114 | #label_array_list_append(label_array) 115 | i+=1 116 | if i%200000==0: 117 | 118 | sys.stdout.write("\rwriting labeled file "+ line.strip("\n")) 119 | sys.stdout.flush() 120 | print("genome_labeling2 "+str(time.time()-start)) 121 | #sys.exit() 122 | print("\n"+fo_name+" has been saved. This file is going to be used when testing a trained model too.") 123 | #return label_array_list 124 | 125 | 126 | def main(): 127 | #bed_file_dir, genome_1000, out_dir=sys.argv[1:] 128 | bed_file_dir="/home/fast/onimaru/deepgmap/data/inputs/hg38_dnase/peaks_10k/test_hg38_window1000_stride300.bed_list/*" 129 | genome_1000="/home/fast/onimaru/deepgmap/data/genomes/hg38_window1000_stride300.bed" 130 | out_dir="/home/fast/onimaru/deepgmap/data/inputs/hg38_dnase/peaks_10k/test.labeled" 131 | bed_file_list=[] 132 | if not "*" in bed_file_dir and bed_file_dir.endswith('.bed'): 133 | bed_file_list.append(bed_file_dir) 134 | elif not '*' in bed_file_dir: 135 | bed_file_dir=bed_file_dir+"*.bed" 136 | 137 | bed_file_list=sorted(glb.glob(bed_file_dir)) 138 | print(bed_file_list) 139 | if len(bed_file_list)==0: 140 | print("no files in "+str(bed_file_dir)) 141 | sys.exit() 142 | label_array_list=genome_label2(bed_file_list, genome_1000,out_dir) 143 | print(label_array_list[0]) 144 | label_array_list_=np.transpose(label_array_list) 145 | print(sum(label_array_list_[0])) 146 | pca = pca_f(n_components=2, kernel="rbf") 147 | X_pca=pca.fit_transform(label_array_list_) 148 | dist1=pdist(label_array_list_, 'cosine') 149 | _, ax1=plt.subplots() 150 | 151 | Y = sch.linkage(dist1, method='ward') 152 | Z1 = sch.dendrogram(Y) 153 | idx1 = Z1['leaves'] 154 | 155 | new_sample_list=[] 156 | 157 | for i in idx1: 158 | txt=bed_file_list[i].split("/")[-1] 159 | new_sample_list.append(txt) 160 | ax1.set_xticklabels(new_sample_list , rotation=90) 161 | 162 | 163 | print(X_pca.shape) 164 | _, ax2=plt.subplots() 165 | ax2.scatter(X_pca[:,0], X_pca[:,1]) 166 | for i, txt in enumerate(bed_file_list): 167 | txt=txt.split("/")[-1] 168 | ax2.annotate(txt, (X_pca[i,0],X_pca[i,1])) 169 | 170 | #plt.show() 171 | if __name__ == '__main__': 172 | main() 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/ROC_space_plotter2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import matplotlib.pyplot as plt 4 | from sklearn import svm, datasets 5 | from sklearn.metrics import roc_curve, auc 6 | from sklearn.preprocessing import label_binarize 7 | from sklearn.multiclass import OneVsRestClassifier 8 | from scipy import interp 9 | from scipy import stats 10 | import getopt 11 | from glob import glob 12 | from natsort import natsorted 13 | from sklearn.metrics import precision_recall_curve 14 | from sklearn.metrics import average_precision_score 15 | from sklearn.metrics import f1_score 16 | import matplotlib as mpl 17 | import os 18 | from decimal import Decimal 19 | import pandas as pd 20 | 21 | def roc_space_calc(label,pred): 22 | 23 | # Compute ROC curve and ROC area for each class 24 | 25 | fpr, tpr, _ = roc_curve(label, pred) 26 | roc_auc = auc(fpr, tpr) 27 | 28 | return fpr, tpr, roc_auc 29 | 30 | 31 | 32 | def roc_space_plotter(label, predictions1,outfile_name): 33 | ind_list=['es', 'brain', 'limb'] 34 | pos=[0,1] 35 | width=0.25 36 | predictions_list=[] 37 | label_array=np.array(label) 38 | label_array_shape=label_array.shape 39 | for pred in predictions1: 40 | print(pred["prediction"].shape) 41 | predictions_list.append(pred["prediction"]) 42 | """df_rearanged = pd.DataFrame({ 43 | ind_list[0] : [[], []], 44 | ind_list[1] : [[], []], 45 | ind_list[2] : [[], []], 46 | },index = ["deepsea", "conv4-frss"])""" 47 | 48 | data_dict={} 49 | for n, i in enumerate(predictions_list): 50 | if n<3: 51 | _key='deepsea' 52 | 53 | else: 54 | _key='conv4-frss' 55 | 56 | if not _key in data_dict: 57 | data_dict[_key]={} 58 | 59 | for j in range(label_array_shape[1]): 60 | 61 | _tmp_pred=np.where(i[:,j]>=0.5, 1,0) 62 | _tmp_label=label_array[:,j] 63 | #true_pos=((_tmp_pred+_tmp_label) ==2).sum() 64 | false_pos=((_tmp_label-_tmp_pred) <0).sum() 65 | #false_neg=((_tmp_label-_tmp_pred) ==1).sum() 66 | if not ind_list[j] in data_dict[_key]: 67 | data_dict[_key][ind_list[j]]=[] 68 | data_dict[_key][ind_list[j]].append(float(false_pos)) 69 | 70 | for k in ind_list: 71 | a=data_dict['deepsea'][k] 72 | b=data_dict['conv4-frss'][k] 73 | s,p=stats.ttest_ind(a,b) 74 | print(p, k) 75 | 76 | 77 | 78 | df=pd.DataFrame(columns=["class1", "class2", "mean","stdv"]) 79 | 80 | """class1=[] 81 | class2=[] 82 | name_of_class=["model","cell-type"] 83 | data_dict2={}""" 84 | for k, v in data_dict.items(): 85 | #print k, v 86 | for k1,v1 in v.items(): 87 | """for e in v1: 88 | class1.append(k) 89 | class2.append(k1) 90 | if not k in data_dict2: 91 | data_dict2[k]=[] 92 | data_dict2[k].append(e)""" 93 | 94 | df=df.append({"class1":k, "class2":k1, "mean":np.mean(v1),"stdv":np.std(v1)}, ignore_index=True) 95 | 96 | 97 | """print data_dict2 98 | print class1 99 | print class2 100 | ix3 = pd.MultiIndex.from_arrays([class1, class2], names=name_of_class) 101 | df3 = pd.DataFrame(data_dict2, index=ix3) 102 | gp3 = df3.groupby(level=name_of_class) 103 | means = gp3.mean() 104 | errors = gp3.std() 105 | fig, ax = plt.subplots() 106 | means.plot.bar(yerr=errors, ax=ax) 107 | """ 108 | #print df 109 | yerr=df.pivot(index='class2',columns='class1',values='stdv') 110 | #print np.shape(yerr) 111 | #print df.pivot(index='class2',columns='class1',values='mean') 112 | df.pivot(index='class2',columns='class1',values='mean').plot(kind='bar', yerr=yerr) 113 | 114 | #df.pivot(index='class1',columns='class2',values='mean').plot(kind='bar', yerr=df.std.reshape((2,3))) 115 | #print df.pivot(index='class1',columns='class2',values='std').values 116 | #df.pivot(index='class1',columns='class2',values='mean').plot(kind='bar') 117 | plt.grid(b=True, which='major', color='gray', linestyle='-',axis= 'y') 118 | plt.grid(b=True, which='minor', color='gray', linestyle='--',axis= 'y') 119 | plt.minorticks_on() 120 | #plt.grid(True) 121 | 122 | #plt.minorticks_on() 123 | plt.show() 124 | #print false_pos 125 | #print round(false_pos/np.float(false_pos+true_pos), 4) 126 | #print f1_score(label_array[:,j], ) 127 | 128 | 129 | def main(): 130 | outfile_name="/home/fast/onimaru/data/prediction/ROC_space_curve_comp_limb_brain.pdf" 131 | npload_list1=[] 132 | npload_list2=[] 133 | label_array=[] 134 | label_array_append=label_array.append 135 | chromosome="chr2" 136 | #name_list=["DeepSEA", "Bidirectional","Conv_plus","Conv+Bidirectional"] 137 | name_list=["conv4frss", "deepsea"] 138 | file_list1=[ 139 | "/home/fast2/onimaru/DeepGMAP-dev/data/predictions/quick_benchmark/deepsea_Fri_Apr_20_140717_2018.ckpt-16747_prediction.npz", 140 | "/home/fast2/onimaru/DeepGMAP-dev/data/predictions/deepsea_Thu_Jun__7_072332_2018.ckpt-16747_prediction.npz", 141 | "/home/fast2/onimaru/DeepGMAP-dev/data/predictions/quick_benchmark/deepsea_Thu_Apr_26_115030_2018.ckpt-16747_prediction.npz", 142 | '/home/fast2/onimaru/DeepGMAP-dev/data/predictions/conv4frss_Fri_Jun__8_101931_2018.ckpt-16747_prediction.npz', 143 | '/home/fast2/onimaru/DeepGMAP-dev/data/predictions/conv4frss_Fri_Jun__8_122816_2018.ckpt-16747_prediction.npz', 144 | "/home/fast2/onimaru/DeepGMAP-dev/data/predictions/quick_benchmark/deepsharktest_Thu_Apr_19_191806_2018.ckpt-16747_prediction.npz", 145 | ] 146 | label_file_array="/home/fast/onimaru/deepgmap/data/inputs/mm10_dnase_subset/dnase_summits_subset_mm10_1000_chr2_testlabels.npz" 147 | if not os.path.isfile(label_file_array): 148 | label_file='/home/fast/onimaru/deepgmap/data/inputs/mm10_dnase_subset/dnase_summits_subset_mm10_1000.bed.labeled' 149 | with open(label_file, 'r') as fin: 150 | for line in fin: 151 | if line.startswith(chromosome): 152 | line=line.split() 153 | #print line 154 | label_array_append(map(int, line[3:])) 155 | label_array=np.array(label_array) 156 | np.savez_compressed( label_file_array, labels=label_array,) 157 | else: 158 | label_array=np.load(label_file_array)["labels"] 159 | for f in file_list1: 160 | npload_list1.append(np.load(f)) 161 | 162 | roc_space_plotter(label_array, npload_list1,outfile_name) 163 | 164 | 165 | if __name__== '__main__': 166 | main() 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/sequence_visualizer2.py: -------------------------------------------------------------------------------- 1 | 2 | import cairocffi as cairo 3 | import numpy as np 4 | 5 | def _select_color(cr, DNA): 6 | if DNA=="A": 7 | cr.set_source_rgb(1, 0, 0) 8 | elif DNA=="G": 9 | cr.set_source_rgb(0.8, 0.8, 0) 10 | elif DNA=="C": 11 | cr.set_source_rgb(0, 0, 1) 12 | elif DNA=="T": 13 | cr.set_source_rgb(0, 1, 0) 14 | else: 15 | cr.set_source_rgb(0.8, 0.8, 0.8) 16 | def seuquence_visualizer2(npz_file, output_file): 17 | 18 | if type(npz_file)==str: 19 | with np.load(npz_file) as f: 20 | reconstruct=f["recon"] 21 | else: 22 | reconstruct=npz_file 23 | 24 | i=0 25 | j=0 26 | k=0 27 | l=0 28 | line_num=10 29 | DNA_len=1000 30 | #max_value=reconstruct.max() 31 | 32 | reconstruct=np.reshape(reconstruct, (DNA_len, 4)) 33 | max_value=np.max(reconstruct) 34 | reconstruct=80*reconstruct/max_value 35 | #print norm 36 | """reconstruct_pos=reconstruct.clip(min=0) 37 | reconstruct_neg=reconstruct.clip(max=0) 38 | reconstruct_pos_sum=np.sum(reconstruct_pos, axis=1) 39 | reconstruct_neg_sum=np.sum(reconstruct_neg, axis=1) 40 | 41 | max_value=reconstruct_pos_sum.max() 42 | min_value=reconstruct_neg_sum.min() 43 | min_value_abs=-min_value 44 | 45 | if min_value_abs>max_value: 46 | max_value=min_value_abs""" 47 | 48 | #scale_factor=400/max_value 49 | 50 | width=DNA_len*30/line_num+200 51 | hight=1024*2*3 52 | y_center=300 53 | ims1 = cairo.PDFSurface(output_file, width, hight) 54 | cr = cairo.Context(ims1) 55 | cr.move_to(100, y_center) 56 | cr.line_to(DNA_len/line_num*30+100, y_center) 57 | #cr.move_to(50, 100) 58 | #cr.line_to(50, 412) 59 | cr.set_line_width(2) 60 | cr.stroke() 61 | 62 | meme_fileout=open(output_file+'.meme','w') 63 | meme_fileout.write("MEME version 4\n\nALPHABET= ACGT\n\nstrands: + -\n\n\ 64 | Background letter frequencies (from uniform background):\nA 0.2500 C 0.2500 G 0.2500 T 0.2500\n\nMOTIF LONG_MOTIF\n\nletter-probability matrix: alength= 4 w= 1000 nsites= 20 E= 0\n") 65 | 66 | for k in range(1000): 67 | if not k==0 and k%(DNA_len/line_num)==0: 68 | cr.set_source_rgba(0.0,0.0,0,1.0) 69 | y_center+=300 70 | cr.move_to(100, y_center) 71 | cr.line_to(DNA_len/line_num*30+100, y_center) 72 | cr.stroke() 73 | print(y_center) 74 | 75 | AGCT={} 76 | values=[] 77 | #print reconstruct[k] 78 | 79 | #reconstruct[k]=reconstruct[k]*50 80 | #mean=np.mean(reconstruct[k]) 81 | #stdv=np.std(reconstruct[k]) 82 | 83 | #reconstruct[k]=(reconstruct[k]-mean)/stdv 84 | probability=np.round(np.true_divide(np.exp(reconstruct[k]),np.nansum(np.exp(reconstruct[k]))),6) 85 | probability/=np.nansum(probability) 86 | to_print="" 87 | for i in range(4): 88 | if np.isnan(probability[i]): 89 | print(k) 90 | probability[i]=0.0 91 | 92 | to_print=str(probability[0])+" "+str(probability[2])+" "+str(probability[1])+" "+str(probability[3])+"\n" 93 | #print(to_print) 94 | meme_fileout.write(to_print) 95 | 96 | ic=np.nansum(probability*np.log2(probability*4+0.0001))*120 97 | #print ic 98 | A=["A", probability[0]*ic] 99 | G=["G",probability[1]*ic] 100 | C=["C",probability[2]*ic] 101 | T=["T", probability[3]*ic] 102 | 103 | """ 104 | A=["A", reconstruct[k][0]*scale_factor] 105 | G=["G",reconstruct[k][1]*scale_factor] 106 | C=["C",reconstruct[k][2]*scale_factor] 107 | T=["T", reconstruct[k][3]*scale_factor]""" 108 | values=[A,G,C,T] 109 | pos=filter(lambda x:x[1]>=0,values) 110 | #neg=filter(lambda x:x[1]<0,values) 111 | pos.sort(key=lambda x:x[1]) 112 | #neg.sort(key=lambda x:x[1], reverse=True) 113 | Nucpos=0 114 | #Nucneg=0 115 | x_pos=k%(DNA_len/line_num) 116 | 117 | for l in range(len(pos)): 118 | Nuc=pos[l][0] 119 | 120 | Nucsize=abs(pos[l][1])+0.1 121 | cr.move_to(100+x_pos*40*0.75, y_center-Nucpos*0.75) 122 | cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL) 123 | _select_color(cr, Nuc) 124 | font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=Nucsize,x0=0.0,y0=0.0) 125 | cr.set_font_matrix(font_mat) 126 | cr.show_text(str(Nuc)) 127 | Nucpos+=abs(pos[l][1]) 128 | 129 | """ 130 | l=0 131 | for l in range(len(neg)): 132 | Nuc=neg[l][0] 133 | Nucsize=abs(neg[l][1]) 134 | 135 | cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL) 136 | _select_color(cr, Nuc) 137 | font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=-Nucsize,x0=0.0,y0=0.0) 138 | cr.set_font_matrix(font_mat) 139 | cr.move_to(100+x_pos*40*0.75, y_center+(Nucneg)*0.75) 140 | cr.show_text(str(Nuc)) 141 | Nucneg+=abs(neg[l][1])""" 142 | """ 143 | max_value=np.amax(reconstruct[k]) 144 | sum_value=np.sum(reconstruct[k]) 145 | print max_value 146 | if max_value>0.0: 147 | max_index=np.argmax(reconstruct[k]) 148 | 149 | if max_index==0: 150 | Nuc="A" 151 | elif max_index==1: 152 | Nuc="G" 153 | elif max_index==2: 154 | Nuc="C" 155 | elif max_index==3: 156 | Nuc="T" 157 | else: 158 | Nuc="N" 159 | 160 | Nucpos=0 161 | Nucneg=0 162 | Nucsize=max_value*1000 163 | Nucsize2=sum_value*1000""" 164 | 165 | #cr.move_to(50+x_pos*40*0.75, y_center) 166 | #cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL) 167 | #select_color(cr, Nuc) 168 | #font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=Nucsize+0.1,x0=0.0,y0=0.0) 169 | #cr.set_font_matrix(font_mat) 170 | #print Nuc 171 | #cr.show_text(str(Nuc)) 172 | """cr.move_to(50+x_pos*40*0.75, y_center) 173 | cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL) 174 | select_color(cr, Nuc2) 175 | font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=Nucsize2+20.0,x0=0.0,y0=0.0) 176 | cr.set_font_matrix(font_mat) 177 | cr.show_text(str(Nuc2)) """ 178 | #cr.set_font_size(40) 179 | meme_fileout.close() 180 | cr.show_page() 181 | 182 | def main(): 183 | npz_file='/home/fast2/onimaru/DeepGMAP-dev/data/reconstructions/conv4frss_Fri_May_11_075425_2018.ckpt-16747Tue_May_15_112518_2018_all_.npz' 184 | a=npz_file.split('/')[-1] 185 | a=a.split('.')[0] 186 | output_file=npz_file+'.pdf' 187 | seuquence_visualizer2(npz_file, output_file) 188 | if __name__ == "__main__": 189 | main() 190 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/kernel_visualizer2.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import cairocffi as cairo 3 | #import cairo 4 | #from gi.repository import Gtk 5 | 6 | import gzip 7 | import cPickle 8 | import numpy as np 9 | import os 10 | import math 11 | import enum 12 | from PIL import Image 13 | try: 14 | from StringIO import StringIO 15 | except ImportError: 16 | from io import StringIO 17 | def _select_color(cr, DNA): 18 | if DNA=="A": 19 | cr.set_source_rgb(1, 0, 0) 20 | elif DNA=="G": 21 | cr.set_source_rgb(0.8, 0.8, 0) 22 | elif DNA=="C": 23 | cr.set_source_rgb(0, 0, 1) 24 | elif DNA=="T": 25 | cr.set_source_rgb(0, 1, 0) 26 | else: 27 | cr.set_source_rgb(0.8, 0.8, 0.8) 28 | 29 | def rectangle(x, y, w,h,lw, context): 30 | context.set_line_width(lw) 31 | context.move_to(x, y) 32 | context.rel_line_to(w, 0) 33 | context.rel_line_to(0, h) 34 | context.rel_line_to(-w, 0) 35 | context.close_path() 36 | 37 | def seuquence_visualizer(npz_file): 38 | png_list=[] 39 | with np.load(npz_file) as f: 40 | kernels=f["prediction/W_conv1:0"] 41 | 42 | kernel_shape=kernels.shape 43 | i=0 44 | j=0 45 | k=0 46 | l=0 47 | #line_num=10 48 | #DNA_len=1000 49 | #max_value=reconstruct.max() 50 | 51 | kernels=np.reshape(kernels, (kernel_shape[0], kernel_shape[1],kernel_shape[3])) 52 | kernel_shape=kernels.shape 53 | # (9, 4, 320) 54 | #kernels=np.exp(kernels) 55 | #kernels=kernels/np.amax(kernels) 56 | #reconstruct=80*reconstruct/max_value 57 | #print norm 58 | #scale_factor=400/max_value 59 | width=kernel_shape[0]*40+10 60 | hight=150 61 | y_center=hight*0.8 62 | 63 | 64 | 65 | prefix=os.path.splitext(npz_file)[0]+"_kernels/" 66 | if not os.path.isdir(prefix): 67 | try: 68 | os.mkdir(prefix) 69 | except: 70 | sys.exit() 71 | 72 | meme_fileout=open(prefix+'motifs.meme','w') 73 | meme_def="MEME version 4\n\nALPHABET= ACGT\n\nstrands: + -\n\n\Background letter frequencies (from uniform background):\nA 0.2500 C 0.2500 G 0.2500 T 0.2500\n\n" 74 | meme_fileout.write(meme_def) 75 | kernel_shape_ic_list=[] 76 | for k in range(kernel_shape[2]): 77 | meme_def="MOTIF kernel_"+str(k)+"\n\nletter-probability matrix: alength= 4 w= 9 nsites= 9 E= 0\n" 78 | meme_fileout.write(meme_def) 79 | ims1 = cairo.PDFSurface(None, width, hight) 80 | cr = cairo.Context(ims1) 81 | cr.set_source_rgb(0.0,0.0,0) 82 | cr.move_to(width*0.1, y_center) 83 | cr.line_to(width*0.9, y_center) 84 | #cr.move_to(50, 100) 85 | #cr.line_to(50, 412) 86 | cr.set_line_width(2) 87 | cr.stroke() 88 | cr.move_to(width*0.1, y_center) 89 | cr.line_to(width*0.1, y_center-120) 90 | cr.set_line_width(2) 91 | cr.stroke() 92 | cr.move_to(width*0.1, y_center-60) 93 | cr.line_to(width*0.08, y_center-60) 94 | cr.set_line_width(2) 95 | cr.stroke() 96 | cr.move_to(width*0.075, y_center-60+4*10) 97 | cr.rotate(-90*math.pi/180.0) 98 | #cr.set_line_width(2) 99 | cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL) 100 | font_mat=cairo.Matrix(xx=32.0,yx=0.0,xy=0.0,yy=32,x0=0.0,y0=0.0) 101 | cr.set_font_matrix(font_mat) 102 | cr.show_text("2 bit") 103 | cr.rotate(90*math.pi/180.0) 104 | font_mat=cairo.Matrix(xx=12.0,yx=0.0,xy=0.0,yy=12,x0=0.0,y0=0.0) 105 | cr.move_to(width*0.5, hight) 106 | cr.show_text("k"+str(k)) 107 | #print y_center 108 | 109 | AGCT={} 110 | #values=[] 111 | #print reconstruct[k] 112 | 113 | #reconstruct[k]=reconstruct[k]*50 114 | #mean=np.mean(reconstruct[k]) 115 | #stdv=np.std(reconstruct[k]) 116 | 117 | #reconstruct[k]=(reconstruct[k]-mean)/stdv 118 | #print kernels[:,:,k] 119 | xkernel=kernels[:,:,k] 120 | xkernel=np.exp(xkernel*100.0) 121 | #print xkernel 122 | #sys.exit() 123 | probability=xkernel/np.nansum(xkernel, axis=1)[:,None] 124 | #probability/=np.nansum(probability) 125 | for p in probability: 126 | to_print=str(p[0])+" "+str(p[2])+" "+str(p[1])+" "+str(p[3])+"\n" 127 | meme_fileout.write(to_print) 128 | meme_fileout.write("\n\n") 129 | ic_sum=0.0 130 | for pind, p in enumerate(probability): 131 | 132 | ic=np.nansum(p*np.log2(p*4+0.0001))*80 133 | ic_sum+=ic 134 | #print ic 135 | A=["A", p[0]*ic] 136 | G=["G",p[1]*ic] 137 | C=["C",p[2]*ic] 138 | T=["T", p[3]*ic] 139 | values=[A,G,C,T] 140 | pos=filter(lambda x:x[1]>=0,values) 141 | #neg=filter(lambda x:x[1]<0,values) 142 | pos.sort(key=lambda x:x[1]) 143 | #neg.sort(key=lambda x:x[1], reverse=True) 144 | Nucpos=0.01 145 | #Nucneg=0 146 | x_pos=width*0.1+pind*30 147 | 148 | for l in range(len(pos)): 149 | Nuc=pos[l][0] 150 | 151 | Nucsize=pos[l][1]+0.01 152 | cr.move_to(x_pos, y_center-Nucpos*0.75) 153 | cr.select_font_face("Sans", cairo.FONT_SLANT_NORMAL,cairo.FONT_WEIGHT_NORMAL) 154 | _select_color(cr, Nuc) 155 | font_mat=cairo.Matrix(xx=40.0,yx=0.0,xy=0.0,yy=Nucsize,x0=0.0,y0=0.0) 156 | cr.set_font_matrix(font_mat) 157 | cr.show_text(str(Nuc)) 158 | Nucpos+=abs(pos[l][1]) 159 | ims1.write_to_png(prefix+"kernel_"+str(k)+'.png') 160 | png_list.append(prefix+"kernel_"+str(k)+'.png') 161 | kernel_shape_ic_list.append(ic_sum) 162 | cr.show_page() 163 | meme_fileout.close() 164 | 165 | return png_list, kernel_shape_ic_list 166 | 167 | def kernel_connector(png_list): 168 | 169 | 170 | 171 | 172 | pt_per_mm = 72 / 25.4 173 | width, height = 210 * pt_per_mm, 297 * pt_per_mm 174 | upper_lim=height*0.1 175 | lateral_lim=width*0.1 176 | x_interval=width*0.8/9.0 177 | nodew=width*0.005 178 | nodeh=0.0015*height 179 | 180 | out_dir=os.path.split(png_list[0])[0]+"/kernels.pdf" 181 | ims1 = cairo.PDFSurface(out_dir, width, height) 182 | cr = cairo.Context(ims1) 183 | #cr.scale(pt_per_mm,pt_per_mm) 184 | coordinates={} 185 | im = Image.open(png_list[0]) 186 | xwidth=int(width*0.8/10.0)+5 187 | ywidth=int(im.size[1]*xwidth/float(im.size[0])) 188 | #print xwidth, ywidth 189 | for k, i in enumerate(png_list): 190 | im = Image.open(i) 191 | 192 | im=im.resize([xwidth, ywidth], Image.ANTIALIAS) 193 | #im.thumbnail([xwidth, ywidth], Image.LANCZOS) 194 | _buffer = StringIO.StringIO() 195 | im.save(_buffer, format="PNG", quality=100)#,compress_level=0, dpi=(xwidth, ywidth)) 196 | _buffer.seek(0) 197 | png_image=cairo.ImageSurface.create_from_png(_buffer) 198 | cr.save() 199 | #cr.scale(0.5, 0.5) 200 | cr.set_source_surface(png_image, lateral_lim+(xwidth-5)*(k%10), upper_lim+ywidth*((k/10))) 201 | cr.paint() 202 | 203 | cr.show_page() 204 | 205 | 206 | def main(): 207 | if len(sys.argv)>1: 208 | npz_file=sys.argv[1] 209 | else: 210 | #npz_file='/home/fast/onimaru/deepgmap/data/outputs/conv4frss_trained_variables_Fri_May_11_075425_2018.npz' 211 | npz_file='/home/fast2/onimaru/DeepGMAP-dev/data/outputs/conv4frss_Mon_Feb_25_092345_2019_trained_variables.npz' 212 | #output_file='/home/fast/onimaru/data/output/deepshark_trained_variables_Sat_Apr_28_170548_2018.npz' 213 | 214 | png_list, kernel_shape_ic_list=seuquence_visualizer(npz_file) 215 | kernel_connector(png_list) 216 | if __name__ == "__main__": 217 | main() 218 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/deconvolution_to_signal.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gzip 3 | import cPickle 4 | import tensorflow as tf 5 | import numpy as np 6 | import time 7 | import math 8 | import os 9 | import matplotlib.pyplot as plt 10 | import pylab 11 | from deepgmap.post_train_tools import unpooling 12 | import importlib as il 13 | import getopt 14 | from natsort import natsorted 15 | from glob import glob 16 | def test_batch(test_batch_file): 17 | with np.load(test_batch_file) as f: 18 | dnase_data_labels1=f['labels'], f['data_array'] 19 | images=np.reshape(dnase_data_labels1[1], (batch_size, data_length, 4, 1)) 20 | labels=dnase_data_labels1[0] 21 | return images, labels 22 | 23 | def genome_scan(filename): 24 | with open(filename, 'r') as f1: 25 | file_name=f1.name 26 | path_sep=os.path.sep 27 | file_name1=file_name.split(path_sep) 28 | file_name2=file_name1[-1].split('_') 29 | chromosome=file_name2[2] 30 | a=file_name2[3] 31 | b=a.split('.') 32 | chr_position=int(b[0]) 33 | #window_id=(file_name2[3])[:3] 34 | genome_seq=np.load(f1) 35 | shape_of_genome=genome_seq['genome'].shape 36 | genome_seq_re=np.reshape(genome_seq['genome'], (shape_of_genome[0], shape_of_genome[1], 4, 1)) 37 | genome_seq_re_list=np.array_split(genome_seq_re, 100) 38 | return genome_seq_re_list, chromosome, chr_position #, window_id 39 | 40 | 41 | BATCH_SIZE=1000 42 | start=time.time() 43 | try: 44 | options, args =getopt.getopt(sys.argv[1:], 'm:t:n:o:', ['model=','test_genome=','network_constructor=','output_dir=']) 45 | except getopt.GetoptError as err: 46 | #print str(err) 47 | sys.exit(2) 48 | if len(options)<3: 49 | print('too few argument') 50 | sys.exit(0) 51 | for opt, arg in options: 52 | if opt in ('-m', '--model'): 53 | trained_model=arg 54 | elif opt in ('-t', '--test_genome'): 55 | test_genome=arg 56 | elif opt in ('-n', '--network_constructor'): 57 | network_constructor=arg 58 | elif opt in ('-o', '--output_dir'): 59 | output_dir=arg 60 | 61 | 62 | #output_dir=None 63 | 64 | config = tf.ConfigProto() 65 | config.gpu_options.allow_growth=True 66 | sess = tf.Session(config=config) 67 | 68 | keep_prob = tf.placeholder(tf.float32) 69 | keep_prob2 = tf.placeholder(tf.float32) 70 | keep_prob3 = tf.placeholder(tf.float32) 71 | 72 | 73 | x_image = tf.placeholder(tf.float32, shape=[None, 1000, 4, 1]) 74 | y_ = tf.placeholder(tf.float32, shape=[None, 3]) 75 | phase=tf.placeholder(tf.bool) 76 | dropout_1=0.95 77 | dropout_2=0.9 78 | dropout_3=0.85 79 | batch_size=100 80 | data_length=1000 81 | input_dir=trained_model 82 | nc=il.import_module("deepgmap.network_constructors."+str(network_constructor)) 83 | train_speed=0.00005 84 | a=time.asctime() 85 | b=a.replace(':', '') 86 | start_at=b.replace(' ', '_') 87 | 88 | model = nc.Model(image=x_image, label=y_, 89 | output_dir=output_dir, 90 | phase=phase, 91 | start_at=start_at, 92 | keep_prob=keep_prob, 93 | keep_prob2=keep_prob2, 94 | keep_prob3=keep_prob3, 95 | data_length=data_length, 96 | max_to_keep=2, 97 | GPUID="1") 98 | 99 | 100 | sess.run(tf.global_variables_initializer()) 101 | saver=model.saver 102 | saver.restore(sess, input_dir) 103 | 104 | test_genome_list=natsorted(glob(test_genome)) 105 | if len(test_genome_list)==0: 106 | sys.exit(test_genome+" does not exist.") 107 | 108 | def conv2d_tp(x, W, output_shape): 109 | return tf.nn.conv2d_transpose(x, W, output_shape, strides=[1, 1, 1, 1], padding='VALID') 110 | def conv2d_tp2(x, W, output_shape): 111 | return tf.nn.conv2d_transpose(x, W, output_shape, strides=[1,2, 1,1], padding='VALID') 112 | def conv2d_tp4(x, W, output_shape): 113 | return tf.nn.conv2d_transpose(x, W, output_shape, strides=[1, 4, 1, 1], padding='VALID') 114 | def max_pool_2x1(x): 115 | return tf.nn.max_pool_with_argmax(x, ksize=[1, 2, 1, 1], strides=[1, 2, 1, 1], padding='SAME') 116 | def max_pool_4x1(x): 117 | return tf.nn.max_pool_with_argmax(x, ksize=[1, 4, 1, 1], strides=[1, 4, 1, 1], padding='SAME') 118 | position_list=[] 119 | y_prediction2=[] 120 | for test_genome_ in test_genome_list: 121 | print(test_genome_) 122 | genome_data=np.load(test_genome_) 123 | position_list_, seq_list=genome_data['positions'], genome_data['sequences'] 124 | if len(position_list)==0: 125 | position_list=position_list_ 126 | else: 127 | position_list=np.concatenate([position_list,position_list_]) 128 | seq_list=np.array(seq_list, np.int16).reshape(-1, data_length, 4, 1) 129 | seq_length=seq_list.shape[0] 130 | print(seq_length) 131 | 132 | 133 | loop=int(math.ceil(float(seq_length)/BATCH_SIZE)) 134 | for i in range(loop): 135 | if i*BATCH_SIZE>seq_length: 136 | break 137 | scanning=seq_list[i*BATCH_SIZE:(i+1)*BATCH_SIZE] 138 | if len(y_prediction2)==0: 139 | _, y_prediction2,variavl_dict, neurons_dict,_2=sess.run(model.prediction, feed_dict={x_image: scanning, keep_prob: 1.0, keep_prob2: 1.0, keep_prob3: 1.0,phase:False}) 140 | 141 | else: 142 | _, y_prediction1,variavl_dict, neurons_dict,_2=sess.run(model.prediction, feed_dict={x_image: scanning, keep_prob: 1.0, keep_prob2: 1.0, keep_prob3: 1.0,phase:False}) 143 | y_prediction2=np.concatenate([y_prediction2,y_prediction1],axis=0) 144 | 145 | 146 | h_conv11_,\ 147 | h_conv12_,\ 148 | h_conv2_,\ 149 | h_conv21_,\ 150 | h_conv22_,\ 151 | h_pool1_,\ 152 | h_pool1_rc_,\ 153 | h_pool2_,\ 154 | h_pool21_,\ 155 | h_pool22_ =\ 156 | neurons_dict["h_conv11"],\ 157 | neurons_dict["h_conv12"],\ 158 | neurons_dict["h_conv2"],\ 159 | neurons_dict["h_conv21"],\ 160 | neurons_dict["h_conv22"],\ 161 | neurons_dict["h_pool1"],\ 162 | neurons_dict["h_pool1_rc"],\ 163 | neurons_dict["h_pool2"],\ 164 | neurons_dict["h_pool21"],\ 165 | neurons_dict["h_pool22"] 166 | 167 | 168 | sess2 = tf.Session() 169 | #print h_pool21_ 170 | h_pool21_shape=list(h_pool21_.shape) 171 | h_pool21_t4= conv2d_tp(h_conv22_, variavl_dict["W_conv22"], h_pool21_shape) 172 | _, mask21=max_pool_2x1(h_conv21_) 173 | #h_unpool21_t4=unpooling.unpool(h_pool21_t4, mask21,output_shape=h_conv21_.shape) 174 | h_unpool21_t4=unpooling.unpool2(h_pool21_t4, mask21) 175 | 176 | h_pool2_shape=list(h_pool2_.shape) 177 | h_pool2_t4= conv2d_tp(h_unpool21_t4, variavl_dict["W_conv21"], h_pool2_shape) 178 | _, mask2=max_pool_2x1(h_conv2_) 179 | #h_unpool2_t4=unpooling.unpool(h_pool2_t4,mask2,output_shape=h_conv2_.shape) 180 | h_unpool2_t4=unpooling.unpool2(h_pool2_t4,mask2) 181 | 182 | h_pool1_shape=list(h_pool1_.shape) 183 | h_pool1_t4= conv2d_tp(h_unpool2_t4, variavl_dict["W_conv2"], h_pool1_shape) 184 | _,mask1=max_pool_2x1(h_conv11_) 185 | #h_unpool1_t4=unpooling.unpool(h_pool1_t4,mask1,output_shape=h_conv11_.shape) 186 | h_unpool1_t4=unpooling.unpool2(h_pool1_t4,mask1) 187 | 188 | h_pool1_rc_t4=conv2d_tp(h_unpool2_t4, tf.reverse(variavl_dict["W_conv2"], [0,1]), h_pool1_shape) 189 | _,mask1rc=max_pool_2x1(h_conv12_) 190 | #h_unpool1_rc_t4=unpooling.unpool(h_pool1_rc_t4,mask1rc,output_shape=h_conv12_.shape) 191 | h_unpool1_rc_t4=unpooling.unpool2(h_pool1_rc_t4,mask1rc) 192 | 193 | reconstruction_shape=scanning.shape 194 | #print reconstruction_shape 195 | reconstruction_conv22=conv2d_tp(h_unpool1_t4, variavl_dict["W_conv1"], reconstruction_shape)+conv2d_tp(h_unpool1_rc_t4, tf.reverse(variavl_dict["W_conv1"], [0,1]), reconstruction_shape) 196 | 197 | 198 | sess2.run(tf.global_variables_initializer()) 199 | units_conv22 = sess2.run(reconstruction_conv22) 200 | reshaped_conv22=np.reshape(units_conv22, (data_length, 4)) 201 | 202 | sess2.close() 203 | 204 | 205 | sess.close() 206 | -------------------------------------------------------------------------------- /deepgmap/post_train_tools/deconv_deepshark_local_extend.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gzip 3 | import cPickle 4 | import tensorflow as tf 5 | import numpy as np 6 | import time 7 | import math 8 | import os 9 | import matplotlib.pyplot as plt 10 | import pylab 11 | from deepgmap.post_train_tools import unpooling 12 | import importlib as il 13 | import getopt 14 | 15 | 16 | def test_batch(test_batch_file): 17 | with np.load(test_batch_file) as f: 18 | dnase_data_labels1=f['labels'], f['data_array'] 19 | images=np.reshape(dnase_data_labels1[1], (batch_size, data_length, 4, 1)) 20 | labels=dnase_data_labels1[0] 21 | return images, labels 22 | 23 | start=time.time() 24 | 25 | 26 | try: 27 | options, args =getopt.getopt(sys.argv[1:], 'm:t:n:o:d:', ['model=','test_batch=','network_constructor=','output_dir=','deconv=']) 28 | except getopt.GetoptError as err: 29 | #print str(err) 30 | sys.exit(2) 31 | if len(options)<3: 32 | print('too few argument') 33 | sys.exit(0) 34 | for opt, arg in options: 35 | if opt in ('-m', '--model'): 36 | trained_model=arg 37 | elif opt in ('-t', '--test_batch'): 38 | test_batch_file=arg 39 | elif opt in ('-n', '--network_constructor'): 40 | network_constructor=arg 41 | elif opt in ('-o', '--output_dir'): 42 | output_dir=arg 43 | elif opt in ('-d','--deconv'): 44 | deconv=arg 45 | 46 | 47 | #output_dir=None 48 | 49 | config = tf.ConfigProto() 50 | config.gpu_options.allow_growth=True 51 | sess = tf.Session(config=config) 52 | 53 | keep_prob = tf.placeholder(tf.float32) 54 | keep_prob2 = tf.placeholder(tf.float32) 55 | keep_prob3 = tf.placeholder(tf.float32) 56 | 57 | 58 | x_image = tf.placeholder(tf.float32, shape=[None, 1000, 4, 1]) 59 | y_ = tf.placeholder(tf.float32, shape=[None, 20]) 60 | phase=tf.placeholder(tf.bool) 61 | dropout_1=0.95 62 | dropout_2=0.9 63 | dropout_3=0.85 64 | batch_size=100 65 | data_length=1000 66 | input_dir=trained_model 67 | nc=il.import_module("network_constructors."+str(network_constructor)) 68 | train_speed=0.00005 69 | a=time.asctime() 70 | b=a.replace(':', '') 71 | start_at=b.replace(' ', '_') 72 | 73 | model = nc.Model(image=x_image, label=y_, 74 | output_dir=output_dir, 75 | phase=phase, 76 | start_at=start_at, 77 | keep_prob=keep_prob, 78 | keep_prob2=keep_prob2, 79 | keep_prob3=keep_prob3, 80 | data_length=data_length) 81 | 82 | 83 | sess.run(tf.global_variables_initializer()) 84 | saver=model.saver 85 | saver.restore(sess, input_dir) 86 | 87 | 88 | batch = test_batch(test_batch_file) 89 | test_accuracy1, y_label1, y_prediction1 =sess.run([model.error, y_, model.prediction[1]], feed_dict={x_image: batch[0], y_: batch[1], keep_prob: 1.0, keep_prob2: 1.0, keep_prob3: 1.0}) 90 | #print "test accuracy (true:false=5:5): "+str(test_accuracy1) 91 | #print deconv 92 | 93 | 94 | 95 | def conv2d_tp(x, W, output_shape): 96 | return tf.nn.conv2d_transpose(x, W, output_shape, strides=[1, 1, 1, 1], padding='VALID') 97 | def conv2d_tp2(x, W, output_shape): 98 | return tf.nn.conv2d_transpose(x, W, output_shape, strides=[1,2, 1,1], padding='VALID') 99 | def conv2d_tp4(x, W, output_shape): 100 | return tf.nn.conv2d_transpose(x, W, output_shape, strides=[1, 4, 1, 1], padding='VALID') 101 | def max_pool_2x1(x): 102 | return tf.nn.max_pool_with_argmax(x, ksize=[1, 2, 1, 1], strides=[1, 2, 1, 1], padding='SAME') 103 | def max_pool_4x1(x): 104 | return tf.nn.max_pool_with_argmax(x, ksize=[1, 4, 1, 1], strides=[1, 4, 1, 1], padding='SAME') 105 | 106 | index_of_image=0 107 | positive_image=[] 108 | for y in batch[1]: 109 | #print y[0] 110 | if np.sum(y)>0: 111 | positive_image.append(index_of_image) 112 | index_of_image+=1 113 | #print len(positive_image) 114 | for k in range(len(positive_image)): 115 | 116 | images4=np.reshape(batch[0][positive_image[k]], (1, data_length, 4, 1)) 117 | 118 | #h_conv3_, h_conv25_, h_conv24_, h_conv23_, h_conv22_,h_conv21_, h_conv2_, h_conv1_, b_conv3_=sess.run([h_conv3, h_conv25,h_conv24, h_conv23, h_conv22,h_conv21, h_conv2, h_conv1, b_conv3], 119 | 120 | # feed_dict={x_image: images4, keep_prob: 1.0, keep_prob2: 1.0, keep_prob3: 1.0}) 121 | _2,_1, variavl_dict, neurons_dict, _3=sess.run(model.prediction, feed_dict={x_image: images4, keep_prob: 1.0, keep_prob2: 1.0, keep_prob3: 1.0, phase: False}) 122 | 123 | h_conv11_,\ 124 | h_conv12_,\ 125 | h_conv2_,\ 126 | h_conv21_,\ 127 | h_conv22_,\ 128 | h_pool1_,\ 129 | h_pool1_rc_,\ 130 | h_pool2_,\ 131 | h_pool21_,\ 132 | h_pool22_ =\ 133 | neurons_dict["h_conv11"],\ 134 | neurons_dict["h_conv12"],\ 135 | neurons_dict["h_conv2"],\ 136 | neurons_dict["h_conv21"],\ 137 | neurons_dict["h_conv22"],\ 138 | neurons_dict["h_pool1"],\ 139 | neurons_dict["h_pool1_rc"],\ 140 | neurons_dict["h_pool2"],\ 141 | neurons_dict["h_pool21"],\ 142 | neurons_dict["h_pool22"] 143 | 144 | 145 | sess2 = tf.Session() 146 | #print h_pool21_ 147 | h_pool21_shape=list(h_pool21_.shape) 148 | h_pool21_t4= conv2d_tp(h_conv22_, variavl_dict["W_conv22"], h_pool21_shape) 149 | _, mask21=max_pool_2x1(h_conv21_) 150 | #h_unpool21_t4=unpooling.unpool(h_pool21_t4, mask21,output_shape=h_conv21_.shape) 151 | h_unpool21_t4=unpooling.unpool2(h_pool21_t4, mask21) 152 | 153 | h_pool2_shape=list(h_pool2_.shape) 154 | h_pool2_t4= conv2d_tp(h_unpool21_t4, variavl_dict["W_conv21"], h_pool2_shape) 155 | _, mask2=max_pool_2x1(h_conv2_) 156 | #h_unpool2_t4=unpooling.unpool(h_pool2_t4,mask2,output_shape=h_conv2_.shape) 157 | h_unpool2_t4=unpooling.unpool2(h_pool2_t4,mask2) 158 | 159 | h_pool1_shape=list(h_pool1_.shape) 160 | h_pool1_t4= conv2d_tp(h_unpool2_t4, variavl_dict["W_conv2"], h_pool1_shape) 161 | _,mask1=max_pool_2x1(h_conv11_) 162 | #h_unpool1_t4=unpooling.unpool(h_pool1_t4,mask1,output_shape=h_conv11_.shape) 163 | h_unpool1_t4=unpooling.unpool2(h_pool1_t4,mask1) 164 | 165 | h_pool1_rc_t4=conv2d_tp(h_unpool2_t4, tf.reverse(variavl_dict["W_conv2"], [0,1]), h_pool1_shape) 166 | _,mask1rc=max_pool_2x1(h_conv12_) 167 | #h_unpool1_rc_t4=unpooling.unpool(h_pool1_rc_t4,mask1rc,output_shape=h_conv12_.shape) 168 | h_unpool1_rc_t4=unpooling.unpool2(h_pool1_rc_t4,mask1rc) 169 | 170 | reconstruction_shape=images4.shape 171 | #print reconstruction_shape 172 | reconstruction_conv22=conv2d_tp(h_unpool1_t4, variavl_dict["W_conv1"], reconstruction_shape)+\ 173 | conv2d_tp(h_unpool1_rc_t4, tf.reverse(variavl_dict["W_conv1"], [0,1]), reconstruction_shape) 174 | 175 | 176 | sess2.run(tf.global_variables_initializer()) 177 | units_conv22 = sess2.run(reconstruction_conv22) 178 | 179 | 180 | reshaped_conv22=np.reshape(units_conv22, (data_length, 4)) 181 | 182 | # Compute and plot first dendrogram. 183 | fig = plt.figure(figsize=(12,8)) 184 | 185 | # Plot distance matrix. 186 | 187 | 188 | axmatrix_conv22 = fig.add_axes([0.05,0.05,0.1,0.9]) 189 | im_conv22 = axmatrix_conv22.matshow(reshaped_conv22, aspect='auto', origin='lower', cmap=plt.get_cmap('YlGnBu')) 190 | axmatrix_conv22.set_xticks([]) 191 | axmatrix_conv22.set_yticks([]) 192 | axcolor = fig.add_axes([0.16,0.05,0.02,0.9]) 193 | pylab.colorbar(im_conv22, cax=axcolor) 194 | 195 | 196 | reshaped2=np.reshape(images4, (data_length, 4)) 197 | axmatrix3 = fig.add_axes([0.85,0.05,0.1,0.9]) 198 | im3 = axmatrix3.matshow(reshaped2, aspect='auto', origin='lower', cmap=plt.get_cmap('YlGnBu')) 199 | axmatrix3.set_xticks([]) 200 | axmatrix3.set_yticks([]) 201 | axcolor = fig.add_axes([0.96,0.05,0.02,0.9]) 202 | pylab.colorbar(im3, cax=axcolor) 203 | 204 | np.savez_compressed(str(output_dir)+str(trained_model.split('/')[-1])+"_transpose_"+str(k), 205 | conv22=reshaped_conv22, 206 | original=np.reshape(images4,(data_length, 4))) 207 | 208 | fig.savefig(str(output_dir)+str(trained_model.split('/')[-1])+'_reconstruction_'+str(k)+'.png') 209 | #plt.show() 210 | sess2.close() 211 | sess.close() 212 | --------------------------------------------------------------------------------