├── .#SCATS.py ├── README.md ├── SCATS.py ├── bin ├── PreProcess.pl ├── __init__.py ├── __init__.pyc ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── my_functions.cpython-37.pyc │ └── scats_functions.cpython-37.pyc ├── check_software.py ├── complie_likelihoodumi.sh ├── getCount.py ├── getCount_cellid.py ├── getCount_umi.py ├── getCount_umi_cellid.py ├── getalpha.pl ├── getexonlevelcount_umi.pl ├── getgenelevelcount.pl ├── getgeneleveltheta_umi.pl ├── getgroupinfo.pl ├── gettascdata.pl ├── likelihoodumi.c ├── likelihoodumi.html ├── likelihoodumi.pyx ├── likelihoodumi.so ├── model_selection_das_umi.py ├── my_functions.py ├── my_functions.pyc ├── scats_functions.py ├── scats_functions.pyc ├── scats_isoform.py └── summarizedas.pl ├── doc ├── Clarity_step1.JPG ├── Clarity_step2.JPG ├── Clarity_step3.JPG ├── Fig1.png ├── Install.md └── Usage.md └── example ├── example.gpinfo ├── example.refFile ├── example.refgene ├── metafile ├── mm10refseq.gpinfo └── mm10refseq.refgene /.#SCATS.py: -------------------------------------------------------------------------------- 1 | huy4@l-1-01.cm.cluster.128826:1562181832 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Single Cell Analysis of Transcript Splicing (SCATS) 2 | A statistical tool to detect differential alternative splicing events using single-cell RNA-seq 3 | 4 | ## Computational pipeline of SCATS 5 |

6 | 7 |

8 | 9 | ## System Requirements 10 | For optimal performance, we recommend a HPC with 20+ cores 11 | 12 | ## Inputs of SCATS 13 | The input of SCATS is single-cell RNA-seq read data in BAM format together with a refrence isoform annotation file. 14 | 15 | ## Installation 16 | Please refer to [Installation](https://github.com/huyustats/SCATS/blob/master/doc/Install.md) for how to install SCATS. 17 | 18 | ## Usage 19 | Please refere to [Usage](https://github.com/huyustats/SCATS/blob/master/doc/Usage.md) for how to use SCATS. 20 | 21 | ## Contact 22 | 23 | If you have any questions/issues/bugs, please post them on [GitHub](https://github.com/huyustats/SCATS/issues). They would also be helpful to other users. 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /SCATS.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from bin import my_functions as my 4 | from bin import scats_functions as sc 5 | import os, sys 6 | fileAbsPath = os.path.abspath(os.path.dirname(__file__)) 7 | crtAbsPath = os.getcwd() 8 | 9 | task = "" 10 | taskList = ["refgene", "group", "count", "gene", "das", "sum"] 11 | for i in range(1,len(sys.argv)): 12 | if sys.argv[i] == "-task" and len(sys.argv)!=i+1: 13 | task = sys.argv[i+1] 14 | if (task not in taskList): 15 | print("\nPlease specify task (SCATS.py -task ):\n") 16 | print("\trefgene: preprocess reference file\n") 17 | print("\tgroup: group alternative splicing exon\n") 18 | print("\tcount: count informative reads from indexed BAM file\n") 19 | #print("\tabkt: calculate technical parameters (alpha beta kappa tau)") 20 | print("\tgene: estimate mean gene expression for each single cell condition\n") 21 | print("\tdas: detect differential alternative splicing (DAS) for each exon group between conditions\n") 22 | print("\tsum: summarize DAS test results\n") 23 | 24 | if task == "refgene": 25 | validArgList = ["-task", "-ref", "-out"] 26 | addAbsPath = [0, 1, 3] 27 | message = "SCATS.py -task refgene -ref -out " 28 | inputs = my.parse_argument(validArgList, addAbsPath, message) 29 | refFile = inputs[1] 30 | outFile = inputs[2] 31 | myCommand = "perl " + fileAbsPath + "/bin/PreProcess.pl -r " + refFile + " -o " + outFile 32 | os.system(myCommand) 33 | 34 | if task == "group": 35 | validArgList = ["-task", "-refgene", "-out"] 36 | addAbsPath = [0, 1, 3] 37 | message = "SCATS.py -task group -refgene -out " 38 | inputs = my.parse_argument(validArgList, addAbsPath, message) 39 | refFile = inputs[1] 40 | outFile = inputs[2] 41 | myCommand = "perl " + fileAbsPath + "/bin/getgroupinfo.pl " + refFile + " > " + outFile 42 | os.system(myCommand) 43 | 44 | if task == "count": 45 | umiRun = "" 46 | onebam = "" 47 | for i in range(1,len(sys.argv)): 48 | if sys.argv[i] == "-umi" and len(sys.argv)!=i+1: 49 | umiRun = sys.argv[i+1] 50 | if sys.argv[i] == "-onebam" and len(sys.argv)!=i+1: 51 | onebam = sys.argv[i+1] 52 | if (umiRun not in ["yes", "no"]) or (onebam not in ["yes", "no"]): 53 | print("\nPlease specify umi and onebam option (SCATS.py -task count -umi -onebam ):\n") 54 | print("\tumi: collect UMI count or not (if yes umitag is required to be specified)\n") 55 | print("\tonebam: whether all aligned reads are merged in one BAM file (if yes celltag and cellbc are required to be specified)\n") 56 | sys.exit() 57 | 58 | validArgList = ["-task", "-umi", "-onebam", "-meta", "-refgene", "-gpinfo"] 59 | addAbsPath = [0, 0, 0, 1, 1, 1] 60 | message = "SCATS.py -task count -umi yes -onebam -yes -meta -refgene -gpinfo " 61 | inputs = my.parse_argument(validArgList, addAbsPath, message) 62 | metaFile = inputs[3] 63 | tmpDir = crtAbsPath + "/tmp" 64 | my.mk_dir(tmpDir) 65 | tmpDir = tmpDir + "/count_script" 66 | my.mk_dir(tmpDir) 67 | refgeneFile = inputs[4] 68 | gpinfoFile = inputs[5] 69 | 70 | # generate sh files for read counting process 71 | sc.write_count_sh(fileAbsPath, umiRun, onebam, metaFile, tmpDir, refgeneFile, gpinfoFile) 72 | print("\nPlease run all scripts (count_\*.sh files) under directory: " + tmpDir + "\n") 73 | 74 | if task == "gene": 75 | 76 | validArgList = ["-task", "-ncore", "-meta"] 77 | addAbsPath = [0, 0, 1] 78 | message = "SCATS.py -task gene -ncore <# cores> -meta " 79 | inputs = my.parse_argument(validArgList, addAbsPath, message) 80 | ncore = inputs[1] 81 | metaFile = inputs[2] 82 | tmpDir = crtAbsPath + "/tmp/gene_script" 83 | my.mk_dir(tmpDir) 84 | tmpDir = crtAbsPath + "/tmp/count_script" 85 | cdtList = sc.check_count_file(metaFile, tmpDir) 86 | 87 | outFile = crtAbsPath + "/tmp/celltypes" 88 | OUT = open(outFile, "w") # create celltype file 89 | for i in range(0, len(cdtList)): 90 | OUT.write(cdtList[i]+"\n") 91 | OUT.close() 92 | 93 | # estimate alpha 94 | my.mk_dir(crtAbsPath+"/tmp/abkt") 95 | myCommand = "perl " + fileAbsPath + "/bin/getalpha.pl " + metaFile + " " + tmpDir + " " + crtAbsPath+"/tmp/abkt/abkt_umi" 96 | os.system(myCommand) 97 | 98 | # estimate gene expression 99 | tmpDir = crtAbsPath + "/tmp/gene_script" 100 | my.mk_dir(tmpDir+"/data") 101 | 102 | outFile = crtAbsPath + "/tmp/comparegroup" 103 | OUT = open(outFile, "w") # create compare group file 104 | for i in range(0,1): 105 | for j in range(i+1, len(cdtList)): 106 | OUT.write(cdtList[i]+"\t"+cdtList[j]+"\n") 107 | myCommand = "perl " + fileAbsPath + "/bin/getgenelevelcount.pl " + cdtList[i] + " " + cdtList[j] 108 | myCommand += " " + crtAbsPath + "/tmp/abkt/abkt_umi " + metaFile + " " + crtAbsPath + "/tmp/count_script " + crtAbsPath + "/tmp/gene_script/data" 109 | os.system(myCommand) 110 | myCommand = "perl " + fileAbsPath + "/bin/gettascdata.pl " + cdtList[i] + " " + cdtList[j] 111 | myCommand += " " + crtAbsPath + "/tmp/gene_script/data " + crtAbsPath + "/tmp/gene_script/data"; 112 | os.system(myCommand) 113 | # generate sh files for gene expression estimation 114 | tmpDir = crtAbsPath + "/tmp/gene_script/data" 115 | mywrite = "mpirun -n " + ncore + " --bind-to none python " + fileAbsPath + "/bin/model_selection_das_umi.py -y " + tmpDir + "/tascdata_" + cdtList[i] + "_" + cdtList[j] 116 | mywrite += " -k " + tmpDir + "/abktfile_" + cdtList[i] + "_" + cdtList[j] + " -x " + tmpDir + "/condition_" + cdtList[i] + "_" + cdtList[j] 117 | mywrite += " -t 4 -o " + tmpDir + "/outgene_" + cdtList[i] + "_" + cdtList[j] + "\n" 118 | myoutsh = crtAbsPath + "/tmp/gene_script" + "/gene_" + cdtList[i] + "_" + cdtList[j] + ".sh" 119 | os.system("echo \"" + mywrite +"\" > " + myoutsh) 120 | 121 | OUT.close() 122 | tmpDir = crtAbsPath + "/tmp/gene_script" 123 | print("\nPlease run all scripts (gene_\*.sh files) under directory: " + tmpDir + "\n") 124 | 125 | if task == "das": 126 | ############### read count need to be filtered #### check getexonlevelcount.pl file to specify ####################### 127 | validArgList = ["-task", "-ncore", "-meta", "-gpinfo"] 128 | addAbsPath = [0, 0, 1, 1] 129 | message = "SCATS.py -task das -ncore <# cores> -meta -gpinfo " 130 | inputs = my.parse_argument(validArgList, addAbsPath, message) 131 | ncore = inputs[1] 132 | metaFile = inputs[2] 133 | gpinfoFile = inputs[3] 134 | tmpDir = crtAbsPath + "/tmp/das_script" 135 | my.mk_dir(tmpDir) 136 | my.mk_dir(tmpDir+"/data") 137 | cdtList = sc.check_count_file(metaFile, crtAbsPath + "/tmp/count_script") 138 | 139 | #collect gene expression and bursting rate 140 | myCommand = "perl " + fileAbsPath + "/bin/getgeneleveltheta_umi.pl " + crtAbsPath + "/tmp" 141 | os.system(myCommand) 142 | #collect informative read counts 143 | for i in range(0, len(cdtList)-1): 144 | for j in range(i+1, len(cdtList)): 145 | myCommand = "perl " + fileAbsPath + "/bin/getexonlevelcount_umi.pl " + cdtList[i] + " " + cdtList[j] 146 | myCommand += " " + crtAbsPath+"/tmp " + metaFile + " " + gpinfoFile 147 | os.system(myCommand) 148 | # generate sh files 149 | tmpDir = crtAbsPath + "/tmp/das_script/data" 150 | mywrite = "mpirun -n " + ncore + " --bind-to none python " + fileAbsPath + "/bin/model_selection_das_umi.py -y " + tmpDir + "/countdata_" + cdtList[i] + "_" + cdtList[j] 151 | mywrite += " -k " + tmpDir + "/abktfile_" + cdtList[i] + "_" + cdtList[j] + " -x " + tmpDir + "/condition_" + cdtList[i] + "_" + cdtList[j] 152 | mywrite += " -t 6 -o " + tmpDir + "/out_" + cdtList[i] + "_" + cdtList[j] + "\n" 153 | myoutsh = crtAbsPath + "/tmp/das_script" + "/das_" + cdtList[i] + "_" + cdtList[j] + ".sh" 154 | os.system("echo \"" + mywrite +"\" > " + myoutsh) 155 | 156 | tmpDir = crtAbsPath + "/tmp/das_script" 157 | print("\nPlease run all scripts (das_\*.sh files) under directory: " + tmpDir + "\n") 158 | 159 | 160 | if task == "sum": 161 | validArgList = ["-task", "-gpinfo"] 162 | addAbsPath = [0, 1] 163 | message = "SCATS.py -task sum -gpinfo " 164 | inputs = my.parse_argument(validArgList, addAbsPath, message) 165 | gpinfoFile = inputs[1] 166 | tmpDir = crtAbsPath + "/summary" 167 | my.mk_dir(tmpDir) 168 | outFile = tmpDir + "/DAS_results" 169 | tmpDir = crtAbsPath + "/tmp" 170 | compareFile = crtAbsPath + "/tmp/comparegroup" 171 | my.check_file(compareFile,"Please run SCATS.py -task gene.") 172 | 173 | myCommand = "perl " + fileAbsPath + "/bin/summarizedas.pl " + tmpDir + " " + gpinfoFile + " " + outFile 174 | os.system(myCommand) 175 | -------------------------------------------------------------------------------- /bin/PreProcess.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perlq 2 | 3 | ###### 4 | 5 | use Getopt::Long; 6 | use Pod::Usage; 7 | 8 | my $refseq; # gene annotation - UCSC 9 | my $output; # length of sequence read 10 | 11 | GetOptions('r=s'=>\$refseq,'o=s'=>\$output); 12 | 13 | if((!($refseq))||(!($output))){ 14 | pod2usage(); 15 | } 16 | 17 | open(REF, $refseq); 18 | open (RRR, ">$output"); 19 | 20 | ######### load transcript annotation 21 | my %knownGene = (); 22 | while () { 23 | chomp($_); 24 | my @transcript = split(/\t/); 25 | if($transcript[1] =~ /NR/){ 26 | next; 27 | } 28 | my $name = $transcript[12]; 29 | my $tran = $transcript[1]; 30 | $knownGene{$name}{$tran}{"chrom"} = $transcript[2]; 31 | $knownGene{$name}{$tran}{"strand"} = $transcript[3]; 32 | $knownGene{$name}{$tran}{"txStart"} = $transcript[4]; 33 | $knownGene{$name}{$tran}{"txEnd"} = $transcript[5]; 34 | $knownGene{$name}{$tran}{"exonCount"} = $transcript[8]; 35 | $knownGene{$name}{$tran}{"exonStarts"} = $transcript[9]; 36 | $knownGene{$name}{$tran}{"exonEnds"} = $transcript[10]; 37 | } 38 | 39 | 40 | ########## load isoform annotation 41 | my %isoGene = (); 42 | my %isoStart = (); 43 | my %isoEnd = (); 44 | 45 | foreach my $name (keys %knownGene ) 46 | { 47 | foreach my $tran (keys %{$knownGene{$name}}) 48 | { 49 | 50 | my $i_start = $knownGene{$name}{$tran}{"txStart"}; 51 | my $i_end = $knownGene{$name}{$tran}{"txEnd"}; 52 | $isoGene{$name} = $isoGene{$name}.$tran.","; 53 | if($isoStart{$name} == NULL ) 54 | {$isoStart{$name} = $i_start;} 55 | else 56 | { 57 | if($isoStart{$name} > $i_start){ $isoStart{$name} = $i_start; } 58 | } 59 | if($isoEnd{$name} == NULL ) 60 | {$isoEnd{$name} = $i_end;} 61 | else 62 | { 63 | if($isoEnd{$name} < $i_end){ $isoEnd{$name} = $i_end;} 64 | } 65 | } 66 | } 67 | 68 | 69 | ################## process isoform information 70 | foreach my $ID (keys %isoGene ) 71 | { 72 | 73 | my @genename = split(/,/, $isoGene{$ID}); 74 | my $size = @genename; 75 | my $i_chrom = $knownGene{$ID}{$genename[0]}{"chrom"}; 76 | my $i_strand = $knownGene{$ID}{$genename[0]}{"strand"}; 77 | my $i_start = $isoStart{$ID}; 78 | my $i_end = $isoEnd{$ID}; 79 | my %ISO_Index = (); 80 | 81 | if($size >1) ### you can specify the number of the isoform per gene here 82 | { 83 | print RRR "$ID\t$i_chrom\t$i_strand\t$i_start\t$i_end\t"; 84 | for(my $j=0; $j<= $#genename; $j++) 85 | { 86 | my $name = $genename[$j]; 87 | print RRR "$name,"; 88 | my @start = split(/,/, $knownGene{$ID}{$name}{"exonStarts"}); 89 | my @end = split(/,/, $knownGene{$ID}{$name}{"exonEnds"}); 90 | for (my $ijk=0; $ijk<= $#start; $ijk++) 91 | { 92 | my $sss = $start[$ijk]; 93 | my $eee = $end[$ijk]; 94 | for (my $abc=$sss; $abc<=$eee; $abc++) 95 | {$ISO_Index{$abc}{$j} = 1;} 96 | } # ijk 97 | } # j 98 | 99 | print RRR "\n"; 100 | 101 | my %NEW_EXON =(); 102 | my $CCC =0; 103 | my $pre_POS = $i_start-10; 104 | my @pre_Index = (); 105 | for(my $j=0; $j <= $#genename; $j++) 106 | {$pre_Index{$j}=0;} 107 | 108 | for my $ijk (sort {$a<=>$b} keys %ISO_Index) 109 | { 110 | my $tot =0; 111 | my @cur_Index=(); 112 | for(my $j=0; $j<=$#genename; $j++) 113 | { 114 | my $name = $genename[$j]; 115 | my $value = exists $ISO_Index{$ijk}{$j} ? $ISO_Index{$ijk}{$j} : 0; 116 | $cur_Index[$j] = $value; 117 | if($cur_Index[$j] != $pre_Index[$j]) 118 | {$tot = $tot +1;} 119 | } 120 | my $move = $ijk - $pre_POS; 121 | if($move != 1) 122 | { 123 | $NEW_EXON{$CCC}{"start"} = $ijk; 124 | $NEW_EXON{$CCC}{"Index"} = [@cur_Index]; 125 | if($CCC > 0) 126 | { 127 | $NEW_EXON{$CCC-1}{"end"} = $pre_POS; 128 | } 129 | @pre_Index = @cur_Index; 130 | $CCC = $CCC+1; 131 | } 132 | else{ 133 | if($tot >0) 134 | { 135 | $NEW_EXON{$CCC}{"start"} = $ijk; 136 | $NEW_EXON{$CCC-1}{"end"} = $ijk-1; 137 | $NEW_EXON{$CCC}{"Index"} = [@cur_Index]; 138 | @pre_Index = @cur_Index; 139 | $CCC = $CCC+1; 140 | } 141 | } 142 | $pre_POS = $ijk; 143 | } # ijk 144 | $NEW_EXON{$CCC-1}{"end"} = $i_end; 145 | 146 | ### print data structure 147 | for my $CCC (sort {$a<=>$b} keys %NEW_EXON) 148 | { 149 | print RRR "$ID\t$i_chrom\t$i_strand\t"; 150 | my $sss = $NEW_EXON{$CCC}{"start"}; 151 | print RRR "$sss\t"; 152 | my $eee = $NEW_EXON{$CCC}{"end"}; 153 | print RRR "$eee\t"; 154 | my $Read_C = 0; 155 | #for (my $abc=$sss; $abc<=$eee; $abc++) 156 | #{ 157 | # my $value = exists $genome{$i_chrom}{$abc} ? $genome{$i_chrom}{$abc} : 0; 158 | # $Read_C =$Read_C+$value; 159 | #} 160 | #print RRR "$Read_C\t"; 161 | my @index_get = $NEW_EXON{$CCC}{"Index"}; 162 | for my $INDEX ( 0 .. $#{$NEW_EXON{$CCC}{"Index"}}){ 163 | my $index_get = $NEW_EXON{$CCC}{"Index"}[$INDEX]; 164 | print RRR "$index_get,"; } 165 | print RRR "\n"; 166 | #print "haha\n"; 167 | } ### for CCC 168 | 169 | } # if size 170 | } #i : Cluster ID -1 171 | 172 | 173 | 174 | 175 | close(SAM); 176 | close(REF); 177 | close(RRR); 178 | 179 | =head1 SYNOPSIS 180 | 181 | -r ---RefSeqAnnotation file 182 | 183 | -o ---The file name that you want to save the results 184 | -------------------------------------------------------------------------------- /bin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/__init__.py -------------------------------------------------------------------------------- /bin/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/__init__.pyc -------------------------------------------------------------------------------- /bin/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /bin/__pycache__/my_functions.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/__pycache__/my_functions.cpython-37.pyc -------------------------------------------------------------------------------- /bin/__pycache__/scats_functions.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/__pycache__/scats_functions.cpython-37.pyc -------------------------------------------------------------------------------- /bin/check_software.py: -------------------------------------------------------------------------------- 1 | import my_functions as my 2 | 3 | list = ['pysam', "numpy", "scipy", "cython"] 4 | for x in list: 5 | my.check_module(x) 6 | 7 | 8 | list = ["mpirun", "samtools"] 9 | for x in list: 10 | my.check_program(x) 11 | -------------------------------------------------------------------------------- /bin/complie_likelihoodumi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rm ./likelihoodumi.c 3 | rm ./likelihoodumi.html 4 | rm ./likelihoodumi.so 5 | cython -a ./likelihoodumi.pyx 6 | gcc -shared -pthread -fPIC `python-config --cflags` -o likelihoodumi.so likelihoodumi.c 7 | -------------------------------------------------------------------------------- /bin/getCount.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from __future__ import print_function # load print function in python3 4 | from collections import defaultdict 5 | import math, sys, os, re, pysam, time 6 | 7 | # set up auto dictionary function 8 | def auto_dict(): 9 | return defaultdict(auto_dict) 10 | 11 | 12 | ############################################################################### 13 | ### ARGUMENT SETTINGS 14 | ############################################################################### 15 | 16 | # checking whether argument is valid or not 17 | validArgList = ["-bam", "-ref", "-out", "-gpinfo"] 18 | for argIndex in range(1,len(sys.argv)): 19 | if sys.argv[argIndex][0] == "-" and sys.argv[argIndex] not in validArgList : 20 | print("Argument \'"+sys.argv[argIndex]+"\' is invalid!") 21 | sys.exit() 22 | 23 | 24 | bamFileExists = 0 25 | refFileExists = 0 26 | outFileExists = 0 27 | gpinfoFileExists = 0 28 | for argIndex in range(1,len(sys.argv)): 29 | if sys.argv[argIndex] == "-bam": ## load in BAM file 30 | argIndex += 1 31 | bamFileAbsPath = os.path.dirname(os.path.abspath(sys.argv[argIndex])) 32 | bamTmp = sys.argv[argIndex].split("/") 33 | bamFile = bamFileAbsPath + "/" + bamTmp[len(bamTmp)-1] 34 | bamFileExists = 1 35 | elif sys.argv[argIndex] == "-ref": ## load in annotation file 36 | argIndex += 1 37 | refFileAbsPath = os.path.dirname(os.path.abspath(sys.argv[argIndex])) 38 | refTmp = sys.argv[argIndex].split("/") 39 | refGeneFile = refFileAbsPath + "/" + refTmp[len(refTmp)-1] 40 | refFileExists = 1 41 | elif sys.argv[argIndex] == "-out": ## load in annotation file 42 | argIndex += 1 43 | outFileAbsPath = os.path.dirname(os.path.abspath(sys.argv[argIndex])) 44 | outTmp = sys.argv[argIndex].split("/") 45 | outFile = outFileAbsPath + "/" + outTmp[len(outTmp)-1] 46 | outFileExists = 1 47 | elif sys.argv[argIndex] == "-gpinfo": ## load group information file 48 | argIndex += 1 49 | gpinfoFileAbsPath = os.path.dirname(os.path.abspath(sys.argv[argIndex])) 50 | gpinfoTmp = sys.argv[argIndex].split("/") 51 | gpinfoFile = gpinfoFileAbsPath + "/" + gpinfoTmp[len(gpinfoTmp)-1] 52 | gpinfoFileExists = 1 53 | 54 | 55 | if bamFileExists == 0 or refFileExists == 0 or outFileExists == 0 or gpinfoFileExists == 0: ## lack enough arguments 56 | print("Please provide arguments:") 57 | print("-bam\tIndexed bam file") 58 | print("-ref\tGene annotation file") 59 | print("-out\tOutput file") 60 | print("-gpinfo\tGroup Information file") 61 | sys.exit() 62 | 63 | 64 | # load gene information 65 | geneStructureInformation = auto_dict() 66 | geneLineCount = auto_dict() 67 | 68 | with open(refGeneFile, "r") as FP: 69 | for line in FP: 70 | line = line.strip("\n") 71 | tmpinf = line.split("\t") 72 | gene = tmpinf[0] 73 | 74 | if not bool(geneStructureInformation[gene]): 75 | geneLineCount[gene] = 0 76 | geneStructureInformation[gene][geneLineCount[gene]] = line 77 | else: 78 | geneLineCount[gene] += 1 79 | geneStructureInformation[gene][geneLineCount[gene]] = line 80 | 81 | # load group information 82 | 83 | groupInformation = auto_dict() 84 | geneLineCount1 = auto_dict() 85 | with open(gpinfoFile, "r") as FP: 86 | for line in FP: 87 | line = line.strip("\n") 88 | tmpinf = line.split("\t") 89 | tmpinf[5] = tmpinf[5].strip(",") 90 | gene = tmpinf[0] 91 | 92 | groupInformation[gene][tmpinf[1]][tmpinf[3]] = tmpinf[5] 93 | 94 | 95 | 96 | 97 | ##################################### 98 | ## Using pysam to read in bam file !! 99 | ##################################### 100 | bamFilePysam = pysam.Samfile(bamFile,"rb") 101 | 102 | 103 | ## RESULTS FILE 104 | OUT = open(outFile, 'w') 105 | 106 | 107 | ########################################################################################################################### 108 | ### START TO ANALYZE DATA FOR EACH GENE ### 109 | ########################################################################################################################## 110 | 111 | geneCount = 0 112 | 113 | startTime = time.time() 114 | 115 | #OUT.write("GeneName\tIsoformName\tNumberOfReads\tRelativeAbundance\n") ## Header of Results 116 | 117 | for gene in geneStructureInformation: 118 | 119 | countResults = auto_dict() 120 | 121 | geneCount += 1 122 | tmpTime = (time.time() - startTime)/60.0 123 | 124 | 125 | sameReadCount = auto_dict() 126 | readStart = auto_dict() 127 | readEnd = auto_dict() 128 | readCigar = auto_dict() 129 | 130 | numofExons = geneLineCount[gene] 131 | tmpgeneinf = geneStructureInformation[gene][0].split("\t") 132 | geneChr = tmpgeneinf[1] 133 | geneStart = int(tmpgeneinf[3]) 134 | geneEnd = int(tmpgeneinf[4]) 135 | if bamFilePysam.get_tid(geneChr) == -1: 136 | continue 137 | 138 | ## load all reads information which were mapped to the specific gene within this loop using pysam 139 | for read in bamFilePysam.fetch(geneChr, geneStart, geneEnd): 140 | line = str(read) 141 | tmpinf = line.split("\t") 142 | tmpReadName = tmpinf[0] 143 | tmpReadChr = geneChr 144 | tmpReadStart = int(tmpinf[3]) + 1 145 | tmpReadCigar = "" 146 | 147 | ## Adjust to different Pysam Version!! ## 148 | 149 | if ")]" in tmpinf[5]: ## vector format 150 | 151 | tmpinf[5] = tmpinf[5].rstrip(")]") 152 | tmpinf[5] = tmpinf[5].lstrip("[(") 153 | tmpinfcigar = tmpinf[5].split("), (") 154 | for cc in tmpinfcigar: 155 | ttcc = cc.split(", ") 156 | if ttcc[0] == "3": 157 | tmpReadCigar = tmpReadCigar + ttcc[1] + "N" 158 | if ttcc[0] == "2": 159 | tmpReadCigar = tmpReadCigar + ttcc[1] + "D" 160 | if ttcc[0] == "1": 161 | tmpReadCigar = tmpReadCigar + ttcc[1] + "I" 162 | if ttcc[0] == "0": 163 | tmpReadCigar = tmpReadCigar + ttcc[1] + "M" 164 | if not (ttcc[0] == "3" or ttcc[0] == "2" or ttcc[0] == "1" or ttcc[0] == "0"): 165 | tmpReadCigar = tmpReadCigar + ttcc[1] + "X" 166 | else: ## 100M10N100M format 167 | tmpReadCigar = tmpinf[5] 168 | 169 | if not bool(sameReadCount[tmpReadName]): 170 | sameReadCount[tmpReadName] = 1 171 | else: 172 | sameReadCount[tmpReadName] += 1 173 | 174 | readStart[tmpReadName][sameReadCount[tmpReadName]] = tmpReadStart 175 | readCigar[tmpReadName][sameReadCount[tmpReadName]] = tmpReadCigar 176 | 177 | 178 | ## load structure information of the specific gene within this loop 179 | 180 | tmpgeneinf[5] = tmpgeneinf[5].rstrip(",") 181 | isoformNames = tmpgeneinf[5].split(",") 182 | exonStarts = [None] * numofExons 183 | exonEnds = [None] * numofExons 184 | exonIndicators = auto_dict() 185 | 186 | for i in range(1,numofExons+1): 187 | tmpinf = geneStructureInformation[gene][i].split("\t") 188 | exonStarts[i-1] = int(tmpinf[3])+1 189 | exonEnds[i-1] = int(tmpinf[4]) 190 | tmpinf[5] = tmpinf[5].rstrip(",") 191 | tmpExonIndicators = tmpinf[5].split(",") 192 | 193 | for j in range(len(tmpExonIndicators)): 194 | exonIndicators[isoformNames[j]][i-1] = int(tmpExonIndicators[j]) 195 | 196 | lociIndicators = auto_dict() 197 | for i in range(len(isoformNames)): 198 | for j in range(len(exonStarts)): 199 | if exonIndicators[isoformNames[i]][j] == 1: 200 | for k in range(exonStarts[j], exonEnds[j]+1): 201 | lociIndicators[isoformNames[i]][k] = 1 202 | 203 | ######################################################################################################################################### 204 | ## START TO ANALYZE EACH READ 205 | ################################################################################################################################################## 206 | 207 | qualifiedRead = auto_dict() 208 | readCount = 0 209 | fragmentStart = auto_dict() 210 | fragmentEnd = auto_dict() 211 | CompatibleMatrix = auto_dict() 212 | tmpCompatibleMatrix = auto_dict() 213 | 214 | for readName in sameReadCount: 215 | 216 | # load CIGAR information 217 | cigarNumberRead1 = auto_dict() 218 | cigarNumberRead2 = auto_dict() 219 | cigarMatchRead1 = auto_dict() 220 | cigarMatchRead2 = auto_dict() 221 | cigarInfCountRead1 = 0 222 | cigarInfCountRead2 = 0 223 | cigarInfCountRead1tmp = 0 224 | cigarInfCountRead2tmp = 0 225 | 226 | tmp1 = re.split("([A-Z])",readCigar[readName][1]) 227 | for i in range(len(tmp1)-1): 228 | if tmp1[i].isalpha(): 229 | cigarMatchRead1[cigarInfCountRead1] = tmp1[i] 230 | cigarInfCountRead1 += 1 231 | else: 232 | cigarNumberRead1[cigarInfCountRead1] = int(tmp1[i]) 233 | cigarInfCountRead1tmp += 1 234 | 235 | if sameReadCount[readName] == 2: 236 | tmp2 = re.split("([A-Z])",readCigar[readName][2]) 237 | for i in range(len(tmp2)-1): 238 | if tmp2[i].isalpha(): 239 | cigarMatchRead2[cigarInfCountRead2] = tmp2[i] 240 | cigarInfCountRead2 += 1 241 | else: 242 | cigarNumberRead2[cigarInfCountRead2] = int(tmp2[i]) 243 | cigarInfCountRead2tmp += 1 244 | 245 | # calculate read end positions 246 | readEnd[readName][1] = readStart[readName][1] 247 | for i in range(cigarInfCountRead1): 248 | readEnd[readName][1] += cigarNumberRead1[i] 249 | 250 | if sameReadCount[readName] == 2: 251 | readEnd[readName][2] = readStart[readName][2] 252 | for i in range(cigarInfCountRead2): 253 | readEnd[readName][2] += cigarNumberRead2[i] 254 | 255 | # calculate fragment START and END positions 256 | if sameReadCount[readName] == 2: 257 | fragmentStart[readName] = readStart[readName][2] if readStart[readName][1] >= readStart[readName][2] else readStart[readName][1] 258 | fragmentEnd[readName] = readEnd[readName][1] if readEnd[readName][1] >= readEnd[readName][2] else readEnd[readName][2] 259 | 260 | if sameReadCount[readName] == 1: 261 | fragmentStart[readName] = readStart[readName][1] 262 | fragmentEnd[readName] = readEnd[readName][1] 263 | 264 | ################################################################################################################################## 265 | ## Obtain compatible matrix of isoforms with respect to reads 266 | ################################################################################################################################# 267 | 268 | if (readStart[readName][1] >= geneStart and readStart[readName][1] <= geneEnd) or (readStart[readName][2] >= geneStart and readStart[readName][2] <= geneEnd and sameReadCount[readName]==2) : 269 | if cigarInfCountRead1 == cigarInfCountRead1tmp and cigarInfCountRead2 == cigarInfCountRead2tmp: 270 | base1 = readStart[readName][1] - 1 271 | exonIndicatorRead1 = [0] * numofExons 272 | if sameReadCount[readName] == 2: 273 | base2 = readStart[readName][2] - 1 274 | exonIndicatorRead2 = [0] * numofExons 275 | compatibleVector = [1] * len(isoformNames) 276 | 277 | ############################################################################################################################################## 278 | ### SET TUP COMPATIBLE INDICATOR VECTOR ############### 279 | ############################################################################################################################################### 280 | ## READ 1 ## 281 | # find exons where read 1 mapped to 282 | for i in range(cigarInfCountRead1): 283 | 284 | if cigarMatchRead1[i] == "M" or cigarMatchRead1[i] == "I": ## matched CIGAR 285 | 286 | for j in range(1,cigarNumberRead1[i]+1): 287 | tmpbase = base1 + j 288 | for k in range(len(exonStarts)): 289 | if exonIndicatorRead1[k] == 1: continue 290 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead1[k] = 1 ## confirm that the read covers this exon 291 | 292 | base1 += cigarNumberRead1[i] # jump to next match information 293 | 294 | if cigarMatchRead1[i] == "N": ## skipping area 295 | base1 += cigarNumberRead1[i] # jump to next match information directly 296 | 297 | # set up indicator vector 298 | tmpcount1 = 0 299 | tmpcount11 = 0 ## these two variable are used to rule out skipping exons 300 | for i in range(len(exonIndicatorRead1)): 301 | if exonIndicatorRead1[i] == 1: tmpcount1 += 1 302 | for i in range(len(exonIndicatorRead1)): 303 | 304 | if exonIndicatorRead1[i] == 1: 305 | tmpcount11 += 1 306 | for j in range(len(isoformNames)): 307 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j 308 | 309 | if exonIndicatorRead1[i] == 0: #aim to rule out isforms which includes exons which skipped by read 310 | if tmpcount1 > 1 and tmpcount11 >= 1 and tmpcount11 < tmpcount1: ## confirm the exon i is skipped by read!! 311 | for j in range(len(isoformNames)): 312 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0 313 | 314 | 315 | ## READ 2 ## SAME AS READ 1 316 | tmpcount2 = 0 317 | if sameReadCount[readName] == 2: ## ONLY WHEN THE READ IS PAIRED-END READ!!! 318 | # find exons where read 2 mapped to 319 | for i in range(cigarInfCountRead2): 320 | 321 | if cigarMatchRead2[i] == "M" or cigarMatchRead2[i] == "I": ## matched CIGAR 322 | 323 | for j in range(1,cigarNumberRead2[i]+1): 324 | tmpbase = base2 + j 325 | for k in range(len(exonStarts)): 326 | if exonIndicatorRead2[k] == 1: continue 327 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead2[k] = 1 ## confirm that the read covers this exon 328 | 329 | base2 += cigarNumberRead2[i] # jump to next match information 330 | 331 | if cigarMatchRead2[i] == "N": ## skipping area 332 | base2 += cigarNumberRead2[i] # jump to next match information directly 333 | 334 | # set up indicator vector 335 | tmpcount2 = 0 336 | tmpcount22 = 0 ## these two variable are used to rule out skipping exons 337 | for i in range(len(exonIndicatorRead2)): 338 | if exonIndicatorRead2[i] == 1: tmpcount2 += 1 339 | for i in range(len(exonIndicatorRead2)): 340 | 341 | if exonIndicatorRead2[i] == 1: 342 | tmpcount22 += 1 343 | for j in range(len(isoformNames)): 344 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j 345 | 346 | if exonIndicatorRead2[i] == 0: #aim to rule out isforms which includes exons which skipped by read 347 | if tmpcount2 > 1 and tmpcount22 >= 1 and tmpcount22 < tmpcount2: ## confirm the exon i is skipped by read!! 348 | for j in range(len(isoformNames)): 349 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0 350 | 351 | ################################################################################################################################################## 352 | ## fill in compatible matrix ## 353 | if tmpcount1 > 0 or (tmpcount2 > 0 and sameReadCount[readName] == 2): 354 | readCount += 1 355 | qualifiedRead[readName] = 1 356 | for i in range(len(isoformNames)): 357 | CompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i] 358 | tmpCompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i] 359 | else: 360 | qualifiedRead[readName] = 0 361 | 362 | 363 | ### COMPATIBLE MATRIX OBTAINED !!! 364 | ############################################################################################################### 365 | 366 | if readCount == 0: continue 367 | print(gene+"\t"+str(readCount)+" reads detected...") 368 | 369 | for weight in groupInformation[gene]: 370 | countResults[weight]["+"] = 0 371 | countResults[weight]["-"] = 0 372 | isosetplus = groupInformation[gene][weight]["+"].split(",") 373 | isosetminus = groupInformation[gene][weight]["-"].split(",") 374 | 375 | for readName in qualifiedRead: 376 | if qualifiedRead[readName] == 0: continue 377 | sumindexplus = 0 378 | for index in isosetplus: 379 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexplus += 1 380 | sumindexminus = 0 381 | for index in isosetminus: 382 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexminus += 1 383 | if sumindexplus == 0: 384 | countResults[weight]["+"] += 1 385 | if sumindexminus == 0: 386 | countResults[weight]["-"] += 1 387 | 388 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"+"+"\t"+str(countResults[weight]["+"])+"\n") 389 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"-"+"\t"+str(countResults[weight]["-"])+"\n") 390 | 391 | OUT.close() 392 | 393 | 394 | 395 | 396 | 397 | -------------------------------------------------------------------------------- /bin/getCount_cellid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from __future__ import print_function # load print function in python3 4 | from collections import defaultdict 5 | import math, sys, os, re, pysam, time 6 | import my_functions as my 7 | 8 | # set up auto dictionary function 9 | def auto_dict(): 10 | return defaultdict(auto_dict) 11 | 12 | 13 | ############################################################################### 14 | ### ARGUMENT SETTINGS 15 | ############################################################################### 16 | 17 | # checking whether argument is valid or not 18 | validArgList = ["-bam", "-ref", "-out", "-gpinfo", "-cellid", "-celltag"] 19 | addAbsPath = [1,1,3,1,0,0] 20 | warnMessage = "-bam, -ref, -out, -gpinfo, -cellid, -celltag" 21 | inputFile = my.parse_argument(validArgList, addAbsPath, warnMessage) 22 | bamFile = inputFile[0] 23 | refGeneFile = inputFile[1] 24 | outFile = inputFile[2] 25 | gpinfoFile = inputFile[3] 26 | cellid = inputFile[4] 27 | celltag = inputFile[5] 28 | 29 | 30 | # load gene information 31 | geneStructureInformation = auto_dict() 32 | geneLineCount = auto_dict() 33 | 34 | with open(refGeneFile, "r") as FP: 35 | for line in FP: 36 | line = line.strip("\n") 37 | tmpinf = line.split("\t") 38 | gene = tmpinf[0] 39 | 40 | if not bool(geneStructureInformation[gene]): 41 | geneLineCount[gene] = 0 42 | geneStructureInformation[gene][geneLineCount[gene]] = line 43 | else: 44 | geneLineCount[gene] += 1 45 | geneStructureInformation[gene][geneLineCount[gene]] = line 46 | 47 | # load group information 48 | 49 | groupInformation = auto_dict() 50 | geneLineCount1 = auto_dict() 51 | with open(gpinfoFile, "r") as FP: 52 | for line in FP: 53 | line = line.strip("\n") 54 | tmpinf = line.split("\t") 55 | tmpinf[5] = tmpinf[5].strip(",") 56 | gene = tmpinf[0] 57 | 58 | groupInformation[gene][tmpinf[1]][tmpinf[3]] = tmpinf[5] 59 | 60 | 61 | 62 | 63 | ##################################### 64 | ## Using pysam to read in bam file !! 65 | ##################################### 66 | bamFilePysam = pysam.Samfile(bamFile,"rb") 67 | 68 | 69 | ## RESULTS FILE 70 | OUT = open(outFile, 'w') 71 | 72 | 73 | ########################################################################################################################### 74 | ### START TO ANALYZE DATA FOR EACH GENE ### 75 | ########################################################################################################################## 76 | 77 | geneCount = 0 78 | 79 | startTime = time.time() 80 | 81 | #OUT.write("GeneName\tIsoformName\tNumberOfReads\tRelativeAbundance\n") ## Header of Results 82 | 83 | for gene in geneStructureInformation: 84 | 85 | countResults = auto_dict() 86 | 87 | geneCount += 1 88 | tmpTime = (time.time() - startTime)/60.0 89 | 90 | 91 | sameReadCount = auto_dict() 92 | readStart = auto_dict() 93 | readEnd = auto_dict() 94 | readCigar = auto_dict() 95 | 96 | numofExons = geneLineCount[gene] 97 | tmpgeneinf = geneStructureInformation[gene][0].split("\t") 98 | geneChr = tmpgeneinf[1] 99 | geneStart = int(tmpgeneinf[3]) 100 | geneEnd = int(tmpgeneinf[4]) 101 | if bamFilePysam.get_tid(geneChr) == -1: 102 | continue 103 | 104 | ## load all reads information which were mapped to the specific gene within this loop using pysam 105 | for read in bamFilePysam.fetch(geneChr, geneStart, geneEnd): 106 | line = str(read) 107 | tmpinf = line.split("\t") 108 | tmpReadName = tmpinf[0] 109 | tmpReadChr = geneChr 110 | tmpReadStart = int(tmpinf[3]) + 1 111 | tmpReadCigar = "" 112 | try: 113 | tmpCellBarcode = read.get_tag(celltag) 114 | except: 115 | continue 116 | if cellid != tmpCellBarcode: 117 | continue 118 | 119 | 120 | ## Adjust to different Pysam Version!! ## 121 | 122 | if ")]" in tmpinf[5]: ## vector format 123 | 124 | tmpinf[5] = tmpinf[5].rstrip(")]") 125 | tmpinf[5] = tmpinf[5].lstrip("[(") 126 | tmpinfcigar = tmpinf[5].split("), (") 127 | for cc in tmpinfcigar: 128 | ttcc = cc.split(", ") 129 | if ttcc[0] == "3": 130 | tmpReadCigar = tmpReadCigar + ttcc[1] + "N" 131 | if ttcc[0] == "2": 132 | tmpReadCigar = tmpReadCigar + ttcc[1] + "D" 133 | if ttcc[0] == "1": 134 | tmpReadCigar = tmpReadCigar + ttcc[1] + "I" 135 | if ttcc[0] == "0": 136 | tmpReadCigar = tmpReadCigar + ttcc[1] + "M" 137 | if not (ttcc[0] == "3" or ttcc[0] == "2" or ttcc[0] == "1" or ttcc[0] == "0"): 138 | tmpReadCigar = tmpReadCigar + ttcc[1] + "X" 139 | else: ## 100M10N100M format 140 | tmpReadCigar = tmpinf[5] 141 | 142 | if not bool(sameReadCount[tmpReadName]): 143 | sameReadCount[tmpReadName] = 1 144 | else: 145 | sameReadCount[tmpReadName] += 1 146 | 147 | readStart[tmpReadName][sameReadCount[tmpReadName]] = tmpReadStart 148 | readCigar[tmpReadName][sameReadCount[tmpReadName]] = tmpReadCigar 149 | 150 | 151 | ## load structure information of the specific gene within this loop 152 | 153 | tmpgeneinf[5] = tmpgeneinf[5].rstrip(",") 154 | isoformNames = tmpgeneinf[5].split(",") 155 | exonStarts = [None] * numofExons 156 | exonEnds = [None] * numofExons 157 | exonIndicators = auto_dict() 158 | 159 | for i in range(1,numofExons+1): 160 | tmpinf = geneStructureInformation[gene][i].split("\t") 161 | exonStarts[i-1] = int(tmpinf[3])+1 162 | exonEnds[i-1] = int(tmpinf[4]) 163 | tmpinf[5] = tmpinf[5].rstrip(",") 164 | tmpExonIndicators = tmpinf[5].split(",") 165 | 166 | for j in range(len(tmpExonIndicators)): 167 | exonIndicators[isoformNames[j]][i-1] = int(tmpExonIndicators[j]) 168 | 169 | lociIndicators = auto_dict() 170 | for i in range(len(isoformNames)): 171 | for j in range(len(exonStarts)): 172 | if exonIndicators[isoformNames[i]][j] == 1: 173 | for k in range(exonStarts[j], exonEnds[j]+1): 174 | lociIndicators[isoformNames[i]][k] = 1 175 | 176 | ######################################################################################################################################### 177 | ## START TO ANALYZE EACH READ 178 | ################################################################################################################################################## 179 | 180 | qualifiedRead = auto_dict() 181 | readCount = 0 182 | fragmentStart = auto_dict() 183 | fragmentEnd = auto_dict() 184 | CompatibleMatrix = auto_dict() 185 | tmpCompatibleMatrix = auto_dict() 186 | 187 | for readName in sameReadCount: 188 | 189 | # load CIGAR information 190 | cigarNumberRead1 = auto_dict() 191 | cigarNumberRead2 = auto_dict() 192 | cigarMatchRead1 = auto_dict() 193 | cigarMatchRead2 = auto_dict() 194 | cigarInfCountRead1 = 0 195 | cigarInfCountRead2 = 0 196 | cigarInfCountRead1tmp = 0 197 | cigarInfCountRead2tmp = 0 198 | 199 | tmp1 = re.split("([A-Z])",readCigar[readName][1]) 200 | for i in range(len(tmp1)-1): 201 | if tmp1[i].isalpha(): 202 | cigarMatchRead1[cigarInfCountRead1] = tmp1[i] 203 | cigarInfCountRead1 += 1 204 | else: 205 | cigarNumberRead1[cigarInfCountRead1] = int(tmp1[i]) 206 | cigarInfCountRead1tmp += 1 207 | 208 | if sameReadCount[readName] == 2: 209 | tmp2 = re.split("([A-Z])",readCigar[readName][2]) 210 | for i in range(len(tmp2)-1): 211 | if tmp2[i].isalpha(): 212 | cigarMatchRead2[cigarInfCountRead2] = tmp2[i] 213 | cigarInfCountRead2 += 1 214 | else: 215 | cigarNumberRead2[cigarInfCountRead2] = int(tmp2[i]) 216 | cigarInfCountRead2tmp += 1 217 | 218 | # calculate read end positions 219 | readEnd[readName][1] = readStart[readName][1] 220 | for i in range(cigarInfCountRead1): 221 | readEnd[readName][1] += cigarNumberRead1[i] 222 | 223 | if sameReadCount[readName] == 2: 224 | readEnd[readName][2] = readStart[readName][2] 225 | for i in range(cigarInfCountRead2): 226 | readEnd[readName][2] += cigarNumberRead2[i] 227 | 228 | # calculate fragment START and END positions 229 | if sameReadCount[readName] == 2: 230 | fragmentStart[readName] = readStart[readName][2] if readStart[readName][1] >= readStart[readName][2] else readStart[readName][1] 231 | fragmentEnd[readName] = readEnd[readName][1] if readEnd[readName][1] >= readEnd[readName][2] else readEnd[readName][2] 232 | 233 | if sameReadCount[readName] == 1: 234 | fragmentStart[readName] = readStart[readName][1] 235 | fragmentEnd[readName] = readEnd[readName][1] 236 | 237 | ################################################################################################################################## 238 | ## Obtain compatible matrix of isoforms with respect to reads 239 | ################################################################################################################################# 240 | 241 | if (readStart[readName][1] >= geneStart and readStart[readName][1] <= geneEnd) or (readStart[readName][2] >= geneStart and readStart[readName][2] <= geneEnd and sameReadCount[readName]==2) : 242 | if cigarInfCountRead1 == cigarInfCountRead1tmp and cigarInfCountRead2 == cigarInfCountRead2tmp: 243 | base1 = readStart[readName][1] - 1 244 | exonIndicatorRead1 = [0] * numofExons 245 | if sameReadCount[readName] == 2: 246 | base2 = readStart[readName][2] - 1 247 | exonIndicatorRead2 = [0] * numofExons 248 | compatibleVector = [1] * len(isoformNames) 249 | 250 | ############################################################################################################################################## 251 | ### SET TUP COMPATIBLE INDICATOR VECTOR ############### 252 | ############################################################################################################################################### 253 | ## READ 1 ## 254 | # find exons where read 1 mapped to 255 | for i in range(cigarInfCountRead1): 256 | 257 | if cigarMatchRead1[i] == "M" or cigarMatchRead1[i] == "I": ## matched CIGAR 258 | 259 | for j in range(1,cigarNumberRead1[i]+1): 260 | tmpbase = base1 + j 261 | for k in range(len(exonStarts)): 262 | if exonIndicatorRead1[k] == 1: continue 263 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead1[k] = 1 ## confirm that the read covers this exon 264 | 265 | base1 += cigarNumberRead1[i] # jump to next match information 266 | 267 | if cigarMatchRead1[i] == "N": ## skipping area 268 | base1 += cigarNumberRead1[i] # jump to next match information directly 269 | 270 | # set up indicator vector 271 | tmpcount1 = 0 272 | tmpcount11 = 0 ## these two variable are used to rule out skipping exons 273 | for i in range(len(exonIndicatorRead1)): 274 | if exonIndicatorRead1[i] == 1: tmpcount1 += 1 275 | for i in range(len(exonIndicatorRead1)): 276 | 277 | if exonIndicatorRead1[i] == 1: 278 | tmpcount11 += 1 279 | for j in range(len(isoformNames)): 280 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j 281 | 282 | if exonIndicatorRead1[i] == 0: #aim to rule out isforms which includes exons which skipped by read 283 | if tmpcount1 > 1 and tmpcount11 >= 1 and tmpcount11 < tmpcount1: ## confirm the exon i is skipped by read!! 284 | for j in range(len(isoformNames)): 285 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0 286 | 287 | 288 | ## READ 2 ## SAME AS READ 1 289 | tmpcount2 = 0 290 | if sameReadCount[readName] == 2: ## ONLY WHEN THE READ IS PAIRED-END READ!!! 291 | # find exons where read 2 mapped to 292 | for i in range(cigarInfCountRead2): 293 | 294 | if cigarMatchRead2[i] == "M" or cigarMatchRead2[i] == "I": ## matched CIGAR 295 | 296 | for j in range(1,cigarNumberRead2[i]+1): 297 | tmpbase = base2 + j 298 | for k in range(len(exonStarts)): 299 | if exonIndicatorRead2[k] == 1: continue 300 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead2[k] = 1 ## confirm that the read covers this exon 301 | 302 | base2 += cigarNumberRead2[i] # jump to next match information 303 | 304 | if cigarMatchRead2[i] == "N": ## skipping area 305 | base2 += cigarNumberRead2[i] # jump to next match information directly 306 | 307 | # set up indicator vector 308 | tmpcount2 = 0 309 | tmpcount22 = 0 ## these two variable are used to rule out skipping exons 310 | for i in range(len(exonIndicatorRead2)): 311 | if exonIndicatorRead2[i] == 1: tmpcount2 += 1 312 | for i in range(len(exonIndicatorRead2)): 313 | 314 | if exonIndicatorRead2[i] == 1: 315 | tmpcount22 += 1 316 | for j in range(len(isoformNames)): 317 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j 318 | 319 | if exonIndicatorRead2[i] == 0: #aim to rule out isforms which includes exons which skipped by read 320 | if tmpcount2 > 1 and tmpcount22 >= 1 and tmpcount22 < tmpcount2: ## confirm the exon i is skipped by read!! 321 | for j in range(len(isoformNames)): 322 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0 323 | 324 | ################################################################################################################################################## 325 | ## fill in compatible matrix ## 326 | if tmpcount1 > 0 or (tmpcount2 > 0 and sameReadCount[readName] == 2): 327 | readCount += 1 328 | qualifiedRead[readName] = 1 329 | for i in range(len(isoformNames)): 330 | CompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i] 331 | tmpCompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i] 332 | else: 333 | qualifiedRead[readName] = 0 334 | 335 | 336 | ### COMPATIBLE MATRIX OBTAINED !!! 337 | ############################################################################################################### 338 | 339 | if readCount == 0: continue 340 | print(gene+"\t"+str(readCount)+" reads detected...") 341 | 342 | for weight in groupInformation[gene]: 343 | countResults[weight]["+"] = 0 344 | countResults[weight]["-"] = 0 345 | isosetplus = groupInformation[gene][weight]["+"].split(",") 346 | isosetminus = groupInformation[gene][weight]["-"].split(",") 347 | 348 | for readName in qualifiedRead: 349 | if qualifiedRead[readName] == 0: continue 350 | sumindexplus = 0 351 | for index in isosetplus: 352 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexplus += 1 353 | sumindexminus = 0 354 | for index in isosetminus: 355 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexminus += 1 356 | if sumindexplus == 0: 357 | countResults[weight]["+"] += 1 358 | if sumindexminus == 0: 359 | countResults[weight]["-"] += 1 360 | 361 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"+"+"\t"+str(countResults[weight]["+"])+"\n") 362 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"-"+"\t"+str(countResults[weight]["-"])+"\n") 363 | 364 | OUT.close() 365 | 366 | 367 | 368 | 369 | 370 | -------------------------------------------------------------------------------- /bin/getCount_umi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from __future__ import print_function # load print function in python3 4 | from collections import defaultdict 5 | import math, sys, os, re, pysam, time 6 | import my_functions as my 7 | 8 | # set up auto dictionary function 9 | def auto_dict(): 10 | return defaultdict(auto_dict) 11 | 12 | 13 | ############################################################################### 14 | ### ARGUMENT SETTINGS 15 | ############################################################################### 16 | 17 | # checking whether argument is valid or not 18 | validArgList = ["-bam", "-ref", "-out", "-gpinfo", "-umitag"] 19 | addAbsPath = [1,1,3,1,0] 20 | warnMessage = "-bam, -ref, -out, -gpinfo, -umitag" 21 | inputFile = my.parse_argument(validArgList, addAbsPath, warnMessage) 22 | bamFile = inputFile[0] 23 | refGeneFile = inputFile[1] 24 | outFile = inputFile[2] 25 | gpinfoFile = inputFile[3] 26 | umitag = inputFile[4] 27 | 28 | 29 | # load gene information 30 | geneStructureInformation = auto_dict() 31 | geneLineCount = auto_dict() 32 | 33 | with open(refGeneFile, "r") as FP: 34 | for line in FP: 35 | line = line.strip("\n") 36 | tmpinf = line.split("\t") 37 | gene = tmpinf[0] 38 | 39 | if not bool(geneStructureInformation[gene]): 40 | geneLineCount[gene] = 0 41 | geneStructureInformation[gene][geneLineCount[gene]] = line 42 | else: 43 | geneLineCount[gene] += 1 44 | geneStructureInformation[gene][geneLineCount[gene]] = line 45 | 46 | # load group information 47 | 48 | groupInformation = auto_dict() 49 | geneLineCount1 = auto_dict() 50 | with open(gpinfoFile, "r") as FP: 51 | for line in FP: 52 | line = line.strip("\n") 53 | tmpinf = line.split("\t") 54 | tmpinf[5] = tmpinf[5].strip(",") 55 | gene = tmpinf[0] 56 | 57 | groupInformation[gene][tmpinf[1]][tmpinf[3]] = tmpinf[5] 58 | 59 | 60 | 61 | 62 | ##################################### 63 | ## Using pysam to read in bam file !! 64 | ##################################### 65 | bamFilePysam = pysam.Samfile(bamFile,"rb") 66 | 67 | 68 | ## RESULTS FILE 69 | OUT = open(outFile, 'w') 70 | 71 | 72 | ########################################################################################################################### 73 | ### START TO ANALYZE DATA FOR EACH GENE ### 74 | ########################################################################################################################## 75 | 76 | geneCount = 0 77 | 78 | startTime = time.time() 79 | 80 | umiSet = auto_dict() 81 | 82 | #OUT.write("GeneName\tIsoformName\tNumberOfReads\tRelativeAbundance\n") ## Header of Results 83 | 84 | for gene in geneStructureInformation: 85 | 86 | countResults = auto_dict() 87 | 88 | geneCount += 1 89 | tmpTime = (time.time() - startTime)/60.0 90 | 91 | 92 | sameReadCount = auto_dict() 93 | readStart = auto_dict() 94 | readEnd = auto_dict() 95 | readCigar = auto_dict() 96 | 97 | numofExons = geneLineCount[gene] 98 | tmpgeneinf = geneStructureInformation[gene][0].split("\t") 99 | geneChr = tmpgeneinf[1] 100 | geneStart = int(tmpgeneinf[3]) 101 | geneEnd = int(tmpgeneinf[4]) 102 | readCount = 0 103 | if bamFilePysam.get_tid(geneChr) == -1: 104 | continue 105 | 106 | ## load all reads information which were mapped to the specific gene within this loop using pysam 107 | for read in bamFilePysam.fetch(geneChr, geneStart, geneEnd): 108 | line = str(read) 109 | tmpinf = line.split("\t") 110 | tmpReadName = tmpinf[0] 111 | try: 112 | tmpUMI = read.get_tag(umitag) 113 | except: 114 | continue 115 | 116 | 117 | tmpReadChr = geneChr 118 | tmpReadStart = int(tmpinf[3]) + 1 119 | tmpReadCigar = "" 120 | 121 | ## Adjust to different Pysam Version!! ## 122 | 123 | if ")]" in tmpinf[5]: ## vector format 124 | 125 | tmpinf[5] = tmpinf[5].rstrip(")]") 126 | tmpinf[5] = tmpinf[5].lstrip("[(") 127 | tmpinfcigar = tmpinf[5].split("), (") 128 | for cc in tmpinfcigar: 129 | ttcc = cc.split(", ") 130 | if ttcc[0] == "3": 131 | tmpReadCigar = tmpReadCigar + ttcc[1] + "N" 132 | if ttcc[0] == "2": 133 | tmpReadCigar = tmpReadCigar + ttcc[1] + "D" 134 | if ttcc[0] == "1": 135 | tmpReadCigar = tmpReadCigar + ttcc[1] + "I" 136 | if ttcc[0] == "0": 137 | tmpReadCigar = tmpReadCigar + ttcc[1] + "M" 138 | if not (ttcc[0] == "3" or ttcc[0] == "2" or ttcc[0] == "1" or ttcc[0] == "0"): 139 | tmpReadCigar = tmpReadCigar + ttcc[1] + "X" 140 | else: ## 100M10N100M format 141 | tmpReadCigar = tmpinf[5] 142 | #print(tmpReadCigar) 143 | if not bool(sameReadCount[tmpReadName]): 144 | sameReadCount[tmpReadName] = 1 145 | umiSet[tmpReadName] = tmpUMI 146 | else: 147 | sameReadCount[tmpReadName] += 1 148 | 149 | readStart[tmpReadName][sameReadCount[tmpReadName]] = tmpReadStart 150 | readCigar[tmpReadName][sameReadCount[tmpReadName]] = tmpReadCigar 151 | 152 | 153 | ## load structure information of the specific gene within this loop 154 | 155 | tmpgeneinf[5] = tmpgeneinf[5].rstrip(",") 156 | isoformNames = tmpgeneinf[5].split(",") 157 | exonStarts = [None] * numofExons 158 | exonEnds = [None] * numofExons 159 | exonIndicators = auto_dict() 160 | 161 | for i in range(1,numofExons+1): 162 | tmpinf = geneStructureInformation[gene][i].split("\t") 163 | exonStarts[i-1] = int(tmpinf[3])+1 164 | exonEnds[i-1] = int(tmpinf[4]) 165 | tmpinf[5] = tmpinf[5].rstrip(",") 166 | tmpExonIndicators = tmpinf[5].split(",") 167 | 168 | for j in range(len(tmpExonIndicators)): 169 | exonIndicators[isoformNames[j]][i-1] = int(tmpExonIndicators[j]) 170 | 171 | lociIndicators = auto_dict() 172 | for i in range(len(isoformNames)): 173 | for j in range(len(exonStarts)): 174 | if exonIndicators[isoformNames[i]][j] == 1: 175 | for k in range(exonStarts[j], exonEnds[j]+1): 176 | lociIndicators[isoformNames[i]][k] = 1 177 | 178 | ######################################################################################################################################### 179 | ## START TO ANALYZE EACH READ 180 | ################################################################################################################################################## 181 | 182 | qualifiedRead = auto_dict() 183 | readSet = [] 184 | fragmentStart = auto_dict() 185 | fragmentEnd = auto_dict() 186 | CompatibleMatrix = auto_dict() 187 | tmpCompatibleMatrix = auto_dict() 188 | 189 | for readName in sameReadCount: 190 | 191 | # load CIGAR information 192 | cigarNumberRead1 = auto_dict() 193 | cigarNumberRead2 = auto_dict() 194 | cigarMatchRead1 = auto_dict() 195 | cigarMatchRead2 = auto_dict() 196 | cigarInfCountRead1 = 0 197 | cigarInfCountRead2 = 0 198 | cigarInfCountRead1tmp = 0 199 | cigarInfCountRead2tmp = 0 200 | 201 | tmp1 = re.split("([A-Z])",readCigar[readName][1]) 202 | for i in range(len(tmp1)-1): 203 | if tmp1[i].isalpha(): 204 | cigarMatchRead1[cigarInfCountRead1] = tmp1[i] 205 | cigarInfCountRead1 += 1 206 | else: 207 | cigarNumberRead1[cigarInfCountRead1] = int(tmp1[i]) 208 | cigarInfCountRead1tmp += 1 209 | 210 | if sameReadCount[readName] == 2: 211 | tmp2 = re.split("([A-Z])",readCigar[readName][2]) 212 | for i in range(len(tmp2)-1): 213 | if tmp2[i].isalpha(): 214 | cigarMatchRead2[cigarInfCountRead2] = tmp2[i] 215 | cigarInfCountRead2 += 1 216 | else: 217 | cigarNumberRead2[cigarInfCountRead2] = int(tmp2[i]) 218 | cigarInfCountRead2tmp += 1 219 | 220 | # calculate read end positions 221 | readEnd[readName][1] = readStart[readName][1] 222 | for i in range(cigarInfCountRead1): 223 | readEnd[readName][1] += cigarNumberRead1[i] 224 | 225 | if sameReadCount[readName] == 2: 226 | readEnd[readName][2] = readStart[readName][2] 227 | for i in range(cigarInfCountRead2): 228 | readEnd[readName][2] += cigarNumberRead2[i] 229 | 230 | # calculate fragment START and END positions 231 | if sameReadCount[readName] == 2: 232 | fragmentStart[readName] = readStart[readName][2] if readStart[readName][1] >= readStart[readName][2] else readStart[readName][1] 233 | fragmentEnd[readName] = readEnd[readName][1] if readEnd[readName][1] >= readEnd[readName][2] else readEnd[readName][2] 234 | 235 | if sameReadCount[readName] == 1: 236 | fragmentStart[readName] = readStart[readName][1] 237 | fragmentEnd[readName] = readEnd[readName][1] 238 | 239 | ################################################################################################################################## 240 | ## Obtain compatible matrix of isoforms with respect to reads 241 | ################################################################################################################################# 242 | if (readStart[readName][1] >= geneStart and readStart[readName][1] <= geneEnd): 243 | #if (readStart[readName][1] >= geneStart and readStart[readName][1] <= geneEnd) or (readStart[readName][2] >= geneStart and readStart[readName][2] <= geneEnd and sameReadCount[readName]==2) : 244 | if cigarInfCountRead1 == cigarInfCountRead1tmp and cigarInfCountRead2 == cigarInfCountRead2tmp: 245 | base1 = readStart[readName][1] - 1 246 | exonIndicatorRead1 = [0] * numofExons 247 | if sameReadCount[readName] == 2: 248 | base2 = readStart[readName][2] - 1 249 | exonIndicatorRead2 = [0] * numofExons 250 | compatibleVector = [1] * len(isoformNames) 251 | 252 | ############################################################################################################################################## 253 | ### SET TUP COMPATIBLE INDICATOR VECTOR ############### 254 | ############################################################################################################################################### 255 | ## READ 1 ## 256 | # find exons where read 1 mapped to 257 | for i in range(cigarInfCountRead1): 258 | 259 | if cigarMatchRead1[i] == "M" or cigarMatchRead1[i] == "I": ## matched CIGAR 260 | 261 | for j in range(1,cigarNumberRead1[i]+1): 262 | tmpbase = base1 + j 263 | for k in range(len(exonStarts)): 264 | if exonIndicatorRead1[k] == 1: continue 265 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead1[k] = 1 ## confirm that the read covers this exon 266 | 267 | base1 += cigarNumberRead1[i] # jump to next match information 268 | 269 | if cigarMatchRead1[i] == "N": ## skipping area 270 | base1 += cigarNumberRead1[i] # jump to next match information directly 271 | 272 | # set up indicator vector 273 | tmpcount1 = 0 274 | tmpcount11 = 0 ## these two variable are used to rule out skipping exons 275 | for i in range(len(exonIndicatorRead1)): 276 | if exonIndicatorRead1[i] == 1: tmpcount1 += 1 277 | for i in range(len(exonIndicatorRead1)): 278 | 279 | if exonIndicatorRead1[i] == 1: 280 | tmpcount11 += 1 281 | for j in range(len(isoformNames)): 282 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j 283 | 284 | if exonIndicatorRead1[i] == 0: #aim to rule out isforms which includes exons which skipped by read 285 | if tmpcount1 > 1 and tmpcount11 >= 1 and tmpcount11 < tmpcount1: ## confirm the exon i is skipped by read!! 286 | for j in range(len(isoformNames)): 287 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0 288 | 289 | 290 | ## READ 2 ## SAME AS READ 1 291 | tmpcount2 = 0 292 | if sameReadCount[readName] == 2: ## ONLY WHEN THE READ IS PAIRED-END READ!!! 293 | # find exons where read 2 mapped to 294 | for i in range(cigarInfCountRead2): 295 | 296 | if cigarMatchRead2[i] == "M" or cigarMatchRead2[i] == "I": ## matched CIGAR 297 | 298 | for j in range(1,cigarNumberRead2[i]+1): 299 | tmpbase = base2 + j 300 | for k in range(len(exonStarts)): 301 | if exonIndicatorRead2[k] == 1: continue 302 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead2[k] = 1 ## confirm that the read covers this exon 303 | 304 | base2 += cigarNumberRead2[i] # jump to next match information 305 | 306 | if cigarMatchRead2[i] == "N": ## skipping area 307 | base2 += cigarNumberRead2[i] # jump to next match information directly 308 | 309 | # set up indicator vector 310 | tmpcount2 = 0 311 | tmpcount22 = 0 ## these two variable are used to rule out skipping exons 312 | for i in range(len(exonIndicatorRead2)): 313 | if exonIndicatorRead2[i] == 1: tmpcount2 += 1 314 | for i in range(len(exonIndicatorRead2)): 315 | 316 | if exonIndicatorRead2[i] == 1: 317 | tmpcount22 += 1 318 | for j in range(len(isoformNames)): 319 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j 320 | 321 | if exonIndicatorRead2[i] == 0: #aim to rule out isforms which includes exons which skipped by read 322 | if tmpcount2 > 1 and tmpcount22 >= 1 and tmpcount22 < tmpcount2: ## confirm the exon i is skipped by read!! 323 | for j in range(len(isoformNames)): 324 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0 325 | 326 | ################################################################################################################################################## 327 | ## fill in compatible matrix ## 328 | if tmpcount1 > 0 or (tmpcount2 > 0 and sameReadCount[readName] == 2): 329 | #umibarcode = readName.split("_") 330 | #umibarcode = umibarcode[len(umibarcode)-1] 331 | umibarcode = umiSet[readName] 332 | readSet.append(umibarcode) 333 | qualifiedRead[readName] = 1 334 | readCount += 1 335 | for i in range(len(isoformNames)): 336 | CompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i] 337 | tmpCompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i] 338 | 339 | 340 | 341 | ### COMPATIBLE MATRIX OBTAINED !!! 342 | ############################################################################################################### 343 | 344 | #readCount = len(set(readSet)) 345 | if readCount == 0: continue 346 | print(gene+"\t"+str(readCount)+" reads detected...") 347 | #print(umibarcode) 348 | for weight in groupInformation[gene]: 349 | countResults[weight]["+"] = [] 350 | countResults[weight]["-"] = [] 351 | 352 | isosetplus = groupInformation[gene][weight]["+"].split(",") 353 | isosetminus = groupInformation[gene][weight]["-"].split(",") 354 | 355 | for readName in qualifiedRead: 356 | umibarcode = readName.split("_") 357 | umibarcode = umibarcode[len(umibarcode)-1] 358 | 359 | if qualifiedRead[readName] == 0: continue 360 | #print(umibarcode) 361 | sumindexplus = 0 362 | for index in isosetplus: 363 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexplus += 1 364 | sumindexminus = 0 365 | for index in isosetminus: 366 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexminus += 1 367 | if sumindexplus == 0: 368 | countResults[weight]["+"].append(umibarcode) 369 | if sumindexminus == 0: 370 | countResults[weight]["-"].append(umibarcode) 371 | count_plus = len(set(countResults[weight]["+"])) 372 | count_minus = len(set(countResults[weight]["-"])) 373 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"+"+"\t"+str(count_plus)+"\n") 374 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"-"+"\t"+str(count_minus)+"\n") 375 | 376 | OUT.close() 377 | 378 | 379 | 380 | 381 | 382 | -------------------------------------------------------------------------------- /bin/getCount_umi_cellid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from __future__ import print_function # load print function in python3 4 | from collections import defaultdict 5 | import math, sys, os, re, pysam, time 6 | import my_functions as my 7 | 8 | # set up auto dictionary function 9 | def auto_dict(): 10 | return defaultdict(auto_dict) 11 | 12 | 13 | ############################################################################### 14 | ### ARGUMENT SETTINGS 15 | ############################################################################### 16 | 17 | # checking whether argument is valid or not 18 | validArgList = ["-bam", "-ref", "-out", "-gpinfo", "-cellid", "-celltag", "-umitag"] 19 | addAbsPath = [1,1,3,1,0,0,0] 20 | warnMessage = "-bam, -ref, -out, -gpinfo, -cellid, -celltag, -umitag" 21 | inputFile = my.parse_argument(validArgList, addAbsPath, warnMessage) 22 | bamFile = inputFile[0] 23 | refGeneFile = inputFile[1] 24 | outFile = inputFile[2] 25 | gpinfoFile = inputFile[3] 26 | cellid = inputFile[4] 27 | celltag = inputFile[5] 28 | umitag = inputFile[6] 29 | 30 | 31 | 32 | # load gene information 33 | geneStructureInformation = auto_dict() 34 | geneLineCount = auto_dict() 35 | 36 | with open(refGeneFile, "r") as FP: 37 | for line in FP: 38 | line = line.strip("\n") 39 | tmpinf = line.split("\t") 40 | gene = tmpinf[0] 41 | 42 | if not bool(geneStructureInformation[gene]): 43 | geneLineCount[gene] = 0 44 | geneStructureInformation[gene][geneLineCount[gene]] = line 45 | else: 46 | geneLineCount[gene] += 1 47 | geneStructureInformation[gene][geneLineCount[gene]] = line 48 | 49 | # load group information 50 | 51 | groupInformation = auto_dict() 52 | geneLineCount1 = auto_dict() 53 | with open(gpinfoFile, "r") as FP: 54 | for line in FP: 55 | line = line.strip("\n") 56 | tmpinf = line.split("\t") 57 | tmpinf[5] = tmpinf[5].strip(",") 58 | gene = tmpinf[0] 59 | 60 | groupInformation[gene][tmpinf[1]][tmpinf[3]] = tmpinf[5] 61 | 62 | 63 | 64 | 65 | ##################################### 66 | ## Using pysam to read in bam file !! 67 | ##################################### 68 | bamFilePysam = pysam.Samfile(bamFile,"rb") 69 | 70 | 71 | ## RESULTS FILE 72 | OUT = open(outFile, 'w') 73 | 74 | 75 | ########################################################################################################################### 76 | ### START TO ANALYZE DATA FOR EACH GENE ### 77 | ########################################################################################################################## 78 | 79 | geneCount = 0 80 | 81 | startTime = time.time() 82 | 83 | umiSet = auto_dict() 84 | 85 | #OUT.write("GeneName\tIsoformName\tNumberOfReads\tRelativeAbundance\n") ## Header of Results 86 | 87 | for gene in geneStructureInformation: 88 | 89 | countResults = auto_dict() 90 | 91 | geneCount += 1 92 | tmpTime = (time.time() - startTime)/60.0 93 | 94 | 95 | sameReadCount = auto_dict() 96 | readStart = auto_dict() 97 | readEnd = auto_dict() 98 | readCigar = auto_dict() 99 | 100 | numofExons = geneLineCount[gene] 101 | tmpgeneinf = geneStructureInformation[gene][0].split("\t") 102 | geneChr = tmpgeneinf[1] 103 | geneStart = int(tmpgeneinf[3]) 104 | geneEnd = int(tmpgeneinf[4]) 105 | readCount = 0 106 | if bamFilePysam.get_tid(geneChr) == -1: 107 | continue 108 | 109 | ## load all reads information which were mapped to the specific gene within this loop using pysam 110 | for read in bamFilePysam.fetch(geneChr, geneStart, geneEnd): 111 | line = str(read) 112 | tmpinf = line.split("\t") 113 | tmpReadName = tmpinf[0] 114 | try: 115 | tmpCellBarcode = read.get_tag(celltag) 116 | tmpUMI = read.get_tag(umitag) 117 | except: 118 | continue 119 | if cellid != tmpCellBarcode: 120 | continue 121 | 122 | 123 | tmpReadChr = geneChr 124 | tmpReadStart = int(tmpinf[3]) + 1 125 | tmpReadCigar = "" 126 | 127 | ## Adjust to different Pysam Version!! ## 128 | 129 | if ")]" in tmpinf[5]: ## vector format 130 | 131 | tmpinf[5] = tmpinf[5].rstrip(")]") 132 | tmpinf[5] = tmpinf[5].lstrip("[(") 133 | tmpinfcigar = tmpinf[5].split("), (") 134 | for cc in tmpinfcigar: 135 | ttcc = cc.split(", ") 136 | if ttcc[0] == "3": 137 | tmpReadCigar = tmpReadCigar + ttcc[1] + "N" 138 | if ttcc[0] == "2": 139 | tmpReadCigar = tmpReadCigar + ttcc[1] + "D" 140 | if ttcc[0] == "1": 141 | tmpReadCigar = tmpReadCigar + ttcc[1] + "I" 142 | if ttcc[0] == "0": 143 | tmpReadCigar = tmpReadCigar + ttcc[1] + "M" 144 | if not (ttcc[0] == "3" or ttcc[0] == "2" or ttcc[0] == "1" or ttcc[0] == "0"): 145 | tmpReadCigar = tmpReadCigar + ttcc[1] + "X" 146 | else: ## 100M10N100M format 147 | tmpReadCigar = tmpinf[5] 148 | #print(tmpReadCigar) 149 | if not bool(sameReadCount[tmpReadName]): 150 | sameReadCount[tmpReadName] = 1 151 | umiSet[tmpReadName] = tmpUMI 152 | else: 153 | sameReadCount[tmpReadName] += 1 154 | 155 | readStart[tmpReadName][sameReadCount[tmpReadName]] = tmpReadStart 156 | readCigar[tmpReadName][sameReadCount[tmpReadName]] = tmpReadCigar 157 | 158 | 159 | ## load structure information of the specific gene within this loop 160 | 161 | tmpgeneinf[5] = tmpgeneinf[5].rstrip(",") 162 | isoformNames = tmpgeneinf[5].split(",") 163 | exonStarts = [None] * numofExons 164 | exonEnds = [None] * numofExons 165 | exonIndicators = auto_dict() 166 | 167 | for i in range(1,numofExons+1): 168 | tmpinf = geneStructureInformation[gene][i].split("\t") 169 | exonStarts[i-1] = int(tmpinf[3])+1 170 | exonEnds[i-1] = int(tmpinf[4]) 171 | tmpinf[5] = tmpinf[5].rstrip(",") 172 | tmpExonIndicators = tmpinf[5].split(",") 173 | 174 | for j in range(len(tmpExonIndicators)): 175 | exonIndicators[isoformNames[j]][i-1] = int(tmpExonIndicators[j]) 176 | 177 | lociIndicators = auto_dict() 178 | for i in range(len(isoformNames)): 179 | for j in range(len(exonStarts)): 180 | if exonIndicators[isoformNames[i]][j] == 1: 181 | for k in range(exonStarts[j], exonEnds[j]+1): 182 | lociIndicators[isoformNames[i]][k] = 1 183 | 184 | ######################################################################################################################################### 185 | ## START TO ANALYZE EACH READ 186 | ################################################################################################################################################## 187 | 188 | qualifiedRead = auto_dict() 189 | readSet = [] 190 | fragmentStart = auto_dict() 191 | fragmentEnd = auto_dict() 192 | CompatibleMatrix = auto_dict() 193 | tmpCompatibleMatrix = auto_dict() 194 | 195 | for readName in sameReadCount: 196 | 197 | # load CIGAR information 198 | cigarNumberRead1 = auto_dict() 199 | cigarNumberRead2 = auto_dict() 200 | cigarMatchRead1 = auto_dict() 201 | cigarMatchRead2 = auto_dict() 202 | cigarInfCountRead1 = 0 203 | cigarInfCountRead2 = 0 204 | cigarInfCountRead1tmp = 0 205 | cigarInfCountRead2tmp = 0 206 | 207 | tmp1 = re.split("([A-Z])",readCigar[readName][1]) 208 | for i in range(len(tmp1)-1): 209 | if tmp1[i].isalpha(): 210 | cigarMatchRead1[cigarInfCountRead1] = tmp1[i] 211 | cigarInfCountRead1 += 1 212 | else: 213 | cigarNumberRead1[cigarInfCountRead1] = int(tmp1[i]) 214 | cigarInfCountRead1tmp += 1 215 | 216 | if sameReadCount[readName] == 2: 217 | tmp2 = re.split("([A-Z])",readCigar[readName][2]) 218 | for i in range(len(tmp2)-1): 219 | if tmp2[i].isalpha(): 220 | cigarMatchRead2[cigarInfCountRead2] = tmp2[i] 221 | cigarInfCountRead2 += 1 222 | else: 223 | cigarNumberRead2[cigarInfCountRead2] = int(tmp2[i]) 224 | cigarInfCountRead2tmp += 1 225 | 226 | # calculate read end positions 227 | readEnd[readName][1] = readStart[readName][1] 228 | for i in range(cigarInfCountRead1): 229 | readEnd[readName][1] += cigarNumberRead1[i] 230 | 231 | if sameReadCount[readName] == 2: 232 | readEnd[readName][2] = readStart[readName][2] 233 | for i in range(cigarInfCountRead2): 234 | readEnd[readName][2] += cigarNumberRead2[i] 235 | 236 | # calculate fragment START and END positions 237 | if sameReadCount[readName] == 2: 238 | fragmentStart[readName] = readStart[readName][2] if readStart[readName][1] >= readStart[readName][2] else readStart[readName][1] 239 | fragmentEnd[readName] = readEnd[readName][1] if readEnd[readName][1] >= readEnd[readName][2] else readEnd[readName][2] 240 | 241 | if sameReadCount[readName] == 1: 242 | fragmentStart[readName] = readStart[readName][1] 243 | fragmentEnd[readName] = readEnd[readName][1] 244 | 245 | ################################################################################################################################## 246 | ## Obtain compatible matrix of isoforms with respect to reads 247 | ################################################################################################################################# 248 | if (readStart[readName][1] >= geneStart and readStart[readName][1] <= geneEnd): 249 | #if (readStart[readName][1] >= geneStart and readStart[readName][1] <= geneEnd) or (readStart[readName][2] >= geneStart and readStart[readName][2] <= geneEnd and sameReadCount[readName]==2) : 250 | if cigarInfCountRead1 == cigarInfCountRead1tmp and cigarInfCountRead2 == cigarInfCountRead2tmp: 251 | base1 = readStart[readName][1] - 1 252 | exonIndicatorRead1 = [0] * numofExons 253 | if sameReadCount[readName] == 2: 254 | base2 = readStart[readName][2] - 1 255 | exonIndicatorRead2 = [0] * numofExons 256 | compatibleVector = [1] * len(isoformNames) 257 | 258 | ############################################################################################################################################## 259 | ### SET TUP COMPATIBLE INDICATOR VECTOR ############### 260 | ############################################################################################################################################### 261 | ## READ 1 ## 262 | # find exons where read 1 mapped to 263 | for i in range(cigarInfCountRead1): 264 | 265 | if cigarMatchRead1[i] == "M" or cigarMatchRead1[i] == "I": ## matched CIGAR 266 | 267 | for j in range(1,cigarNumberRead1[i]+1): 268 | tmpbase = base1 + j 269 | for k in range(len(exonStarts)): 270 | if exonIndicatorRead1[k] == 1: continue 271 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead1[k] = 1 ## confirm that the read covers this exon 272 | 273 | base1 += cigarNumberRead1[i] # jump to next match information 274 | 275 | if cigarMatchRead1[i] == "N": ## skipping area 276 | base1 += cigarNumberRead1[i] # jump to next match information directly 277 | 278 | # set up indicator vector 279 | tmpcount1 = 0 280 | tmpcount11 = 0 ## these two variable are used to rule out skipping exons 281 | for i in range(len(exonIndicatorRead1)): 282 | if exonIndicatorRead1[i] == 1: tmpcount1 += 1 283 | for i in range(len(exonIndicatorRead1)): 284 | 285 | if exonIndicatorRead1[i] == 1: 286 | tmpcount11 += 1 287 | for j in range(len(isoformNames)): 288 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j 289 | 290 | if exonIndicatorRead1[i] == 0: #aim to rule out isforms which includes exons which skipped by read 291 | if tmpcount1 > 1 and tmpcount11 >= 1 and tmpcount11 < tmpcount1: ## confirm the exon i is skipped by read!! 292 | for j in range(len(isoformNames)): 293 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0 294 | 295 | 296 | ## READ 2 ## SAME AS READ 1 297 | tmpcount2 = 0 298 | if sameReadCount[readName] == 2: ## ONLY WHEN THE READ IS PAIRED-END READ!!! 299 | # find exons where read 2 mapped to 300 | for i in range(cigarInfCountRead2): 301 | 302 | if cigarMatchRead2[i] == "M" or cigarMatchRead2[i] == "I": ## matched CIGAR 303 | 304 | for j in range(1,cigarNumberRead2[i]+1): 305 | tmpbase = base2 + j 306 | for k in range(len(exonStarts)): 307 | if exonIndicatorRead2[k] == 1: continue 308 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead2[k] = 1 ## confirm that the read covers this exon 309 | 310 | base2 += cigarNumberRead2[i] # jump to next match information 311 | 312 | if cigarMatchRead2[i] == "N": ## skipping area 313 | base2 += cigarNumberRead2[i] # jump to next match information directly 314 | 315 | # set up indicator vector 316 | tmpcount2 = 0 317 | tmpcount22 = 0 ## these two variable are used to rule out skipping exons 318 | for i in range(len(exonIndicatorRead2)): 319 | if exonIndicatorRead2[i] == 1: tmpcount2 += 1 320 | for i in range(len(exonIndicatorRead2)): 321 | 322 | if exonIndicatorRead2[i] == 1: 323 | tmpcount22 += 1 324 | for j in range(len(isoformNames)): 325 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j 326 | 327 | if exonIndicatorRead2[i] == 0: #aim to rule out isforms which includes exons which skipped by read 328 | if tmpcount2 > 1 and tmpcount22 >= 1 and tmpcount22 < tmpcount2: ## confirm the exon i is skipped by read!! 329 | for j in range(len(isoformNames)): 330 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0 331 | 332 | ################################################################################################################################################## 333 | ## fill in compatible matrix ## 334 | if tmpcount1 > 0 or (tmpcount2 > 0 and sameReadCount[readName] == 2): 335 | #umibarcode = readName.split("_") 336 | #umibarcode = umibarcode[len(umibarcode)-1] 337 | umibarcode = umiSet[readName] 338 | readSet.append(umibarcode) 339 | qualifiedRead[readName] = 1 340 | readCount += 1 341 | for i in range(len(isoformNames)): 342 | CompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i] 343 | tmpCompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i] 344 | 345 | 346 | 347 | ### COMPATIBLE MATRIX OBTAINED !!! 348 | ############################################################################################################### 349 | 350 | #readCount = len(set(readSet)) 351 | if readCount == 0: continue 352 | print(gene+"\t"+str(readCount)+" reads detected...") 353 | #print(umibarcode) 354 | for weight in groupInformation[gene]: 355 | countResults[weight]["+"] = [] 356 | countResults[weight]["-"] = [] 357 | 358 | isosetplus = groupInformation[gene][weight]["+"].split(",") 359 | isosetminus = groupInformation[gene][weight]["-"].split(",") 360 | 361 | for readName in qualifiedRead: 362 | umibarcode = readName.split("_") 363 | umibarcode = umibarcode[len(umibarcode)-1] 364 | 365 | if qualifiedRead[readName] == 0: continue 366 | #print(umibarcode) 367 | sumindexplus = 0 368 | for index in isosetplus: 369 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexplus += 1 370 | sumindexminus = 0 371 | for index in isosetminus: 372 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexminus += 1 373 | if sumindexplus == 0: 374 | countResults[weight]["+"].append(umibarcode) 375 | if sumindexminus == 0: 376 | countResults[weight]["-"].append(umibarcode) 377 | count_plus = len(set(countResults[weight]["+"])) 378 | count_minus = len(set(countResults[weight]["-"])) 379 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"+"+"\t"+str(count_plus)+"\n") 380 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"-"+"\t"+str(count_minus)+"\n") 381 | 382 | OUT.close() 383 | 384 | 385 | 386 | 387 | 388 | -------------------------------------------------------------------------------- /bin/getalpha.pl: -------------------------------------------------------------------------------- 1 | my %nonzeroct; 2 | my %logct; 3 | my %data; 4 | my $metafile = $ARGV[0]; 5 | my $countdir = $ARGV[1]; 6 | my $outfile = $ARGV[2]; 7 | open FP1, "$metafile"; 8 | while() { 9 | chomp(); 10 | my @b = split("\t"); 11 | open FP, "$countdir\/count_$b[0]\.out"; 12 | while() { 13 | chomp(); 14 | my @a = split("\t"); 15 | $data{$b[0]}{$a[0]} = $a[1]; 16 | } 17 | close FP; 18 | } 19 | close FP1; 20 | 21 | foreach my $ccc (keys %data) { 22 | foreach my $gene (keys %{$data{$ccc}}) { 23 | $logct{$ccc} = $logct{$ccc} + log($data{$ccc}{$gene}); 24 | $nonzeroct{$ccc}++; 25 | } 26 | } 27 | 28 | open OUT, ">$outfile"; 29 | open FP, "$metafile"; 30 | while() { 31 | chomp(); 32 | my @a = split("\t"); 33 | if($nonzeroct{$a[0]} > 0) { 34 | my $aaa = $logct{$a[0]} / $nonzeroct{$a[0]}; 35 | print OUT "$a[0]\t$aaa\t0\t0\t0\n"; 36 | } else { 37 | print OUT "NA\n"; 38 | } 39 | } 40 | close FP; 41 | close OUT; 42 | -------------------------------------------------------------------------------- /bin/getexonlevelcount_umi.pl: -------------------------------------------------------------------------------- 1 | my $cdt0 = $ARGV[0]; 2 | my $cdt1 = $ARGV[1]; 3 | my $tmpdir = $ARGV[2]; 4 | my $metafile = $ARGV[3]; 5 | my $gpinfofile = $ARGV[4]; 6 | 7 | my $gpp = $cdt0."_".$cdt1; 8 | 9 | my @condition; 10 | my %qout; 11 | open FP, "$tmpdir\/abkt/abkt_umi"; 12 | while() { 13 | chomp(); 14 | my @a = split("\t"); 15 | $qout{$a[0]} = 1; 16 | } 17 | close FP; 18 | 19 | my %count; 20 | my %quality1; 21 | my %quality2; 22 | open FP, "$metafile"; 23 | while() { 24 | chomp(); 25 | my @a = split("\t"); 26 | if($qout{$a[0]} == 1) { 27 | #print "$a[0]\n"; 28 | open FP1, "$tmpdir\/count_script\/count_$a[0]\.out"; 29 | while() { 30 | chomp(); 31 | my @b = split("\t"); 32 | #$count{$b[0].":".$b[2]}{$a[0]}{$b[3]} = $b[4] if $qgene1{$b[0]} == 1; 33 | $count{$b[0].":".$b[2]}{$a[0]}{$b[3]} = $b[4]; 34 | if($a[$#a] =~ $cdt0 || $a[$#a] =~ $cdt1) { 35 | $quality1{$b[0].":".$b[2]}{$b[3]} = $quality1{$b[0].":".$b[2]}{$b[3]} + $b[4]; 36 | $quality2{$b[0].":".$b[2]}{$b[3]}++ if $b[4] > 0; 37 | } 38 | } 39 | close FP1; 40 | } 41 | } 42 | close FP; 43 | 44 | my %abkt; 45 | open FP, "$tmpdir\/abkt/abkt_umi"; 46 | while() { 47 | chomp(); 48 | my @a = split("\t"); 49 | $abkt{$a[0]} = "$a[1]\t$a[2]\t$a[3]\t$a[4]"; 50 | } 51 | close FP; 52 | 53 | my @abktfile; 54 | my @cell; 55 | open FP, "$metafile"; 56 | while() { 57 | chomp(); 58 | my @a = split("\t"); 59 | if($qout{$a[0]} == 1) { 60 | if($a[1] eq $cdt0) { 61 | @condition = (@condition, 0); 62 | @cell = (@cell, $a[0]); 63 | @abktfile = (@abktfile, $abkt{$a[0]}); 64 | } 65 | if($a[1] eq $cdt1) { 66 | @condition = (@condition, 1); 67 | @cell = (@cell, $a[0]); 68 | @abktfile = (@abktfile, $abkt{$a[0]}); 69 | } 70 | } 71 | } 72 | close FP; 73 | 74 | open OUT1, ">$tmpdir\/das_script/data/condition_$gpp"; 75 | foreach my $i (0..$#condition) { 76 | print OUT1 "$condition[$i]\n"; 77 | } 78 | close OUT1; 79 | 80 | open OUT3, ">$tmpdir\/das_script/data/abktfile_$gpp"; 81 | foreach my $i (0..$#abktfile) { 82 | print OUT3 "$abktfile[$i]\n"; 83 | } 84 | close OUT3; 85 | 86 | my %prop; 87 | my %qgroup; 88 | open FP, "$gpinfofile"; 89 | while() { 90 | chomp(); 91 | my @a = split("\t"); 92 | my $gp = $a[0].":".$a[1]; 93 | $prop{$gp}{$a[3]} = log($a[4]); 94 | $qgroup{$gp} = 1; 95 | } 96 | close FP; 97 | 98 | my %qgene; 99 | my %mean; 100 | my %bursting; 101 | open FP, "$tmpdir\/gene_script/geneleveltheta_umi"; 102 | while() { 103 | chomp(); 104 | next if /theta_rd/; 105 | my @a = split("\t"); 106 | next if ($a[0] ne $cdt0 && $a[0] ne $cdt1); 107 | $qgene{$a[0]}{$a[1]} = 1; 108 | $mean{$a[0]}{$a[1]} = $a[2]; 109 | } 110 | close FP; 111 | 112 | open FP, "$tmpdir\/gene_script/genelevelbursting_umi"; 113 | while() { 114 | chomp(); 115 | my @a = split("\t"); 116 | $bursting{$a[0]}{$a[1]} = $a[2]; 117 | } 118 | close FP; 119 | 120 | 121 | open OUT2, ">$tmpdir\/das_script/data/countdata_$gpp"; 122 | foreach my $gp (keys %count) { 123 | my @a = split(":",$gp); 124 | next if !($qgene{$cdt0}{$a[0]} == 1); 125 | next if !($qgene{$cdt1}{$a[0]} == 1); 126 | next if !($qgroup{$gp} == 1); 127 | #print "yes\n"; 128 | my $genect; 129 | my $plus; 130 | my $minus; 131 | my $tmpquality = 1; 132 | if($quality1{$gp}{"+"} > 50 || $quality1{$gp}{"-"} > 50) { 133 | $tmpquality = 0 if !($quality2{$gp}{"+"} > 15 || $quality2{$gp}{"-"} > 15); 134 | } else { 135 | $tmpquality = 0 if !($quality2{$gp}{"+"} > 10 && $quality2{$gp}{"-"} > 10); 136 | } 137 | #next if $tmpquality == 0; 138 | foreach my $i (0..$#cell) { 139 | $genect = $genect."100," if $i < $#cell; 140 | $genect = $genect."100" if $i == $#cell; 141 | if(!($count{$gp}{$cell[$i]}{"+"}>0)) { 142 | $count{$gp}{$cell[$i]}{"+"} = 0; 143 | } 144 | if(!($count{$gp}{$cell[$i]}{"-"}>0)) { 145 | $count{$gp}{$cell[$i]}{"-"} = 0; 146 | } 147 | $plus = $plus.$count{$gp}{$cell[$i]}{"+"}."," if $i < $#cell; 148 | $plus = $plus.$count{$gp}{$cell[$i]}{"+"} if $i == $#cell; 149 | $minus = $minus.$count{$gp}{$cell[$i]}{"-"}."," if $i < $#cell; 150 | $minus = $minus.$count{$gp}{$cell[$i]}{"-"} if $i == $#cell; 151 | } 152 | #print OUT2 "$gp\t$genect\t$plus\t$minus\t$mean{$cdt0}{$a[0]},1,$mean{$cdt1}{$a[0]},1,$prop{$gp}{\"+\"},$prop{$gp}{\"-\"}\t+\\-\n"; 153 | my $tmpgp = $cdt0."_".$cdt1; 154 | print OUT2 "$gp\t$genect\t$plus\t$minus\t$mean{$cdt0}{$a[0]},1,$mean{$cdt1}{$a[0]},1,$bursting{$tmpgp}{$a[0]}\t+\\-\n"; 155 | } 156 | close OUT2; 157 | -------------------------------------------------------------------------------- /bin/getgenelevelcount.pl: -------------------------------------------------------------------------------- 1 | my $cdt0 = $ARGV[0]; 2 | my $cdt1 = $ARGV[1]; 3 | my @condition; 4 | my %qout; 5 | my $abktfile = $ARGV[2]; 6 | my $metafile = $ARGV[3]; 7 | my $tmpdir = $ARGV[4]; 8 | my $tmpdirgene = $ARGV[5]; 9 | open FP, "$abktfile"; 10 | while() { 11 | chomp(); 12 | my @a = split("\t"); 13 | $qout{$a[0]} = 1; 14 | } 15 | close FP; 16 | 17 | open FP, "$metafile"; 18 | while() { 19 | chomp(); 20 | my @a = split("\t"); 21 | if($qout{$a[0]} == 1) { 22 | #print "$a[0]\n"; 23 | open FP1, "$tmpdir\/count_$a[0]\.out"; 24 | while() { 25 | chomp(); 26 | my @b = split("\t"); 27 | #$count{$b[0]}{$a[0]} = $b[1] if $qgene{$b[0]} == 1; 28 | $count{$b[0]}{$a[0]} = $b[1]; 29 | } 30 | close FP1; 31 | } 32 | } 33 | close FP; 34 | 35 | my %abkt; 36 | open FP, "$abktfile"; 37 | while() { 38 | chomp(); 39 | my @a = split("\t"); 40 | $abkt{$a[0]} = "$a[1]\t$a[2]\t$a[3]\t$a[4]"; 41 | } 42 | close FP; 43 | 44 | my @abktfile; 45 | my @cell; 46 | open FP, "$metafile"; 47 | while() { 48 | chomp(); 49 | my @a = split("\t"); 50 | if($qout{$a[0]} == 1) { 51 | if($a[1] eq $cdt0) { 52 | @condition = (@condition, 0); 53 | @cell = (@cell, $a[0]); 54 | @abktfile = (@abktfile, $abkt{$a[0]}); 55 | } 56 | if($a[1] eq $cdt1) { 57 | @condition = (@condition, 1); 58 | @cell = (@cell, $a[0]); 59 | @abktfile = (@abktfile, $abkt{$a[0]}); 60 | } 61 | } 62 | } 63 | close FP; 64 | 65 | open OUT1, ">$tmpdirgene\/condition_$cdt0\_$cdt1"; 66 | foreach my $i (0..$#condition) { 67 | print OUT1 "$condition[$i]\n"; 68 | } 69 | close OUT1; 70 | 71 | open OUT2, ">$tmpdirgene\/countdata_$cdt0\_$cdt1"; 72 | foreach my $gene (keys %count) { 73 | print OUT2 "$gene\t"; 74 | foreach my $i (0..$#cell) { 75 | if($i < $#cell) { 76 | if($count{$gene}{$cell[$i]} > 0) { 77 | print OUT2 "$count{$gene}{$cell[$i]}\t"; 78 | } else { 79 | print OUT2 "0\t"; 80 | } 81 | } 82 | if($i == $#cell) { 83 | if($count{$gene}{$cell[$i]} > 0) { 84 | print OUT2 "$count{$gene}{$cell[$i]}\n"; 85 | } else { 86 | print OUT2 "0\n"; 87 | } 88 | } 89 | } 90 | } 91 | close OUT2; 92 | 93 | open OUT3, ">$tmpdirgene\/abktfile_$cdt0\_$cdt1"; 94 | foreach my $i (0..$#abktfile) { 95 | print OUT3 "$abktfile[$i]\n"; 96 | } 97 | close OUT3; 98 | -------------------------------------------------------------------------------- /bin/getgeneleveltheta_umi.pl: -------------------------------------------------------------------------------- 1 | my %theta; 2 | my %qgene; 3 | my %bursting; 4 | my $ct = 0; 5 | my $comparedir = $ARGV[0]; 6 | my $genedir = $comparedir."/gene_script"; 7 | my $datadir = $genedir."/data"; 8 | 9 | open FP, "$comparedir\/comparegroup"; 10 | while() { 11 | chomp(); 12 | $ct++; 13 | my @a = split("\t"); 14 | my $gp = "$a[0]\_$a[1]"; 15 | open FP1, "$datadir\/outgene_$gp"; 16 | while() { 17 | chomp(); 18 | my @b = split("\t"); 19 | if($b[1] eq "True" && $b[5] ne "nan" && $b[6] ne "nan") { 20 | #$qgene{$a[0]}{$b[0]} = 1; 21 | $theta{$a[0]}{$b[0]} = $b[5]; 22 | #$qgene{$a[1]}{$b[0]} = 1; 23 | $theta{$a[1]}{$b[0]} = $b[6]; 24 | $bursting{$gp}{$b[0]} = $b[8]; 25 | 26 | #print "$b[8]\n"; 27 | } 28 | } 29 | close FP1; 30 | } 31 | close FP; 32 | 33 | open OUT, ">$genedir\/geneleveltheta_umi"; 34 | open FP, "$comparedir\/celltypes"; 35 | while() { 36 | chomp(); 37 | foreach my $g (keys %{$theta{$_}}) { 38 | #print "$bursting{$_}{$g}\n"; 39 | print OUT "$_\t$g\t$theta{$_}{$g}\n"; 40 | } 41 | } 42 | close FP; 43 | close OUT; 44 | 45 | open OUT, ">$genedir\/genelevelbursting_umi"; 46 | open FP, "$comparedir\/comparegroup"; 47 | while() { 48 | chomp(); 49 | my @a = split("\t"); 50 | my $gp = "$a[0]\_$a[1]"; 51 | foreach my $g (keys %{$bursting{$gp}}) { 52 | print OUT "$gp\t$g\t$bursting{$gp}{$g}\n"; 53 | } 54 | } 55 | close FP; 56 | close OUT; 57 | -------------------------------------------------------------------------------- /bin/getgroupinfo.pl: -------------------------------------------------------------------------------- 1 | my $input = $ARGV[0]; 2 | #my $output = $ARGV[1]; 3 | my %genecount; 4 | my $totalweight; 5 | my $tmpweight; 6 | my $tmpisoset; 7 | my $tmpisoindex; 8 | my $tmpisoindex1; 9 | my @tmpiso; 10 | my %isosetinf; 11 | my %genelength; 12 | my %grouplength; 13 | my %isoindex; 14 | my %isoindex_complement; 15 | my %exonset; 16 | open FP, "$input"; 17 | while() { 18 | chomp(); 19 | my @a = split("\t"); 20 | my @b = split(",",$a[$#a]); 21 | $genecount{$a[0]}++; 22 | if($genecount{$a[0]} == 1) { 23 | @tmpiso = split(",",$a[$#a]); 24 | $totalweight = 0; 25 | foreach my $i (0..$#tmpiso) { 26 | $totalweight = $totalweight + 2 ** $i; 27 | } 28 | } else { 29 | $genelength{$a[0]} = $genelength{$a[0]} + $a[4] - $a[3] + 1; 30 | my $tmpexon = $a[1].",".$a[3].",".$a[4]; 31 | if($a[$#a] =~ /0,/) { 32 | $tmpweight = 0; 33 | $tmpisoset = ""; 34 | $tmpisoindex = ""; 35 | $tmpisoindex1 = ""; 36 | foreach my $i (0..$#b) { 37 | $tmpweight = $tmpweight + 2 ** $i if $b[$i] == 1; 38 | $tmpisoset = $tmpisoset.$tmpiso[$i]."," if $b[$i] == 1; 39 | $tmpisoindex = $tmpisoindex.$i."," if $b[$i] == 0; 40 | $tmpisoindex1 = $tmpisoindex1.$i."," if $b[$i] == 1; 41 | } 42 | if($tmpweight < $totalweight/2) { 43 | $isoindex{$a[0]}{$tmpweight}{1} = $tmpisoindex1; 44 | $isoindex_complement{$a[0]}{$tmpweight}{1} = $tmpisoindex; 45 | $isosetinf{$a[0]}{$tmpweight}{1} = $tmpisoset; 46 | $grouplength{$a[0]}{$tmpweight}{1} = $grouplength{$a[0]}{$tmpweight}{1} + $a[4] - $a[3] + 1; 47 | $exonset{$a[0]}{$tmpweight}{1} = $exonset{$a[0]}{$tmpweight}{1}.$tmpexon.";"; 48 | } else { 49 | $tmpweight = $totalweight - $tmpweight; 50 | $isoindex{$a[0]}{$tmpweight}{0} = $tmpisoindex1; 51 | $isoindex_complement{$a[0]}{$tmpweight}{0} = $tmpisoindex; 52 | $isosetinf{$a[0]}{$tmpweight}{0} = $tmpisoset; 53 | $grouplength{$a[0]}{$tmpweight}{0} = $grouplength{$a[0]}{$tmpweight}{0} + $a[4] - $a[3] + 1; 54 | $exonset{$a[0]}{$tmpweight}{0} = $exonset{$a[0]}{$tmpweight}{0}.$tmpexon.";"; 55 | } 56 | } 57 | } 58 | } 59 | close FP; 60 | 61 | 62 | foreach my $gene (keys %isosetinf) { 63 | foreach my $weight (keys %{$isosetinf{$gene}}) { 64 | my $status; 65 | if($grouplength{$gene}{$weight}{1} > 0 && $grouplength{$gene}{$weight}{0} > 0) { 66 | $status = "both"; 67 | } else { 68 | $status = "one"; 69 | } 70 | my $tmph1 = $grouplength{$gene}{$weight}{1} / $genelength{$gene}; 71 | my $tmph0 = $grouplength{$gene}{$weight}{0} / $genelength{$gene}; 72 | 73 | if($status eq "both") { 74 | print "$gene\t$weight\t$status\t+\t$tmph1\t$isoindex_complement{$gene}{$weight}{1}\t$exonset{$gene}{$weight}{1}\n"; 75 | print "$gene\t$weight\t$status\t-\t$tmph0\t$isoindex_complement{$gene}{$weight}{0}\t$exonset{$gene}{$weight}{0}\n"; 76 | } 77 | 78 | if($status eq "one") { 79 | if($grouplength{$gene}{$weight}{1} > 0) { 80 | print "$gene\t$weight\tplus\t+\t$tmph1\t$isoindex_complement{$gene}{$weight}{1}\t$exonset{$gene}{$weight}{1}\n"; 81 | } else { 82 | print "$gene\t$weight\tminus\t+\t$tmph0\t$isoindex{$gene}{$weight}{0}\tNA\n"; 83 | } 84 | if($grouplength{$gene}{$weight}{0} > 0) { 85 | print "$gene\t$weight\tminus\t-\t$tmph0\t$isoindex_complement{$gene}{$weight}{0}\t$exonset{$gene}{$weight}{0}\n"; 86 | } else { 87 | print "$gene\t$weight\tplus\t-\t$tmph1\t$isoindex{$gene}{$weight}{1}\tNA\n"; 88 | } 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /bin/gettascdata.pl: -------------------------------------------------------------------------------- 1 | my %cdt; 2 | my $ct=0; 3 | my $cdt0 = $ARGV[0]; 4 | my $cdt1 = $ARGV[1]; 5 | my $tmpdir = $ARGV[2]; 6 | my $outdir = $ARGV[3]; 7 | open FP, "$tmpdir\/condition_$cdt0\_$cdt1"; 8 | while() { 9 | chomp(); 10 | $ct++; 11 | $cdt{$ct} = $_; 12 | } 13 | close FP; 14 | 15 | my %qgene; 16 | open FP, "$tmpdir\/countdata_$cdt0\_$cdt1"; 17 | while() { 18 | chomp(); 19 | my @a = split("\t"); 20 | my %tmp; 21 | foreach my $i (1..$#a) { 22 | $tmp{$cdt{$i}} = $tmp{$cdt{$i}} + $a[$i]; 23 | } 24 | if($tmp{$cdt0} >= 1 && $tmp{$cdt1} >= 1) { 25 | $qgene{$a[0]} = 1; 26 | } 27 | } 28 | close FP; 29 | 30 | 31 | open OUT, ">$outdir\/tascdata_$cdt0\_$cdt1"; 32 | open FP, "$tmpdir\/countdata_$cdt0\_$cdt1"; 33 | while() { 34 | chomp(); 35 | if(/ERCC/) { 36 | next; 37 | } 38 | my @a = split("\t"); 39 | if($qgene{$a[0]} == 1) { 40 | print OUT "$a[0]\t"; 41 | foreach my $i (1..($#a-1)) { 42 | print OUT "$a[$i],"; 43 | } 44 | print OUT "$a[$#a]\t"; 45 | print OUT "1\t1\t1\t1\n"; 46 | } 47 | } 48 | close FP; 49 | close OUT; 50 | -------------------------------------------------------------------------------- /bin/likelihoodumi.pyx: -------------------------------------------------------------------------------- 1 | from libc.math cimport lgamma 2 | from libc.math cimport exp 3 | from libc.math cimport log 4 | from libc.math cimport sqrt 5 | from scipy.optimize import minimize_scalar 6 | from scipy.optimize import minimize 7 | from math import pi 8 | from scipy.integrate import quad 9 | from scipy.integrate import dblquad 10 | 11 | cdef double expit(double p): 12 | return 1.0 / (1 + exp(-p)) 13 | 14 | cdef double second_order_derivative(abkt_c, params_g, mu_cg, y_cg): 15 | cdef double alpha = abkt_c[0] 16 | cdef double beta = abkt_c[1] 17 | cdef double kappa = abkt_c[2] 18 | cdef double tau = abkt_c[3] 19 | cdef double theta_g = params_g[0] 20 | cdef double sigma_g = params_g[1] 21 | cdef double p_g = params_g[2] 22 | cdef double cg 23 | 24 | if sigma_g == 0: 25 | return float('inf') 26 | 27 | if y_cg == 0: 28 | cg = ((2 * p_g / (1 + exp(tau * mu_cg + kappa)) ** 3 * tau ** 2 * exp(tau * mu_cg + kappa) ** 2 - p_g / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau ** 2 * exp(tau * mu_cg + kappa) + 2 * p_g / (1 + exp(-tau * mu_cg - kappa)) ** 3 * exp(-exp(beta * mu_cg + alpha)) * tau ** 2 * exp(-tau * mu_cg - kappa) ** 2 - 2 * p_g / (1 + exp(-tau * mu_cg - kappa)) ** 2 * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - p_g / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau ** 2 * exp(-tau * mu_cg - kappa) - p_g / (1 + exp(-tau * mu_cg - kappa)) * beta ** 2 * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * beta ** 2 * exp(beta * mu_cg + alpha) ** 2 * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) / 2 - 2 * (-p_g / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + p_g / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - p_g / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) - (1 - p_g + p_g / (1 + exp(tau * mu_cg + kappa)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) + 2 * (1 - p_g + p_g / (1 + exp(tau * mu_cg + kappa)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) ** 2 / sigma_g ** 4 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2)) / (1 - p_g + p_g / (1 + exp(tau * mu_cg + kappa)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * sqrt(pi * sigma_g ** 2) / exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) - ((-p_g / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + p_g / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - p_g / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) / 2 - (1 - p_g + p_g / (1 + exp(tau * mu_cg + kappa)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2)) / (1 - p_g + p_g / (1 + exp(tau * mu_cg + kappa)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) ** 2 * sqrt(2) * sqrt(pi * sigma_g ** 2) / exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) * (-p_g / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + p_g / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - p_g / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) + 2 * ((-p_g / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + p_g / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - p_g / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) / 2 - (1 - p_g + p_g / (1 + exp(tau * mu_cg + kappa)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2)) / (1 - p_g + p_g / (1 + exp(tau * mu_cg + kappa)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * sqrt(pi * sigma_g ** 2) / exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) * (mu_cg - theta_g) / sigma_g ** 2 29 | else: 30 | cg = -(2 * beta ** 2 * exp(beta * mu_cg - tau * mu_cg + alpha - kappa) * sigma_g ** 2 + exp(beta * mu_cg - 2 * tau * mu_cg + alpha - 2 * kappa) * beta ** 2 * sigma_g ** 2 + tau ** 2 * exp(-tau * mu_cg - kappa) * sigma_g ** 2 + beta ** 2 * exp(beta * mu_cg + alpha) * sigma_g ** 2 + 2 * exp(-2 * tau * mu_cg - 2 * kappa) + 4 * exp(-tau * mu_cg - kappa) + 2) / sigma_g ** 2 / (1 + exp(-tau * mu_cg - kappa)) ** 2 31 | return cg 32 | 33 | 34 | cdef double second_order_derivative_nob(abkt_c, params_g, mu_cg, y_cg): 35 | cdef double alpha = abkt_c[0] 36 | cdef double beta = abkt_c[1] 37 | cdef double kappa = abkt_c[2] 38 | cdef double tau = abkt_c[3] 39 | cdef double theta_g = params_g[0] 40 | cdef double sigma_g = params_g[1] 41 | cdef double cg 42 | 43 | if sigma_g == 0: 44 | return float('inf') 45 | 46 | if y_cg == 0: 47 | cg = ((2 / (1 + exp(tau * mu_cg + kappa)) ** 3 * tau ** 2 * exp(tau * mu_cg + kappa) ** 2 - 1 / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau ** 2 * exp(tau * mu_cg + kappa) + 2 / (1 + exp(-tau * mu_cg - kappa)) ** 3 * exp(-exp(beta * mu_cg + alpha)) * tau ** 2 * exp(-tau * mu_cg - kappa) ** 2 - 2 / (1 + exp(-tau * mu_cg - kappa)) ** 2 * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - 1 / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau ** 2 * exp(-tau * mu_cg - kappa) - 1 / (1 + exp(-tau * mu_cg - kappa)) * beta ** 2 * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * beta ** 2 * exp(beta * mu_cg + alpha) ** 2 * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) / 2 - 2 * (-1 / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + 1 / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - 1 / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) - (1 / (1 + exp(tau * mu_cg + kappa)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) + 2 * (1 / (1 + exp(tau * mu_cg + kappa)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) ** 2 / sigma_g ** 4 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2)) / (1 / (1 + exp(tau * mu_cg + kappa)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * sqrt(pi * sigma_g ** 2) / exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) - ((-1 / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + 1 / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - 1 / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) / 2 - (1 / (1 + exp(tau * mu_cg + kappa)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2)) / (1 / (1 + exp(tau * mu_cg + kappa)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) ** 2 * sqrt(2) * sqrt(pi * sigma_g ** 2) / exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) * (-1 / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + 1 / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - 1 / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) + 2 * ((-1 / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + 1 / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - 1 / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) / 2 - (1 / (1 + exp(tau * mu_cg + kappa)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2)) / (1 / (1 + exp(tau * mu_cg + kappa)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * sqrt(pi * sigma_g ** 2) / exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) * (mu_cg - theta_g) / sigma_g ** 2 48 | else: 49 | cg = -(2 * beta ** 2 * exp(beta * mu_cg - tau * mu_cg + alpha - kappa) * sigma_g ** 2 + exp(beta * mu_cg - 2 * tau * mu_cg + alpha - 2 * kappa) * beta ** 2 * sigma_g ** 2 + tau ** 2 * exp(-tau * mu_cg - kappa) * sigma_g ** 2 + beta ** 2 * exp(beta * mu_cg + alpha) * sigma_g ** 2 + 2 * exp(-2 * tau * mu_cg - 2 * kappa) + 4 * exp(-tau * mu_cg - kappa) + 2) / sigma_g ** 2 / (1 + exp(-tau * mu_cg - kappa)) ** 2 50 | return cg 51 | 52 | 53 | cdef double log_dpois0(double log_mean): 54 | return -exp(log_mean) 55 | 56 | 57 | cdef double log_dpois(double count, double log_mean): 58 | return count * log_mean - lgamma(long(count + 1.5)) - exp(log_mean) 59 | 60 | 61 | cdef double log_expit(double x): 62 | return -log(1.0+exp(-x)) 63 | 64 | 65 | cdef double log_sum_exp(double a, double b, double c): 66 | cdef double max_el = max(a, b, c) 67 | return max_el + log(exp(a - max_el) + exp(b - max_el) + exp(c - max_el)) 68 | 69 | 70 | cdef double log_sum_exp2(double a, double b): 71 | cdef double max_el = max(a, b) 72 | return max_el + log(exp(a - max_el) + exp(b - max_el)) 73 | 74 | cdef double log_dnorm(double x, double mu, double sigma): 75 | if sigma == 0.0: 76 | if x == mu: 77 | return 0.0 78 | else: 79 | return -float('inf') 80 | else: 81 | return -0.918938533204672669540968854562379419803619384765625 - log(sigma) - (x-mu) * (x-mu) / sigma / sigma / 2 82 | 83 | 84 | #### Beta distrubtion approach: NOT Completed ############################################################################# 85 | cdef double log_dbeta(double x, double alpha, double beta): 86 | return (alpha-1) * log(x) + (beta-1) * log(1-x) + lgamma(alpha+beta) - lgamma(alpha) - lgamma(beta) 87 | 88 | cdef double neg_log_single_complete_likelihood_nob_psi(double mu_cg, double psi_ce, params_e, params_g, abkt_c, y_ce1, y_ce0): 89 | cdef double theta_g = params_g[0] 90 | cdef double sigma_g = params_g[1] 91 | cdef double alpha_e = params_e[0] 92 | cdef double beta_e = params_e[1] 93 | cdef double a_c = abkt_c[0] 94 | cdef double b_c = abkt_c[1] 95 | 96 | return -(log_dpois(y_ce1, a_c + b_c * mu_cg * psi_ce) + log_dpois(y_ce0, a_c + b_c * mu_cg * (1-psi_ce) ) + log_dbeta(x=psi_ce, alpha=alpha_e, beta=beta_e)) 97 | 98 | cdef double neg_log_single_complete_likelihood_nob_psi_forminimize(double param_ce, double mu_cg, params_e, params_g, abkt_c, y_ce1, y_ce0): 99 | cdef double psi_ce = expit(param_ce) 100 | cdef double theta_g = params_g[0] 101 | cdef double sigma_g = params_g[1] 102 | cdef double alpha_e = params_e[0] 103 | cdef double beta_e = params_e[1] 104 | cdef double a_c = abkt_c[0] 105 | cdef double b_c = abkt_c[1] 106 | 107 | 108 | return -(log_dpois(y_ce1, a_c + b_c * mu_cg * psi_ce) + log_dpois(y_ce0, a_c + b_c * mu_cg * (1-psi_ce) ) + log_dbeta(x=psi_ce, alpha=alpha_e, beta=beta_e)) 109 | 110 | cdef double single_complete_likelihood_nob_psi(double psi_ce, double mu_cg, params_e, params_g, abkt_c, y_ce1, y_ce0, double scale_factor_ce): 111 | print "haha",-neg_log_single_complete_likelihood_nob_psi(mu_cg, psi_ce, params_e, params_g, abkt_c, y_ce1, y_ce0),scale_factor_ce,exp(-neg_log_single_complete_likelihood_nob_psi(mu_cg, psi_ce, params_e, params_g, abkt_c, y_ce1, y_ce0) + scale_factor_ce) 112 | return exp(-neg_log_single_complete_likelihood_nob_psi(mu_cg, psi_ce, params_e, params_g, abkt_c, y_ce1, y_ce0) + scale_factor_ce) 113 | 114 | 115 | cdef double neg_log_single_marginal_likelihood_nob_psi(params_e, params_g, abkt_c, y_ce1, y_ce0, y_cg): 116 | # first get the min of the neg log-likelihood 117 | # use brent method 118 | cdef double min_val 119 | cdef double hessian 120 | cdef double lower_b 121 | cdef double upper_b 122 | min_neg_log = minimize_scalar(neg_log_single_complete_likelihood_nob_psi_forminimize, args=(9, params_e, params_g, abkt_c, y_ce1, y_ce0), method='brent') 123 | 124 | if min_neg_log.success: 125 | arg_min = min_neg_log.x 126 | min_val = min_neg_log.fun 127 | #print -neg_log_single_complete_likelihood_nob_psi(9, 0.5, params_e, params_g, abkt_c, y_ce1, y_ce0), min_val 128 | integral = quad(single_complete_likelihood_nob_psi, 0, 1, args = (9, params_e, params_g, abkt_c, y_ce1, y_ce0, min_val)) 129 | print "hahaha",integral[0] 130 | return -(log(integral[0]) - min_val) 131 | else: 132 | return float('nan') 133 | ############################################################################################################################# 134 | 135 | cdef double neg_log_single_complete_likelihood_nob(double mu_cg, params_g, abkt_c, y_cg): 136 | cdef double theta_g = params_g[0] 137 | cdef double sigma_g = params_g[1] 138 | cdef double a_c = abkt_c[0] 139 | cdef double b_c = abkt_c[1] 140 | cdef double k_c = abkt_c[2] 141 | cdef double t_c = abkt_c[3] 142 | if y_cg==0: 143 | return -(log_sum_exp2(log_expit(-(k_c + t_c * mu_cg)), log_expit(k_c + t_c * mu_cg) + log_dpois0(a_c + b_c * mu_cg)) + log_dnorm(x=mu_cg, mu=theta_g, sigma=sigma_g)) 144 | else: 145 | return -(log_expit(k_c + t_c * mu_cg) + log_dpois(y_cg, a_c + b_c * mu_cg) + log_dnorm(x=mu_cg, mu=theta_g, sigma=sigma_g)) 146 | 147 | 148 | cdef double single_complete_likelihood_nob(double mu_cg, params_g, abkt_c, y_cg, double scale_factor_cg): 149 | 150 | return exp(-neg_log_single_complete_likelihood_nob(mu_cg, params_g, abkt_c, y_cg) + scale_factor_cg) 151 | 152 | 153 | 154 | 155 | cdef double neg_log_single_marginal_likelihood_nob(params_g, abkt_c, y_cg): 156 | # first get the min of the neg log-likelihood 157 | # use brent method 158 | min_neg_log = minimize_scalar(neg_log_single_complete_likelihood_nob, args=(params_g, abkt_c, y_cg), method='brent') 159 | cdef double min_val 160 | cdef double hessian 161 | cdef double lower_b 162 | cdef double upper_b 163 | if min_neg_log.success: 164 | arg_min = min_neg_log.x 165 | min_val = min_neg_log.fun 166 | hessian = second_order_derivative_nob(abkt_c, params_g, arg_min, y_cg) 167 | lower_b = arg_min - 20 / sqrt(abs(hessian)) 168 | upper_b = arg_min + 20 / sqrt(abs(hessian)) 169 | integral = quad(single_complete_likelihood_nob, lower_b, upper_b, args = (params_g, abkt_c, y_cg, min_val)) 170 | return -(log(integral[0]) - min_val) 171 | else: 172 | return float('nan') 173 | 174 | 175 | def neg_log_sum_marginal_likelihood_nob(real_params_g, abkt, y_g): 176 | params_g = [real_params_g[0], exp(real_params_g[1])] 177 | cdef double sum_marginal_likelihood = 0 178 | for i in range(len(y_g)): 179 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g, abkt[i,:], y_g[i]) 180 | return sum_marginal_likelihood 181 | 182 | 183 | cdef double neg_log_single_complete_likelihood(double mu_cg, params_g, abkt_c, long y_cg): 184 | cdef double theta_g = params_g[0] 185 | cdef double sigma_g = params_g[1] 186 | cdef double p_g = params_g[2] 187 | cdef double a_c = abkt_c[0] 188 | cdef double b_c = abkt_c[1] 189 | cdef double k_c = abkt_c[2] 190 | cdef double t_c = abkt_c[3] 191 | if y_cg==0: 192 | return -(log_sum_exp(log(1-p_g), log(p_g) + log_expit(-(k_c + t_c * mu_cg)), log(p_g) + log_expit(k_c + t_c * mu_cg) + log_dpois0(a_c + b_c * mu_cg)) + log_dnorm(x=mu_cg, mu=theta_g, sigma=sigma_g)) 193 | else: 194 | return -(log(p_g) + log_expit(k_c + t_c * mu_cg) + log_dpois(y_cg, a_c + b_c * mu_cg) + log_dnorm(x=mu_cg, mu=theta_g, sigma=sigma_g)) 195 | 196 | cdef double neg_log_single_complete_likelihood_umi(double mu_cg, params_g, abkt_c, long y_cg): 197 | cdef double theta_g = params_g[0] 198 | cdef double sigma_g = params_g[1] 199 | cdef double p_g = params_g[2] 200 | cdef double a_c = abkt_c[0] 201 | cdef double b_c = 1 202 | cdef double k_c = 1 203 | cdef double t_c = 1 204 | if y_cg==0: 205 | return -(log_sum_exp2(log(1-p_g), log(p_g) + log_dpois0(a_c + mu_cg)) + log_dnorm(x=mu_cg, mu=theta_g, sigma=sigma_g)) 206 | else: 207 | return -(log(p_g) + log_dpois(y_cg, a_c + mu_cg) + log_dnorm(x=mu_cg, mu=theta_g, sigma=sigma_g)) 208 | 209 | 210 | cdef double single_complete_likelihood(double mu_cg, params_g, abkt_c, y_cg, scale_factor_cg): 211 | return exp(-neg_log_single_complete_likelihood(mu_cg, params_g, abkt_c, y_cg) + scale_factor_cg) 212 | 213 | cdef double single_complete_likelihood_umi(double mu_cg, params_g, abkt_c, y_cg, scale_factor_cg): 214 | return exp(-neg_log_single_complete_likelihood_umi(mu_cg, params_g, abkt_c, y_cg) + scale_factor_cg) 215 | 216 | cdef double neg_log_single_marginal_likelihood(params_g, abkt_c, y_cg): 217 | # first get the min of the neg log-likelihood 218 | # use brent method 219 | min_neg_log = minimize_scalar(neg_log_single_complete_likelihood, args = (params_g, abkt_c, y_cg), method='brent') 220 | cdef double min_val 221 | cdef double hessian 222 | cdef double lower_b 223 | cdef double upper_b 224 | if min_neg_log.success: 225 | arg_min = min_neg_log.x 226 | min_val = min_neg_log.fun 227 | hessian = second_order_derivative(abkt_c, params_g, arg_min, y_cg) 228 | lower_b = arg_min - 20 / sqrt(abs(hessian)) 229 | upper_b = arg_min + 20 / sqrt(abs(hessian)) 230 | integral = quad(single_complete_likelihood, lower_b, upper_b, args = (params_g, abkt_c, y_cg, min_val)) 231 | return -(log(integral[0]) - min_val) 232 | else: 233 | return float('nan') 234 | 235 | cdef double neg_log_single_marginal_likelihood_umi(params_g, abkt_c, y_cg): 236 | # first get the min of the neg log-likelihood 237 | # use brent method 238 | min_neg_log = minimize_scalar(neg_log_single_complete_likelihood_umi, args = (params_g, abkt_c, y_cg), method='brent') 239 | cdef double min_val 240 | cdef double hessian 241 | cdef double lower_b 242 | cdef double upper_b 243 | if min_neg_log.success: 244 | arg_min = min_neg_log.x 245 | min_val = min_neg_log.fun 246 | hessian = second_order_derivative(abkt_c, params_g, arg_min, y_cg) 247 | lower_b = arg_min - 20 / sqrt(abs(hessian)) 248 | upper_b = arg_min + 20 / sqrt(abs(hessian)) 249 | integral = quad(single_complete_likelihood_umi, lower_b, upper_b, args = (params_g, abkt_c, y_cg, min_val)) 250 | return -(log(integral[0]) - min_val) 251 | else: 252 | return float('nan') 253 | 254 | 255 | def neg_log_sum_marginal_likelihood(real_params_g, abkt, y_g): 256 | params_g = [real_params_g[0], exp(real_params_g[1]), expit(real_params_g[2])] 257 | cdef double sum_marginal_likelihood = 0 258 | for i in range(len(y_g)): 259 | sum_marginal_likelihood += neg_log_single_marginal_likelihood(params_g, abkt[i,:], y_g[i]) 260 | return sum_marginal_likelihood 261 | 262 | def neg_log_sum_marginal_likelihood_umi(real_params_g, abkt, y_g): 263 | params_g = [real_params_g[0], exp(real_params_g[1]), expit(real_params_g[2])] 264 | cdef double sum_marginal_likelihood = 0 265 | for i in range(len(y_g)): 266 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_umi(params_g, abkt[i,:], y_g[i]) 267 | return sum_marginal_likelihood 268 | 269 | def neg_log_sum_marginal_likelihood_free_p(real_params_g, abkt, y_g, x_g): 270 | cdef double sum_marginal_likelihood = 0 271 | for i in range(len(y_g)): 272 | params_g = [real_params_g[0], exp(real_params_g[1]), expit(real_params_g[2]) * (1 - x_g[i]) + expit(real_params_g[3]) * x_g[i]] 273 | sum_marginal_likelihood += neg_log_single_marginal_likelihood(params_g, abkt[i,:], y_g[i]) 274 | return sum_marginal_likelihood 275 | 276 | def neg_log_sum_marginal_likelihood_free_theta(real_params_g, abkt, y_g, x_g): 277 | cdef double sum_marginal_likelihood = 0 278 | for i in range(len(y_g)): 279 | params_g = [real_params_g[0] * (1-x_g[i]) + real_params_g[1] * x_g[i], exp(real_params_g[2]), expit(real_params_g[3])] 280 | sum_marginal_likelihood += neg_log_single_marginal_likelihood(params_g, abkt[i,:], y_g[i]) 281 | return sum_marginal_likelihood 282 | 283 | def neg_log_sum_marginal_likelihood_free_theta_umi(real_params_g, abkt, y_g, x_g): 284 | cdef double sum_marginal_likelihood = 0 285 | for i in range(len(y_g)): 286 | params_g = [real_params_g[0] * (1-x_g[i]) + real_params_g[1] * x_g[i], exp(real_params_g[2]), expit(real_params_g[3])] 287 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_umi(params_g, abkt[i,:], y_g[i]) 288 | return sum_marginal_likelihood 289 | 290 | def neg_log_sum_marginal_likelihood_free_both(real_params_g, abkt, y_g, x_g): 291 | cdef double sum_marginal_likelihood = 0 292 | for i in range(len(y_g)): 293 | params_g = [real_params_g[0] * (1-x_g[i]) + real_params_g[1] * x_g[i], exp(real_params_g[2]), expit(real_params_g[3]) * (1 - x_g[i])] 294 | sum_marginal_likelihood += neg_log_single_marginal_likelihood(params_g, abkt[i,:], y_g[i]) 295 | return sum_marginal_likelihood 296 | 297 | # testing BETE parameters NOT completed 298 | 299 | 300 | def neg_log_sum_marginal_likelihood_psi_both(real_params_e, est_params_g, abkt, y_g, y_e1, y_e0, x_g): 301 | cdef double sum_marginal_likelihood = 0 302 | for i in range(len(y_g)): 303 | if y_e1[i] > 0 or y_e0[i] > 0: 304 | params_e = [exp(real_params_e[0]) * (1-x_g[i]) + exp(real_params_e[2]) * x_g[i], exp(real_params_e[1]) * (1-x_g[i]) + exp(real_params_e[3]) * x_g[i] ] 305 | params_g = [est_params_g[0] * (1-x_g[i]) + est_params_g[2] * x_g[i], est_params_g[1] * (1-x_g[i]) + est_params_g[3] * x_g[i]] 306 | #print "hahaha",neg_log_single_marginal_likelihood_nob_psi(params_e, params_g, abkt[i,:], y_e1[i], y_e0[i], y_g[i]) 307 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob_psi(params_e, params_g, abkt[i,:], y_e1[i], y_e0[i], y_g[i]) 308 | return sum_marginal_likelihood 309 | 310 | 311 | # testing PSI 312 | 313 | def neg_log_sum_marginal_likelihood_psi_equal_variance(real_params_g, abkt, y_ce1, y_ce0, x_g, theta_g1, theta_g2, sigma_g1, sigma_g2, group_status): 314 | cdef double sum_marginal_likelihood = 0 315 | cdef double psi_ce = expit(real_params_g[0]) 316 | cdef double theta_e1_grp1 = theta_g1 * psi_ce 317 | cdef double theta_e0_grp1 = theta_g1 - theta_e1_grp1 318 | cdef double theta_e1_grp2 = theta_g2 * psi_ce 319 | cdef double theta_e0_grp2 = theta_g2 - theta_e1_grp2 320 | cdef double sigma_e1_grp1 = exp(real_params_g[1]) 321 | cdef double sigma_e0_grp1 = exp(real_params_g[2]) 322 | cdef double sigma_e1_grp2 = exp(real_params_g[1]) 323 | cdef double sigma_e0_grp2 = exp(real_params_g[2]) 324 | 325 | if group_status == "+\-": 326 | for i in range(len(y_ce1)): 327 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]] 328 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]] 329 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g, abkt[i,:], y_ce1[i]) + neg_log_single_marginal_likelihood_nob(params_g1, abkt[i,:], y_ce0[i]) 330 | 331 | if group_status == "+": 332 | for i in range(len(y_ce1)): 333 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]] 334 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]] 335 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g, abkt[i,:], y_ce1[i]) 336 | 337 | if group_status == "-": 338 | for i in range(len(y_ce1)): 339 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]] 340 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]] 341 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g1, abkt[i,:], y_ce0[i]) 342 | 343 | return sum_marginal_likelihood 344 | 345 | def neg_log_sum_marginal_likelihood_psi_free_equal_variance(real_params_g, abkt, y_ce1, y_ce0, x_g, theta_g1, theta_g2, sigma_g1, sigma_g2, group_status): 346 | cdef double sum_marginal_likelihood = 0 347 | cdef double psi_ce_grp1 = expit(real_params_g[0]) 348 | cdef double psi_ce_grp2 = expit(real_params_g[1]) 349 | cdef double theta_e1_grp1 = theta_g1 * psi_ce_grp1 350 | cdef double theta_e0_grp1 = theta_g1 - theta_e1_grp1 351 | cdef double theta_e1_grp2 = theta_g2 * psi_ce_grp2 352 | cdef double theta_e0_grp2 = theta_g2 - theta_e1_grp2 353 | cdef double sigma_e1_grp1 = exp(real_params_g[2]) 354 | cdef double sigma_e0_grp1 = exp(real_params_g[3]) 355 | cdef double sigma_e1_grp2 = exp(real_params_g[2]) 356 | cdef double sigma_e0_grp2 = exp(real_params_g[3]) 357 | 358 | if group_status == "+\-": 359 | for i in range(len(y_ce1)): 360 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]] 361 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]] 362 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g, abkt[i,:], y_ce1[i]) + neg_log_single_marginal_likelihood_nob(params_g1, abkt[i,:], y_ce0[i]) 363 | 364 | if group_status == "+": 365 | for i in range(len(y_ce1)): 366 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]] 367 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]] 368 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g, abkt[i,:], y_ce1[i]) 369 | 370 | if group_status == "-": 371 | for i in range(len(y_ce1)): 372 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]] 373 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]] 374 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g1, abkt[i,:], y_ce0[i]) 375 | 376 | return sum_marginal_likelihood 377 | 378 | 379 | ### testing psi for UMI data 380 | 381 | def neg_log_sum_marginal_likelihood_psi_equal_variance_umi(real_params_g, abkt, y_ce1, y_ce0, x_g, theta_g1, theta_g2, sigma_g1, sigma_g2, p_bursting, group_status): 382 | cdef double sum_marginal_likelihood = 0 383 | cdef double psi_ce = expit(real_params_g[0]) 384 | cdef double theta_e1_grp1 = theta_g1 * psi_ce 385 | cdef double theta_e0_grp1 = theta_g1 - theta_e1_grp1 386 | cdef double theta_e1_grp2 = theta_g2 * psi_ce 387 | cdef double theta_e0_grp2 = theta_g2 - theta_e1_grp2 388 | cdef double sigma_e1_grp1 = exp(real_params_g[1]) 389 | cdef double sigma_e0_grp1 = exp(real_params_g[2]) 390 | cdef double sigma_e1_grp2 = exp(real_params_g[1]) 391 | cdef double sigma_e0_grp2 = exp(real_params_g[2]) 392 | 393 | if group_status == "+\-": 394 | for i in range(len(y_ce1)): 395 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i], p_bursting] 396 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i], p_bursting] 397 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_umi(params_g, abkt[i,:], y_ce1[i]) + neg_log_single_marginal_likelihood_umi(params_g1, abkt[i,:], y_ce0[i]) 398 | 399 | if group_status == "+": 400 | for i in range(len(y_ce1)): 401 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]] 402 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]] 403 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g, abkt[i,:], y_ce1[i]) 404 | 405 | if group_status == "-": 406 | for i in range(len(y_ce1)): 407 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]] 408 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]] 409 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g1, abkt[i,:], y_ce0[i]) 410 | 411 | return sum_marginal_likelihood 412 | 413 | 414 | 415 | def neg_log_sum_marginal_likelihood_psi_free_equal_variance_umi(real_params_g, abkt, y_ce1, y_ce0, x_g, theta_g1, theta_g2, sigma_g1, sigma_g2, p_bursting, group_status): 416 | cdef double sum_marginal_likelihood = 0 417 | cdef double psi_ce_grp1 = expit(real_params_g[0]) 418 | cdef double psi_ce_grp2 = expit(real_params_g[1]) 419 | cdef double theta_e1_grp1 = theta_g1 * psi_ce_grp1 420 | cdef double theta_e0_grp1 = theta_g1 - theta_e1_grp1 421 | cdef double theta_e1_grp2 = theta_g2 * psi_ce_grp2 422 | cdef double theta_e0_grp2 = theta_g2 - theta_e1_grp2 423 | cdef double sigma_e1_grp1 = exp(real_params_g[2]) 424 | cdef double sigma_e0_grp1 = exp(real_params_g[3]) 425 | cdef double sigma_e1_grp2 = exp(real_params_g[2]) 426 | cdef double sigma_e0_grp2 = exp(real_params_g[3]) 427 | 428 | 429 | if group_status == "+\-": 430 | for i in range(len(y_ce1)): 431 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i], p_bursting] 432 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i], p_bursting] 433 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_umi(params_g, abkt[i,:], y_ce1[i]) + neg_log_single_marginal_likelihood_umi(params_g1, abkt[i,:], y_ce0[i]) 434 | 435 | if group_status == "+": 436 | for i in range(len(y_ce1)): 437 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]] 438 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]] 439 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g, abkt[i,:], y_ce1[i]) 440 | 441 | if group_status == "-": 442 | for i in range(len(y_ce1)): 443 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]] 444 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]] 445 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g1, abkt[i,:], y_ce0[i]) 446 | 447 | return sum_marginal_likelihood 448 | -------------------------------------------------------------------------------- /bin/likelihoodumi.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/likelihoodumi.so -------------------------------------------------------------------------------- /bin/my_functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from __future__ import print_function 4 | from collections import defaultdict 5 | import math, sys, os, re, time 6 | 7 | # set up auto dictionary function 8 | def auto_dict(): 9 | return defaultdict(auto_dict) 10 | 11 | # make a directory 12 | def mk_dir(path): 13 | check = os.path.isdir(path) 14 | if not check: 15 | os.system("mkdir " + path) 16 | return 17 | 18 | # parse arguments 19 | def parse_argument(validArgList, addAbsPath, warnMessage): 20 | for argIndex in range(1,len(sys.argv)): 21 | if sys.argv[argIndex][0] == "-" and sys.argv[argIndex] not in validArgList : 22 | print("Argument \'"+sys.argv[argIndex]+"\' is invalid!") 23 | sys.exit() 24 | 25 | # assign arguments to a list 26 | outList = [] 27 | for i in range(0, len(validArgList)): 28 | for argIndex in range(1,len(sys.argv)): 29 | if sys.argv[argIndex] == validArgList[i]: 30 | argIndex += 1 31 | if "~" in sys.argv[argIndex]: 32 | sys.argv[argIndex] = os.path.expanduser(sys.argv[argIndex]) 33 | fileAbsPath = os.path.dirname(os.path.abspath(sys.argv[argIndex])) 34 | fileTmp = sys.argv[argIndex].split("/") 35 | if addAbsPath[i] == 1: # target file 36 | fileTmp = fileAbsPath + "/" + fileTmp[len(fileTmp)-1] 37 | check = os.path.exists(fileTmp) 38 | if not check: 39 | print(fileTmp+" does not exist!") 40 | sys.exit() 41 | if addAbsPath[i] == 3: # create target file 42 | fileTmp = fileAbsPath + "/" + fileTmp[len(fileTmp)-1] 43 | if addAbsPath[i] == 0: # value 44 | fileTmp = fileTmp[len(fileTmp)-1] 45 | if addAbsPath[i] == 2: # target directory 46 | fileTmp = os.path.abspath(sys.argv[argIndex]) 47 | check = os.path.isdir(fileTmp) 48 | if not check: 49 | print(fileTmp+" does not exist!") 50 | sys.exit() 51 | 52 | outList.append(fileTmp) 53 | 54 | if len(outList) != len(validArgList): 55 | print(warnMessage) 56 | sys.exit() 57 | return outList 58 | 59 | # check modules ### NOT WORKING!! 60 | import imp 61 | def check_module_exists(name): 62 | try: 63 | imp.find_module(name) 64 | except ImportError: 65 | return False 66 | return True 67 | 68 | def check_module(module): 69 | x = check_module_exists(module) 70 | if x: 71 | print("Module \'" + module + "\' is installed.") 72 | if not x: 73 | print("Module \'" + module + "\' is NOT installed!") 74 | return 75 | 76 | # check program 77 | from subprocess import Popen, PIPE 78 | 79 | def check_program_exists(name): 80 | p = Popen(['/usr/bin/which', name], stdout=PIPE, stderr=PIPE) 81 | p.communicate() 82 | return p.returncode == 0 83 | 84 | def check_program(program): 85 | x = check_program_exists(program) 86 | if x: 87 | print("Program \'" + program + "\' is installed.") 88 | if not x: 89 | print("Program \'" + program + "\' is NOT installed!") 90 | return 91 | 92 | # check file 93 | def check_file(name, othermessage): 94 | check = os.path.exists(name) 95 | if not check: 96 | print(name+" does not exist!"+othermessage) 97 | sys.exit() 98 | -------------------------------------------------------------------------------- /bin/my_functions.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/my_functions.pyc -------------------------------------------------------------------------------- /bin/scats_functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from __future__ import print_function 4 | from collections import defaultdict 5 | import math, sys, os, re, time 6 | 7 | # check whether meta file is qualified 8 | def check_meta(metaFile, umiRun, onebam): 9 | cdtset = [] 10 | with open (metaFile, "r") as FP: 11 | for line in FP: 12 | line = line.strip("\n") 13 | tmpinf = line.split("\t") 14 | cellbc = tmpinf[0] 15 | condition = tmpinf[1] 16 | bamfile = tmpinf[2] 17 | 18 | cdtset.append(condition) 19 | exists = os.path.isfile(bamfile) 20 | if not exists: 21 | print(bamfile+" does not exist!") 22 | sys.exit() 23 | exists = os.path.isfile(bamfile+".bai") 24 | if not exists: 25 | print(bamfile+".bai does not exist! Please index BAM file.") 26 | sys.exit() 27 | 28 | if umiRun == "yes": 29 | umitag = tmpinf[3] 30 | if not umitag: 31 | print("Please specify UMI tag name for each cell at 4th column of meta file!") 32 | sys.exit() 33 | if onebam == "yes": 34 | celltag = tmpinf[4] 35 | if not celltag: 36 | print("Please specify cell tag name for each cell at 5th column of meta file!") 37 | sys.exit() 38 | if umiRun == "no": 39 | if onebam == "yes": 40 | celltag = tmpinf[3] 41 | if not celltag: 42 | print("Please specify cell tag name for each cell at 4th column of meta file!") 43 | sys.exit() 44 | 45 | # check number of conditions 46 | #cdtset = len(set(cdtset)) 47 | #if cdtset < 2: 48 | #print("Please specify 2 conditions at 2nd column of meta file!") 49 | #sys.exit() 50 | #if cdtset > 2: 51 | #print("Please specify only 2 conditions at 2nd column of meta file!") 52 | #sys.exit() 53 | return 54 | 55 | # check count file 56 | def check_count_file(metaFile, tmpDir): 57 | check_meta(metaFile, "no", "no") 58 | cdtset = [] 59 | with open (metaFile, "r") as FP: 60 | for line in FP: 61 | line = line.strip("\n") 62 | tmpinf = line.split("\t") 63 | countFile = tmpDir + "/count_" + tmpinf[0] + ".out" 64 | check = os.path.exists(countFile) 65 | if not check: 66 | print(countFile+" does not exist! Please run SCATS.py -task count to obtain read count files.") 67 | sys.exit() 68 | condition = tmpinf[1] 69 | cdtset.append(condition) 70 | 71 | cdtset = list(set(cdtset)) 72 | cdtset.sort() 73 | return cdtset 74 | 75 | 76 | ## write count sh file to tmp directory 77 | def write_count_sh(fileAbsPath, umiRun, onebam, metaFile, tmpDir, refgeneFile, gpinfoFile): 78 | check_meta(metaFile, umiRun, onebam) 79 | if umiRun == "yes" and onebam == "yes": 80 | with open (metaFile, "r") as FP: 81 | for line in FP: 82 | line = line.strip("\n") 83 | tmpinf = line.split("\t") 84 | cellbc = tmpinf[0] 85 | bamfile = tmpinf[2] 86 | umitag = tmpinf[3] 87 | celltag = tmpinf[4] 88 | outFile = tmpDir + "/count_" + cellbc + ".sh" 89 | OUT = open(outFile, "w") 90 | outwrite = "python " + fileAbsPath + "/bin/getCount_umi_cellid.py -bam " + bamfile + " -ref " + refgeneFile + " -gpinfo " + gpinfoFile + " -out " + tmpDir + "/count_" + cellbc + ".out" 91 | outwrite += " -cellid " + cellbc + " -celltag " + celltag + " -umitag " + umitag + "\n" 92 | OUT.write(outwrite) 93 | OUT.close() 94 | if umiRun == "yes" and onebam == "no": 95 | with open (metaFile, "r") as FP: 96 | for line in FP: 97 | line = line.strip("\n") 98 | tmpinf = line.split("\t") 99 | cellbc = tmpinf[0] 100 | bamfile = tmpinf[2] 101 | umitag = tmpinf[3] 102 | #celltag = tmpinf[4] 103 | outFile = tmpDir + "/count_" + cellbc + ".sh" 104 | OUT = open(outFile, "w") 105 | outwrite = "python " + fileAbsPath + "/bin/getCount_umi.py -bam " + bamfile + " -ref " + refgeneFile + " -gpinfo " + gpinfoFile + " -\ 106 | out " + tmpDir + "/count_" + cellbc + ".out" 107 | outwrite += " -umitag " + umitag + "\n" 108 | OUT.write(outwrite) 109 | OUT.close() 110 | if umiRun == "no" and onebam == "yes": 111 | with open (metaFile, "r") as FP: 112 | for line in FP: 113 | line = line.strip("\n") 114 | tmpinf = line.split("\t") 115 | cellbc = tmpinf[0] 116 | bamfile = tmpinf[2] 117 | #umitag = tmpinf[3] 118 | celltag = tmpinf[3] 119 | outFile = tmpDir + "/count_" + cellbc + ".sh" 120 | OUT = open(outFile, "w") 121 | outwrite = "python " + fileAbsPath + "/bin/getCount_cellid.py -bam " + bamfile + " -ref " + refgeneFile + " -gpinfo " + gpinfoFile + " -\ 122 | out " + tmpDir + "/count_" + cellbc + ".out" 123 | outwrite += " -cellid " + cellbc + " -celltag " + celltag + "\n" 124 | OUT.write(outwrite) 125 | OUT.close() 126 | if umiRun == "no" and onebam == "no": 127 | with open (metaFile, "r") as FP: 128 | for line in FP: 129 | line = line.strip("\n") 130 | tmpinf = line.split("\t") 131 | cellbc = tmpinf[0] 132 | bamfile = tmpinf[2] 133 | outFile = tmpDir + "/count_" + cellbc + ".sh" 134 | OUT = open(outFile, "w") 135 | outwrite = "python " + fileAbsPath + "/bin/getCount.py -bam " + bamfile + " -ref " + refgeneFile + " -gpinfo " + gpinfoFile + " -\ 136 | out " + tmpDir + "/count_" + cellbc + ".out" 137 | OUT.write(outwrite) 138 | OUT.close() 139 | 140 | return 141 | -------------------------------------------------------------------------------- /bin/scats_functions.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/scats_functions.pyc -------------------------------------------------------------------------------- /bin/scats_isoform.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | from mpi4py import MPI 5 | import argparse 6 | from time import strftime 7 | from numpy import log 8 | from scipy.optimize import minimize 9 | from scipy.special import expit 10 | from scipy.special import logit 11 | from numpy import exp 12 | from numpy import column_stack 13 | from numpy.random import uniform 14 | from scipy.stats import chi2 15 | import likelihoodumi 16 | 17 | 18 | class logger(): 19 | def __init__(self, fh): 20 | self.mpi_fh = fh 21 | 22 | def log(self, string): 23 | self.mpi_fh.Write_shared('[' + strftime("%m/%d/%Y %H:%M:%S") + ']\t' + string + '\n') 24 | self.mpi_fh.Sync() 25 | 26 | def close(self): 27 | self.mpi_fh.Close() 28 | 29 | 30 | class result_writer(): 31 | def __init__(self, fh): 32 | self.mpi_fh = fh 33 | 34 | def write_header(self, type_comp): 35 | if type_comp == 1: 36 | self.mpi_fh.Write_shared('\t'.join(['gene', 'optim_scs', 'theta_nob', 'sigma_nob', 'theta', 'sigma', 'pg', 'lrt_stat', 'lrt_pval']) + "\n") 37 | elif type_comp == 2: 38 | self.mpi_fh.Write_shared('\t'.join(['gene', 'optim_scs', 'theta_rd', 'sigma_rd', 'pg_rd', 'theta_full', 'sigma_full', 'pg_full_0', 'pg_full_1', 'lrt_stat', 'lrt_pval']) + "\n") 39 | elif type_comp == 3: 40 | self.mpi_fh.Write_shared('\t'.join(['gene', 'optim_scs', 'theta_rd', 'sigma_rd', 'pg_rd', 'theta_full_0', 'theta_full_1', 'sigma_full', 'pg_full', 'lrt_stat', 'lrt_pval']) + "\n") 41 | elif type_comp == 4: 42 | self.mpi_fh.Write_shared('\t'.join(['gene', 'optim_scs', 'theta_rd', 'sigma_rd', 'pg_rd', 'theta_full_0', 'theta_full_1', 'sigma_full', 'pg_full_0', 'pg_full_1', 'lrt_stat', 'lrt_pval']) + "\n") 43 | 44 | def log(self, res): 45 | self.mpi_fh.Write_shared('\t'.join([str(x) for x in res]) + '\n') 46 | self.mpi_fh.Sync() 47 | 48 | def close(self): 49 | self.mpi_fh.Close() 50 | 51 | 52 | def get_non_zero(y): 53 | num_non_zero=0 54 | for el in y: 55 | if el > 0: 56 | num_non_zero += 1 57 | return num_non_zero 58 | 59 | def get_psi_range(y_ce1, y_ce0): 60 | psi_obs = np.sum(y_ce1) / (np.sum(y_ce1) + np.sum(y_ce0)) 61 | psi_upper = min(psi_obs+0.25, 1) 62 | psi_lower = max(psi_obs-0.25, 0) 63 | 64 | return psi_lower, psi_upper 65 | 66 | def get_psi_range_grp(y_ce1, y_ce0, x): 67 | y_ce1_grp1 = y_ce1[x==0] 68 | y_ce1_grp2 = y_ce1[x==1] 69 | y_ce0_grp1 = y_ce1[x==0] 70 | y_ce0_grp2 = y_ce1[x==1] 71 | 72 | psi_obs_grp1 = np.sum(y_ce1_grp1) / (np.sum(y_ce1_grp1) + np.sum(y_ce0_grp1)) 73 | psi_obs_grp2 = np.sum(y_ce1_grp2) / (np.sum(y_ce1_grp2) + np.sum(y_ce0_grp2)) 74 | psi_upper_grp1 = min(psi_obs_grp1+0.25, 1) 75 | psi_lower_grp1 = max(psi_obs_grp1-0.25, 0) 76 | psi_upper_grp2 = min(psi_obs_grp2+0.25, 1) 77 | psi_lower_grp2 = max(psi_obs_grp2-0.25, 0) 78 | 79 | return psi_lower_grp1, psi_upper_grp1, psi_lower_grp2, psi_upper_grp2 80 | 81 | 82 | def get_rr_range(y): 83 | abkt_mean = np.mean(abkt_params, axis=0) 84 | alpha = abkt_mean[0] 85 | beta = abkt_mean[1] 86 | theta_upper = (np.mean(np.log(y[y > 0])) - alpha) / beta 87 | theta_lower = (-1 - alpha) / beta 88 | p_upper = 0.9 89 | p_lower = float(np.sum(y>0)) / len(y) 90 | std_upper = np.std(np.log(y + 1))/beta/beta 91 | std_lower = np.std(np.log(y[y > 0] + 1))/beta/beta 92 | # std_upper = 10 93 | # std_lower = 1 94 | return theta_lower, theta_upper, p_lower, p_upper, std_lower, std_upper 95 | 96 | def get_rr_range_umi(y): 97 | abkt_mean = np.mean(abkt_params, axis=0) 98 | alpha = abkt_mean[0] 99 | beta = 1 100 | theta_upper = (np.mean(np.log(y[y > 0])) - alpha) / beta 101 | theta_lower = (-1 - alpha) / beta 102 | p_upper = 0.9 103 | p_lower = float(np.sum(y>0)) / len(y) 104 | std_upper = np.std(np.log(y + 1))/beta/beta 105 | std_lower = np.std(np.log(y[y > 0] + 1))/beta/beta 106 | # std_upper = 10 107 | # std_lower = 1 108 | return theta_lower, theta_upper, p_lower, p_upper, std_lower, std_upper 109 | 110 | 111 | def get_rr_range_grp(y, x): 112 | abkt_mean0 = np.mean(abkt_params[x==0,:], axis=0) 113 | abkt_mean1 = np.mean(abkt_params[x==1,:], axis=0) 114 | y_grp0 = y[x==0] 115 | y_grp1 = y[x==1] 116 | theta_upper0 = (np.mean(np.log(y_grp0[y_grp0 > 0])) - abkt_mean0[0])/abkt_mean0[1] 117 | theta_lower0 = (-1 - abkt_mean0[0])/abkt_mean0[1] 118 | theta_upper1 = (np.mean(np.log(y_grp1[y_grp1 > 0])) - abkt_mean1[0])/abkt_mean1[1] 119 | theta_lower1 = (-1 - abkt_mean1[0])/abkt_mean1[1] 120 | p_upper0 = 0.9 121 | p_upper1 = 0.9 122 | p_lower0 = float(np.sum(y_grp0 > 0)) / len(y_grp0) 123 | p_lower1 = float(np.sum(y_grp1 > 0)) / len(y_grp1) 124 | std_upper0 = np.std(np.log(y_grp0 + 1))/abkt_mean0[1]/abkt_mean0[1] 125 | std_lower0 = np.std(np.log(y_grp0[y_grp0 > 0] + 1))/abkt_mean0[1]/abkt_mean0[1] 126 | std_upper1 = np.std(np.log(y_grp1 + 1))/abkt_mean1[1]/abkt_mean1[1] 127 | std_lower1 = np.std(np.log(y_grp1[y_grp1 > 0] + 1))/abkt_mean1[1]/abkt_mean1[1] 128 | # std_upper0 = std_upper1 = 10 129 | # std_lower0 = std_lower1 = 1 130 | return theta_lower0, theta_upper0, p_lower0, p_upper0, std_lower0, std_upper0, theta_lower1, theta_upper1, p_lower1, p_upper1, std_lower1, std_upper1 131 | 132 | def get_rr_range_grp_umi(y, x): 133 | abkt_mean0 = np.mean(abkt_params[x==0,:], axis=0) 134 | abkt_mean1 = np.mean(abkt_params[x==1,:], axis=0) 135 | abkt_mean0[1] = 1 136 | abkt_mean1[1] = 1 137 | y_grp0 = y[x==0] 138 | y_grp1 = y[x==1] 139 | theta_upper0 = (np.mean(np.log(y_grp0[y_grp0 > 0])) - abkt_mean0[0])/abkt_mean0[1] 140 | theta_lower0 = (-1 - abkt_mean0[0])/abkt_mean0[1] 141 | theta_upper1 = (np.mean(np.log(y_grp1[y_grp1 > 0])) - abkt_mean1[0])/abkt_mean1[1] 142 | theta_lower1 = (-1 - abkt_mean1[0])/abkt_mean1[1] 143 | p_upper0 = 0.9 144 | p_upper1 = 0.9 145 | p_lower0 = float(np.sum(y_grp0 > 0)) / len(y_grp0) 146 | p_lower1 = float(np.sum(y_grp1 > 0)) / len(y_grp1) 147 | std_upper0 = np.std(np.log(y_grp0 + 1))/abkt_mean0[1]/abkt_mean0[1] 148 | std_lower0 = np.std(np.log(y_grp0[y_grp0 > 0] + 1))/abkt_mean0[1]/abkt_mean0[1] 149 | std_upper1 = np.std(np.log(y_grp1 + 1))/abkt_mean1[1]/abkt_mean1[1] 150 | std_lower1 = np.std(np.log(y_grp1[y_grp1 > 0] + 1))/abkt_mean1[1]/abkt_mean1[1] 151 | # std_upper0 = std_upper1 = 10 152 | # std_lower0 = std_lower1 = 1 153 | return theta_lower0, theta_upper0, p_lower0, p_upper0, std_lower0, std_upper0, theta_lower1, theta_upper1, p_lower1, p_upper1, std_lower1, std_upper1 154 | 155 | 156 | 157 | def get_parsed_options(): 158 | parser=argparse.ArgumentParser(description='TASC-B, a quantifier for gene expression incorporating gene bursting.') 159 | parser.add_argument('-y', '--counts', required = True, type=str, dest='y_filename', action='store', default='y.tsv', 160 | help='name of the file containing the counts') 161 | parser.add_argument('-x', '--group', type=str, dest='x_filename', action='store', default='x.tsv', 162 | help='name of the file containing group info') 163 | parser.add_argument('-k', '--abkt', type=str, dest='abkt_filename', action='store', default='abkt.tsv', 164 | help='name of the file containing given abkt values') 165 | parser.add_argument('-t', '--type', type=int, dest='type_op', action='store', default=1, 166 | help='type of operation: \n1 - test p < 1; \n2 - test p1 != p2, \n3 - test t1 != t2, \n4 - test 2 and 3 simultaneously') 167 | parser.add_argument('-o', '--outdest', type=str, dest='out_filename', action='store', default='tasc_out.tsv', 168 | help='name of the output file') 169 | parser.add_argument('-r', '--minrestart', type=int, dest='minNR', action='store', default=1, 170 | help='minimum number of restarts for optimization (default=2)') 171 | parser.add_argument('-m', '--maxrestart', type=int, dest='maxNR', action='store', default=3, 172 | help='max number of restarts for optimization (default=8)') 173 | args=parser.parse_args() 174 | return args 175 | 176 | 177 | def parse_filter_counts(y_filename, size): 178 | genes=[[] for _ in range(size)] 179 | log_fh.log('parsing counts file: ' + y_filename) 180 | 181 | with open(y_filename) as f: 182 | idx=0 183 | total_num_genes = 0 184 | for line in f: 185 | tokens=line.rstrip('\n').split('\t') 186 | counts=np.array([long(x) for x in tokens[1].split(',')]) 187 | counts1=np.array([long(x) for x in tokens[2].split(',')]) 188 | counts0=np.array([long(x) for x in tokens[3].split(',')]) 189 | est_params_g=np.array([float(x) for x in tokens[4].split(',')]) 190 | group_status=tokens[5] 191 | #print group_status 192 | #print counts 193 | if get_non_zero(counts) >= 3: 194 | genes[idx].append((tokens[0], counts, counts1, counts0, est_params_g, group_status)) 195 | idx += 1 196 | total_num_genes += 1 197 | if idx >= size: 198 | idx=0 199 | log_fh.log('total number of genes parsed: ' + str(total_num_genes)) 200 | return genes 201 | 202 | 203 | def opt_neg_log_sum_marginal_likelihood(gene_name, abkt, y_g, num_random_restarts, minrr): 204 | 205 | theta_lower, theta_upper, p_lower, p_upper, std_lower, std_upper = get_rr_range(y_g) 206 | 207 | real_params_g_rtimes = column_stack((uniform(theta_lower, theta_upper, num_random_restarts), 208 | log(uniform(std_lower, std_upper, num_random_restarts)))) 209 | 210 | arg_min_x=[] 211 | val_min_x=[] 212 | for i in range(num_random_restarts): 213 | log_fh.log('tasc optimization #' + str(i) + ' for gene ' + gene_name) 214 | real_params_g=real_params_g_rtimes[i,:] 215 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood_nob, x0=real_params_g, args=(abkt, y_g), method='L-BFGS-B') 216 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (not optim_result_obj.fun == 0): 217 | arg_min_x.append(optim_result_obj) 218 | val_min_x.append(optim_result_obj.fun) 219 | if len(arg_min_x) >= minrr: 220 | break 221 | 222 | if len(arg_min_x) == 0: 223 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan'))) 224 | return 225 | tasc_nob_res = arg_min_x[np.argmin(val_min_x)] 226 | 227 | 228 | real_params_g_rtimes = column_stack((uniform(theta_lower, theta_upper, num_random_restarts), 229 | log(uniform(std_lower, std_upper, num_random_restarts)), 230 | logit(uniform(p_lower, p_upper, num_random_restarts)))) 231 | arg_min_x=[] 232 | val_min_x=[] 233 | 234 | for i in range(num_random_restarts): 235 | log_fh.log('tasc-b optimization #' + str(i) + ' for gene ' + gene_name) 236 | real_params_g=real_params_g_rtimes[i,:] 237 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood, x0=real_params_g, args=(abkt, y_g), method='L-BFGS-B') 238 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (not optim_result_obj.fun == 0): 239 | arg_min_x.append(optim_result_obj) 240 | val_min_x.append(optim_result_obj.fun) 241 | if len(arg_min_x) >= minrr: 242 | break 243 | 244 | if len(arg_min_x) == 0: 245 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan'))) 246 | return 247 | tasc_b_res=arg_min_x[np.argmin(val_min_x)] 248 | 249 | lrt_stat = 2 * (tasc_nob_res.fun - tasc_b_res.fun) 250 | 251 | if np.isnan(lrt_stat): 252 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan'))) 253 | else: 254 | lrt_pval = 1 - chi2.cdf(lrt_stat, df=1) 255 | res_fh.log((gene_name, True, tasc_nob_res.x[0], exp(tasc_nob_res.x[1]), tasc_b_res.x[0], exp(tasc_b_res.x[1]), expit(tasc_b_res.x[2]), lrt_stat, lrt_pval)) 256 | 257 | 258 | def lrt_free_p(gene_name, abkt, y_g, num_random_restarts, minrr): 259 | theta_lower, theta_upper, p_lower, p_upper, std_lower, std_upper = get_rr_range(y_g) 260 | 261 | real_params_g_rtimes = column_stack((uniform(theta_lower, theta_upper, num_random_restarts), 262 | log(uniform(std_lower, std_upper, num_random_restarts)), 263 | logit(uniform(p_lower, p_upper, num_random_restarts)))) 264 | arg_min_x = [] 265 | val_min_x = [] 266 | 267 | for i in range(num_random_restarts): 268 | log_fh.log('tasc-b optimization #' + str(i) + ' for gene ' + gene_name) 269 | real_params_g = real_params_g_rtimes[i, :] 270 | optim_result_obj = minimize(likelihoodumi.neg_log_sum_marginal_likelihood, x0=real_params_g, args=(abkt, y_g), 271 | method='L-BFGS-B') 272 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (not optim_result_obj.fun == 0): 273 | arg_min_x.append(optim_result_obj) 274 | val_min_x.append(optim_result_obj.fun) 275 | if len(arg_min_x) >= minrr: 276 | break 277 | 278 | if len(arg_min_x) == 0: 279 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), 280 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan'))) 281 | return 282 | tasc_b_res = arg_min_x[np.argmin(val_min_x)] 283 | 284 | theta_lower0, theta_upper0, p_lower0, p_upper0, std_lower0, std_upper0, theta_lower1, theta_upper1, p_lower1, p_upper1, std_lower1, std_upper1 = get_rr_range_grp( 285 | y_g, group_info) 286 | real_params_g_rtimes=column_stack((uniform(min(theta_lower0, theta_lower1), max(theta_upper0, theta_upper1), num_random_restarts), 287 | log(uniform(min(std_lower0, std_lower1), max(std_upper0, std_upper1), num_random_restarts)), 288 | logit(uniform(p_lower0, p_upper0, num_random_restarts)), 289 | logit(uniform(p_lower1, p_upper1, num_random_restarts)))) 290 | arg_min_x=[] 291 | val_min_x=[] 292 | 293 | for i in range(num_random_restarts): 294 | log_fh.log('tasc free p optimization #' + str(i) + ' for gene ' + gene_name) 295 | real_params_g=real_params_g_rtimes[i,:] 296 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood_free_p, x0=real_params_g, args=(abkt, y_g, group_info), method='L-BFGS-B') 297 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (optim_result_obj.fun != 0): 298 | arg_min_x.append(optim_result_obj) 299 | val_min_x.append(optim_result_obj.fun) 300 | if len(arg_min_x) >= minrr: 301 | break 302 | 303 | if len(arg_min_x) == 0: 304 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), 305 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan'))) 306 | return 307 | tasc_freep_res = arg_min_x[np.argmin(val_min_x)] 308 | 309 | lrt_stat = 2 * (tasc_b_res.fun - tasc_freep_res.fun) 310 | 311 | if np.isnan(lrt_stat): 312 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), 313 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan'))) 314 | else: 315 | lrt_pval = 1 - chi2.cdf(lrt_stat, df=1) 316 | res_fh.log(((gene_name, True, tasc_b_res.x[0], exp(tasc_b_res.x[1]), expit(tasc_b_res.x[2]), 317 | tasc_freep_res.x[0], exp(tasc_freep_res.x[1]), expit(tasc_freep_res.x[2]), 318 | expit(tasc_freep_res.x[3]), lrt_stat, lrt_pval))) 319 | 320 | 321 | def lrt_free_theta(gene_name, abkt, y_g, num_random_restarts, minrr): 322 | theta_lower, theta_upper, p_lower, p_upper, std_lower, std_upper = get_rr_range(y_g) 323 | 324 | real_params_g_rtimes = column_stack((uniform(theta_lower, theta_upper, num_random_restarts), 325 | log(uniform(std_lower, std_upper, num_random_restarts)), 326 | logit(uniform(p_lower, p_upper, num_random_restarts)))) 327 | arg_min_x=[] 328 | val_min_x=[] 329 | 330 | for i in range(num_random_restarts): 331 | log_fh.log('tasc-b optimization #' + str(i) + ' for gene ' + gene_name) 332 | real_params_g=real_params_g_rtimes[i,:] 333 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood, x0=real_params_g, args=(abkt, y_g), method='L-BFGS-B') 334 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (optim_result_obj.fun != 0): 335 | arg_min_x.append(optim_result_obj) 336 | val_min_x.append(optim_result_obj.fun) 337 | if len(arg_min_x) >= minrr: 338 | break 339 | 340 | if len(arg_min_x) == 0: 341 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), 342 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan'))) 343 | return 344 | tasc_b_res=arg_min_x[np.argmin(val_min_x)] 345 | 346 | theta_lower0, theta_upper0, p_lower0, p_upper0, std_lower0, std_upper0, theta_lower1, theta_upper1, p_lower1, p_upper1, std_lower1, std_upper1 = get_rr_range_grp( 347 | y_g, group_info) 348 | real_params_g_rtimes = column_stack((uniform(theta_lower0, theta_upper0, num_random_restarts), 349 | uniform(theta_lower1, theta_upper1, num_random_restarts), 350 | log(uniform(min(std_lower0, std_lower1), max(std_upper0, std_upper1), num_random_restarts)), 351 | logit(uniform(min(p_lower0, p_lower1), max(p_upper0, p_upper1), num_random_restarts)))) 352 | arg_min_x=[] 353 | val_min_x=[] 354 | for i in range(num_random_restarts): 355 | log_fh.log('tasc free theta optimization #' + str(i) + ' for gene ' + gene_name) 356 | real_params_g=real_params_g_rtimes[i,:] 357 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood_free_theta, x0=real_params_g, args=(abkt, y_g, group_info), method='L-BFGS-B') 358 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (optim_result_obj.fun != 0): 359 | arg_min_x.append(optim_result_obj) 360 | val_min_x.append(optim_result_obj.fun) 361 | if len(arg_min_x) >= minrr: 362 | break 363 | 364 | if len(arg_min_x) == 0: 365 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), 366 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan'))) 367 | return 368 | tasc_free_theta = arg_min_x[np.argmin(val_min_x)] 369 | 370 | lrt_stat = 2 * (tasc_b_res.fun - tasc_free_theta.fun) 371 | 372 | if np.isnan(lrt_stat): 373 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), 374 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan'))) 375 | else: 376 | lrt_pval = 1 - chi2.cdf(lrt_stat, df=1) 377 | res_fh.log((gene_name, True, tasc_b_res.x[0], exp(tasc_b_res.x[1]), expit(tasc_b_res.x[2]), 378 | tasc_free_theta.x[0], tasc_free_theta.x[1], exp(tasc_free_theta.x[2]), 379 | expit(tasc_free_theta.x[3]), lrt_stat, lrt_pval)) 380 | 381 | ############################################################################################## 382 | 383 | def lrt_free_theta_umi(gene_name, abkt, y_g, num_random_restarts, minrr): 384 | theta_lower, theta_upper, p_lower, p_upper, std_lower, std_upper = get_rr_range_umi(y_g) 385 | 386 | real_params_g_rtimes = column_stack((uniform(theta_lower, theta_upper, num_random_restarts), 387 | log(uniform(std_lower, std_upper, num_random_restarts)), 388 | logit(uniform(p_lower, p_upper, num_random_restarts)))) 389 | arg_min_x=[] 390 | val_min_x=[] 391 | 392 | for i in range(num_random_restarts): 393 | log_fh.log('tasc-b optimization #' + str(i) + ' for gene ' + gene_name) 394 | real_params_g=real_params_g_rtimes[i,:] 395 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood_umi, x0=real_params_g, args=(abkt, y_g), method='L-BFGS-B') 396 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (optim_result_obj.fun != 0): 397 | arg_min_x.append(optim_result_obj) 398 | val_min_x.append(optim_result_obj.fun) 399 | if len(arg_min_x) >= minrr: 400 | break 401 | 402 | if len(arg_min_x) == 0: 403 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), 404 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan'))) 405 | return 406 | tasc_b_res=arg_min_x[np.argmin(val_min_x)] 407 | 408 | theta_lower0, theta_upper0, p_lower0, p_upper0, std_lower0, std_upper0, theta_lower1, theta_upper1, p_lower1, p_upper1, std_lower1, std_upper1 = get_rr_range_grp_umi( 409 | y_g, group_info) 410 | real_params_g_rtimes = column_stack((uniform(theta_lower0, theta_upper0, num_random_restarts), 411 | uniform(theta_lower1, theta_upper1, num_random_restarts), 412 | log(uniform(min(std_lower0, std_lower1), max(std_upper0, std_upper1), num_random_restarts)), 413 | logit(uniform(min(p_lower0, p_lower1), max(p_upper0, p_upper1), num_random_restarts)))) 414 | arg_min_x=[] 415 | val_min_x=[] 416 | for i in range(num_random_restarts): 417 | log_fh.log('tasc free theta optimization #' + str(i) + ' for gene ' + gene_name) 418 | real_params_g=real_params_g_rtimes[i,:] 419 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood_free_theta_umi, x0=real_params_g, args=(abkt, y_g, group_info), method='L-BFGS-B') 420 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (optim_result_obj.fun != 0): 421 | arg_min_x.append(optim_result_obj) 422 | val_min_x.append(optim_result_obj.fun) 423 | if len(arg_min_x) >= minrr: 424 | break 425 | 426 | if len(arg_min_x) == 0: 427 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), 428 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan'))) 429 | return 430 | tasc_free_theta = arg_min_x[np.argmin(val_min_x)] 431 | 432 | lrt_stat = 2 * (tasc_b_res.fun - tasc_free_theta.fun) 433 | 434 | if np.isnan(lrt_stat): 435 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), 436 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan'))) 437 | else: 438 | lrt_pval = 1 - chi2.cdf(lrt_stat, df=1) 439 | res_fh.log((gene_name, True, tasc_b_res.x[0], exp(tasc_b_res.x[1]), expit(tasc_b_res.x[2]), 440 | tasc_free_theta.x[0], tasc_free_theta.x[1], exp(tasc_free_theta.x[2]), 441 | expit(tasc_free_theta.x[3]), lrt_stat, lrt_pval)) 442 | 443 | 444 | 445 | def lrt_free_p_and_theta(gene_name, abkt, y_g, num_random_restarts, minrr): 446 | theta_lower, theta_upper, p_lower, p_upper, std_lower, std_upper = get_rr_range(y_g) 447 | 448 | real_params_g_rtimes = column_stack((uniform(theta_lower, theta_upper, num_random_restarts), 449 | log(uniform(std_lower, std_upper, num_random_restarts)), 450 | logit(uniform(p_lower, p_upper, num_random_restarts)))) 451 | arg_min_x=[] 452 | val_min_x=[] 453 | 454 | for i in range(num_random_restarts): 455 | log_fh.log('tasc-b optimization #' + str(i) + ' for gene ' + gene_name) 456 | real_params_g=real_params_g_rtimes[i,:] 457 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood, x0=real_params_g, args=(abkt, y_g), method='L-BFGS-B') 458 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (optim_result_obj.fun != 0): 459 | arg_min_x.append(optim_result_obj) 460 | val_min_x.append(optim_result_obj.fun) 461 | if len(arg_min_x) >= minrr: 462 | break 463 | 464 | if len(arg_min_x) == 0: 465 | print("xx") 466 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), 467 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan'))) 468 | return 469 | tasc_b_res=arg_min_x[np.argmin(val_min_x)] 470 | 471 | theta_lower0, theta_upper0, p_lower0, p_upper0, std_lower0, std_upper0, theta_lower1, theta_upper1, p_lower1, p_upper1, std_lower1, std_upper1 = get_rr_range_grp( 472 | y_g, group_info) 473 | real_params_g_rtimes = column_stack((uniform(theta_lower0, theta_upper0, num_random_restarts), 474 | uniform(theta_lower1, theta_upper1, num_random_restarts), 475 | log(uniform(min(std_lower0, std_lower1), max(std_upper0, std_upper1), num_random_restarts)), 476 | logit(uniform(p_lower0, p_upper0, num_random_restarts)))) 477 | arg_min_x=[] 478 | val_min_x=[] 479 | for i in range(num_random_restarts): 480 | log_fh.log('tasc free both optimization #' + str(i) + ' for gene ' + gene_name) 481 | real_params_g=real_params_g_rtimes[i,:] 482 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood_free_both, x0=real_params_g, args=(abkt, y_g, group_info), method='L-BFGS-B') 483 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (optim_result_obj.fun != 0): 484 | arg_min_x.append(optim_result_obj) 485 | val_min_x.append(optim_result_obj.fun) 486 | if len(arg_min_x) >= minrr: 487 | break 488 | 489 | if len(arg_min_x) == 0: 490 | print("xxx") 491 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), 492 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan'))) 493 | return 494 | tasc_freeboth_res = arg_min_x[np.argmin(val_min_x)] 495 | 496 | lrt_stat = 2 * (tasc_b_res.fun - tasc_freeboth_res.fun) 497 | 498 | if np.isnan(lrt_stat): 499 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), 500 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan'))) 501 | else: 502 | lrt_pval = 1 - chi2.cdf(lrt_stat, df=1) 503 | res_fh.log((gene_name, True, tasc_b_res.x[0], exp(tasc_b_res.x[1]), expit(tasc_b_res.x[2]), 504 | tasc_freeboth_res.x[0], tasc_freeboth_res.x[1], exp(tasc_freeboth_res.x[2]), 505 | expit(tasc_freeboth_res.x[3]), lrt_stat, lrt_pval)) 506 | 507 | 508 | 509 | ################################################################################################## 510 | 511 | def get_min_marginal(data): 512 | if args.type_op == 1: 513 | for el in data: 514 | log_fh.log('now analyzing ' + el[0] + ' on node #' + str(rank)) 515 | opt_neg_log_sum_marginal_likelihood(el[0], py_stan_input['abkt'], el[1], args.maxNR, args.minNR) 516 | elif args.type_op == 2: 517 | for el in data: 518 | log_fh.log('now analyzing ' + el[0] + ' on node #' + str(rank)) 519 | lrt_free_p(el[0], py_stan_input['abkt'], el[1], args.maxNR, args.minNR) 520 | elif args.type_op == 3: 521 | for el in data: 522 | log_fh.log('now analyzing ' + el[0] + ' on node #' + str(rank)) 523 | lrt_free_theta(el[0], py_stan_input['abkt'], el[1], args.maxNR, args.minNR) 524 | elif args.type_op == 4: 525 | for el in data: 526 | log_fh.log('now analyzing ' + el[0] + ' on node #' + str(rank)) 527 | lrt_free_theta_umi(el[0], py_stan_input['abkt'], el[1], args.maxNR, args.minNR) 528 | elif args.type_op == 5: 529 | for el in data: 530 | #x = likelihoodumi.neg_log_sum_marginal_likelihood_psi_both([50,50,10,10], el[4], py_stan_input['abkt'], el[1], el[2], el[3], group_info) 531 | #x = likelihoodumi.neg_log_sum_marginal_likelihood_nob([8,2], py_stan_input['abkt'], el[1]) 532 | #print el 533 | lrt_free_psi_equal_variance(el[0], py_stan_input['abkt'], el[2], el[3], el[4], el[5], args.maxNR, args.minNR) 534 | elif args.type_op == 6: 535 | for el in data: 536 | lrt_free_psi_equal_variance_umi(el[0], py_stan_input['abkt'], el[2], el[3], el[4], el[5], args.maxNR, args.minNR) 537 | 538 | 539 | np.seterr(all='ignore') 540 | 541 | #parse args 542 | args=get_parsed_options() 543 | 544 | # init mpi env 545 | comm=MPI.COMM_WORLD 546 | rank=comm.Get_rank() 547 | size=comm.Get_size() 548 | 549 | # init logger file handle 550 | log_fh = logger(MPI.File.Open(comm, args.out_filename + '.log', MPI.MODE_CREATE | MPI.MODE_WRONLY)) 551 | res_fh = result_writer(MPI.File.Open(comm, args.out_filename, MPI.MODE_CREATE | MPI.MODE_WRONLY)) 552 | 553 | # all nodes init 554 | genes_grouped_by_worker=None 555 | abkt_params=None 556 | py_stan_input=None 557 | tasc_sm=None 558 | group_info=None 559 | 560 | # master node init 561 | if rank == 0: 562 | log_fh.log('opened MPI World with size ' + str(size)) 563 | log_fh.log('input counts filename: ' + str(args.y_filename)) 564 | log_fh.log('input abkt filename: ' + str(args.abkt_filename)) 565 | log_fh.log('output filename: ' + str(args.out_filename)) 566 | log_fh.log('max number of restarts: ' + str(args.maxNR)) 567 | log_fh.log('min number of restarts: ' + str(args.minNR)) 568 | 569 | log_fh.log('parsing abkt file: ' + args.abkt_filename) 570 | res_fh.write_header(args.type_op) 571 | abkt_params = np.genfromtxt(args.abkt_filename) 572 | 573 | log_fh.log('parsing x file: ' + args.x_filename) 574 | group_info = np.genfromtxt(args.x_filename, dtype=np.int8) 575 | 576 | py_stan_input={ 577 | 'C': abkt_params.shape[0], 578 | 'abkt' : abkt_params 579 | } 580 | 581 | genes_grouped_by_worker=parse_filter_counts(args.y_filename, size) 582 | 583 | #print genes_grouped_by_worker 584 | part_data = comm.scatter(genes_grouped_by_worker, root=0) 585 | #print group_info 586 | log_fh.log('rank ' + str(rank) + ' has ' + str(len(part_data)) + ' genes. the first gene is ' + part_data[0][0]) 587 | 588 | py_stan_input = comm.bcast(py_stan_input, root=0) 589 | abkt_params = comm.bcast(abkt_params, root=0) 590 | group_info = comm.bcast(group_info, root=0) 591 | #print abkt_params[0] 592 | opt_marg_results = get_min_marginal(part_data) 593 | 594 | log_fh.close() 595 | res_fh.close() 596 | 597 | 598 | -------------------------------------------------------------------------------- /bin/summarizedas.pl: -------------------------------------------------------------------------------- 1 | my %results; 2 | my %qgene; 3 | my $ct = 0; 4 | my $comparedir = $ARGV[0]; 5 | my $dasdir = $comparedir."/das_script"; 6 | my $datadir = $dasdir."/data"; 7 | my $infofile = $ARGV[1]; 8 | my $outfile = $ARGV[2]; 9 | 10 | my %event2exon; 11 | open FP, "$infofile"; 12 | while() { 13 | chomp(); 14 | my @a = split("\t"); 15 | my $tmp = $a[0].":".$a[1]; 16 | $event2exon{$tmp} = $a[$#a] if $a[$#a] ne "NA"; 17 | } 18 | close FP; 19 | 20 | 21 | open OUT, ">$outfile"; 22 | print OUT "gp1\tgp2\tgene_name\tAS_exons\tPSI_gp1\tPSI_gp2\ttest_stat\tp_value\n"; 23 | open FP, "$comparedir\/comparegroup"; 24 | while() { 25 | chomp(); 26 | $ct++; 27 | my @a = split("\t"); 28 | my $gp = "$a[0]\_$a[1]"; 29 | open FP1, "$datadir\/out_$gp"; 30 | while() { 31 | chomp(); 32 | my @b = split("\t"); 33 | if($b[1] eq "True" && $b[5] ne "nan" && $b[6] ne "nan") { 34 | my @c = split(":", $b[0]); 35 | print OUT "$a[0]\t$a[1]\t$c[0]\t$event2exon{$b[0]}\t$b[$#b-5]\t$b[$#b-4]\t$b[$#b-1]\t$b[$#b]\n"; 36 | 37 | #$results{$gp}{$b[0]}{"pv"} = $b[$#b]; 38 | #$results{$gp}{$b[0]}{"stat"} = $b[$#b-1]; 39 | #$results{$gp}{$b[0]}{0} = $b[$#b-5]; 40 | #$results{$gp}{$b[0]}{1} = $b[$#b-4]; 41 | } 42 | } 43 | close FP1; 44 | } 45 | close FP; 46 | close OUT; 47 | -------------------------------------------------------------------------------- /doc/Clarity_step1.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/doc/Clarity_step1.JPG -------------------------------------------------------------------------------- /doc/Clarity_step2.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/doc/Clarity_step2.JPG -------------------------------------------------------------------------------- /doc/Clarity_step3.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/doc/Clarity_step3.JPG -------------------------------------------------------------------------------- /doc/Fig1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/doc/Fig1.png -------------------------------------------------------------------------------- /doc/Install.md: -------------------------------------------------------------------------------- 1 | ## Prerequisites: 2 | 3 | #### Make sure you have the following libraries and packages installed on your system. 4 | ``` 5 | cmake >= 2.6.4 6 | gcc >= 4.4 7 | Python 2.7 8 | python packages: 9 | pysam 10 | numpy 11 | scipy 12 | cython 13 | OpenMPI 14 | SAMTOOLS 15 | Perl 16 | ``` 17 | ## Installation 18 | 19 | #### Download SCATS from github. 20 | ``` 21 | git clone https://github.com/huyustats/SCATS.git 22 | ``` 23 | 24 | #### Complie C functions using `Cython` and `gcc` 25 | ``` 26 | cd SCATS/bin/ 27 | bash complie_likelihoodumi.sh 28 | ``` 29 | 30 | #### Check programs and python packages installed or not 31 | ``` 32 | python check_software.py 33 | ``` 34 | 35 | 36 | -------------------------------------------------------------------------------- /doc/Usage.md: -------------------------------------------------------------------------------- 1 | # Instruction about how to use SCATS 2 | 3 | The inputs of SCATS are aligned single-cell RNA-seq data in BAM format and a reference isoform annotation file (Ensembl/Refseq). User needs to specify `-task` to perform in each step: 4 | ``` 5 | SCATS.py -task : 6 | 7 | refgene: preprocess reference file 8 | 9 | group: group alternative splicing exon 10 | 11 | count: count informative reads from indexed BAM file 12 | 13 | gene: estimate mean gene expression for each single cell condition 14 | 15 | das: detect differential alternative splicing (DAS) for each exon group between conditions 16 | 17 | sum: summarize DAS test results 18 | 19 | ``` 20 | 21 | ## Step 1: Group exons based on reference annotation file 22 | SCATS requires a reference annotation file `example.refFile` in following format: 23 | ``` 24 | 749 NM_001397 chr1 - 21543739 21616982 21546447 21616907 19 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185 25 | ,21586763,21599191,21605683,21616562,21616856, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21605825,21616649,21616982, 0 ECE1cmpl cmpl 0,0,1,2,2,0,0,0,0,2,0,0,0,0,1,1,0,0,0, 26 | 93 NM_001113348 chr1 - 21543739 21672034 21546447 21671871 19 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185 27 | ,21586763,21599191,21605683,21616562,21671868, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21605825,21616649,21672034, 0 ECE1cmpl cmpl 0,0,1,2,2,0,0,0,0,2,0,0,0,0,1,1,0,0,0, 28 | 749 NM_001113349 chr1 - 21543739 21616766 21546447 21616691 18 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185 29 | ,21586763,21599191,21605683,21616562, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21605825,21616766, 0 ECE1 cmpl cmpl0,0,1,2,2,0,0,0,0,2,0,0,0,0,1,1,0,0, 30 | 749 NM_001113347 chr1 - 21543739 21606183 21546447 21605927 17 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185 31 | ,21586763,21599191,21605683, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21606183, 0 ECE1 cmpl cmpl 0,0,1,2,2,0,0 32 | ,0,0,2,0,0,0,0,1,1,0, 33 | ``` 34 | Reference file in this format can be downloaded at [UCSC](https://genome.ucsc.edu/cgi-bin/hgTables?command=start) by selecting "all fields from selected table" in output format. 35 | 36 | We preprocess `example.refFile` by using `python SCATS.py -task refgene`. An example is given below. 37 | ``` 38 | python SCATS.py -task refgene -ref example.refFile -out example.refgene 39 | ``` 40 | Next, the command for exon grouping is to run `python SCATS.py -task group`. 41 | ``` 42 | python SCATS.py -task group -refgene example.refgene -out example.gpinfo 43 | ``` 44 | `example.refgene` and `example.gpinfo` are two important calibrated annotation files for following steps. 45 | 46 | ## Step 2: Extract informative read count for each exon group from alignment file 47 | SCATS requires a headerless `metafile` in this step to tell SCATS that how and where to find the aligment BAM files to extract cell-specific informative read count. BAM files have to be indexed. Here is an example of `metafile` 48 | ``` 49 | AACACGTCACATAACC-1 A ~/1dot1/outs/possorted_genome_bam.bam UB CB 50 | GGACAAGTCTCCCTGA-1 A ~/1dot1/outs/possorted_genome_bam.bam UB CB 51 | CACAGGCAGATCCCGC-1 B ~/1dot1/outs/possorted_genome_bam.bam UB CB 52 | ATCTGCCGTCATCGGC-1 B ~/1dot1/outs/possorted_genome_bam.bam UB CB 53 | GGAAAGCGTTGCTCCT-1 C ~/1dot1/outs/possorted_genome_bam.bam UB CB 54 | CGAGCACGTGTTCTTT-1 C ~/1dot1/outs/possorted_genome_bam.bam UB CB 55 | CCTATTACAATGGATA-1 D ~/1dot1/outs/possorted_genome_bam.bam UB CB 56 | AAGGAGCAGCGTCAAG-1 D ~/1dot1/outs/possorted_genome_bam.bam UB CB 57 | ``` 58 | where 1st column contains cell barcode/cell name, 2nd column represents condition group, 3rd column represents the location of BAM file. 4th and 5th columns represent the tag names of UMI barcode and cell barcode in BAM file. For example 59 | ``` 60 | NS500497:57:H27CKBGX2:3:12506:1885:16376 272 1 3014861 1 98M * 0 0 TGGCGTTCCCCTGTACTGGGGCTTATAAAGTTTGCAAGTCCAATGGGCCTCTCTTTGCAGTGATGGCCGACTAGGCCATCTTTTGATACATATGCAGC //A/A/A/EEE -refgene -gpinfo 68 | 69 | [count options] type 'python SCATS.py -task count' to check two important count options. 70 | 71 | -umi collect UMI count or not 72 | 73 | -onebam whether all aligned reads are merged in one BAM files 74 | 75 | OUTPUT: 76 | 77 | count_*.sh script files will be generated under directory `./tmp/count_script`. 78 | ``` 79 | where '-umi' and '-onebam' are two important options: 80 | * `-umi yes -onebam yes`: UMI and cell barcode tag names have to be specified in the 4th and 5th columns of `metafile`. 81 | * `-umi yes -onebam no`: only UMI barcode tag name is needed. It has to be specified in the 4th column of `metafile`. 82 | * `-umi no -onebam yes`: only cell barcode tag name is needed. It has to be specified in the 4th column of `metafile`. 83 | * `-umi no -onebam no`: no tag name is needed. 84 | 85 | Outputs of `python SCATS.py -task count` are `count_*.sh` script files located at `./tmp/count_script`. User needs to run all of them to obtain informative read count for each single cell. 86 | 87 | ## Step 3: Quantify gene-level expression accounting for technical noises 88 | In this step, user needs to give `metafile` to SCATS and specify the number of cores to use for each pairwise comparison between conditions: 89 | ``` 90 | python SCATS.py -task gene -ncore 20 -meta metafile 91 | ``` 92 | Outputs of `python SCATS.py -task gene` are `gene_*.sh` script files located at `./tmp/gene_script`. User needs to run all of them to obtain accurate gene expression estimations for each cell condition group. 93 | 94 | ## Step 4: Detect differential alternative splicing (DAS) across cell conditions accounting for technical noises 95 | In this step, user needs to give `metafile` and `example.gpinfo` to SCATS and specify the number of cores to use for each pairwise comparison between conditions: 96 | ``` 97 | python SCATS.py -task das -ncore 20 -meta metafile -gpinfo example.gpinfo 98 | ``` 99 | Outputs of `python SCATS.py -task das` are `das_*.sh` script files located at `./tmp/das_script`. User needs to run all of them to obtain differential alternative splicing even at exon group level across cell conditions. 100 | 101 | ## Step 5: Summarize DAS test results 102 | ``` 103 | python SCATS.py -task sum -gpinfo example.gpinfo 104 | ``` 105 | -------------------------------------------------------------------------------- /example/example.gpinfo: -------------------------------------------------------------------------------- 1 | ECE1 4 plus + 0.0290081639742922 0,1,3, chr1,21671868,21672034; 2 | ECE1 4 plus - 0.0290081639742922 2, NA 3 | ECE1 1 both + 0.0621851658850096 1,2,3, chr1,21605826,21606183; 4 | ECE1 1 both - 0.0152857391002258 0, chr1,21616562,21616649; 5 | ECE1 7 minus + 0.0220601007469168 3, NA 6 | ECE1 7 minus - 0.0220601007469168 0,1,2, chr1,21616856,21616982; 7 | ECE1 2 plus + 0.020323084940073 0,2,3, chr1,21616650,21616766; 8 | ECE1 2 plus - 0.020323084940073 1, NA 9 | -------------------------------------------------------------------------------- /example/example.refFile: -------------------------------------------------------------------------------- 1 | 749 NM_001397 chr1 - 21543739 21616982 21546447 21616907 19 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185,21586763,21599191,21605683,21616562,21616856, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21605825,21616649,21616982, 0 ECE1 cmpl cmpl 0,0,1,2,2,0,0,0,0,2,0,0,0,0,1,1,0,0,0, 2 | 93 NM_001113348 chr1 - 21543739 21672034 21546447 21671871 19 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185,21586763,21599191,21605683,21616562,21671868, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21605825,21616649,21672034, 0 ECE1 cmpl cmpl 0,0,1,2,2,0,0,0,0,2,0,0,0,0,1,1,0,0,0, 3 | 749 NM_001113349 chr1 - 21543739 21616766 21546447 21616691 18 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185,21586763,21599191,21605683,21616562, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21605825,21616766, 0 ECE1 cmpl cmpl 0,0,1,2,2,0,0,0,0,2,0,0,0,0,1,1,0,0, 4 | 749 NM_001113347 chr1 - 21543739 21606183 21546447 21605927 17 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185,21586763,21599191,21605683, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21606183, 0 ECE1 cmpl cmpl 0,0,1,2,2,0,0,0,0,2,0,0,0,0,1,1,0, 5 | -------------------------------------------------------------------------------- /example/example.refgene: -------------------------------------------------------------------------------- 1 | ECE1 chr1 - 21543739 21672034 NM_001113349,NM_001113348,NM_001113347,NM_001397, 2 | ECE1 chr1 - 21543739 21546624 1,1,1,1, 3 | ECE1 chr1 - 21548239 21548335 1,1,1,1, 4 | ECE1 chr1 - 21551742 21551933 1,1,1,1, 5 | ECE1 chr1 - 21553651 21553719 1,1,1,1, 6 | ECE1 chr1 - 21554423 21554534 1,1,1,1, 7 | ECE1 chr1 - 21560050 21560154 1,1,1,1, 8 | ECE1 chr1 - 21562342 21562420 1,1,1,1, 9 | ECE1 chr1 - 21563238 21563337 1,1,1,1, 10 | ECE1 chr1 - 21564626 21564737 1,1,1,1, 11 | ECE1 chr1 - 21571481 21571596 1,1,1,1, 12 | ECE1 chr1 - 21573713 21573856 1,1,1,1, 13 | ECE1 chr1 - 21582439 21582631 1,1,1,1, 14 | ECE1 chr1 - 21584017 21584083 1,1,1,1, 15 | ECE1 chr1 - 21585185 21585332 1,1,1,1, 16 | ECE1 chr1 - 21586763 21586885 1,1,1,1, 17 | ECE1 chr1 - 21599191 21599404 1,1,1,1, 18 | ECE1 chr1 - 21605683 21605825 1,1,1,1, 19 | ECE1 chr1 - 21605826 21606183 0,0,1,0, 20 | ECE1 chr1 - 21616562 21616649 1,1,0,1, 21 | ECE1 chr1 - 21616650 21616766 1,0,0,0, 22 | ECE1 chr1 - 21616856 21616982 0,0,0,1, 23 | ECE1 chr1 - 21671868 21672034 0,1,0,0, 24 | -------------------------------------------------------------------------------- /example/metafile: -------------------------------------------------------------------------------- 1 | TTTGGTTGTACTCAAC-1 1 /home/huyu1/SCATS/data/TTTGGTTGTACTCAAC-1.bam UB CB 2 | TTTGGTTGTGCACCAC-1 2 /home/huyu1/SCATS/data/TTTGGTTGTGCACCAC-1.bam UB CB --------------------------------------------------------------------------------