├── .#SCATS.py
├── README.md
├── SCATS.py
├── bin
├── PreProcess.pl
├── __init__.py
├── __init__.pyc
├── __pycache__
│ ├── __init__.cpython-37.pyc
│ ├── my_functions.cpython-37.pyc
│ └── scats_functions.cpython-37.pyc
├── check_software.py
├── complie_likelihoodumi.sh
├── getCount.py
├── getCount_cellid.py
├── getCount_umi.py
├── getCount_umi_cellid.py
├── getalpha.pl
├── getexonlevelcount_umi.pl
├── getgenelevelcount.pl
├── getgeneleveltheta_umi.pl
├── getgroupinfo.pl
├── gettascdata.pl
├── likelihoodumi.c
├── likelihoodumi.html
├── likelihoodumi.pyx
├── likelihoodumi.so
├── model_selection_das_umi.py
├── my_functions.py
├── my_functions.pyc
├── scats_functions.py
├── scats_functions.pyc
├── scats_isoform.py
└── summarizedas.pl
├── doc
├── Clarity_step1.JPG
├── Clarity_step2.JPG
├── Clarity_step3.JPG
├── Fig1.png
├── Install.md
└── Usage.md
└── example
├── example.gpinfo
├── example.refFile
├── example.refgene
├── metafile
├── mm10refseq.gpinfo
└── mm10refseq.refgene
/.#SCATS.py:
--------------------------------------------------------------------------------
1 | huy4@l-1-01.cm.cluster.128826:1562181832
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Single Cell Analysis of Transcript Splicing (SCATS)
2 | A statistical tool to detect differential alternative splicing events using single-cell RNA-seq
3 |
4 | ## Computational pipeline of SCATS
5 |
6 |
7 |
8 |
9 | ## System Requirements
10 | For optimal performance, we recommend a HPC with 20+ cores
11 |
12 | ## Inputs of SCATS
13 | The input of SCATS is single-cell RNA-seq read data in BAM format together with a refrence isoform annotation file.
14 |
15 | ## Installation
16 | Please refer to [Installation](https://github.com/huyustats/SCATS/blob/master/doc/Install.md) for how to install SCATS.
17 |
18 | ## Usage
19 | Please refere to [Usage](https://github.com/huyustats/SCATS/blob/master/doc/Usage.md) for how to use SCATS.
20 |
21 | ## Contact
22 |
23 | If you have any questions/issues/bugs, please post them on [GitHub](https://github.com/huyustats/SCATS/issues). They would also be helpful to other users.
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/SCATS.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | from bin import my_functions as my
4 | from bin import scats_functions as sc
5 | import os, sys
6 | fileAbsPath = os.path.abspath(os.path.dirname(__file__))
7 | crtAbsPath = os.getcwd()
8 |
9 | task = ""
10 | taskList = ["refgene", "group", "count", "gene", "das", "sum"]
11 | for i in range(1,len(sys.argv)):
12 | if sys.argv[i] == "-task" and len(sys.argv)!=i+1:
13 | task = sys.argv[i+1]
14 | if (task not in taskList):
15 | print("\nPlease specify task (SCATS.py -task ):\n")
16 | print("\trefgene: preprocess reference file\n")
17 | print("\tgroup: group alternative splicing exon\n")
18 | print("\tcount: count informative reads from indexed BAM file\n")
19 | #print("\tabkt: calculate technical parameters (alpha beta kappa tau)")
20 | print("\tgene: estimate mean gene expression for each single cell condition\n")
21 | print("\tdas: detect differential alternative splicing (DAS) for each exon group between conditions\n")
22 | print("\tsum: summarize DAS test results\n")
23 |
24 | if task == "refgene":
25 | validArgList = ["-task", "-ref", "-out"]
26 | addAbsPath = [0, 1, 3]
27 | message = "SCATS.py -task refgene -ref -out "
28 | inputs = my.parse_argument(validArgList, addAbsPath, message)
29 | refFile = inputs[1]
30 | outFile = inputs[2]
31 | myCommand = "perl " + fileAbsPath + "/bin/PreProcess.pl -r " + refFile + " -o " + outFile
32 | os.system(myCommand)
33 |
34 | if task == "group":
35 | validArgList = ["-task", "-refgene", "-out"]
36 | addAbsPath = [0, 1, 3]
37 | message = "SCATS.py -task group -refgene -out "
38 | inputs = my.parse_argument(validArgList, addAbsPath, message)
39 | refFile = inputs[1]
40 | outFile = inputs[2]
41 | myCommand = "perl " + fileAbsPath + "/bin/getgroupinfo.pl " + refFile + " > " + outFile
42 | os.system(myCommand)
43 |
44 | if task == "count":
45 | umiRun = ""
46 | onebam = ""
47 | for i in range(1,len(sys.argv)):
48 | if sys.argv[i] == "-umi" and len(sys.argv)!=i+1:
49 | umiRun = sys.argv[i+1]
50 | if sys.argv[i] == "-onebam" and len(sys.argv)!=i+1:
51 | onebam = sys.argv[i+1]
52 | if (umiRun not in ["yes", "no"]) or (onebam not in ["yes", "no"]):
53 | print("\nPlease specify umi and onebam option (SCATS.py -task count -umi -onebam ):\n")
54 | print("\tumi: collect UMI count or not (if yes umitag is required to be specified)\n")
55 | print("\tonebam: whether all aligned reads are merged in one BAM file (if yes celltag and cellbc are required to be specified)\n")
56 | sys.exit()
57 |
58 | validArgList = ["-task", "-umi", "-onebam", "-meta", "-refgene", "-gpinfo"]
59 | addAbsPath = [0, 0, 0, 1, 1, 1]
60 | message = "SCATS.py -task count -umi yes -onebam -yes -meta -refgene -gpinfo "
61 | inputs = my.parse_argument(validArgList, addAbsPath, message)
62 | metaFile = inputs[3]
63 | tmpDir = crtAbsPath + "/tmp"
64 | my.mk_dir(tmpDir)
65 | tmpDir = tmpDir + "/count_script"
66 | my.mk_dir(tmpDir)
67 | refgeneFile = inputs[4]
68 | gpinfoFile = inputs[5]
69 |
70 | # generate sh files for read counting process
71 | sc.write_count_sh(fileAbsPath, umiRun, onebam, metaFile, tmpDir, refgeneFile, gpinfoFile)
72 | print("\nPlease run all scripts (count_\*.sh files) under directory: " + tmpDir + "\n")
73 |
74 | if task == "gene":
75 |
76 | validArgList = ["-task", "-ncore", "-meta"]
77 | addAbsPath = [0, 0, 1]
78 | message = "SCATS.py -task gene -ncore <# cores> -meta "
79 | inputs = my.parse_argument(validArgList, addAbsPath, message)
80 | ncore = inputs[1]
81 | metaFile = inputs[2]
82 | tmpDir = crtAbsPath + "/tmp/gene_script"
83 | my.mk_dir(tmpDir)
84 | tmpDir = crtAbsPath + "/tmp/count_script"
85 | cdtList = sc.check_count_file(metaFile, tmpDir)
86 |
87 | outFile = crtAbsPath + "/tmp/celltypes"
88 | OUT = open(outFile, "w") # create celltype file
89 | for i in range(0, len(cdtList)):
90 | OUT.write(cdtList[i]+"\n")
91 | OUT.close()
92 |
93 | # estimate alpha
94 | my.mk_dir(crtAbsPath+"/tmp/abkt")
95 | myCommand = "perl " + fileAbsPath + "/bin/getalpha.pl " + metaFile + " " + tmpDir + " " + crtAbsPath+"/tmp/abkt/abkt_umi"
96 | os.system(myCommand)
97 |
98 | # estimate gene expression
99 | tmpDir = crtAbsPath + "/tmp/gene_script"
100 | my.mk_dir(tmpDir+"/data")
101 |
102 | outFile = crtAbsPath + "/tmp/comparegroup"
103 | OUT = open(outFile, "w") # create compare group file
104 | for i in range(0,1):
105 | for j in range(i+1, len(cdtList)):
106 | OUT.write(cdtList[i]+"\t"+cdtList[j]+"\n")
107 | myCommand = "perl " + fileAbsPath + "/bin/getgenelevelcount.pl " + cdtList[i] + " " + cdtList[j]
108 | myCommand += " " + crtAbsPath + "/tmp/abkt/abkt_umi " + metaFile + " " + crtAbsPath + "/tmp/count_script " + crtAbsPath + "/tmp/gene_script/data"
109 | os.system(myCommand)
110 | myCommand = "perl " + fileAbsPath + "/bin/gettascdata.pl " + cdtList[i] + " " + cdtList[j]
111 | myCommand += " " + crtAbsPath + "/tmp/gene_script/data " + crtAbsPath + "/tmp/gene_script/data";
112 | os.system(myCommand)
113 | # generate sh files for gene expression estimation
114 | tmpDir = crtAbsPath + "/tmp/gene_script/data"
115 | mywrite = "mpirun -n " + ncore + " --bind-to none python " + fileAbsPath + "/bin/model_selection_das_umi.py -y " + tmpDir + "/tascdata_" + cdtList[i] + "_" + cdtList[j]
116 | mywrite += " -k " + tmpDir + "/abktfile_" + cdtList[i] + "_" + cdtList[j] + " -x " + tmpDir + "/condition_" + cdtList[i] + "_" + cdtList[j]
117 | mywrite += " -t 4 -o " + tmpDir + "/outgene_" + cdtList[i] + "_" + cdtList[j] + "\n"
118 | myoutsh = crtAbsPath + "/tmp/gene_script" + "/gene_" + cdtList[i] + "_" + cdtList[j] + ".sh"
119 | os.system("echo \"" + mywrite +"\" > " + myoutsh)
120 |
121 | OUT.close()
122 | tmpDir = crtAbsPath + "/tmp/gene_script"
123 | print("\nPlease run all scripts (gene_\*.sh files) under directory: " + tmpDir + "\n")
124 |
125 | if task == "das":
126 | ############### read count need to be filtered #### check getexonlevelcount.pl file to specify #######################
127 | validArgList = ["-task", "-ncore", "-meta", "-gpinfo"]
128 | addAbsPath = [0, 0, 1, 1]
129 | message = "SCATS.py -task das -ncore <# cores> -meta -gpinfo "
130 | inputs = my.parse_argument(validArgList, addAbsPath, message)
131 | ncore = inputs[1]
132 | metaFile = inputs[2]
133 | gpinfoFile = inputs[3]
134 | tmpDir = crtAbsPath + "/tmp/das_script"
135 | my.mk_dir(tmpDir)
136 | my.mk_dir(tmpDir+"/data")
137 | cdtList = sc.check_count_file(metaFile, crtAbsPath + "/tmp/count_script")
138 |
139 | #collect gene expression and bursting rate
140 | myCommand = "perl " + fileAbsPath + "/bin/getgeneleveltheta_umi.pl " + crtAbsPath + "/tmp"
141 | os.system(myCommand)
142 | #collect informative read counts
143 | for i in range(0, len(cdtList)-1):
144 | for j in range(i+1, len(cdtList)):
145 | myCommand = "perl " + fileAbsPath + "/bin/getexonlevelcount_umi.pl " + cdtList[i] + " " + cdtList[j]
146 | myCommand += " " + crtAbsPath+"/tmp " + metaFile + " " + gpinfoFile
147 | os.system(myCommand)
148 | # generate sh files
149 | tmpDir = crtAbsPath + "/tmp/das_script/data"
150 | mywrite = "mpirun -n " + ncore + " --bind-to none python " + fileAbsPath + "/bin/model_selection_das_umi.py -y " + tmpDir + "/countdata_" + cdtList[i] + "_" + cdtList[j]
151 | mywrite += " -k " + tmpDir + "/abktfile_" + cdtList[i] + "_" + cdtList[j] + " -x " + tmpDir + "/condition_" + cdtList[i] + "_" + cdtList[j]
152 | mywrite += " -t 6 -o " + tmpDir + "/out_" + cdtList[i] + "_" + cdtList[j] + "\n"
153 | myoutsh = crtAbsPath + "/tmp/das_script" + "/das_" + cdtList[i] + "_" + cdtList[j] + ".sh"
154 | os.system("echo \"" + mywrite +"\" > " + myoutsh)
155 |
156 | tmpDir = crtAbsPath + "/tmp/das_script"
157 | print("\nPlease run all scripts (das_\*.sh files) under directory: " + tmpDir + "\n")
158 |
159 |
160 | if task == "sum":
161 | validArgList = ["-task", "-gpinfo"]
162 | addAbsPath = [0, 1]
163 | message = "SCATS.py -task sum -gpinfo "
164 | inputs = my.parse_argument(validArgList, addAbsPath, message)
165 | gpinfoFile = inputs[1]
166 | tmpDir = crtAbsPath + "/summary"
167 | my.mk_dir(tmpDir)
168 | outFile = tmpDir + "/DAS_results"
169 | tmpDir = crtAbsPath + "/tmp"
170 | compareFile = crtAbsPath + "/tmp/comparegroup"
171 | my.check_file(compareFile,"Please run SCATS.py -task gene.")
172 |
173 | myCommand = "perl " + fileAbsPath + "/bin/summarizedas.pl " + tmpDir + " " + gpinfoFile + " " + outFile
174 | os.system(myCommand)
175 |
--------------------------------------------------------------------------------
/bin/PreProcess.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perlq
2 |
3 | ######
4 |
5 | use Getopt::Long;
6 | use Pod::Usage;
7 |
8 | my $refseq; # gene annotation - UCSC
9 | my $output; # length of sequence read
10 |
11 | GetOptions('r=s'=>\$refseq,'o=s'=>\$output);
12 |
13 | if((!($refseq))||(!($output))){
14 | pod2usage();
15 | }
16 |
17 | open(REF, $refseq);
18 | open (RRR, ">$output");
19 |
20 | ######### load transcript annotation
21 | my %knownGene = ();
22 | while ([) {
23 | chomp($_);
24 | my @transcript = split(/\t/);
25 | if($transcript[1] =~ /NR/){
26 | next;
27 | }
28 | my $name = $transcript[12];
29 | my $tran = $transcript[1];
30 | $knownGene{$name}{$tran}{"chrom"} = $transcript[2];
31 | $knownGene{$name}{$tran}{"strand"} = $transcript[3];
32 | $knownGene{$name}{$tran}{"txStart"} = $transcript[4];
33 | $knownGene{$name}{$tran}{"txEnd"} = $transcript[5];
34 | $knownGene{$name}{$tran}{"exonCount"} = $transcript[8];
35 | $knownGene{$name}{$tran}{"exonStarts"} = $transcript[9];
36 | $knownGene{$name}{$tran}{"exonEnds"} = $transcript[10];
37 | }
38 |
39 |
40 | ########## load isoform annotation
41 | my %isoGene = ();
42 | my %isoStart = ();
43 | my %isoEnd = ();
44 |
45 | foreach my $name (keys %knownGene )
46 | {
47 | foreach my $tran (keys %{$knownGene{$name}})
48 | {
49 |
50 | my $i_start = $knownGene{$name}{$tran}{"txStart"};
51 | my $i_end = $knownGene{$name}{$tran}{"txEnd"};
52 | $isoGene{$name} = $isoGene{$name}.$tran.",";
53 | if($isoStart{$name} == NULL )
54 | {$isoStart{$name} = $i_start;}
55 | else
56 | {
57 | if($isoStart{$name} > $i_start){ $isoStart{$name} = $i_start; }
58 | }
59 | if($isoEnd{$name} == NULL )
60 | {$isoEnd{$name} = $i_end;}
61 | else
62 | {
63 | if($isoEnd{$name} < $i_end){ $isoEnd{$name} = $i_end;}
64 | }
65 | }
66 | }
67 |
68 |
69 | ################## process isoform information
70 | foreach my $ID (keys %isoGene )
71 | {
72 |
73 | my @genename = split(/,/, $isoGene{$ID});
74 | my $size = @genename;
75 | my $i_chrom = $knownGene{$ID}{$genename[0]}{"chrom"};
76 | my $i_strand = $knownGene{$ID}{$genename[0]}{"strand"};
77 | my $i_start = $isoStart{$ID};
78 | my $i_end = $isoEnd{$ID};
79 | my %ISO_Index = ();
80 |
81 | if($size >1) ### you can specify the number of the isoform per gene here
82 | {
83 | print RRR "$ID\t$i_chrom\t$i_strand\t$i_start\t$i_end\t";
84 | for(my $j=0; $j<= $#genename; $j++)
85 | {
86 | my $name = $genename[$j];
87 | print RRR "$name,";
88 | my @start = split(/,/, $knownGene{$ID}{$name}{"exonStarts"});
89 | my @end = split(/,/, $knownGene{$ID}{$name}{"exonEnds"});
90 | for (my $ijk=0; $ijk<= $#start; $ijk++)
91 | {
92 | my $sss = $start[$ijk];
93 | my $eee = $end[$ijk];
94 | for (my $abc=$sss; $abc<=$eee; $abc++)
95 | {$ISO_Index{$abc}{$j} = 1;}
96 | } # ijk
97 | } # j
98 |
99 | print RRR "\n";
100 |
101 | my %NEW_EXON =();
102 | my $CCC =0;
103 | my $pre_POS = $i_start-10;
104 | my @pre_Index = ();
105 | for(my $j=0; $j <= $#genename; $j++)
106 | {$pre_Index{$j}=0;}
107 |
108 | for my $ijk (sort {$a<=>$b} keys %ISO_Index)
109 | {
110 | my $tot =0;
111 | my @cur_Index=();
112 | for(my $j=0; $j<=$#genename; $j++)
113 | {
114 | my $name = $genename[$j];
115 | my $value = exists $ISO_Index{$ijk}{$j} ? $ISO_Index{$ijk}{$j} : 0;
116 | $cur_Index[$j] = $value;
117 | if($cur_Index[$j] != $pre_Index[$j])
118 | {$tot = $tot +1;}
119 | }
120 | my $move = $ijk - $pre_POS;
121 | if($move != 1)
122 | {
123 | $NEW_EXON{$CCC}{"start"} = $ijk;
124 | $NEW_EXON{$CCC}{"Index"} = [@cur_Index];
125 | if($CCC > 0)
126 | {
127 | $NEW_EXON{$CCC-1}{"end"} = $pre_POS;
128 | }
129 | @pre_Index = @cur_Index;
130 | $CCC = $CCC+1;
131 | }
132 | else{
133 | if($tot >0)
134 | {
135 | $NEW_EXON{$CCC}{"start"} = $ijk;
136 | $NEW_EXON{$CCC-1}{"end"} = $ijk-1;
137 | $NEW_EXON{$CCC}{"Index"} = [@cur_Index];
138 | @pre_Index = @cur_Index;
139 | $CCC = $CCC+1;
140 | }
141 | }
142 | $pre_POS = $ijk;
143 | } # ijk
144 | $NEW_EXON{$CCC-1}{"end"} = $i_end;
145 |
146 | ### print data structure
147 | for my $CCC (sort {$a<=>$b} keys %NEW_EXON)
148 | {
149 | print RRR "$ID\t$i_chrom\t$i_strand\t";
150 | my $sss = $NEW_EXON{$CCC}{"start"};
151 | print RRR "$sss\t";
152 | my $eee = $NEW_EXON{$CCC}{"end"};
153 | print RRR "$eee\t";
154 | my $Read_C = 0;
155 | #for (my $abc=$sss; $abc<=$eee; $abc++)
156 | #{
157 | # my $value = exists $genome{$i_chrom}{$abc} ? $genome{$i_chrom}{$abc} : 0;
158 | # $Read_C =$Read_C+$value;
159 | #}
160 | #print RRR "$Read_C\t";
161 | my @index_get = $NEW_EXON{$CCC}{"Index"};
162 | for my $INDEX ( 0 .. $#{$NEW_EXON{$CCC}{"Index"}}){
163 | my $index_get = $NEW_EXON{$CCC}{"Index"}[$INDEX];
164 | print RRR "$index_get,"; }
165 | print RRR "\n";
166 | #print "haha\n";
167 | } ### for CCC
168 |
169 | } # if size
170 | } #i : Cluster ID -1
171 |
172 |
173 |
174 |
175 | close(SAM);
176 | close(REF);
177 | close(RRR);
178 |
179 | =head1 SYNOPSIS
180 |
181 | -r ---RefSeqAnnotation file
182 |
183 | -o ---The file name that you want to save the results
184 |
--------------------------------------------------------------------------------
/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/__init__.py
--------------------------------------------------------------------------------
/bin/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/__init__.pyc
--------------------------------------------------------------------------------
/bin/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/bin/__pycache__/my_functions.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/__pycache__/my_functions.cpython-37.pyc
--------------------------------------------------------------------------------
/bin/__pycache__/scats_functions.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/__pycache__/scats_functions.cpython-37.pyc
--------------------------------------------------------------------------------
/bin/check_software.py:
--------------------------------------------------------------------------------
1 | import my_functions as my
2 |
3 | list = ['pysam', "numpy", "scipy", "cython"]
4 | for x in list:
5 | my.check_module(x)
6 |
7 |
8 | list = ["mpirun", "samtools"]
9 | for x in list:
10 | my.check_program(x)
11 |
--------------------------------------------------------------------------------
/bin/complie_likelihoodumi.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | rm ./likelihoodumi.c
3 | rm ./likelihoodumi.html
4 | rm ./likelihoodumi.so
5 | cython -a ./likelihoodumi.pyx
6 | gcc -shared -pthread -fPIC `python-config --cflags` -o likelihoodumi.so likelihoodumi.c
7 |
--------------------------------------------------------------------------------
/bin/getCount.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | from __future__ import print_function # load print function in python3
4 | from collections import defaultdict
5 | import math, sys, os, re, pysam, time
6 |
7 | # set up auto dictionary function
8 | def auto_dict():
9 | return defaultdict(auto_dict)
10 |
11 |
12 | ###############################################################################
13 | ### ARGUMENT SETTINGS
14 | ###############################################################################
15 |
16 | # checking whether argument is valid or not
17 | validArgList = ["-bam", "-ref", "-out", "-gpinfo"]
18 | for argIndex in range(1,len(sys.argv)):
19 | if sys.argv[argIndex][0] == "-" and sys.argv[argIndex] not in validArgList :
20 | print("Argument \'"+sys.argv[argIndex]+"\' is invalid!")
21 | sys.exit()
22 |
23 |
24 | bamFileExists = 0
25 | refFileExists = 0
26 | outFileExists = 0
27 | gpinfoFileExists = 0
28 | for argIndex in range(1,len(sys.argv)):
29 | if sys.argv[argIndex] == "-bam": ## load in BAM file
30 | argIndex += 1
31 | bamFileAbsPath = os.path.dirname(os.path.abspath(sys.argv[argIndex]))
32 | bamTmp = sys.argv[argIndex].split("/")
33 | bamFile = bamFileAbsPath + "/" + bamTmp[len(bamTmp)-1]
34 | bamFileExists = 1
35 | elif sys.argv[argIndex] == "-ref": ## load in annotation file
36 | argIndex += 1
37 | refFileAbsPath = os.path.dirname(os.path.abspath(sys.argv[argIndex]))
38 | refTmp = sys.argv[argIndex].split("/")
39 | refGeneFile = refFileAbsPath + "/" + refTmp[len(refTmp)-1]
40 | refFileExists = 1
41 | elif sys.argv[argIndex] == "-out": ## load in annotation file
42 | argIndex += 1
43 | outFileAbsPath = os.path.dirname(os.path.abspath(sys.argv[argIndex]))
44 | outTmp = sys.argv[argIndex].split("/")
45 | outFile = outFileAbsPath + "/" + outTmp[len(outTmp)-1]
46 | outFileExists = 1
47 | elif sys.argv[argIndex] == "-gpinfo": ## load group information file
48 | argIndex += 1
49 | gpinfoFileAbsPath = os.path.dirname(os.path.abspath(sys.argv[argIndex]))
50 | gpinfoTmp = sys.argv[argIndex].split("/")
51 | gpinfoFile = gpinfoFileAbsPath + "/" + gpinfoTmp[len(gpinfoTmp)-1]
52 | gpinfoFileExists = 1
53 |
54 |
55 | if bamFileExists == 0 or refFileExists == 0 or outFileExists == 0 or gpinfoFileExists == 0: ## lack enough arguments
56 | print("Please provide arguments:")
57 | print("-bam\tIndexed bam file")
58 | print("-ref\tGene annotation file")
59 | print("-out\tOutput file")
60 | print("-gpinfo\tGroup Information file")
61 | sys.exit()
62 |
63 |
64 | # load gene information
65 | geneStructureInformation = auto_dict()
66 | geneLineCount = auto_dict()
67 |
68 | with open(refGeneFile, "r") as FP:
69 | for line in FP:
70 | line = line.strip("\n")
71 | tmpinf = line.split("\t")
72 | gene = tmpinf[0]
73 |
74 | if not bool(geneStructureInformation[gene]):
75 | geneLineCount[gene] = 0
76 | geneStructureInformation[gene][geneLineCount[gene]] = line
77 | else:
78 | geneLineCount[gene] += 1
79 | geneStructureInformation[gene][geneLineCount[gene]] = line
80 |
81 | # load group information
82 |
83 | groupInformation = auto_dict()
84 | geneLineCount1 = auto_dict()
85 | with open(gpinfoFile, "r") as FP:
86 | for line in FP:
87 | line = line.strip("\n")
88 | tmpinf = line.split("\t")
89 | tmpinf[5] = tmpinf[5].strip(",")
90 | gene = tmpinf[0]
91 |
92 | groupInformation[gene][tmpinf[1]][tmpinf[3]] = tmpinf[5]
93 |
94 |
95 |
96 |
97 | #####################################
98 | ## Using pysam to read in bam file !!
99 | #####################################
100 | bamFilePysam = pysam.Samfile(bamFile,"rb")
101 |
102 |
103 | ## RESULTS FILE
104 | OUT = open(outFile, 'w')
105 |
106 |
107 | ###########################################################################################################################
108 | ### START TO ANALYZE DATA FOR EACH GENE ###
109 | ##########################################################################################################################
110 |
111 | geneCount = 0
112 |
113 | startTime = time.time()
114 |
115 | #OUT.write("GeneName\tIsoformName\tNumberOfReads\tRelativeAbundance\n") ## Header of Results
116 |
117 | for gene in geneStructureInformation:
118 |
119 | countResults = auto_dict()
120 |
121 | geneCount += 1
122 | tmpTime = (time.time() - startTime)/60.0
123 |
124 |
125 | sameReadCount = auto_dict()
126 | readStart = auto_dict()
127 | readEnd = auto_dict()
128 | readCigar = auto_dict()
129 |
130 | numofExons = geneLineCount[gene]
131 | tmpgeneinf = geneStructureInformation[gene][0].split("\t")
132 | geneChr = tmpgeneinf[1]
133 | geneStart = int(tmpgeneinf[3])
134 | geneEnd = int(tmpgeneinf[4])
135 | if bamFilePysam.get_tid(geneChr) == -1:
136 | continue
137 |
138 | ## load all reads information which were mapped to the specific gene within this loop using pysam
139 | for read in bamFilePysam.fetch(geneChr, geneStart, geneEnd):
140 | line = str(read)
141 | tmpinf = line.split("\t")
142 | tmpReadName = tmpinf[0]
143 | tmpReadChr = geneChr
144 | tmpReadStart = int(tmpinf[3]) + 1
145 | tmpReadCigar = ""
146 |
147 | ## Adjust to different Pysam Version!! ##
148 |
149 | if ")]" in tmpinf[5]: ## vector format
150 |
151 | tmpinf[5] = tmpinf[5].rstrip(")]")
152 | tmpinf[5] = tmpinf[5].lstrip("[(")
153 | tmpinfcigar = tmpinf[5].split("), (")
154 | for cc in tmpinfcigar:
155 | ttcc = cc.split(", ")
156 | if ttcc[0] == "3":
157 | tmpReadCigar = tmpReadCigar + ttcc[1] + "N"
158 | if ttcc[0] == "2":
159 | tmpReadCigar = tmpReadCigar + ttcc[1] + "D"
160 | if ttcc[0] == "1":
161 | tmpReadCigar = tmpReadCigar + ttcc[1] + "I"
162 | if ttcc[0] == "0":
163 | tmpReadCigar = tmpReadCigar + ttcc[1] + "M"
164 | if not (ttcc[0] == "3" or ttcc[0] == "2" or ttcc[0] == "1" or ttcc[0] == "0"):
165 | tmpReadCigar = tmpReadCigar + ttcc[1] + "X"
166 | else: ## 100M10N100M format
167 | tmpReadCigar = tmpinf[5]
168 |
169 | if not bool(sameReadCount[tmpReadName]):
170 | sameReadCount[tmpReadName] = 1
171 | else:
172 | sameReadCount[tmpReadName] += 1
173 |
174 | readStart[tmpReadName][sameReadCount[tmpReadName]] = tmpReadStart
175 | readCigar[tmpReadName][sameReadCount[tmpReadName]] = tmpReadCigar
176 |
177 |
178 | ## load structure information of the specific gene within this loop
179 |
180 | tmpgeneinf[5] = tmpgeneinf[5].rstrip(",")
181 | isoformNames = tmpgeneinf[5].split(",")
182 | exonStarts = [None] * numofExons
183 | exonEnds = [None] * numofExons
184 | exonIndicators = auto_dict()
185 |
186 | for i in range(1,numofExons+1):
187 | tmpinf = geneStructureInformation[gene][i].split("\t")
188 | exonStarts[i-1] = int(tmpinf[3])+1
189 | exonEnds[i-1] = int(tmpinf[4])
190 | tmpinf[5] = tmpinf[5].rstrip(",")
191 | tmpExonIndicators = tmpinf[5].split(",")
192 |
193 | for j in range(len(tmpExonIndicators)):
194 | exonIndicators[isoformNames[j]][i-1] = int(tmpExonIndicators[j])
195 |
196 | lociIndicators = auto_dict()
197 | for i in range(len(isoformNames)):
198 | for j in range(len(exonStarts)):
199 | if exonIndicators[isoformNames[i]][j] == 1:
200 | for k in range(exonStarts[j], exonEnds[j]+1):
201 | lociIndicators[isoformNames[i]][k] = 1
202 |
203 | #########################################################################################################################################
204 | ## START TO ANALYZE EACH READ
205 | ##################################################################################################################################################
206 |
207 | qualifiedRead = auto_dict()
208 | readCount = 0
209 | fragmentStart = auto_dict()
210 | fragmentEnd = auto_dict()
211 | CompatibleMatrix = auto_dict()
212 | tmpCompatibleMatrix = auto_dict()
213 |
214 | for readName in sameReadCount:
215 |
216 | # load CIGAR information
217 | cigarNumberRead1 = auto_dict()
218 | cigarNumberRead2 = auto_dict()
219 | cigarMatchRead1 = auto_dict()
220 | cigarMatchRead2 = auto_dict()
221 | cigarInfCountRead1 = 0
222 | cigarInfCountRead2 = 0
223 | cigarInfCountRead1tmp = 0
224 | cigarInfCountRead2tmp = 0
225 |
226 | tmp1 = re.split("([A-Z])",readCigar[readName][1])
227 | for i in range(len(tmp1)-1):
228 | if tmp1[i].isalpha():
229 | cigarMatchRead1[cigarInfCountRead1] = tmp1[i]
230 | cigarInfCountRead1 += 1
231 | else:
232 | cigarNumberRead1[cigarInfCountRead1] = int(tmp1[i])
233 | cigarInfCountRead1tmp += 1
234 |
235 | if sameReadCount[readName] == 2:
236 | tmp2 = re.split("([A-Z])",readCigar[readName][2])
237 | for i in range(len(tmp2)-1):
238 | if tmp2[i].isalpha():
239 | cigarMatchRead2[cigarInfCountRead2] = tmp2[i]
240 | cigarInfCountRead2 += 1
241 | else:
242 | cigarNumberRead2[cigarInfCountRead2] = int(tmp2[i])
243 | cigarInfCountRead2tmp += 1
244 |
245 | # calculate read end positions
246 | readEnd[readName][1] = readStart[readName][1]
247 | for i in range(cigarInfCountRead1):
248 | readEnd[readName][1] += cigarNumberRead1[i]
249 |
250 | if sameReadCount[readName] == 2:
251 | readEnd[readName][2] = readStart[readName][2]
252 | for i in range(cigarInfCountRead2):
253 | readEnd[readName][2] += cigarNumberRead2[i]
254 |
255 | # calculate fragment START and END positions
256 | if sameReadCount[readName] == 2:
257 | fragmentStart[readName] = readStart[readName][2] if readStart[readName][1] >= readStart[readName][2] else readStart[readName][1]
258 | fragmentEnd[readName] = readEnd[readName][1] if readEnd[readName][1] >= readEnd[readName][2] else readEnd[readName][2]
259 |
260 | if sameReadCount[readName] == 1:
261 | fragmentStart[readName] = readStart[readName][1]
262 | fragmentEnd[readName] = readEnd[readName][1]
263 |
264 | ##################################################################################################################################
265 | ## Obtain compatible matrix of isoforms with respect to reads
266 | #################################################################################################################################
267 |
268 | if (readStart[readName][1] >= geneStart and readStart[readName][1] <= geneEnd) or (readStart[readName][2] >= geneStart and readStart[readName][2] <= geneEnd and sameReadCount[readName]==2) :
269 | if cigarInfCountRead1 == cigarInfCountRead1tmp and cigarInfCountRead2 == cigarInfCountRead2tmp:
270 | base1 = readStart[readName][1] - 1
271 | exonIndicatorRead1 = [0] * numofExons
272 | if sameReadCount[readName] == 2:
273 | base2 = readStart[readName][2] - 1
274 | exonIndicatorRead2 = [0] * numofExons
275 | compatibleVector = [1] * len(isoformNames)
276 |
277 | ##############################################################################################################################################
278 | ### SET TUP COMPATIBLE INDICATOR VECTOR ###############
279 | ###############################################################################################################################################
280 | ## READ 1 ##
281 | # find exons where read 1 mapped to
282 | for i in range(cigarInfCountRead1):
283 |
284 | if cigarMatchRead1[i] == "M" or cigarMatchRead1[i] == "I": ## matched CIGAR
285 |
286 | for j in range(1,cigarNumberRead1[i]+1):
287 | tmpbase = base1 + j
288 | for k in range(len(exonStarts)):
289 | if exonIndicatorRead1[k] == 1: continue
290 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead1[k] = 1 ## confirm that the read covers this exon
291 |
292 | base1 += cigarNumberRead1[i] # jump to next match information
293 |
294 | if cigarMatchRead1[i] == "N": ## skipping area
295 | base1 += cigarNumberRead1[i] # jump to next match information directly
296 |
297 | # set up indicator vector
298 | tmpcount1 = 0
299 | tmpcount11 = 0 ## these two variable are used to rule out skipping exons
300 | for i in range(len(exonIndicatorRead1)):
301 | if exonIndicatorRead1[i] == 1: tmpcount1 += 1
302 | for i in range(len(exonIndicatorRead1)):
303 |
304 | if exonIndicatorRead1[i] == 1:
305 | tmpcount11 += 1
306 | for j in range(len(isoformNames)):
307 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j
308 |
309 | if exonIndicatorRead1[i] == 0: #aim to rule out isforms which includes exons which skipped by read
310 | if tmpcount1 > 1 and tmpcount11 >= 1 and tmpcount11 < tmpcount1: ## confirm the exon i is skipped by read!!
311 | for j in range(len(isoformNames)):
312 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0
313 |
314 |
315 | ## READ 2 ## SAME AS READ 1
316 | tmpcount2 = 0
317 | if sameReadCount[readName] == 2: ## ONLY WHEN THE READ IS PAIRED-END READ!!!
318 | # find exons where read 2 mapped to
319 | for i in range(cigarInfCountRead2):
320 |
321 | if cigarMatchRead2[i] == "M" or cigarMatchRead2[i] == "I": ## matched CIGAR
322 |
323 | for j in range(1,cigarNumberRead2[i]+1):
324 | tmpbase = base2 + j
325 | for k in range(len(exonStarts)):
326 | if exonIndicatorRead2[k] == 1: continue
327 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead2[k] = 1 ## confirm that the read covers this exon
328 |
329 | base2 += cigarNumberRead2[i] # jump to next match information
330 |
331 | if cigarMatchRead2[i] == "N": ## skipping area
332 | base2 += cigarNumberRead2[i] # jump to next match information directly
333 |
334 | # set up indicator vector
335 | tmpcount2 = 0
336 | tmpcount22 = 0 ## these two variable are used to rule out skipping exons
337 | for i in range(len(exonIndicatorRead2)):
338 | if exonIndicatorRead2[i] == 1: tmpcount2 += 1
339 | for i in range(len(exonIndicatorRead2)):
340 |
341 | if exonIndicatorRead2[i] == 1:
342 | tmpcount22 += 1
343 | for j in range(len(isoformNames)):
344 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j
345 |
346 | if exonIndicatorRead2[i] == 0: #aim to rule out isforms which includes exons which skipped by read
347 | if tmpcount2 > 1 and tmpcount22 >= 1 and tmpcount22 < tmpcount2: ## confirm the exon i is skipped by read!!
348 | for j in range(len(isoformNames)):
349 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0
350 |
351 | ##################################################################################################################################################
352 | ## fill in compatible matrix ##
353 | if tmpcount1 > 0 or (tmpcount2 > 0 and sameReadCount[readName] == 2):
354 | readCount += 1
355 | qualifiedRead[readName] = 1
356 | for i in range(len(isoformNames)):
357 | CompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i]
358 | tmpCompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i]
359 | else:
360 | qualifiedRead[readName] = 0
361 |
362 |
363 | ### COMPATIBLE MATRIX OBTAINED !!!
364 | ###############################################################################################################
365 |
366 | if readCount == 0: continue
367 | print(gene+"\t"+str(readCount)+" reads detected...")
368 |
369 | for weight in groupInformation[gene]:
370 | countResults[weight]["+"] = 0
371 | countResults[weight]["-"] = 0
372 | isosetplus = groupInformation[gene][weight]["+"].split(",")
373 | isosetminus = groupInformation[gene][weight]["-"].split(",")
374 |
375 | for readName in qualifiedRead:
376 | if qualifiedRead[readName] == 0: continue
377 | sumindexplus = 0
378 | for index in isosetplus:
379 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexplus += 1
380 | sumindexminus = 0
381 | for index in isosetminus:
382 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexminus += 1
383 | if sumindexplus == 0:
384 | countResults[weight]["+"] += 1
385 | if sumindexminus == 0:
386 | countResults[weight]["-"] += 1
387 |
388 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"+"+"\t"+str(countResults[weight]["+"])+"\n")
389 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"-"+"\t"+str(countResults[weight]["-"])+"\n")
390 |
391 | OUT.close()
392 |
393 |
394 |
395 |
396 |
397 |
--------------------------------------------------------------------------------
/bin/getCount_cellid.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | from __future__ import print_function # load print function in python3
4 | from collections import defaultdict
5 | import math, sys, os, re, pysam, time
6 | import my_functions as my
7 |
8 | # set up auto dictionary function
9 | def auto_dict():
10 | return defaultdict(auto_dict)
11 |
12 |
13 | ###############################################################################
14 | ### ARGUMENT SETTINGS
15 | ###############################################################################
16 |
17 | # checking whether argument is valid or not
18 | validArgList = ["-bam", "-ref", "-out", "-gpinfo", "-cellid", "-celltag"]
19 | addAbsPath = [1,1,3,1,0,0]
20 | warnMessage = "-bam, -ref, -out, -gpinfo, -cellid, -celltag"
21 | inputFile = my.parse_argument(validArgList, addAbsPath, warnMessage)
22 | bamFile = inputFile[0]
23 | refGeneFile = inputFile[1]
24 | outFile = inputFile[2]
25 | gpinfoFile = inputFile[3]
26 | cellid = inputFile[4]
27 | celltag = inputFile[5]
28 |
29 |
30 | # load gene information
31 | geneStructureInformation = auto_dict()
32 | geneLineCount = auto_dict()
33 |
34 | with open(refGeneFile, "r") as FP:
35 | for line in FP:
36 | line = line.strip("\n")
37 | tmpinf = line.split("\t")
38 | gene = tmpinf[0]
39 |
40 | if not bool(geneStructureInformation[gene]):
41 | geneLineCount[gene] = 0
42 | geneStructureInformation[gene][geneLineCount[gene]] = line
43 | else:
44 | geneLineCount[gene] += 1
45 | geneStructureInformation[gene][geneLineCount[gene]] = line
46 |
47 | # load group information
48 |
49 | groupInformation = auto_dict()
50 | geneLineCount1 = auto_dict()
51 | with open(gpinfoFile, "r") as FP:
52 | for line in FP:
53 | line = line.strip("\n")
54 | tmpinf = line.split("\t")
55 | tmpinf[5] = tmpinf[5].strip(",")
56 | gene = tmpinf[0]
57 |
58 | groupInformation[gene][tmpinf[1]][tmpinf[3]] = tmpinf[5]
59 |
60 |
61 |
62 |
63 | #####################################
64 | ## Using pysam to read in bam file !!
65 | #####################################
66 | bamFilePysam = pysam.Samfile(bamFile,"rb")
67 |
68 |
69 | ## RESULTS FILE
70 | OUT = open(outFile, 'w')
71 |
72 |
73 | ###########################################################################################################################
74 | ### START TO ANALYZE DATA FOR EACH GENE ###
75 | ##########################################################################################################################
76 |
77 | geneCount = 0
78 |
79 | startTime = time.time()
80 |
81 | #OUT.write("GeneName\tIsoformName\tNumberOfReads\tRelativeAbundance\n") ## Header of Results
82 |
83 | for gene in geneStructureInformation:
84 |
85 | countResults = auto_dict()
86 |
87 | geneCount += 1
88 | tmpTime = (time.time() - startTime)/60.0
89 |
90 |
91 | sameReadCount = auto_dict()
92 | readStart = auto_dict()
93 | readEnd = auto_dict()
94 | readCigar = auto_dict()
95 |
96 | numofExons = geneLineCount[gene]
97 | tmpgeneinf = geneStructureInformation[gene][0].split("\t")
98 | geneChr = tmpgeneinf[1]
99 | geneStart = int(tmpgeneinf[3])
100 | geneEnd = int(tmpgeneinf[4])
101 | if bamFilePysam.get_tid(geneChr) == -1:
102 | continue
103 |
104 | ## load all reads information which were mapped to the specific gene within this loop using pysam
105 | for read in bamFilePysam.fetch(geneChr, geneStart, geneEnd):
106 | line = str(read)
107 | tmpinf = line.split("\t")
108 | tmpReadName = tmpinf[0]
109 | tmpReadChr = geneChr
110 | tmpReadStart = int(tmpinf[3]) + 1
111 | tmpReadCigar = ""
112 | try:
113 | tmpCellBarcode = read.get_tag(celltag)
114 | except:
115 | continue
116 | if cellid != tmpCellBarcode:
117 | continue
118 |
119 |
120 | ## Adjust to different Pysam Version!! ##
121 |
122 | if ")]" in tmpinf[5]: ## vector format
123 |
124 | tmpinf[5] = tmpinf[5].rstrip(")]")
125 | tmpinf[5] = tmpinf[5].lstrip("[(")
126 | tmpinfcigar = tmpinf[5].split("), (")
127 | for cc in tmpinfcigar:
128 | ttcc = cc.split(", ")
129 | if ttcc[0] == "3":
130 | tmpReadCigar = tmpReadCigar + ttcc[1] + "N"
131 | if ttcc[0] == "2":
132 | tmpReadCigar = tmpReadCigar + ttcc[1] + "D"
133 | if ttcc[0] == "1":
134 | tmpReadCigar = tmpReadCigar + ttcc[1] + "I"
135 | if ttcc[0] == "0":
136 | tmpReadCigar = tmpReadCigar + ttcc[1] + "M"
137 | if not (ttcc[0] == "3" or ttcc[0] == "2" or ttcc[0] == "1" or ttcc[0] == "0"):
138 | tmpReadCigar = tmpReadCigar + ttcc[1] + "X"
139 | else: ## 100M10N100M format
140 | tmpReadCigar = tmpinf[5]
141 |
142 | if not bool(sameReadCount[tmpReadName]):
143 | sameReadCount[tmpReadName] = 1
144 | else:
145 | sameReadCount[tmpReadName] += 1
146 |
147 | readStart[tmpReadName][sameReadCount[tmpReadName]] = tmpReadStart
148 | readCigar[tmpReadName][sameReadCount[tmpReadName]] = tmpReadCigar
149 |
150 |
151 | ## load structure information of the specific gene within this loop
152 |
153 | tmpgeneinf[5] = tmpgeneinf[5].rstrip(",")
154 | isoformNames = tmpgeneinf[5].split(",")
155 | exonStarts = [None] * numofExons
156 | exonEnds = [None] * numofExons
157 | exonIndicators = auto_dict()
158 |
159 | for i in range(1,numofExons+1):
160 | tmpinf = geneStructureInformation[gene][i].split("\t")
161 | exonStarts[i-1] = int(tmpinf[3])+1
162 | exonEnds[i-1] = int(tmpinf[4])
163 | tmpinf[5] = tmpinf[5].rstrip(",")
164 | tmpExonIndicators = tmpinf[5].split(",")
165 |
166 | for j in range(len(tmpExonIndicators)):
167 | exonIndicators[isoformNames[j]][i-1] = int(tmpExonIndicators[j])
168 |
169 | lociIndicators = auto_dict()
170 | for i in range(len(isoformNames)):
171 | for j in range(len(exonStarts)):
172 | if exonIndicators[isoformNames[i]][j] == 1:
173 | for k in range(exonStarts[j], exonEnds[j]+1):
174 | lociIndicators[isoformNames[i]][k] = 1
175 |
176 | #########################################################################################################################################
177 | ## START TO ANALYZE EACH READ
178 | ##################################################################################################################################################
179 |
180 | qualifiedRead = auto_dict()
181 | readCount = 0
182 | fragmentStart = auto_dict()
183 | fragmentEnd = auto_dict()
184 | CompatibleMatrix = auto_dict()
185 | tmpCompatibleMatrix = auto_dict()
186 |
187 | for readName in sameReadCount:
188 |
189 | # load CIGAR information
190 | cigarNumberRead1 = auto_dict()
191 | cigarNumberRead2 = auto_dict()
192 | cigarMatchRead1 = auto_dict()
193 | cigarMatchRead2 = auto_dict()
194 | cigarInfCountRead1 = 0
195 | cigarInfCountRead2 = 0
196 | cigarInfCountRead1tmp = 0
197 | cigarInfCountRead2tmp = 0
198 |
199 | tmp1 = re.split("([A-Z])",readCigar[readName][1])
200 | for i in range(len(tmp1)-1):
201 | if tmp1[i].isalpha():
202 | cigarMatchRead1[cigarInfCountRead1] = tmp1[i]
203 | cigarInfCountRead1 += 1
204 | else:
205 | cigarNumberRead1[cigarInfCountRead1] = int(tmp1[i])
206 | cigarInfCountRead1tmp += 1
207 |
208 | if sameReadCount[readName] == 2:
209 | tmp2 = re.split("([A-Z])",readCigar[readName][2])
210 | for i in range(len(tmp2)-1):
211 | if tmp2[i].isalpha():
212 | cigarMatchRead2[cigarInfCountRead2] = tmp2[i]
213 | cigarInfCountRead2 += 1
214 | else:
215 | cigarNumberRead2[cigarInfCountRead2] = int(tmp2[i])
216 | cigarInfCountRead2tmp += 1
217 |
218 | # calculate read end positions
219 | readEnd[readName][1] = readStart[readName][1]
220 | for i in range(cigarInfCountRead1):
221 | readEnd[readName][1] += cigarNumberRead1[i]
222 |
223 | if sameReadCount[readName] == 2:
224 | readEnd[readName][2] = readStart[readName][2]
225 | for i in range(cigarInfCountRead2):
226 | readEnd[readName][2] += cigarNumberRead2[i]
227 |
228 | # calculate fragment START and END positions
229 | if sameReadCount[readName] == 2:
230 | fragmentStart[readName] = readStart[readName][2] if readStart[readName][1] >= readStart[readName][2] else readStart[readName][1]
231 | fragmentEnd[readName] = readEnd[readName][1] if readEnd[readName][1] >= readEnd[readName][2] else readEnd[readName][2]
232 |
233 | if sameReadCount[readName] == 1:
234 | fragmentStart[readName] = readStart[readName][1]
235 | fragmentEnd[readName] = readEnd[readName][1]
236 |
237 | ##################################################################################################################################
238 | ## Obtain compatible matrix of isoforms with respect to reads
239 | #################################################################################################################################
240 |
241 | if (readStart[readName][1] >= geneStart and readStart[readName][1] <= geneEnd) or (readStart[readName][2] >= geneStart and readStart[readName][2] <= geneEnd and sameReadCount[readName]==2) :
242 | if cigarInfCountRead1 == cigarInfCountRead1tmp and cigarInfCountRead2 == cigarInfCountRead2tmp:
243 | base1 = readStart[readName][1] - 1
244 | exonIndicatorRead1 = [0] * numofExons
245 | if sameReadCount[readName] == 2:
246 | base2 = readStart[readName][2] - 1
247 | exonIndicatorRead2 = [0] * numofExons
248 | compatibleVector = [1] * len(isoformNames)
249 |
250 | ##############################################################################################################################################
251 | ### SET TUP COMPATIBLE INDICATOR VECTOR ###############
252 | ###############################################################################################################################################
253 | ## READ 1 ##
254 | # find exons where read 1 mapped to
255 | for i in range(cigarInfCountRead1):
256 |
257 | if cigarMatchRead1[i] == "M" or cigarMatchRead1[i] == "I": ## matched CIGAR
258 |
259 | for j in range(1,cigarNumberRead1[i]+1):
260 | tmpbase = base1 + j
261 | for k in range(len(exonStarts)):
262 | if exonIndicatorRead1[k] == 1: continue
263 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead1[k] = 1 ## confirm that the read covers this exon
264 |
265 | base1 += cigarNumberRead1[i] # jump to next match information
266 |
267 | if cigarMatchRead1[i] == "N": ## skipping area
268 | base1 += cigarNumberRead1[i] # jump to next match information directly
269 |
270 | # set up indicator vector
271 | tmpcount1 = 0
272 | tmpcount11 = 0 ## these two variable are used to rule out skipping exons
273 | for i in range(len(exonIndicatorRead1)):
274 | if exonIndicatorRead1[i] == 1: tmpcount1 += 1
275 | for i in range(len(exonIndicatorRead1)):
276 |
277 | if exonIndicatorRead1[i] == 1:
278 | tmpcount11 += 1
279 | for j in range(len(isoformNames)):
280 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j
281 |
282 | if exonIndicatorRead1[i] == 0: #aim to rule out isforms which includes exons which skipped by read
283 | if tmpcount1 > 1 and tmpcount11 >= 1 and tmpcount11 < tmpcount1: ## confirm the exon i is skipped by read!!
284 | for j in range(len(isoformNames)):
285 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0
286 |
287 |
288 | ## READ 2 ## SAME AS READ 1
289 | tmpcount2 = 0
290 | if sameReadCount[readName] == 2: ## ONLY WHEN THE READ IS PAIRED-END READ!!!
291 | # find exons where read 2 mapped to
292 | for i in range(cigarInfCountRead2):
293 |
294 | if cigarMatchRead2[i] == "M" or cigarMatchRead2[i] == "I": ## matched CIGAR
295 |
296 | for j in range(1,cigarNumberRead2[i]+1):
297 | tmpbase = base2 + j
298 | for k in range(len(exonStarts)):
299 | if exonIndicatorRead2[k] == 1: continue
300 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead2[k] = 1 ## confirm that the read covers this exon
301 |
302 | base2 += cigarNumberRead2[i] # jump to next match information
303 |
304 | if cigarMatchRead2[i] == "N": ## skipping area
305 | base2 += cigarNumberRead2[i] # jump to next match information directly
306 |
307 | # set up indicator vector
308 | tmpcount2 = 0
309 | tmpcount22 = 0 ## these two variable are used to rule out skipping exons
310 | for i in range(len(exonIndicatorRead2)):
311 | if exonIndicatorRead2[i] == 1: tmpcount2 += 1
312 | for i in range(len(exonIndicatorRead2)):
313 |
314 | if exonIndicatorRead2[i] == 1:
315 | tmpcount22 += 1
316 | for j in range(len(isoformNames)):
317 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j
318 |
319 | if exonIndicatorRead2[i] == 0: #aim to rule out isforms which includes exons which skipped by read
320 | if tmpcount2 > 1 and tmpcount22 >= 1 and tmpcount22 < tmpcount2: ## confirm the exon i is skipped by read!!
321 | for j in range(len(isoformNames)):
322 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0
323 |
324 | ##################################################################################################################################################
325 | ## fill in compatible matrix ##
326 | if tmpcount1 > 0 or (tmpcount2 > 0 and sameReadCount[readName] == 2):
327 | readCount += 1
328 | qualifiedRead[readName] = 1
329 | for i in range(len(isoformNames)):
330 | CompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i]
331 | tmpCompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i]
332 | else:
333 | qualifiedRead[readName] = 0
334 |
335 |
336 | ### COMPATIBLE MATRIX OBTAINED !!!
337 | ###############################################################################################################
338 |
339 | if readCount == 0: continue
340 | print(gene+"\t"+str(readCount)+" reads detected...")
341 |
342 | for weight in groupInformation[gene]:
343 | countResults[weight]["+"] = 0
344 | countResults[weight]["-"] = 0
345 | isosetplus = groupInformation[gene][weight]["+"].split(",")
346 | isosetminus = groupInformation[gene][weight]["-"].split(",")
347 |
348 | for readName in qualifiedRead:
349 | if qualifiedRead[readName] == 0: continue
350 | sumindexplus = 0
351 | for index in isosetplus:
352 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexplus += 1
353 | sumindexminus = 0
354 | for index in isosetminus:
355 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexminus += 1
356 | if sumindexplus == 0:
357 | countResults[weight]["+"] += 1
358 | if sumindexminus == 0:
359 | countResults[weight]["-"] += 1
360 |
361 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"+"+"\t"+str(countResults[weight]["+"])+"\n")
362 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"-"+"\t"+str(countResults[weight]["-"])+"\n")
363 |
364 | OUT.close()
365 |
366 |
367 |
368 |
369 |
370 |
--------------------------------------------------------------------------------
/bin/getCount_umi.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | from __future__ import print_function # load print function in python3
4 | from collections import defaultdict
5 | import math, sys, os, re, pysam, time
6 | import my_functions as my
7 |
8 | # set up auto dictionary function
9 | def auto_dict():
10 | return defaultdict(auto_dict)
11 |
12 |
13 | ###############################################################################
14 | ### ARGUMENT SETTINGS
15 | ###############################################################################
16 |
17 | # checking whether argument is valid or not
18 | validArgList = ["-bam", "-ref", "-out", "-gpinfo", "-umitag"]
19 | addAbsPath = [1,1,3,1,0]
20 | warnMessage = "-bam, -ref, -out, -gpinfo, -umitag"
21 | inputFile = my.parse_argument(validArgList, addAbsPath, warnMessage)
22 | bamFile = inputFile[0]
23 | refGeneFile = inputFile[1]
24 | outFile = inputFile[2]
25 | gpinfoFile = inputFile[3]
26 | umitag = inputFile[4]
27 |
28 |
29 | # load gene information
30 | geneStructureInformation = auto_dict()
31 | geneLineCount = auto_dict()
32 |
33 | with open(refGeneFile, "r") as FP:
34 | for line in FP:
35 | line = line.strip("\n")
36 | tmpinf = line.split("\t")
37 | gene = tmpinf[0]
38 |
39 | if not bool(geneStructureInformation[gene]):
40 | geneLineCount[gene] = 0
41 | geneStructureInformation[gene][geneLineCount[gene]] = line
42 | else:
43 | geneLineCount[gene] += 1
44 | geneStructureInformation[gene][geneLineCount[gene]] = line
45 |
46 | # load group information
47 |
48 | groupInformation = auto_dict()
49 | geneLineCount1 = auto_dict()
50 | with open(gpinfoFile, "r") as FP:
51 | for line in FP:
52 | line = line.strip("\n")
53 | tmpinf = line.split("\t")
54 | tmpinf[5] = tmpinf[5].strip(",")
55 | gene = tmpinf[0]
56 |
57 | groupInformation[gene][tmpinf[1]][tmpinf[3]] = tmpinf[5]
58 |
59 |
60 |
61 |
62 | #####################################
63 | ## Using pysam to read in bam file !!
64 | #####################################
65 | bamFilePysam = pysam.Samfile(bamFile,"rb")
66 |
67 |
68 | ## RESULTS FILE
69 | OUT = open(outFile, 'w')
70 |
71 |
72 | ###########################################################################################################################
73 | ### START TO ANALYZE DATA FOR EACH GENE ###
74 | ##########################################################################################################################
75 |
76 | geneCount = 0
77 |
78 | startTime = time.time()
79 |
80 | umiSet = auto_dict()
81 |
82 | #OUT.write("GeneName\tIsoformName\tNumberOfReads\tRelativeAbundance\n") ## Header of Results
83 |
84 | for gene in geneStructureInformation:
85 |
86 | countResults = auto_dict()
87 |
88 | geneCount += 1
89 | tmpTime = (time.time() - startTime)/60.0
90 |
91 |
92 | sameReadCount = auto_dict()
93 | readStart = auto_dict()
94 | readEnd = auto_dict()
95 | readCigar = auto_dict()
96 |
97 | numofExons = geneLineCount[gene]
98 | tmpgeneinf = geneStructureInformation[gene][0].split("\t")
99 | geneChr = tmpgeneinf[1]
100 | geneStart = int(tmpgeneinf[3])
101 | geneEnd = int(tmpgeneinf[4])
102 | readCount = 0
103 | if bamFilePysam.get_tid(geneChr) == -1:
104 | continue
105 |
106 | ## load all reads information which were mapped to the specific gene within this loop using pysam
107 | for read in bamFilePysam.fetch(geneChr, geneStart, geneEnd):
108 | line = str(read)
109 | tmpinf = line.split("\t")
110 | tmpReadName = tmpinf[0]
111 | try:
112 | tmpUMI = read.get_tag(umitag)
113 | except:
114 | continue
115 |
116 |
117 | tmpReadChr = geneChr
118 | tmpReadStart = int(tmpinf[3]) + 1
119 | tmpReadCigar = ""
120 |
121 | ## Adjust to different Pysam Version!! ##
122 |
123 | if ")]" in tmpinf[5]: ## vector format
124 |
125 | tmpinf[5] = tmpinf[5].rstrip(")]")
126 | tmpinf[5] = tmpinf[5].lstrip("[(")
127 | tmpinfcigar = tmpinf[5].split("), (")
128 | for cc in tmpinfcigar:
129 | ttcc = cc.split(", ")
130 | if ttcc[0] == "3":
131 | tmpReadCigar = tmpReadCigar + ttcc[1] + "N"
132 | if ttcc[0] == "2":
133 | tmpReadCigar = tmpReadCigar + ttcc[1] + "D"
134 | if ttcc[0] == "1":
135 | tmpReadCigar = tmpReadCigar + ttcc[1] + "I"
136 | if ttcc[0] == "0":
137 | tmpReadCigar = tmpReadCigar + ttcc[1] + "M"
138 | if not (ttcc[0] == "3" or ttcc[0] == "2" or ttcc[0] == "1" or ttcc[0] == "0"):
139 | tmpReadCigar = tmpReadCigar + ttcc[1] + "X"
140 | else: ## 100M10N100M format
141 | tmpReadCigar = tmpinf[5]
142 | #print(tmpReadCigar)
143 | if not bool(sameReadCount[tmpReadName]):
144 | sameReadCount[tmpReadName] = 1
145 | umiSet[tmpReadName] = tmpUMI
146 | else:
147 | sameReadCount[tmpReadName] += 1
148 |
149 | readStart[tmpReadName][sameReadCount[tmpReadName]] = tmpReadStart
150 | readCigar[tmpReadName][sameReadCount[tmpReadName]] = tmpReadCigar
151 |
152 |
153 | ## load structure information of the specific gene within this loop
154 |
155 | tmpgeneinf[5] = tmpgeneinf[5].rstrip(",")
156 | isoformNames = tmpgeneinf[5].split(",")
157 | exonStarts = [None] * numofExons
158 | exonEnds = [None] * numofExons
159 | exonIndicators = auto_dict()
160 |
161 | for i in range(1,numofExons+1):
162 | tmpinf = geneStructureInformation[gene][i].split("\t")
163 | exonStarts[i-1] = int(tmpinf[3])+1
164 | exonEnds[i-1] = int(tmpinf[4])
165 | tmpinf[5] = tmpinf[5].rstrip(",")
166 | tmpExonIndicators = tmpinf[5].split(",")
167 |
168 | for j in range(len(tmpExonIndicators)):
169 | exonIndicators[isoformNames[j]][i-1] = int(tmpExonIndicators[j])
170 |
171 | lociIndicators = auto_dict()
172 | for i in range(len(isoformNames)):
173 | for j in range(len(exonStarts)):
174 | if exonIndicators[isoformNames[i]][j] == 1:
175 | for k in range(exonStarts[j], exonEnds[j]+1):
176 | lociIndicators[isoformNames[i]][k] = 1
177 |
178 | #########################################################################################################################################
179 | ## START TO ANALYZE EACH READ
180 | ##################################################################################################################################################
181 |
182 | qualifiedRead = auto_dict()
183 | readSet = []
184 | fragmentStart = auto_dict()
185 | fragmentEnd = auto_dict()
186 | CompatibleMatrix = auto_dict()
187 | tmpCompatibleMatrix = auto_dict()
188 |
189 | for readName in sameReadCount:
190 |
191 | # load CIGAR information
192 | cigarNumberRead1 = auto_dict()
193 | cigarNumberRead2 = auto_dict()
194 | cigarMatchRead1 = auto_dict()
195 | cigarMatchRead2 = auto_dict()
196 | cigarInfCountRead1 = 0
197 | cigarInfCountRead2 = 0
198 | cigarInfCountRead1tmp = 0
199 | cigarInfCountRead2tmp = 0
200 |
201 | tmp1 = re.split("([A-Z])",readCigar[readName][1])
202 | for i in range(len(tmp1)-1):
203 | if tmp1[i].isalpha():
204 | cigarMatchRead1[cigarInfCountRead1] = tmp1[i]
205 | cigarInfCountRead1 += 1
206 | else:
207 | cigarNumberRead1[cigarInfCountRead1] = int(tmp1[i])
208 | cigarInfCountRead1tmp += 1
209 |
210 | if sameReadCount[readName] == 2:
211 | tmp2 = re.split("([A-Z])",readCigar[readName][2])
212 | for i in range(len(tmp2)-1):
213 | if tmp2[i].isalpha():
214 | cigarMatchRead2[cigarInfCountRead2] = tmp2[i]
215 | cigarInfCountRead2 += 1
216 | else:
217 | cigarNumberRead2[cigarInfCountRead2] = int(tmp2[i])
218 | cigarInfCountRead2tmp += 1
219 |
220 | # calculate read end positions
221 | readEnd[readName][1] = readStart[readName][1]
222 | for i in range(cigarInfCountRead1):
223 | readEnd[readName][1] += cigarNumberRead1[i]
224 |
225 | if sameReadCount[readName] == 2:
226 | readEnd[readName][2] = readStart[readName][2]
227 | for i in range(cigarInfCountRead2):
228 | readEnd[readName][2] += cigarNumberRead2[i]
229 |
230 | # calculate fragment START and END positions
231 | if sameReadCount[readName] == 2:
232 | fragmentStart[readName] = readStart[readName][2] if readStart[readName][1] >= readStart[readName][2] else readStart[readName][1]
233 | fragmentEnd[readName] = readEnd[readName][1] if readEnd[readName][1] >= readEnd[readName][2] else readEnd[readName][2]
234 |
235 | if sameReadCount[readName] == 1:
236 | fragmentStart[readName] = readStart[readName][1]
237 | fragmentEnd[readName] = readEnd[readName][1]
238 |
239 | ##################################################################################################################################
240 | ## Obtain compatible matrix of isoforms with respect to reads
241 | #################################################################################################################################
242 | if (readStart[readName][1] >= geneStart and readStart[readName][1] <= geneEnd):
243 | #if (readStart[readName][1] >= geneStart and readStart[readName][1] <= geneEnd) or (readStart[readName][2] >= geneStart and readStart[readName][2] <= geneEnd and sameReadCount[readName]==2) :
244 | if cigarInfCountRead1 == cigarInfCountRead1tmp and cigarInfCountRead2 == cigarInfCountRead2tmp:
245 | base1 = readStart[readName][1] - 1
246 | exonIndicatorRead1 = [0] * numofExons
247 | if sameReadCount[readName] == 2:
248 | base2 = readStart[readName][2] - 1
249 | exonIndicatorRead2 = [0] * numofExons
250 | compatibleVector = [1] * len(isoformNames)
251 |
252 | ##############################################################################################################################################
253 | ### SET TUP COMPATIBLE INDICATOR VECTOR ###############
254 | ###############################################################################################################################################
255 | ## READ 1 ##
256 | # find exons where read 1 mapped to
257 | for i in range(cigarInfCountRead1):
258 |
259 | if cigarMatchRead1[i] == "M" or cigarMatchRead1[i] == "I": ## matched CIGAR
260 |
261 | for j in range(1,cigarNumberRead1[i]+1):
262 | tmpbase = base1 + j
263 | for k in range(len(exonStarts)):
264 | if exonIndicatorRead1[k] == 1: continue
265 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead1[k] = 1 ## confirm that the read covers this exon
266 |
267 | base1 += cigarNumberRead1[i] # jump to next match information
268 |
269 | if cigarMatchRead1[i] == "N": ## skipping area
270 | base1 += cigarNumberRead1[i] # jump to next match information directly
271 |
272 | # set up indicator vector
273 | tmpcount1 = 0
274 | tmpcount11 = 0 ## these two variable are used to rule out skipping exons
275 | for i in range(len(exonIndicatorRead1)):
276 | if exonIndicatorRead1[i] == 1: tmpcount1 += 1
277 | for i in range(len(exonIndicatorRead1)):
278 |
279 | if exonIndicatorRead1[i] == 1:
280 | tmpcount11 += 1
281 | for j in range(len(isoformNames)):
282 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j
283 |
284 | if exonIndicatorRead1[i] == 0: #aim to rule out isforms which includes exons which skipped by read
285 | if tmpcount1 > 1 and tmpcount11 >= 1 and tmpcount11 < tmpcount1: ## confirm the exon i is skipped by read!!
286 | for j in range(len(isoformNames)):
287 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0
288 |
289 |
290 | ## READ 2 ## SAME AS READ 1
291 | tmpcount2 = 0
292 | if sameReadCount[readName] == 2: ## ONLY WHEN THE READ IS PAIRED-END READ!!!
293 | # find exons where read 2 mapped to
294 | for i in range(cigarInfCountRead2):
295 |
296 | if cigarMatchRead2[i] == "M" or cigarMatchRead2[i] == "I": ## matched CIGAR
297 |
298 | for j in range(1,cigarNumberRead2[i]+1):
299 | tmpbase = base2 + j
300 | for k in range(len(exonStarts)):
301 | if exonIndicatorRead2[k] == 1: continue
302 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead2[k] = 1 ## confirm that the read covers this exon
303 |
304 | base2 += cigarNumberRead2[i] # jump to next match information
305 |
306 | if cigarMatchRead2[i] == "N": ## skipping area
307 | base2 += cigarNumberRead2[i] # jump to next match information directly
308 |
309 | # set up indicator vector
310 | tmpcount2 = 0
311 | tmpcount22 = 0 ## these two variable are used to rule out skipping exons
312 | for i in range(len(exonIndicatorRead2)):
313 | if exonIndicatorRead2[i] == 1: tmpcount2 += 1
314 | for i in range(len(exonIndicatorRead2)):
315 |
316 | if exonIndicatorRead2[i] == 1:
317 | tmpcount22 += 1
318 | for j in range(len(isoformNames)):
319 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j
320 |
321 | if exonIndicatorRead2[i] == 0: #aim to rule out isforms which includes exons which skipped by read
322 | if tmpcount2 > 1 and tmpcount22 >= 1 and tmpcount22 < tmpcount2: ## confirm the exon i is skipped by read!!
323 | for j in range(len(isoformNames)):
324 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0
325 |
326 | ##################################################################################################################################################
327 | ## fill in compatible matrix ##
328 | if tmpcount1 > 0 or (tmpcount2 > 0 and sameReadCount[readName] == 2):
329 | #umibarcode = readName.split("_")
330 | #umibarcode = umibarcode[len(umibarcode)-1]
331 | umibarcode = umiSet[readName]
332 | readSet.append(umibarcode)
333 | qualifiedRead[readName] = 1
334 | readCount += 1
335 | for i in range(len(isoformNames)):
336 | CompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i]
337 | tmpCompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i]
338 |
339 |
340 |
341 | ### COMPATIBLE MATRIX OBTAINED !!!
342 | ###############################################################################################################
343 |
344 | #readCount = len(set(readSet))
345 | if readCount == 0: continue
346 | print(gene+"\t"+str(readCount)+" reads detected...")
347 | #print(umibarcode)
348 | for weight in groupInformation[gene]:
349 | countResults[weight]["+"] = []
350 | countResults[weight]["-"] = []
351 |
352 | isosetplus = groupInformation[gene][weight]["+"].split(",")
353 | isosetminus = groupInformation[gene][weight]["-"].split(",")
354 |
355 | for readName in qualifiedRead:
356 | umibarcode = readName.split("_")
357 | umibarcode = umibarcode[len(umibarcode)-1]
358 |
359 | if qualifiedRead[readName] == 0: continue
360 | #print(umibarcode)
361 | sumindexplus = 0
362 | for index in isosetplus:
363 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexplus += 1
364 | sumindexminus = 0
365 | for index in isosetminus:
366 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexminus += 1
367 | if sumindexplus == 0:
368 | countResults[weight]["+"].append(umibarcode)
369 | if sumindexminus == 0:
370 | countResults[weight]["-"].append(umibarcode)
371 | count_plus = len(set(countResults[weight]["+"]))
372 | count_minus = len(set(countResults[weight]["-"]))
373 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"+"+"\t"+str(count_plus)+"\n")
374 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"-"+"\t"+str(count_minus)+"\n")
375 |
376 | OUT.close()
377 |
378 |
379 |
380 |
381 |
382 |
--------------------------------------------------------------------------------
/bin/getCount_umi_cellid.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | from __future__ import print_function # load print function in python3
4 | from collections import defaultdict
5 | import math, sys, os, re, pysam, time
6 | import my_functions as my
7 |
8 | # set up auto dictionary function
9 | def auto_dict():
10 | return defaultdict(auto_dict)
11 |
12 |
13 | ###############################################################################
14 | ### ARGUMENT SETTINGS
15 | ###############################################################################
16 |
17 | # checking whether argument is valid or not
18 | validArgList = ["-bam", "-ref", "-out", "-gpinfo", "-cellid", "-celltag", "-umitag"]
19 | addAbsPath = [1,1,3,1,0,0,0]
20 | warnMessage = "-bam, -ref, -out, -gpinfo, -cellid, -celltag, -umitag"
21 | inputFile = my.parse_argument(validArgList, addAbsPath, warnMessage)
22 | bamFile = inputFile[0]
23 | refGeneFile = inputFile[1]
24 | outFile = inputFile[2]
25 | gpinfoFile = inputFile[3]
26 | cellid = inputFile[4]
27 | celltag = inputFile[5]
28 | umitag = inputFile[6]
29 |
30 |
31 |
32 | # load gene information
33 | geneStructureInformation = auto_dict()
34 | geneLineCount = auto_dict()
35 |
36 | with open(refGeneFile, "r") as FP:
37 | for line in FP:
38 | line = line.strip("\n")
39 | tmpinf = line.split("\t")
40 | gene = tmpinf[0]
41 |
42 | if not bool(geneStructureInformation[gene]):
43 | geneLineCount[gene] = 0
44 | geneStructureInformation[gene][geneLineCount[gene]] = line
45 | else:
46 | geneLineCount[gene] += 1
47 | geneStructureInformation[gene][geneLineCount[gene]] = line
48 |
49 | # load group information
50 |
51 | groupInformation = auto_dict()
52 | geneLineCount1 = auto_dict()
53 | with open(gpinfoFile, "r") as FP:
54 | for line in FP:
55 | line = line.strip("\n")
56 | tmpinf = line.split("\t")
57 | tmpinf[5] = tmpinf[5].strip(",")
58 | gene = tmpinf[0]
59 |
60 | groupInformation[gene][tmpinf[1]][tmpinf[3]] = tmpinf[5]
61 |
62 |
63 |
64 |
65 | #####################################
66 | ## Using pysam to read in bam file !!
67 | #####################################
68 | bamFilePysam = pysam.Samfile(bamFile,"rb")
69 |
70 |
71 | ## RESULTS FILE
72 | OUT = open(outFile, 'w')
73 |
74 |
75 | ###########################################################################################################################
76 | ### START TO ANALYZE DATA FOR EACH GENE ###
77 | ##########################################################################################################################
78 |
79 | geneCount = 0
80 |
81 | startTime = time.time()
82 |
83 | umiSet = auto_dict()
84 |
85 | #OUT.write("GeneName\tIsoformName\tNumberOfReads\tRelativeAbundance\n") ## Header of Results
86 |
87 | for gene in geneStructureInformation:
88 |
89 | countResults = auto_dict()
90 |
91 | geneCount += 1
92 | tmpTime = (time.time() - startTime)/60.0
93 |
94 |
95 | sameReadCount = auto_dict()
96 | readStart = auto_dict()
97 | readEnd = auto_dict()
98 | readCigar = auto_dict()
99 |
100 | numofExons = geneLineCount[gene]
101 | tmpgeneinf = geneStructureInformation[gene][0].split("\t")
102 | geneChr = tmpgeneinf[1]
103 | geneStart = int(tmpgeneinf[3])
104 | geneEnd = int(tmpgeneinf[4])
105 | readCount = 0
106 | if bamFilePysam.get_tid(geneChr) == -1:
107 | continue
108 |
109 | ## load all reads information which were mapped to the specific gene within this loop using pysam
110 | for read in bamFilePysam.fetch(geneChr, geneStart, geneEnd):
111 | line = str(read)
112 | tmpinf = line.split("\t")
113 | tmpReadName = tmpinf[0]
114 | try:
115 | tmpCellBarcode = read.get_tag(celltag)
116 | tmpUMI = read.get_tag(umitag)
117 | except:
118 | continue
119 | if cellid != tmpCellBarcode:
120 | continue
121 |
122 |
123 | tmpReadChr = geneChr
124 | tmpReadStart = int(tmpinf[3]) + 1
125 | tmpReadCigar = ""
126 |
127 | ## Adjust to different Pysam Version!! ##
128 |
129 | if ")]" in tmpinf[5]: ## vector format
130 |
131 | tmpinf[5] = tmpinf[5].rstrip(")]")
132 | tmpinf[5] = tmpinf[5].lstrip("[(")
133 | tmpinfcigar = tmpinf[5].split("), (")
134 | for cc in tmpinfcigar:
135 | ttcc = cc.split(", ")
136 | if ttcc[0] == "3":
137 | tmpReadCigar = tmpReadCigar + ttcc[1] + "N"
138 | if ttcc[0] == "2":
139 | tmpReadCigar = tmpReadCigar + ttcc[1] + "D"
140 | if ttcc[0] == "1":
141 | tmpReadCigar = tmpReadCigar + ttcc[1] + "I"
142 | if ttcc[0] == "0":
143 | tmpReadCigar = tmpReadCigar + ttcc[1] + "M"
144 | if not (ttcc[0] == "3" or ttcc[0] == "2" or ttcc[0] == "1" or ttcc[0] == "0"):
145 | tmpReadCigar = tmpReadCigar + ttcc[1] + "X"
146 | else: ## 100M10N100M format
147 | tmpReadCigar = tmpinf[5]
148 | #print(tmpReadCigar)
149 | if not bool(sameReadCount[tmpReadName]):
150 | sameReadCount[tmpReadName] = 1
151 | umiSet[tmpReadName] = tmpUMI
152 | else:
153 | sameReadCount[tmpReadName] += 1
154 |
155 | readStart[tmpReadName][sameReadCount[tmpReadName]] = tmpReadStart
156 | readCigar[tmpReadName][sameReadCount[tmpReadName]] = tmpReadCigar
157 |
158 |
159 | ## load structure information of the specific gene within this loop
160 |
161 | tmpgeneinf[5] = tmpgeneinf[5].rstrip(",")
162 | isoformNames = tmpgeneinf[5].split(",")
163 | exonStarts = [None] * numofExons
164 | exonEnds = [None] * numofExons
165 | exonIndicators = auto_dict()
166 |
167 | for i in range(1,numofExons+1):
168 | tmpinf = geneStructureInformation[gene][i].split("\t")
169 | exonStarts[i-1] = int(tmpinf[3])+1
170 | exonEnds[i-1] = int(tmpinf[4])
171 | tmpinf[5] = tmpinf[5].rstrip(",")
172 | tmpExonIndicators = tmpinf[5].split(",")
173 |
174 | for j in range(len(tmpExonIndicators)):
175 | exonIndicators[isoformNames[j]][i-1] = int(tmpExonIndicators[j])
176 |
177 | lociIndicators = auto_dict()
178 | for i in range(len(isoformNames)):
179 | for j in range(len(exonStarts)):
180 | if exonIndicators[isoformNames[i]][j] == 1:
181 | for k in range(exonStarts[j], exonEnds[j]+1):
182 | lociIndicators[isoformNames[i]][k] = 1
183 |
184 | #########################################################################################################################################
185 | ## START TO ANALYZE EACH READ
186 | ##################################################################################################################################################
187 |
188 | qualifiedRead = auto_dict()
189 | readSet = []
190 | fragmentStart = auto_dict()
191 | fragmentEnd = auto_dict()
192 | CompatibleMatrix = auto_dict()
193 | tmpCompatibleMatrix = auto_dict()
194 |
195 | for readName in sameReadCount:
196 |
197 | # load CIGAR information
198 | cigarNumberRead1 = auto_dict()
199 | cigarNumberRead2 = auto_dict()
200 | cigarMatchRead1 = auto_dict()
201 | cigarMatchRead2 = auto_dict()
202 | cigarInfCountRead1 = 0
203 | cigarInfCountRead2 = 0
204 | cigarInfCountRead1tmp = 0
205 | cigarInfCountRead2tmp = 0
206 |
207 | tmp1 = re.split("([A-Z])",readCigar[readName][1])
208 | for i in range(len(tmp1)-1):
209 | if tmp1[i].isalpha():
210 | cigarMatchRead1[cigarInfCountRead1] = tmp1[i]
211 | cigarInfCountRead1 += 1
212 | else:
213 | cigarNumberRead1[cigarInfCountRead1] = int(tmp1[i])
214 | cigarInfCountRead1tmp += 1
215 |
216 | if sameReadCount[readName] == 2:
217 | tmp2 = re.split("([A-Z])",readCigar[readName][2])
218 | for i in range(len(tmp2)-1):
219 | if tmp2[i].isalpha():
220 | cigarMatchRead2[cigarInfCountRead2] = tmp2[i]
221 | cigarInfCountRead2 += 1
222 | else:
223 | cigarNumberRead2[cigarInfCountRead2] = int(tmp2[i])
224 | cigarInfCountRead2tmp += 1
225 |
226 | # calculate read end positions
227 | readEnd[readName][1] = readStart[readName][1]
228 | for i in range(cigarInfCountRead1):
229 | readEnd[readName][1] += cigarNumberRead1[i]
230 |
231 | if sameReadCount[readName] == 2:
232 | readEnd[readName][2] = readStart[readName][2]
233 | for i in range(cigarInfCountRead2):
234 | readEnd[readName][2] += cigarNumberRead2[i]
235 |
236 | # calculate fragment START and END positions
237 | if sameReadCount[readName] == 2:
238 | fragmentStart[readName] = readStart[readName][2] if readStart[readName][1] >= readStart[readName][2] else readStart[readName][1]
239 | fragmentEnd[readName] = readEnd[readName][1] if readEnd[readName][1] >= readEnd[readName][2] else readEnd[readName][2]
240 |
241 | if sameReadCount[readName] == 1:
242 | fragmentStart[readName] = readStart[readName][1]
243 | fragmentEnd[readName] = readEnd[readName][1]
244 |
245 | ##################################################################################################################################
246 | ## Obtain compatible matrix of isoforms with respect to reads
247 | #################################################################################################################################
248 | if (readStart[readName][1] >= geneStart and readStart[readName][1] <= geneEnd):
249 | #if (readStart[readName][1] >= geneStart and readStart[readName][1] <= geneEnd) or (readStart[readName][2] >= geneStart and readStart[readName][2] <= geneEnd and sameReadCount[readName]==2) :
250 | if cigarInfCountRead1 == cigarInfCountRead1tmp and cigarInfCountRead2 == cigarInfCountRead2tmp:
251 | base1 = readStart[readName][1] - 1
252 | exonIndicatorRead1 = [0] * numofExons
253 | if sameReadCount[readName] == 2:
254 | base2 = readStart[readName][2] - 1
255 | exonIndicatorRead2 = [0] * numofExons
256 | compatibleVector = [1] * len(isoformNames)
257 |
258 | ##############################################################################################################################################
259 | ### SET TUP COMPATIBLE INDICATOR VECTOR ###############
260 | ###############################################################################################################################################
261 | ## READ 1 ##
262 | # find exons where read 1 mapped to
263 | for i in range(cigarInfCountRead1):
264 |
265 | if cigarMatchRead1[i] == "M" or cigarMatchRead1[i] == "I": ## matched CIGAR
266 |
267 | for j in range(1,cigarNumberRead1[i]+1):
268 | tmpbase = base1 + j
269 | for k in range(len(exonStarts)):
270 | if exonIndicatorRead1[k] == 1: continue
271 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead1[k] = 1 ## confirm that the read covers this exon
272 |
273 | base1 += cigarNumberRead1[i] # jump to next match information
274 |
275 | if cigarMatchRead1[i] == "N": ## skipping area
276 | base1 += cigarNumberRead1[i] # jump to next match information directly
277 |
278 | # set up indicator vector
279 | tmpcount1 = 0
280 | tmpcount11 = 0 ## these two variable are used to rule out skipping exons
281 | for i in range(len(exonIndicatorRead1)):
282 | if exonIndicatorRead1[i] == 1: tmpcount1 += 1
283 | for i in range(len(exonIndicatorRead1)):
284 |
285 | if exonIndicatorRead1[i] == 1:
286 | tmpcount11 += 1
287 | for j in range(len(isoformNames)):
288 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j
289 |
290 | if exonIndicatorRead1[i] == 0: #aim to rule out isforms which includes exons which skipped by read
291 | if tmpcount1 > 1 and tmpcount11 >= 1 and tmpcount11 < tmpcount1: ## confirm the exon i is skipped by read!!
292 | for j in range(len(isoformNames)):
293 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0
294 |
295 |
296 | ## READ 2 ## SAME AS READ 1
297 | tmpcount2 = 0
298 | if sameReadCount[readName] == 2: ## ONLY WHEN THE READ IS PAIRED-END READ!!!
299 | # find exons where read 2 mapped to
300 | for i in range(cigarInfCountRead2):
301 |
302 | if cigarMatchRead2[i] == "M" or cigarMatchRead2[i] == "I": ## matched CIGAR
303 |
304 | for j in range(1,cigarNumberRead2[i]+1):
305 | tmpbase = base2 + j
306 | for k in range(len(exonStarts)):
307 | if exonIndicatorRead2[k] == 1: continue
308 | if tmpbase >= exonStarts[k] and tmpbase <= exonEnds[k]: exonIndicatorRead2[k] = 1 ## confirm that the read covers this exon
309 |
310 | base2 += cigarNumberRead2[i] # jump to next match information
311 |
312 | if cigarMatchRead2[i] == "N": ## skipping area
313 | base2 += cigarNumberRead2[i] # jump to next match information directly
314 |
315 | # set up indicator vector
316 | tmpcount2 = 0
317 | tmpcount22 = 0 ## these two variable are used to rule out skipping exons
318 | for i in range(len(exonIndicatorRead2)):
319 | if exonIndicatorRead2[i] == 1: tmpcount2 += 1
320 | for i in range(len(exonIndicatorRead2)):
321 |
322 | if exonIndicatorRead2[i] == 1:
323 | tmpcount22 += 1
324 | for j in range(len(isoformNames)):
325 | if exonIndicators[isoformNames[j]][i] == 0: compatibleVector[j] = 0 ## rule out isoform j if reads covers skipping area of isoform j
326 |
327 | if exonIndicatorRead2[i] == 0: #aim to rule out isforms which includes exons which skipped by read
328 | if tmpcount2 > 1 and tmpcount22 >= 1 and tmpcount22 < tmpcount2: ## confirm the exon i is skipped by read!!
329 | for j in range(len(isoformNames)):
330 | if exonIndicators[isoformNames[j]][i] == 1: compatibleVector[j] = 0
331 |
332 | ##################################################################################################################################################
333 | ## fill in compatible matrix ##
334 | if tmpcount1 > 0 or (tmpcount2 > 0 and sameReadCount[readName] == 2):
335 | #umibarcode = readName.split("_")
336 | #umibarcode = umibarcode[len(umibarcode)-1]
337 | umibarcode = umiSet[readName]
338 | readSet.append(umibarcode)
339 | qualifiedRead[readName] = 1
340 | readCount += 1
341 | for i in range(len(isoformNames)):
342 | CompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i]
343 | tmpCompatibleMatrix[readName][isoformNames[i]] = compatibleVector[i]
344 |
345 |
346 |
347 | ### COMPATIBLE MATRIX OBTAINED !!!
348 | ###############################################################################################################
349 |
350 | #readCount = len(set(readSet))
351 | if readCount == 0: continue
352 | print(gene+"\t"+str(readCount)+" reads detected...")
353 | #print(umibarcode)
354 | for weight in groupInformation[gene]:
355 | countResults[weight]["+"] = []
356 | countResults[weight]["-"] = []
357 |
358 | isosetplus = groupInformation[gene][weight]["+"].split(",")
359 | isosetminus = groupInformation[gene][weight]["-"].split(",")
360 |
361 | for readName in qualifiedRead:
362 | umibarcode = readName.split("_")
363 | umibarcode = umibarcode[len(umibarcode)-1]
364 |
365 | if qualifiedRead[readName] == 0: continue
366 | #print(umibarcode)
367 | sumindexplus = 0
368 | for index in isosetplus:
369 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexplus += 1
370 | sumindexminus = 0
371 | for index in isosetminus:
372 | if CompatibleMatrix[readName][isoformNames[int(index)]] == 1: sumindexminus += 1
373 | if sumindexplus == 0:
374 | countResults[weight]["+"].append(umibarcode)
375 | if sumindexminus == 0:
376 | countResults[weight]["-"].append(umibarcode)
377 | count_plus = len(set(countResults[weight]["+"]))
378 | count_minus = len(set(countResults[weight]["-"]))
379 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"+"+"\t"+str(count_plus)+"\n")
380 | OUT.write(gene+"\t"+str(readCount)+"\t"+weight+"\t"+"-"+"\t"+str(count_minus)+"\n")
381 |
382 | OUT.close()
383 |
384 |
385 |
386 |
387 |
388 |
--------------------------------------------------------------------------------
/bin/getalpha.pl:
--------------------------------------------------------------------------------
1 | my %nonzeroct;
2 | my %logct;
3 | my %data;
4 | my $metafile = $ARGV[0];
5 | my $countdir = $ARGV[1];
6 | my $outfile = $ARGV[2];
7 | open FP1, "$metafile";
8 | while() {
9 | chomp();
10 | my @b = split("\t");
11 | open FP, "$countdir\/count_$b[0]\.out";
12 | while() {
13 | chomp();
14 | my @a = split("\t");
15 | $data{$b[0]}{$a[0]} = $a[1];
16 | }
17 | close FP;
18 | }
19 | close FP1;
20 |
21 | foreach my $ccc (keys %data) {
22 | foreach my $gene (keys %{$data{$ccc}}) {
23 | $logct{$ccc} = $logct{$ccc} + log($data{$ccc}{$gene});
24 | $nonzeroct{$ccc}++;
25 | }
26 | }
27 |
28 | open OUT, ">$outfile";
29 | open FP, "$metafile";
30 | while() {
31 | chomp();
32 | my @a = split("\t");
33 | if($nonzeroct{$a[0]} > 0) {
34 | my $aaa = $logct{$a[0]} / $nonzeroct{$a[0]};
35 | print OUT "$a[0]\t$aaa\t0\t0\t0\n";
36 | } else {
37 | print OUT "NA\n";
38 | }
39 | }
40 | close FP;
41 | close OUT;
42 |
--------------------------------------------------------------------------------
/bin/getexonlevelcount_umi.pl:
--------------------------------------------------------------------------------
1 | my $cdt0 = $ARGV[0];
2 | my $cdt1 = $ARGV[1];
3 | my $tmpdir = $ARGV[2];
4 | my $metafile = $ARGV[3];
5 | my $gpinfofile = $ARGV[4];
6 |
7 | my $gpp = $cdt0."_".$cdt1;
8 |
9 | my @condition;
10 | my %qout;
11 | open FP, "$tmpdir\/abkt/abkt_umi";
12 | while() {
13 | chomp();
14 | my @a = split("\t");
15 | $qout{$a[0]} = 1;
16 | }
17 | close FP;
18 |
19 | my %count;
20 | my %quality1;
21 | my %quality2;
22 | open FP, "$metafile";
23 | while() {
24 | chomp();
25 | my @a = split("\t");
26 | if($qout{$a[0]} == 1) {
27 | #print "$a[0]\n";
28 | open FP1, "$tmpdir\/count_script\/count_$a[0]\.out";
29 | while() {
30 | chomp();
31 | my @b = split("\t");
32 | #$count{$b[0].":".$b[2]}{$a[0]}{$b[3]} = $b[4] if $qgene1{$b[0]} == 1;
33 | $count{$b[0].":".$b[2]}{$a[0]}{$b[3]} = $b[4];
34 | if($a[$#a] =~ $cdt0 || $a[$#a] =~ $cdt1) {
35 | $quality1{$b[0].":".$b[2]}{$b[3]} = $quality1{$b[0].":".$b[2]}{$b[3]} + $b[4];
36 | $quality2{$b[0].":".$b[2]}{$b[3]}++ if $b[4] > 0;
37 | }
38 | }
39 | close FP1;
40 | }
41 | }
42 | close FP;
43 |
44 | my %abkt;
45 | open FP, "$tmpdir\/abkt/abkt_umi";
46 | while() {
47 | chomp();
48 | my @a = split("\t");
49 | $abkt{$a[0]} = "$a[1]\t$a[2]\t$a[3]\t$a[4]";
50 | }
51 | close FP;
52 |
53 | my @abktfile;
54 | my @cell;
55 | open FP, "$metafile";
56 | while() {
57 | chomp();
58 | my @a = split("\t");
59 | if($qout{$a[0]} == 1) {
60 | if($a[1] eq $cdt0) {
61 | @condition = (@condition, 0);
62 | @cell = (@cell, $a[0]);
63 | @abktfile = (@abktfile, $abkt{$a[0]});
64 | }
65 | if($a[1] eq $cdt1) {
66 | @condition = (@condition, 1);
67 | @cell = (@cell, $a[0]);
68 | @abktfile = (@abktfile, $abkt{$a[0]});
69 | }
70 | }
71 | }
72 | close FP;
73 |
74 | open OUT1, ">$tmpdir\/das_script/data/condition_$gpp";
75 | foreach my $i (0..$#condition) {
76 | print OUT1 "$condition[$i]\n";
77 | }
78 | close OUT1;
79 |
80 | open OUT3, ">$tmpdir\/das_script/data/abktfile_$gpp";
81 | foreach my $i (0..$#abktfile) {
82 | print OUT3 "$abktfile[$i]\n";
83 | }
84 | close OUT3;
85 |
86 | my %prop;
87 | my %qgroup;
88 | open FP, "$gpinfofile";
89 | while() {
90 | chomp();
91 | my @a = split("\t");
92 | my $gp = $a[0].":".$a[1];
93 | $prop{$gp}{$a[3]} = log($a[4]);
94 | $qgroup{$gp} = 1;
95 | }
96 | close FP;
97 |
98 | my %qgene;
99 | my %mean;
100 | my %bursting;
101 | open FP, "$tmpdir\/gene_script/geneleveltheta_umi";
102 | while() {
103 | chomp();
104 | next if /theta_rd/;
105 | my @a = split("\t");
106 | next if ($a[0] ne $cdt0 && $a[0] ne $cdt1);
107 | $qgene{$a[0]}{$a[1]} = 1;
108 | $mean{$a[0]}{$a[1]} = $a[2];
109 | }
110 | close FP;
111 |
112 | open FP, "$tmpdir\/gene_script/genelevelbursting_umi";
113 | while() {
114 | chomp();
115 | my @a = split("\t");
116 | $bursting{$a[0]}{$a[1]} = $a[2];
117 | }
118 | close FP;
119 |
120 |
121 | open OUT2, ">$tmpdir\/das_script/data/countdata_$gpp";
122 | foreach my $gp (keys %count) {
123 | my @a = split(":",$gp);
124 | next if !($qgene{$cdt0}{$a[0]} == 1);
125 | next if !($qgene{$cdt1}{$a[0]} == 1);
126 | next if !($qgroup{$gp} == 1);
127 | #print "yes\n";
128 | my $genect;
129 | my $plus;
130 | my $minus;
131 | my $tmpquality = 1;
132 | if($quality1{$gp}{"+"} > 50 || $quality1{$gp}{"-"} > 50) {
133 | $tmpquality = 0 if !($quality2{$gp}{"+"} > 15 || $quality2{$gp}{"-"} > 15);
134 | } else {
135 | $tmpquality = 0 if !($quality2{$gp}{"+"} > 10 && $quality2{$gp}{"-"} > 10);
136 | }
137 | #next if $tmpquality == 0;
138 | foreach my $i (0..$#cell) {
139 | $genect = $genect."100," if $i < $#cell;
140 | $genect = $genect."100" if $i == $#cell;
141 | if(!($count{$gp}{$cell[$i]}{"+"}>0)) {
142 | $count{$gp}{$cell[$i]}{"+"} = 0;
143 | }
144 | if(!($count{$gp}{$cell[$i]}{"-"}>0)) {
145 | $count{$gp}{$cell[$i]}{"-"} = 0;
146 | }
147 | $plus = $plus.$count{$gp}{$cell[$i]}{"+"}."," if $i < $#cell;
148 | $plus = $plus.$count{$gp}{$cell[$i]}{"+"} if $i == $#cell;
149 | $minus = $minus.$count{$gp}{$cell[$i]}{"-"}."," if $i < $#cell;
150 | $minus = $minus.$count{$gp}{$cell[$i]}{"-"} if $i == $#cell;
151 | }
152 | #print OUT2 "$gp\t$genect\t$plus\t$minus\t$mean{$cdt0}{$a[0]},1,$mean{$cdt1}{$a[0]},1,$prop{$gp}{\"+\"},$prop{$gp}{\"-\"}\t+\\-\n";
153 | my $tmpgp = $cdt0."_".$cdt1;
154 | print OUT2 "$gp\t$genect\t$plus\t$minus\t$mean{$cdt0}{$a[0]},1,$mean{$cdt1}{$a[0]},1,$bursting{$tmpgp}{$a[0]}\t+\\-\n";
155 | }
156 | close OUT2;
157 |
--------------------------------------------------------------------------------
/bin/getgenelevelcount.pl:
--------------------------------------------------------------------------------
1 | my $cdt0 = $ARGV[0];
2 | my $cdt1 = $ARGV[1];
3 | my @condition;
4 | my %qout;
5 | my $abktfile = $ARGV[2];
6 | my $metafile = $ARGV[3];
7 | my $tmpdir = $ARGV[4];
8 | my $tmpdirgene = $ARGV[5];
9 | open FP, "$abktfile";
10 | while() {
11 | chomp();
12 | my @a = split("\t");
13 | $qout{$a[0]} = 1;
14 | }
15 | close FP;
16 |
17 | open FP, "$metafile";
18 | while() {
19 | chomp();
20 | my @a = split("\t");
21 | if($qout{$a[0]} == 1) {
22 | #print "$a[0]\n";
23 | open FP1, "$tmpdir\/count_$a[0]\.out";
24 | while() {
25 | chomp();
26 | my @b = split("\t");
27 | #$count{$b[0]}{$a[0]} = $b[1] if $qgene{$b[0]} == 1;
28 | $count{$b[0]}{$a[0]} = $b[1];
29 | }
30 | close FP1;
31 | }
32 | }
33 | close FP;
34 |
35 | my %abkt;
36 | open FP, "$abktfile";
37 | while() {
38 | chomp();
39 | my @a = split("\t");
40 | $abkt{$a[0]} = "$a[1]\t$a[2]\t$a[3]\t$a[4]";
41 | }
42 | close FP;
43 |
44 | my @abktfile;
45 | my @cell;
46 | open FP, "$metafile";
47 | while() {
48 | chomp();
49 | my @a = split("\t");
50 | if($qout{$a[0]} == 1) {
51 | if($a[1] eq $cdt0) {
52 | @condition = (@condition, 0);
53 | @cell = (@cell, $a[0]);
54 | @abktfile = (@abktfile, $abkt{$a[0]});
55 | }
56 | if($a[1] eq $cdt1) {
57 | @condition = (@condition, 1);
58 | @cell = (@cell, $a[0]);
59 | @abktfile = (@abktfile, $abkt{$a[0]});
60 | }
61 | }
62 | }
63 | close FP;
64 |
65 | open OUT1, ">$tmpdirgene\/condition_$cdt0\_$cdt1";
66 | foreach my $i (0..$#condition) {
67 | print OUT1 "$condition[$i]\n";
68 | }
69 | close OUT1;
70 |
71 | open OUT2, ">$tmpdirgene\/countdata_$cdt0\_$cdt1";
72 | foreach my $gene (keys %count) {
73 | print OUT2 "$gene\t";
74 | foreach my $i (0..$#cell) {
75 | if($i < $#cell) {
76 | if($count{$gene}{$cell[$i]} > 0) {
77 | print OUT2 "$count{$gene}{$cell[$i]}\t";
78 | } else {
79 | print OUT2 "0\t";
80 | }
81 | }
82 | if($i == $#cell) {
83 | if($count{$gene}{$cell[$i]} > 0) {
84 | print OUT2 "$count{$gene}{$cell[$i]}\n";
85 | } else {
86 | print OUT2 "0\n";
87 | }
88 | }
89 | }
90 | }
91 | close OUT2;
92 |
93 | open OUT3, ">$tmpdirgene\/abktfile_$cdt0\_$cdt1";
94 | foreach my $i (0..$#abktfile) {
95 | print OUT3 "$abktfile[$i]\n";
96 | }
97 | close OUT3;
98 |
--------------------------------------------------------------------------------
/bin/getgeneleveltheta_umi.pl:
--------------------------------------------------------------------------------
1 | my %theta;
2 | my %qgene;
3 | my %bursting;
4 | my $ct = 0;
5 | my $comparedir = $ARGV[0];
6 | my $genedir = $comparedir."/gene_script";
7 | my $datadir = $genedir."/data";
8 |
9 | open FP, "$comparedir\/comparegroup";
10 | while() {
11 | chomp();
12 | $ct++;
13 | my @a = split("\t");
14 | my $gp = "$a[0]\_$a[1]";
15 | open FP1, "$datadir\/outgene_$gp";
16 | while() {
17 | chomp();
18 | my @b = split("\t");
19 | if($b[1] eq "True" && $b[5] ne "nan" && $b[6] ne "nan") {
20 | #$qgene{$a[0]}{$b[0]} = 1;
21 | $theta{$a[0]}{$b[0]} = $b[5];
22 | #$qgene{$a[1]}{$b[0]} = 1;
23 | $theta{$a[1]}{$b[0]} = $b[6];
24 | $bursting{$gp}{$b[0]} = $b[8];
25 |
26 | #print "$b[8]\n";
27 | }
28 | }
29 | close FP1;
30 | }
31 | close FP;
32 |
33 | open OUT, ">$genedir\/geneleveltheta_umi";
34 | open FP, "$comparedir\/celltypes";
35 | while() {
36 | chomp();
37 | foreach my $g (keys %{$theta{$_}}) {
38 | #print "$bursting{$_}{$g}\n";
39 | print OUT "$_\t$g\t$theta{$_}{$g}\n";
40 | }
41 | }
42 | close FP;
43 | close OUT;
44 |
45 | open OUT, ">$genedir\/genelevelbursting_umi";
46 | open FP, "$comparedir\/comparegroup";
47 | while() {
48 | chomp();
49 | my @a = split("\t");
50 | my $gp = "$a[0]\_$a[1]";
51 | foreach my $g (keys %{$bursting{$gp}}) {
52 | print OUT "$gp\t$g\t$bursting{$gp}{$g}\n";
53 | }
54 | }
55 | close FP;
56 | close OUT;
57 |
--------------------------------------------------------------------------------
/bin/getgroupinfo.pl:
--------------------------------------------------------------------------------
1 | my $input = $ARGV[0];
2 | #my $output = $ARGV[1];
3 | my %genecount;
4 | my $totalweight;
5 | my $tmpweight;
6 | my $tmpisoset;
7 | my $tmpisoindex;
8 | my $tmpisoindex1;
9 | my @tmpiso;
10 | my %isosetinf;
11 | my %genelength;
12 | my %grouplength;
13 | my %isoindex;
14 | my %isoindex_complement;
15 | my %exonset;
16 | open FP, "$input";
17 | while() {
18 | chomp();
19 | my @a = split("\t");
20 | my @b = split(",",$a[$#a]);
21 | $genecount{$a[0]}++;
22 | if($genecount{$a[0]} == 1) {
23 | @tmpiso = split(",",$a[$#a]);
24 | $totalweight = 0;
25 | foreach my $i (0..$#tmpiso) {
26 | $totalweight = $totalweight + 2 ** $i;
27 | }
28 | } else {
29 | $genelength{$a[0]} = $genelength{$a[0]} + $a[4] - $a[3] + 1;
30 | my $tmpexon = $a[1].",".$a[3].",".$a[4];
31 | if($a[$#a] =~ /0,/) {
32 | $tmpweight = 0;
33 | $tmpisoset = "";
34 | $tmpisoindex = "";
35 | $tmpisoindex1 = "";
36 | foreach my $i (0..$#b) {
37 | $tmpweight = $tmpweight + 2 ** $i if $b[$i] == 1;
38 | $tmpisoset = $tmpisoset.$tmpiso[$i]."," if $b[$i] == 1;
39 | $tmpisoindex = $tmpisoindex.$i."," if $b[$i] == 0;
40 | $tmpisoindex1 = $tmpisoindex1.$i."," if $b[$i] == 1;
41 | }
42 | if($tmpweight < $totalweight/2) {
43 | $isoindex{$a[0]}{$tmpweight}{1} = $tmpisoindex1;
44 | $isoindex_complement{$a[0]}{$tmpweight}{1} = $tmpisoindex;
45 | $isosetinf{$a[0]}{$tmpweight}{1} = $tmpisoset;
46 | $grouplength{$a[0]}{$tmpweight}{1} = $grouplength{$a[0]}{$tmpweight}{1} + $a[4] - $a[3] + 1;
47 | $exonset{$a[0]}{$tmpweight}{1} = $exonset{$a[0]}{$tmpweight}{1}.$tmpexon.";";
48 | } else {
49 | $tmpweight = $totalweight - $tmpweight;
50 | $isoindex{$a[0]}{$tmpweight}{0} = $tmpisoindex1;
51 | $isoindex_complement{$a[0]}{$tmpweight}{0} = $tmpisoindex;
52 | $isosetinf{$a[0]}{$tmpweight}{0} = $tmpisoset;
53 | $grouplength{$a[0]}{$tmpweight}{0} = $grouplength{$a[0]}{$tmpweight}{0} + $a[4] - $a[3] + 1;
54 | $exonset{$a[0]}{$tmpweight}{0} = $exonset{$a[0]}{$tmpweight}{0}.$tmpexon.";";
55 | }
56 | }
57 | }
58 | }
59 | close FP;
60 |
61 |
62 | foreach my $gene (keys %isosetinf) {
63 | foreach my $weight (keys %{$isosetinf{$gene}}) {
64 | my $status;
65 | if($grouplength{$gene}{$weight}{1} > 0 && $grouplength{$gene}{$weight}{0} > 0) {
66 | $status = "both";
67 | } else {
68 | $status = "one";
69 | }
70 | my $tmph1 = $grouplength{$gene}{$weight}{1} / $genelength{$gene};
71 | my $tmph0 = $grouplength{$gene}{$weight}{0} / $genelength{$gene};
72 |
73 | if($status eq "both") {
74 | print "$gene\t$weight\t$status\t+\t$tmph1\t$isoindex_complement{$gene}{$weight}{1}\t$exonset{$gene}{$weight}{1}\n";
75 | print "$gene\t$weight\t$status\t-\t$tmph0\t$isoindex_complement{$gene}{$weight}{0}\t$exonset{$gene}{$weight}{0}\n";
76 | }
77 |
78 | if($status eq "one") {
79 | if($grouplength{$gene}{$weight}{1} > 0) {
80 | print "$gene\t$weight\tplus\t+\t$tmph1\t$isoindex_complement{$gene}{$weight}{1}\t$exonset{$gene}{$weight}{1}\n";
81 | } else {
82 | print "$gene\t$weight\tminus\t+\t$tmph0\t$isoindex{$gene}{$weight}{0}\tNA\n";
83 | }
84 | if($grouplength{$gene}{$weight}{0} > 0) {
85 | print "$gene\t$weight\tminus\t-\t$tmph0\t$isoindex_complement{$gene}{$weight}{0}\t$exonset{$gene}{$weight}{0}\n";
86 | } else {
87 | print "$gene\t$weight\tplus\t-\t$tmph1\t$isoindex{$gene}{$weight}{1}\tNA\n";
88 | }
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/bin/gettascdata.pl:
--------------------------------------------------------------------------------
1 | my %cdt;
2 | my $ct=0;
3 | my $cdt0 = $ARGV[0];
4 | my $cdt1 = $ARGV[1];
5 | my $tmpdir = $ARGV[2];
6 | my $outdir = $ARGV[3];
7 | open FP, "$tmpdir\/condition_$cdt0\_$cdt1";
8 | while() {
9 | chomp();
10 | $ct++;
11 | $cdt{$ct} = $_;
12 | }
13 | close FP;
14 |
15 | my %qgene;
16 | open FP, "$tmpdir\/countdata_$cdt0\_$cdt1";
17 | while() {
18 | chomp();
19 | my @a = split("\t");
20 | my %tmp;
21 | foreach my $i (1..$#a) {
22 | $tmp{$cdt{$i}} = $tmp{$cdt{$i}} + $a[$i];
23 | }
24 | if($tmp{$cdt0} >= 1 && $tmp{$cdt1} >= 1) {
25 | $qgene{$a[0]} = 1;
26 | }
27 | }
28 | close FP;
29 |
30 |
31 | open OUT, ">$outdir\/tascdata_$cdt0\_$cdt1";
32 | open FP, "$tmpdir\/countdata_$cdt0\_$cdt1";
33 | while() {
34 | chomp();
35 | if(/ERCC/) {
36 | next;
37 | }
38 | my @a = split("\t");
39 | if($qgene{$a[0]} == 1) {
40 | print OUT "$a[0]\t";
41 | foreach my $i (1..($#a-1)) {
42 | print OUT "$a[$i],";
43 | }
44 | print OUT "$a[$#a]\t";
45 | print OUT "1\t1\t1\t1\n";
46 | }
47 | }
48 | close FP;
49 | close OUT;
50 |
--------------------------------------------------------------------------------
/bin/likelihoodumi.pyx:
--------------------------------------------------------------------------------
1 | from libc.math cimport lgamma
2 | from libc.math cimport exp
3 | from libc.math cimport log
4 | from libc.math cimport sqrt
5 | from scipy.optimize import minimize_scalar
6 | from scipy.optimize import minimize
7 | from math import pi
8 | from scipy.integrate import quad
9 | from scipy.integrate import dblquad
10 |
11 | cdef double expit(double p):
12 | return 1.0 / (1 + exp(-p))
13 |
14 | cdef double second_order_derivative(abkt_c, params_g, mu_cg, y_cg):
15 | cdef double alpha = abkt_c[0]
16 | cdef double beta = abkt_c[1]
17 | cdef double kappa = abkt_c[2]
18 | cdef double tau = abkt_c[3]
19 | cdef double theta_g = params_g[0]
20 | cdef double sigma_g = params_g[1]
21 | cdef double p_g = params_g[2]
22 | cdef double cg
23 |
24 | if sigma_g == 0:
25 | return float('inf')
26 |
27 | if y_cg == 0:
28 | cg = ((2 * p_g / (1 + exp(tau * mu_cg + kappa)) ** 3 * tau ** 2 * exp(tau * mu_cg + kappa) ** 2 - p_g / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau ** 2 * exp(tau * mu_cg + kappa) + 2 * p_g / (1 + exp(-tau * mu_cg - kappa)) ** 3 * exp(-exp(beta * mu_cg + alpha)) * tau ** 2 * exp(-tau * mu_cg - kappa) ** 2 - 2 * p_g / (1 + exp(-tau * mu_cg - kappa)) ** 2 * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - p_g / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau ** 2 * exp(-tau * mu_cg - kappa) - p_g / (1 + exp(-tau * mu_cg - kappa)) * beta ** 2 * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * beta ** 2 * exp(beta * mu_cg + alpha) ** 2 * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) / 2 - 2 * (-p_g / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + p_g / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - p_g / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) - (1 - p_g + p_g / (1 + exp(tau * mu_cg + kappa)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) + 2 * (1 - p_g + p_g / (1 + exp(tau * mu_cg + kappa)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) ** 2 / sigma_g ** 4 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2)) / (1 - p_g + p_g / (1 + exp(tau * mu_cg + kappa)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * sqrt(pi * sigma_g ** 2) / exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) - ((-p_g / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + p_g / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - p_g / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) / 2 - (1 - p_g + p_g / (1 + exp(tau * mu_cg + kappa)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2)) / (1 - p_g + p_g / (1 + exp(tau * mu_cg + kappa)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) ** 2 * sqrt(2) * sqrt(pi * sigma_g ** 2) / exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) * (-p_g / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + p_g / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - p_g / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) + 2 * ((-p_g / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + p_g / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - p_g / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) / 2 - (1 - p_g + p_g / (1 + exp(tau * mu_cg + kappa)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2)) / (1 - p_g + p_g / (1 + exp(tau * mu_cg + kappa)) + p_g / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * sqrt(pi * sigma_g ** 2) / exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) * (mu_cg - theta_g) / sigma_g ** 2
29 | else:
30 | cg = -(2 * beta ** 2 * exp(beta * mu_cg - tau * mu_cg + alpha - kappa) * sigma_g ** 2 + exp(beta * mu_cg - 2 * tau * mu_cg + alpha - 2 * kappa) * beta ** 2 * sigma_g ** 2 + tau ** 2 * exp(-tau * mu_cg - kappa) * sigma_g ** 2 + beta ** 2 * exp(beta * mu_cg + alpha) * sigma_g ** 2 + 2 * exp(-2 * tau * mu_cg - 2 * kappa) + 4 * exp(-tau * mu_cg - kappa) + 2) / sigma_g ** 2 / (1 + exp(-tau * mu_cg - kappa)) ** 2
31 | return cg
32 |
33 |
34 | cdef double second_order_derivative_nob(abkt_c, params_g, mu_cg, y_cg):
35 | cdef double alpha = abkt_c[0]
36 | cdef double beta = abkt_c[1]
37 | cdef double kappa = abkt_c[2]
38 | cdef double tau = abkt_c[3]
39 | cdef double theta_g = params_g[0]
40 | cdef double sigma_g = params_g[1]
41 | cdef double cg
42 |
43 | if sigma_g == 0:
44 | return float('inf')
45 |
46 | if y_cg == 0:
47 | cg = ((2 / (1 + exp(tau * mu_cg + kappa)) ** 3 * tau ** 2 * exp(tau * mu_cg + kappa) ** 2 - 1 / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau ** 2 * exp(tau * mu_cg + kappa) + 2 / (1 + exp(-tau * mu_cg - kappa)) ** 3 * exp(-exp(beta * mu_cg + alpha)) * tau ** 2 * exp(-tau * mu_cg - kappa) ** 2 - 2 / (1 + exp(-tau * mu_cg - kappa)) ** 2 * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - 1 / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau ** 2 * exp(-tau * mu_cg - kappa) - 1 / (1 + exp(-tau * mu_cg - kappa)) * beta ** 2 * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * beta ** 2 * exp(beta * mu_cg + alpha) ** 2 * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) / 2 - 2 * (-1 / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + 1 / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - 1 / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) - (1 / (1 + exp(tau * mu_cg + kappa)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) + 2 * (1 / (1 + exp(tau * mu_cg + kappa)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) ** 2 / sigma_g ** 4 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2)) / (1 / (1 + exp(tau * mu_cg + kappa)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * sqrt(pi * sigma_g ** 2) / exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) - ((-1 / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + 1 / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - 1 / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) / 2 - (1 / (1 + exp(tau * mu_cg + kappa)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2)) / (1 / (1 + exp(tau * mu_cg + kappa)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) ** 2 * sqrt(2) * sqrt(pi * sigma_g ** 2) / exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) * (-1 / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + 1 / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - 1 / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) + 2 * ((-1 / (1 + exp(tau * mu_cg + kappa)) ** 2 * tau * exp(tau * mu_cg + kappa) + 1 / (1 + exp(-tau * mu_cg - kappa)) ** 2 * exp(-exp(beta * mu_cg + alpha)) * tau * exp(-tau * mu_cg - kappa) - 1 / (1 + exp(-tau * mu_cg - kappa)) * beta * exp(beta * mu_cg + alpha) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) / 2 - (1 / (1 + exp(tau * mu_cg + kappa)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * (pi * sigma_g ** 2) ** (-0.5) * (mu_cg - theta_g) / sigma_g ** 2 * exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2)) / (1 / (1 + exp(tau * mu_cg + kappa)) + 1 / (1 + exp(-tau * mu_cg - kappa)) * exp(-exp(beta * mu_cg + alpha))) * sqrt(2) * sqrt(pi * sigma_g ** 2) / exp(-(mu_cg - theta_g) ** 2 / sigma_g ** 2) * (mu_cg - theta_g) / sigma_g ** 2
48 | else:
49 | cg = -(2 * beta ** 2 * exp(beta * mu_cg - tau * mu_cg + alpha - kappa) * sigma_g ** 2 + exp(beta * mu_cg - 2 * tau * mu_cg + alpha - 2 * kappa) * beta ** 2 * sigma_g ** 2 + tau ** 2 * exp(-tau * mu_cg - kappa) * sigma_g ** 2 + beta ** 2 * exp(beta * mu_cg + alpha) * sigma_g ** 2 + 2 * exp(-2 * tau * mu_cg - 2 * kappa) + 4 * exp(-tau * mu_cg - kappa) + 2) / sigma_g ** 2 / (1 + exp(-tau * mu_cg - kappa)) ** 2
50 | return cg
51 |
52 |
53 | cdef double log_dpois0(double log_mean):
54 | return -exp(log_mean)
55 |
56 |
57 | cdef double log_dpois(double count, double log_mean):
58 | return count * log_mean - lgamma(long(count + 1.5)) - exp(log_mean)
59 |
60 |
61 | cdef double log_expit(double x):
62 | return -log(1.0+exp(-x))
63 |
64 |
65 | cdef double log_sum_exp(double a, double b, double c):
66 | cdef double max_el = max(a, b, c)
67 | return max_el + log(exp(a - max_el) + exp(b - max_el) + exp(c - max_el))
68 |
69 |
70 | cdef double log_sum_exp2(double a, double b):
71 | cdef double max_el = max(a, b)
72 | return max_el + log(exp(a - max_el) + exp(b - max_el))
73 |
74 | cdef double log_dnorm(double x, double mu, double sigma):
75 | if sigma == 0.0:
76 | if x == mu:
77 | return 0.0
78 | else:
79 | return -float('inf')
80 | else:
81 | return -0.918938533204672669540968854562379419803619384765625 - log(sigma) - (x-mu) * (x-mu) / sigma / sigma / 2
82 |
83 |
84 | #### Beta distrubtion approach: NOT Completed #############################################################################
85 | cdef double log_dbeta(double x, double alpha, double beta):
86 | return (alpha-1) * log(x) + (beta-1) * log(1-x) + lgamma(alpha+beta) - lgamma(alpha) - lgamma(beta)
87 |
88 | cdef double neg_log_single_complete_likelihood_nob_psi(double mu_cg, double psi_ce, params_e, params_g, abkt_c, y_ce1, y_ce0):
89 | cdef double theta_g = params_g[0]
90 | cdef double sigma_g = params_g[1]
91 | cdef double alpha_e = params_e[0]
92 | cdef double beta_e = params_e[1]
93 | cdef double a_c = abkt_c[0]
94 | cdef double b_c = abkt_c[1]
95 |
96 | return -(log_dpois(y_ce1, a_c + b_c * mu_cg * psi_ce) + log_dpois(y_ce0, a_c + b_c * mu_cg * (1-psi_ce) ) + log_dbeta(x=psi_ce, alpha=alpha_e, beta=beta_e))
97 |
98 | cdef double neg_log_single_complete_likelihood_nob_psi_forminimize(double param_ce, double mu_cg, params_e, params_g, abkt_c, y_ce1, y_ce0):
99 | cdef double psi_ce = expit(param_ce)
100 | cdef double theta_g = params_g[0]
101 | cdef double sigma_g = params_g[1]
102 | cdef double alpha_e = params_e[0]
103 | cdef double beta_e = params_e[1]
104 | cdef double a_c = abkt_c[0]
105 | cdef double b_c = abkt_c[1]
106 |
107 |
108 | return -(log_dpois(y_ce1, a_c + b_c * mu_cg * psi_ce) + log_dpois(y_ce0, a_c + b_c * mu_cg * (1-psi_ce) ) + log_dbeta(x=psi_ce, alpha=alpha_e, beta=beta_e))
109 |
110 | cdef double single_complete_likelihood_nob_psi(double psi_ce, double mu_cg, params_e, params_g, abkt_c, y_ce1, y_ce0, double scale_factor_ce):
111 | print "haha",-neg_log_single_complete_likelihood_nob_psi(mu_cg, psi_ce, params_e, params_g, abkt_c, y_ce1, y_ce0),scale_factor_ce,exp(-neg_log_single_complete_likelihood_nob_psi(mu_cg, psi_ce, params_e, params_g, abkt_c, y_ce1, y_ce0) + scale_factor_ce)
112 | return exp(-neg_log_single_complete_likelihood_nob_psi(mu_cg, psi_ce, params_e, params_g, abkt_c, y_ce1, y_ce0) + scale_factor_ce)
113 |
114 |
115 | cdef double neg_log_single_marginal_likelihood_nob_psi(params_e, params_g, abkt_c, y_ce1, y_ce0, y_cg):
116 | # first get the min of the neg log-likelihood
117 | # use brent method
118 | cdef double min_val
119 | cdef double hessian
120 | cdef double lower_b
121 | cdef double upper_b
122 | min_neg_log = minimize_scalar(neg_log_single_complete_likelihood_nob_psi_forminimize, args=(9, params_e, params_g, abkt_c, y_ce1, y_ce0), method='brent')
123 |
124 | if min_neg_log.success:
125 | arg_min = min_neg_log.x
126 | min_val = min_neg_log.fun
127 | #print -neg_log_single_complete_likelihood_nob_psi(9, 0.5, params_e, params_g, abkt_c, y_ce1, y_ce0), min_val
128 | integral = quad(single_complete_likelihood_nob_psi, 0, 1, args = (9, params_e, params_g, abkt_c, y_ce1, y_ce0, min_val))
129 | print "hahaha",integral[0]
130 | return -(log(integral[0]) - min_val)
131 | else:
132 | return float('nan')
133 | #############################################################################################################################
134 |
135 | cdef double neg_log_single_complete_likelihood_nob(double mu_cg, params_g, abkt_c, y_cg):
136 | cdef double theta_g = params_g[0]
137 | cdef double sigma_g = params_g[1]
138 | cdef double a_c = abkt_c[0]
139 | cdef double b_c = abkt_c[1]
140 | cdef double k_c = abkt_c[2]
141 | cdef double t_c = abkt_c[3]
142 | if y_cg==0:
143 | return -(log_sum_exp2(log_expit(-(k_c + t_c * mu_cg)), log_expit(k_c + t_c * mu_cg) + log_dpois0(a_c + b_c * mu_cg)) + log_dnorm(x=mu_cg, mu=theta_g, sigma=sigma_g))
144 | else:
145 | return -(log_expit(k_c + t_c * mu_cg) + log_dpois(y_cg, a_c + b_c * mu_cg) + log_dnorm(x=mu_cg, mu=theta_g, sigma=sigma_g))
146 |
147 |
148 | cdef double single_complete_likelihood_nob(double mu_cg, params_g, abkt_c, y_cg, double scale_factor_cg):
149 |
150 | return exp(-neg_log_single_complete_likelihood_nob(mu_cg, params_g, abkt_c, y_cg) + scale_factor_cg)
151 |
152 |
153 |
154 |
155 | cdef double neg_log_single_marginal_likelihood_nob(params_g, abkt_c, y_cg):
156 | # first get the min of the neg log-likelihood
157 | # use brent method
158 | min_neg_log = minimize_scalar(neg_log_single_complete_likelihood_nob, args=(params_g, abkt_c, y_cg), method='brent')
159 | cdef double min_val
160 | cdef double hessian
161 | cdef double lower_b
162 | cdef double upper_b
163 | if min_neg_log.success:
164 | arg_min = min_neg_log.x
165 | min_val = min_neg_log.fun
166 | hessian = second_order_derivative_nob(abkt_c, params_g, arg_min, y_cg)
167 | lower_b = arg_min - 20 / sqrt(abs(hessian))
168 | upper_b = arg_min + 20 / sqrt(abs(hessian))
169 | integral = quad(single_complete_likelihood_nob, lower_b, upper_b, args = (params_g, abkt_c, y_cg, min_val))
170 | return -(log(integral[0]) - min_val)
171 | else:
172 | return float('nan')
173 |
174 |
175 | def neg_log_sum_marginal_likelihood_nob(real_params_g, abkt, y_g):
176 | params_g = [real_params_g[0], exp(real_params_g[1])]
177 | cdef double sum_marginal_likelihood = 0
178 | for i in range(len(y_g)):
179 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g, abkt[i,:], y_g[i])
180 | return sum_marginal_likelihood
181 |
182 |
183 | cdef double neg_log_single_complete_likelihood(double mu_cg, params_g, abkt_c, long y_cg):
184 | cdef double theta_g = params_g[0]
185 | cdef double sigma_g = params_g[1]
186 | cdef double p_g = params_g[2]
187 | cdef double a_c = abkt_c[0]
188 | cdef double b_c = abkt_c[1]
189 | cdef double k_c = abkt_c[2]
190 | cdef double t_c = abkt_c[3]
191 | if y_cg==0:
192 | return -(log_sum_exp(log(1-p_g), log(p_g) + log_expit(-(k_c + t_c * mu_cg)), log(p_g) + log_expit(k_c + t_c * mu_cg) + log_dpois0(a_c + b_c * mu_cg)) + log_dnorm(x=mu_cg, mu=theta_g, sigma=sigma_g))
193 | else:
194 | return -(log(p_g) + log_expit(k_c + t_c * mu_cg) + log_dpois(y_cg, a_c + b_c * mu_cg) + log_dnorm(x=mu_cg, mu=theta_g, sigma=sigma_g))
195 |
196 | cdef double neg_log_single_complete_likelihood_umi(double mu_cg, params_g, abkt_c, long y_cg):
197 | cdef double theta_g = params_g[0]
198 | cdef double sigma_g = params_g[1]
199 | cdef double p_g = params_g[2]
200 | cdef double a_c = abkt_c[0]
201 | cdef double b_c = 1
202 | cdef double k_c = 1
203 | cdef double t_c = 1
204 | if y_cg==0:
205 | return -(log_sum_exp2(log(1-p_g), log(p_g) + log_dpois0(a_c + mu_cg)) + log_dnorm(x=mu_cg, mu=theta_g, sigma=sigma_g))
206 | else:
207 | return -(log(p_g) + log_dpois(y_cg, a_c + mu_cg) + log_dnorm(x=mu_cg, mu=theta_g, sigma=sigma_g))
208 |
209 |
210 | cdef double single_complete_likelihood(double mu_cg, params_g, abkt_c, y_cg, scale_factor_cg):
211 | return exp(-neg_log_single_complete_likelihood(mu_cg, params_g, abkt_c, y_cg) + scale_factor_cg)
212 |
213 | cdef double single_complete_likelihood_umi(double mu_cg, params_g, abkt_c, y_cg, scale_factor_cg):
214 | return exp(-neg_log_single_complete_likelihood_umi(mu_cg, params_g, abkt_c, y_cg) + scale_factor_cg)
215 |
216 | cdef double neg_log_single_marginal_likelihood(params_g, abkt_c, y_cg):
217 | # first get the min of the neg log-likelihood
218 | # use brent method
219 | min_neg_log = minimize_scalar(neg_log_single_complete_likelihood, args = (params_g, abkt_c, y_cg), method='brent')
220 | cdef double min_val
221 | cdef double hessian
222 | cdef double lower_b
223 | cdef double upper_b
224 | if min_neg_log.success:
225 | arg_min = min_neg_log.x
226 | min_val = min_neg_log.fun
227 | hessian = second_order_derivative(abkt_c, params_g, arg_min, y_cg)
228 | lower_b = arg_min - 20 / sqrt(abs(hessian))
229 | upper_b = arg_min + 20 / sqrt(abs(hessian))
230 | integral = quad(single_complete_likelihood, lower_b, upper_b, args = (params_g, abkt_c, y_cg, min_val))
231 | return -(log(integral[0]) - min_val)
232 | else:
233 | return float('nan')
234 |
235 | cdef double neg_log_single_marginal_likelihood_umi(params_g, abkt_c, y_cg):
236 | # first get the min of the neg log-likelihood
237 | # use brent method
238 | min_neg_log = minimize_scalar(neg_log_single_complete_likelihood_umi, args = (params_g, abkt_c, y_cg), method='brent')
239 | cdef double min_val
240 | cdef double hessian
241 | cdef double lower_b
242 | cdef double upper_b
243 | if min_neg_log.success:
244 | arg_min = min_neg_log.x
245 | min_val = min_neg_log.fun
246 | hessian = second_order_derivative(abkt_c, params_g, arg_min, y_cg)
247 | lower_b = arg_min - 20 / sqrt(abs(hessian))
248 | upper_b = arg_min + 20 / sqrt(abs(hessian))
249 | integral = quad(single_complete_likelihood_umi, lower_b, upper_b, args = (params_g, abkt_c, y_cg, min_val))
250 | return -(log(integral[0]) - min_val)
251 | else:
252 | return float('nan')
253 |
254 |
255 | def neg_log_sum_marginal_likelihood(real_params_g, abkt, y_g):
256 | params_g = [real_params_g[0], exp(real_params_g[1]), expit(real_params_g[2])]
257 | cdef double sum_marginal_likelihood = 0
258 | for i in range(len(y_g)):
259 | sum_marginal_likelihood += neg_log_single_marginal_likelihood(params_g, abkt[i,:], y_g[i])
260 | return sum_marginal_likelihood
261 |
262 | def neg_log_sum_marginal_likelihood_umi(real_params_g, abkt, y_g):
263 | params_g = [real_params_g[0], exp(real_params_g[1]), expit(real_params_g[2])]
264 | cdef double sum_marginal_likelihood = 0
265 | for i in range(len(y_g)):
266 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_umi(params_g, abkt[i,:], y_g[i])
267 | return sum_marginal_likelihood
268 |
269 | def neg_log_sum_marginal_likelihood_free_p(real_params_g, abkt, y_g, x_g):
270 | cdef double sum_marginal_likelihood = 0
271 | for i in range(len(y_g)):
272 | params_g = [real_params_g[0], exp(real_params_g[1]), expit(real_params_g[2]) * (1 - x_g[i]) + expit(real_params_g[3]) * x_g[i]]
273 | sum_marginal_likelihood += neg_log_single_marginal_likelihood(params_g, abkt[i,:], y_g[i])
274 | return sum_marginal_likelihood
275 |
276 | def neg_log_sum_marginal_likelihood_free_theta(real_params_g, abkt, y_g, x_g):
277 | cdef double sum_marginal_likelihood = 0
278 | for i in range(len(y_g)):
279 | params_g = [real_params_g[0] * (1-x_g[i]) + real_params_g[1] * x_g[i], exp(real_params_g[2]), expit(real_params_g[3])]
280 | sum_marginal_likelihood += neg_log_single_marginal_likelihood(params_g, abkt[i,:], y_g[i])
281 | return sum_marginal_likelihood
282 |
283 | def neg_log_sum_marginal_likelihood_free_theta_umi(real_params_g, abkt, y_g, x_g):
284 | cdef double sum_marginal_likelihood = 0
285 | for i in range(len(y_g)):
286 | params_g = [real_params_g[0] * (1-x_g[i]) + real_params_g[1] * x_g[i], exp(real_params_g[2]), expit(real_params_g[3])]
287 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_umi(params_g, abkt[i,:], y_g[i])
288 | return sum_marginal_likelihood
289 |
290 | def neg_log_sum_marginal_likelihood_free_both(real_params_g, abkt, y_g, x_g):
291 | cdef double sum_marginal_likelihood = 0
292 | for i in range(len(y_g)):
293 | params_g = [real_params_g[0] * (1-x_g[i]) + real_params_g[1] * x_g[i], exp(real_params_g[2]), expit(real_params_g[3]) * (1 - x_g[i])]
294 | sum_marginal_likelihood += neg_log_single_marginal_likelihood(params_g, abkt[i,:], y_g[i])
295 | return sum_marginal_likelihood
296 |
297 | # testing BETE parameters NOT completed
298 |
299 |
300 | def neg_log_sum_marginal_likelihood_psi_both(real_params_e, est_params_g, abkt, y_g, y_e1, y_e0, x_g):
301 | cdef double sum_marginal_likelihood = 0
302 | for i in range(len(y_g)):
303 | if y_e1[i] > 0 or y_e0[i] > 0:
304 | params_e = [exp(real_params_e[0]) * (1-x_g[i]) + exp(real_params_e[2]) * x_g[i], exp(real_params_e[1]) * (1-x_g[i]) + exp(real_params_e[3]) * x_g[i] ]
305 | params_g = [est_params_g[0] * (1-x_g[i]) + est_params_g[2] * x_g[i], est_params_g[1] * (1-x_g[i]) + est_params_g[3] * x_g[i]]
306 | #print "hahaha",neg_log_single_marginal_likelihood_nob_psi(params_e, params_g, abkt[i,:], y_e1[i], y_e0[i], y_g[i])
307 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob_psi(params_e, params_g, abkt[i,:], y_e1[i], y_e0[i], y_g[i])
308 | return sum_marginal_likelihood
309 |
310 |
311 | # testing PSI
312 |
313 | def neg_log_sum_marginal_likelihood_psi_equal_variance(real_params_g, abkt, y_ce1, y_ce0, x_g, theta_g1, theta_g2, sigma_g1, sigma_g2, group_status):
314 | cdef double sum_marginal_likelihood = 0
315 | cdef double psi_ce = expit(real_params_g[0])
316 | cdef double theta_e1_grp1 = theta_g1 * psi_ce
317 | cdef double theta_e0_grp1 = theta_g1 - theta_e1_grp1
318 | cdef double theta_e1_grp2 = theta_g2 * psi_ce
319 | cdef double theta_e0_grp2 = theta_g2 - theta_e1_grp2
320 | cdef double sigma_e1_grp1 = exp(real_params_g[1])
321 | cdef double sigma_e0_grp1 = exp(real_params_g[2])
322 | cdef double sigma_e1_grp2 = exp(real_params_g[1])
323 | cdef double sigma_e0_grp2 = exp(real_params_g[2])
324 |
325 | if group_status == "+\-":
326 | for i in range(len(y_ce1)):
327 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]]
328 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]]
329 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g, abkt[i,:], y_ce1[i]) + neg_log_single_marginal_likelihood_nob(params_g1, abkt[i,:], y_ce0[i])
330 |
331 | if group_status == "+":
332 | for i in range(len(y_ce1)):
333 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]]
334 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]]
335 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g, abkt[i,:], y_ce1[i])
336 |
337 | if group_status == "-":
338 | for i in range(len(y_ce1)):
339 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]]
340 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]]
341 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g1, abkt[i,:], y_ce0[i])
342 |
343 | return sum_marginal_likelihood
344 |
345 | def neg_log_sum_marginal_likelihood_psi_free_equal_variance(real_params_g, abkt, y_ce1, y_ce0, x_g, theta_g1, theta_g2, sigma_g1, sigma_g2, group_status):
346 | cdef double sum_marginal_likelihood = 0
347 | cdef double psi_ce_grp1 = expit(real_params_g[0])
348 | cdef double psi_ce_grp2 = expit(real_params_g[1])
349 | cdef double theta_e1_grp1 = theta_g1 * psi_ce_grp1
350 | cdef double theta_e0_grp1 = theta_g1 - theta_e1_grp1
351 | cdef double theta_e1_grp2 = theta_g2 * psi_ce_grp2
352 | cdef double theta_e0_grp2 = theta_g2 - theta_e1_grp2
353 | cdef double sigma_e1_grp1 = exp(real_params_g[2])
354 | cdef double sigma_e0_grp1 = exp(real_params_g[3])
355 | cdef double sigma_e1_grp2 = exp(real_params_g[2])
356 | cdef double sigma_e0_grp2 = exp(real_params_g[3])
357 |
358 | if group_status == "+\-":
359 | for i in range(len(y_ce1)):
360 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]]
361 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]]
362 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g, abkt[i,:], y_ce1[i]) + neg_log_single_marginal_likelihood_nob(params_g1, abkt[i,:], y_ce0[i])
363 |
364 | if group_status == "+":
365 | for i in range(len(y_ce1)):
366 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]]
367 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]]
368 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g, abkt[i,:], y_ce1[i])
369 |
370 | if group_status == "-":
371 | for i in range(len(y_ce1)):
372 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]]
373 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]]
374 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g1, abkt[i,:], y_ce0[i])
375 |
376 | return sum_marginal_likelihood
377 |
378 |
379 | ### testing psi for UMI data
380 |
381 | def neg_log_sum_marginal_likelihood_psi_equal_variance_umi(real_params_g, abkt, y_ce1, y_ce0, x_g, theta_g1, theta_g2, sigma_g1, sigma_g2, p_bursting, group_status):
382 | cdef double sum_marginal_likelihood = 0
383 | cdef double psi_ce = expit(real_params_g[0])
384 | cdef double theta_e1_grp1 = theta_g1 * psi_ce
385 | cdef double theta_e0_grp1 = theta_g1 - theta_e1_grp1
386 | cdef double theta_e1_grp2 = theta_g2 * psi_ce
387 | cdef double theta_e0_grp2 = theta_g2 - theta_e1_grp2
388 | cdef double sigma_e1_grp1 = exp(real_params_g[1])
389 | cdef double sigma_e0_grp1 = exp(real_params_g[2])
390 | cdef double sigma_e1_grp2 = exp(real_params_g[1])
391 | cdef double sigma_e0_grp2 = exp(real_params_g[2])
392 |
393 | if group_status == "+\-":
394 | for i in range(len(y_ce1)):
395 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i], p_bursting]
396 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i], p_bursting]
397 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_umi(params_g, abkt[i,:], y_ce1[i]) + neg_log_single_marginal_likelihood_umi(params_g1, abkt[i,:], y_ce0[i])
398 |
399 | if group_status == "+":
400 | for i in range(len(y_ce1)):
401 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]]
402 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]]
403 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g, abkt[i,:], y_ce1[i])
404 |
405 | if group_status == "-":
406 | for i in range(len(y_ce1)):
407 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]]
408 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]]
409 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g1, abkt[i,:], y_ce0[i])
410 |
411 | return sum_marginal_likelihood
412 |
413 |
414 |
415 | def neg_log_sum_marginal_likelihood_psi_free_equal_variance_umi(real_params_g, abkt, y_ce1, y_ce0, x_g, theta_g1, theta_g2, sigma_g1, sigma_g2, p_bursting, group_status):
416 | cdef double sum_marginal_likelihood = 0
417 | cdef double psi_ce_grp1 = expit(real_params_g[0])
418 | cdef double psi_ce_grp2 = expit(real_params_g[1])
419 | cdef double theta_e1_grp1 = theta_g1 * psi_ce_grp1
420 | cdef double theta_e0_grp1 = theta_g1 - theta_e1_grp1
421 | cdef double theta_e1_grp2 = theta_g2 * psi_ce_grp2
422 | cdef double theta_e0_grp2 = theta_g2 - theta_e1_grp2
423 | cdef double sigma_e1_grp1 = exp(real_params_g[2])
424 | cdef double sigma_e0_grp1 = exp(real_params_g[3])
425 | cdef double sigma_e1_grp2 = exp(real_params_g[2])
426 | cdef double sigma_e0_grp2 = exp(real_params_g[3])
427 |
428 |
429 | if group_status == "+\-":
430 | for i in range(len(y_ce1)):
431 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i], p_bursting]
432 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i], p_bursting]
433 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_umi(params_g, abkt[i,:], y_ce1[i]) + neg_log_single_marginal_likelihood_umi(params_g1, abkt[i,:], y_ce0[i])
434 |
435 | if group_status == "+":
436 | for i in range(len(y_ce1)):
437 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]]
438 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]]
439 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g, abkt[i,:], y_ce1[i])
440 |
441 | if group_status == "-":
442 | for i in range(len(y_ce1)):
443 | params_g = [theta_e1_grp1 * (1-x_g[i]) + (theta_e1_grp2 * x_g[i]), sigma_e1_grp1 * (1-x_g[i]) + sigma_e1_grp2 * x_g[i]]
444 | params_g1 = [theta_e0_grp1 * (1-x_g[i]) + (theta_e0_grp2 * x_g[i]), sigma_e0_grp1 * (1-x_g[i]) + sigma_e0_grp2 * x_g[i]]
445 | sum_marginal_likelihood += neg_log_single_marginal_likelihood_nob(params_g1, abkt[i,:], y_ce0[i])
446 |
447 | return sum_marginal_likelihood
448 |
--------------------------------------------------------------------------------
/bin/likelihoodumi.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/likelihoodumi.so
--------------------------------------------------------------------------------
/bin/my_functions.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | from __future__ import print_function
4 | from collections import defaultdict
5 | import math, sys, os, re, time
6 |
7 | # set up auto dictionary function
8 | def auto_dict():
9 | return defaultdict(auto_dict)
10 |
11 | # make a directory
12 | def mk_dir(path):
13 | check = os.path.isdir(path)
14 | if not check:
15 | os.system("mkdir " + path)
16 | return
17 |
18 | # parse arguments
19 | def parse_argument(validArgList, addAbsPath, warnMessage):
20 | for argIndex in range(1,len(sys.argv)):
21 | if sys.argv[argIndex][0] == "-" and sys.argv[argIndex] not in validArgList :
22 | print("Argument \'"+sys.argv[argIndex]+"\' is invalid!")
23 | sys.exit()
24 |
25 | # assign arguments to a list
26 | outList = []
27 | for i in range(0, len(validArgList)):
28 | for argIndex in range(1,len(sys.argv)):
29 | if sys.argv[argIndex] == validArgList[i]:
30 | argIndex += 1
31 | if "~" in sys.argv[argIndex]:
32 | sys.argv[argIndex] = os.path.expanduser(sys.argv[argIndex])
33 | fileAbsPath = os.path.dirname(os.path.abspath(sys.argv[argIndex]))
34 | fileTmp = sys.argv[argIndex].split("/")
35 | if addAbsPath[i] == 1: # target file
36 | fileTmp = fileAbsPath + "/" + fileTmp[len(fileTmp)-1]
37 | check = os.path.exists(fileTmp)
38 | if not check:
39 | print(fileTmp+" does not exist!")
40 | sys.exit()
41 | if addAbsPath[i] == 3: # create target file
42 | fileTmp = fileAbsPath + "/" + fileTmp[len(fileTmp)-1]
43 | if addAbsPath[i] == 0: # value
44 | fileTmp = fileTmp[len(fileTmp)-1]
45 | if addAbsPath[i] == 2: # target directory
46 | fileTmp = os.path.abspath(sys.argv[argIndex])
47 | check = os.path.isdir(fileTmp)
48 | if not check:
49 | print(fileTmp+" does not exist!")
50 | sys.exit()
51 |
52 | outList.append(fileTmp)
53 |
54 | if len(outList) != len(validArgList):
55 | print(warnMessage)
56 | sys.exit()
57 | return outList
58 |
59 | # check modules ### NOT WORKING!!
60 | import imp
61 | def check_module_exists(name):
62 | try:
63 | imp.find_module(name)
64 | except ImportError:
65 | return False
66 | return True
67 |
68 | def check_module(module):
69 | x = check_module_exists(module)
70 | if x:
71 | print("Module \'" + module + "\' is installed.")
72 | if not x:
73 | print("Module \'" + module + "\' is NOT installed!")
74 | return
75 |
76 | # check program
77 | from subprocess import Popen, PIPE
78 |
79 | def check_program_exists(name):
80 | p = Popen(['/usr/bin/which', name], stdout=PIPE, stderr=PIPE)
81 | p.communicate()
82 | return p.returncode == 0
83 |
84 | def check_program(program):
85 | x = check_program_exists(program)
86 | if x:
87 | print("Program \'" + program + "\' is installed.")
88 | if not x:
89 | print("Program \'" + program + "\' is NOT installed!")
90 | return
91 |
92 | # check file
93 | def check_file(name, othermessage):
94 | check = os.path.exists(name)
95 | if not check:
96 | print(name+" does not exist!"+othermessage)
97 | sys.exit()
98 |
--------------------------------------------------------------------------------
/bin/my_functions.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/my_functions.pyc
--------------------------------------------------------------------------------
/bin/scats_functions.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | from __future__ import print_function
4 | from collections import defaultdict
5 | import math, sys, os, re, time
6 |
7 | # check whether meta file is qualified
8 | def check_meta(metaFile, umiRun, onebam):
9 | cdtset = []
10 | with open (metaFile, "r") as FP:
11 | for line in FP:
12 | line = line.strip("\n")
13 | tmpinf = line.split("\t")
14 | cellbc = tmpinf[0]
15 | condition = tmpinf[1]
16 | bamfile = tmpinf[2]
17 |
18 | cdtset.append(condition)
19 | exists = os.path.isfile(bamfile)
20 | if not exists:
21 | print(bamfile+" does not exist!")
22 | sys.exit()
23 | exists = os.path.isfile(bamfile+".bai")
24 | if not exists:
25 | print(bamfile+".bai does not exist! Please index BAM file.")
26 | sys.exit()
27 |
28 | if umiRun == "yes":
29 | umitag = tmpinf[3]
30 | if not umitag:
31 | print("Please specify UMI tag name for each cell at 4th column of meta file!")
32 | sys.exit()
33 | if onebam == "yes":
34 | celltag = tmpinf[4]
35 | if not celltag:
36 | print("Please specify cell tag name for each cell at 5th column of meta file!")
37 | sys.exit()
38 | if umiRun == "no":
39 | if onebam == "yes":
40 | celltag = tmpinf[3]
41 | if not celltag:
42 | print("Please specify cell tag name for each cell at 4th column of meta file!")
43 | sys.exit()
44 |
45 | # check number of conditions
46 | #cdtset = len(set(cdtset))
47 | #if cdtset < 2:
48 | #print("Please specify 2 conditions at 2nd column of meta file!")
49 | #sys.exit()
50 | #if cdtset > 2:
51 | #print("Please specify only 2 conditions at 2nd column of meta file!")
52 | #sys.exit()
53 | return
54 |
55 | # check count file
56 | def check_count_file(metaFile, tmpDir):
57 | check_meta(metaFile, "no", "no")
58 | cdtset = []
59 | with open (metaFile, "r") as FP:
60 | for line in FP:
61 | line = line.strip("\n")
62 | tmpinf = line.split("\t")
63 | countFile = tmpDir + "/count_" + tmpinf[0] + ".out"
64 | check = os.path.exists(countFile)
65 | if not check:
66 | print(countFile+" does not exist! Please run SCATS.py -task count to obtain read count files.")
67 | sys.exit()
68 | condition = tmpinf[1]
69 | cdtset.append(condition)
70 |
71 | cdtset = list(set(cdtset))
72 | cdtset.sort()
73 | return cdtset
74 |
75 |
76 | ## write count sh file to tmp directory
77 | def write_count_sh(fileAbsPath, umiRun, onebam, metaFile, tmpDir, refgeneFile, gpinfoFile):
78 | check_meta(metaFile, umiRun, onebam)
79 | if umiRun == "yes" and onebam == "yes":
80 | with open (metaFile, "r") as FP:
81 | for line in FP:
82 | line = line.strip("\n")
83 | tmpinf = line.split("\t")
84 | cellbc = tmpinf[0]
85 | bamfile = tmpinf[2]
86 | umitag = tmpinf[3]
87 | celltag = tmpinf[4]
88 | outFile = tmpDir + "/count_" + cellbc + ".sh"
89 | OUT = open(outFile, "w")
90 | outwrite = "python " + fileAbsPath + "/bin/getCount_umi_cellid.py -bam " + bamfile + " -ref " + refgeneFile + " -gpinfo " + gpinfoFile + " -out " + tmpDir + "/count_" + cellbc + ".out"
91 | outwrite += " -cellid " + cellbc + " -celltag " + celltag + " -umitag " + umitag + "\n"
92 | OUT.write(outwrite)
93 | OUT.close()
94 | if umiRun == "yes" and onebam == "no":
95 | with open (metaFile, "r") as FP:
96 | for line in FP:
97 | line = line.strip("\n")
98 | tmpinf = line.split("\t")
99 | cellbc = tmpinf[0]
100 | bamfile = tmpinf[2]
101 | umitag = tmpinf[3]
102 | #celltag = tmpinf[4]
103 | outFile = tmpDir + "/count_" + cellbc + ".sh"
104 | OUT = open(outFile, "w")
105 | outwrite = "python " + fileAbsPath + "/bin/getCount_umi.py -bam " + bamfile + " -ref " + refgeneFile + " -gpinfo " + gpinfoFile + " -\
106 | out " + tmpDir + "/count_" + cellbc + ".out"
107 | outwrite += " -umitag " + umitag + "\n"
108 | OUT.write(outwrite)
109 | OUT.close()
110 | if umiRun == "no" and onebam == "yes":
111 | with open (metaFile, "r") as FP:
112 | for line in FP:
113 | line = line.strip("\n")
114 | tmpinf = line.split("\t")
115 | cellbc = tmpinf[0]
116 | bamfile = tmpinf[2]
117 | #umitag = tmpinf[3]
118 | celltag = tmpinf[3]
119 | outFile = tmpDir + "/count_" + cellbc + ".sh"
120 | OUT = open(outFile, "w")
121 | outwrite = "python " + fileAbsPath + "/bin/getCount_cellid.py -bam " + bamfile + " -ref " + refgeneFile + " -gpinfo " + gpinfoFile + " -\
122 | out " + tmpDir + "/count_" + cellbc + ".out"
123 | outwrite += " -cellid " + cellbc + " -celltag " + celltag + "\n"
124 | OUT.write(outwrite)
125 | OUT.close()
126 | if umiRun == "no" and onebam == "no":
127 | with open (metaFile, "r") as FP:
128 | for line in FP:
129 | line = line.strip("\n")
130 | tmpinf = line.split("\t")
131 | cellbc = tmpinf[0]
132 | bamfile = tmpinf[2]
133 | outFile = tmpDir + "/count_" + cellbc + ".sh"
134 | OUT = open(outFile, "w")
135 | outwrite = "python " + fileAbsPath + "/bin/getCount.py -bam " + bamfile + " -ref " + refgeneFile + " -gpinfo " + gpinfoFile + " -\
136 | out " + tmpDir + "/count_" + cellbc + ".out"
137 | OUT.write(outwrite)
138 | OUT.close()
139 |
140 | return
141 |
--------------------------------------------------------------------------------
/bin/scats_functions.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/bin/scats_functions.pyc
--------------------------------------------------------------------------------
/bin/scats_isoform.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import numpy as np
4 | from mpi4py import MPI
5 | import argparse
6 | from time import strftime
7 | from numpy import log
8 | from scipy.optimize import minimize
9 | from scipy.special import expit
10 | from scipy.special import logit
11 | from numpy import exp
12 | from numpy import column_stack
13 | from numpy.random import uniform
14 | from scipy.stats import chi2
15 | import likelihoodumi
16 |
17 |
18 | class logger():
19 | def __init__(self, fh):
20 | self.mpi_fh = fh
21 |
22 | def log(self, string):
23 | self.mpi_fh.Write_shared('[' + strftime("%m/%d/%Y %H:%M:%S") + ']\t' + string + '\n')
24 | self.mpi_fh.Sync()
25 |
26 | def close(self):
27 | self.mpi_fh.Close()
28 |
29 |
30 | class result_writer():
31 | def __init__(self, fh):
32 | self.mpi_fh = fh
33 |
34 | def write_header(self, type_comp):
35 | if type_comp == 1:
36 | self.mpi_fh.Write_shared('\t'.join(['gene', 'optim_scs', 'theta_nob', 'sigma_nob', 'theta', 'sigma', 'pg', 'lrt_stat', 'lrt_pval']) + "\n")
37 | elif type_comp == 2:
38 | self.mpi_fh.Write_shared('\t'.join(['gene', 'optim_scs', 'theta_rd', 'sigma_rd', 'pg_rd', 'theta_full', 'sigma_full', 'pg_full_0', 'pg_full_1', 'lrt_stat', 'lrt_pval']) + "\n")
39 | elif type_comp == 3:
40 | self.mpi_fh.Write_shared('\t'.join(['gene', 'optim_scs', 'theta_rd', 'sigma_rd', 'pg_rd', 'theta_full_0', 'theta_full_1', 'sigma_full', 'pg_full', 'lrt_stat', 'lrt_pval']) + "\n")
41 | elif type_comp == 4:
42 | self.mpi_fh.Write_shared('\t'.join(['gene', 'optim_scs', 'theta_rd', 'sigma_rd', 'pg_rd', 'theta_full_0', 'theta_full_1', 'sigma_full', 'pg_full_0', 'pg_full_1', 'lrt_stat', 'lrt_pval']) + "\n")
43 |
44 | def log(self, res):
45 | self.mpi_fh.Write_shared('\t'.join([str(x) for x in res]) + '\n')
46 | self.mpi_fh.Sync()
47 |
48 | def close(self):
49 | self.mpi_fh.Close()
50 |
51 |
52 | def get_non_zero(y):
53 | num_non_zero=0
54 | for el in y:
55 | if el > 0:
56 | num_non_zero += 1
57 | return num_non_zero
58 |
59 | def get_psi_range(y_ce1, y_ce0):
60 | psi_obs = np.sum(y_ce1) / (np.sum(y_ce1) + np.sum(y_ce0))
61 | psi_upper = min(psi_obs+0.25, 1)
62 | psi_lower = max(psi_obs-0.25, 0)
63 |
64 | return psi_lower, psi_upper
65 |
66 | def get_psi_range_grp(y_ce1, y_ce0, x):
67 | y_ce1_grp1 = y_ce1[x==0]
68 | y_ce1_grp2 = y_ce1[x==1]
69 | y_ce0_grp1 = y_ce1[x==0]
70 | y_ce0_grp2 = y_ce1[x==1]
71 |
72 | psi_obs_grp1 = np.sum(y_ce1_grp1) / (np.sum(y_ce1_grp1) + np.sum(y_ce0_grp1))
73 | psi_obs_grp2 = np.sum(y_ce1_grp2) / (np.sum(y_ce1_grp2) + np.sum(y_ce0_grp2))
74 | psi_upper_grp1 = min(psi_obs_grp1+0.25, 1)
75 | psi_lower_grp1 = max(psi_obs_grp1-0.25, 0)
76 | psi_upper_grp2 = min(psi_obs_grp2+0.25, 1)
77 | psi_lower_grp2 = max(psi_obs_grp2-0.25, 0)
78 |
79 | return psi_lower_grp1, psi_upper_grp1, psi_lower_grp2, psi_upper_grp2
80 |
81 |
82 | def get_rr_range(y):
83 | abkt_mean = np.mean(abkt_params, axis=0)
84 | alpha = abkt_mean[0]
85 | beta = abkt_mean[1]
86 | theta_upper = (np.mean(np.log(y[y > 0])) - alpha) / beta
87 | theta_lower = (-1 - alpha) / beta
88 | p_upper = 0.9
89 | p_lower = float(np.sum(y>0)) / len(y)
90 | std_upper = np.std(np.log(y + 1))/beta/beta
91 | std_lower = np.std(np.log(y[y > 0] + 1))/beta/beta
92 | # std_upper = 10
93 | # std_lower = 1
94 | return theta_lower, theta_upper, p_lower, p_upper, std_lower, std_upper
95 |
96 | def get_rr_range_umi(y):
97 | abkt_mean = np.mean(abkt_params, axis=0)
98 | alpha = abkt_mean[0]
99 | beta = 1
100 | theta_upper = (np.mean(np.log(y[y > 0])) - alpha) / beta
101 | theta_lower = (-1 - alpha) / beta
102 | p_upper = 0.9
103 | p_lower = float(np.sum(y>0)) / len(y)
104 | std_upper = np.std(np.log(y + 1))/beta/beta
105 | std_lower = np.std(np.log(y[y > 0] + 1))/beta/beta
106 | # std_upper = 10
107 | # std_lower = 1
108 | return theta_lower, theta_upper, p_lower, p_upper, std_lower, std_upper
109 |
110 |
111 | def get_rr_range_grp(y, x):
112 | abkt_mean0 = np.mean(abkt_params[x==0,:], axis=0)
113 | abkt_mean1 = np.mean(abkt_params[x==1,:], axis=0)
114 | y_grp0 = y[x==0]
115 | y_grp1 = y[x==1]
116 | theta_upper0 = (np.mean(np.log(y_grp0[y_grp0 > 0])) - abkt_mean0[0])/abkt_mean0[1]
117 | theta_lower0 = (-1 - abkt_mean0[0])/abkt_mean0[1]
118 | theta_upper1 = (np.mean(np.log(y_grp1[y_grp1 > 0])) - abkt_mean1[0])/abkt_mean1[1]
119 | theta_lower1 = (-1 - abkt_mean1[0])/abkt_mean1[1]
120 | p_upper0 = 0.9
121 | p_upper1 = 0.9
122 | p_lower0 = float(np.sum(y_grp0 > 0)) / len(y_grp0)
123 | p_lower1 = float(np.sum(y_grp1 > 0)) / len(y_grp1)
124 | std_upper0 = np.std(np.log(y_grp0 + 1))/abkt_mean0[1]/abkt_mean0[1]
125 | std_lower0 = np.std(np.log(y_grp0[y_grp0 > 0] + 1))/abkt_mean0[1]/abkt_mean0[1]
126 | std_upper1 = np.std(np.log(y_grp1 + 1))/abkt_mean1[1]/abkt_mean1[1]
127 | std_lower1 = np.std(np.log(y_grp1[y_grp1 > 0] + 1))/abkt_mean1[1]/abkt_mean1[1]
128 | # std_upper0 = std_upper1 = 10
129 | # std_lower0 = std_lower1 = 1
130 | return theta_lower0, theta_upper0, p_lower0, p_upper0, std_lower0, std_upper0, theta_lower1, theta_upper1, p_lower1, p_upper1, std_lower1, std_upper1
131 |
132 | def get_rr_range_grp_umi(y, x):
133 | abkt_mean0 = np.mean(abkt_params[x==0,:], axis=0)
134 | abkt_mean1 = np.mean(abkt_params[x==1,:], axis=0)
135 | abkt_mean0[1] = 1
136 | abkt_mean1[1] = 1
137 | y_grp0 = y[x==0]
138 | y_grp1 = y[x==1]
139 | theta_upper0 = (np.mean(np.log(y_grp0[y_grp0 > 0])) - abkt_mean0[0])/abkt_mean0[1]
140 | theta_lower0 = (-1 - abkt_mean0[0])/abkt_mean0[1]
141 | theta_upper1 = (np.mean(np.log(y_grp1[y_grp1 > 0])) - abkt_mean1[0])/abkt_mean1[1]
142 | theta_lower1 = (-1 - abkt_mean1[0])/abkt_mean1[1]
143 | p_upper0 = 0.9
144 | p_upper1 = 0.9
145 | p_lower0 = float(np.sum(y_grp0 > 0)) / len(y_grp0)
146 | p_lower1 = float(np.sum(y_grp1 > 0)) / len(y_grp1)
147 | std_upper0 = np.std(np.log(y_grp0 + 1))/abkt_mean0[1]/abkt_mean0[1]
148 | std_lower0 = np.std(np.log(y_grp0[y_grp0 > 0] + 1))/abkt_mean0[1]/abkt_mean0[1]
149 | std_upper1 = np.std(np.log(y_grp1 + 1))/abkt_mean1[1]/abkt_mean1[1]
150 | std_lower1 = np.std(np.log(y_grp1[y_grp1 > 0] + 1))/abkt_mean1[1]/abkt_mean1[1]
151 | # std_upper0 = std_upper1 = 10
152 | # std_lower0 = std_lower1 = 1
153 | return theta_lower0, theta_upper0, p_lower0, p_upper0, std_lower0, std_upper0, theta_lower1, theta_upper1, p_lower1, p_upper1, std_lower1, std_upper1
154 |
155 |
156 |
157 | def get_parsed_options():
158 | parser=argparse.ArgumentParser(description='TASC-B, a quantifier for gene expression incorporating gene bursting.')
159 | parser.add_argument('-y', '--counts', required = True, type=str, dest='y_filename', action='store', default='y.tsv',
160 | help='name of the file containing the counts')
161 | parser.add_argument('-x', '--group', type=str, dest='x_filename', action='store', default='x.tsv',
162 | help='name of the file containing group info')
163 | parser.add_argument('-k', '--abkt', type=str, dest='abkt_filename', action='store', default='abkt.tsv',
164 | help='name of the file containing given abkt values')
165 | parser.add_argument('-t', '--type', type=int, dest='type_op', action='store', default=1,
166 | help='type of operation: \n1 - test p < 1; \n2 - test p1 != p2, \n3 - test t1 != t2, \n4 - test 2 and 3 simultaneously')
167 | parser.add_argument('-o', '--outdest', type=str, dest='out_filename', action='store', default='tasc_out.tsv',
168 | help='name of the output file')
169 | parser.add_argument('-r', '--minrestart', type=int, dest='minNR', action='store', default=1,
170 | help='minimum number of restarts for optimization (default=2)')
171 | parser.add_argument('-m', '--maxrestart', type=int, dest='maxNR', action='store', default=3,
172 | help='max number of restarts for optimization (default=8)')
173 | args=parser.parse_args()
174 | return args
175 |
176 |
177 | def parse_filter_counts(y_filename, size):
178 | genes=[[] for _ in range(size)]
179 | log_fh.log('parsing counts file: ' + y_filename)
180 |
181 | with open(y_filename) as f:
182 | idx=0
183 | total_num_genes = 0
184 | for line in f:
185 | tokens=line.rstrip('\n').split('\t')
186 | counts=np.array([long(x) for x in tokens[1].split(',')])
187 | counts1=np.array([long(x) for x in tokens[2].split(',')])
188 | counts0=np.array([long(x) for x in tokens[3].split(',')])
189 | est_params_g=np.array([float(x) for x in tokens[4].split(',')])
190 | group_status=tokens[5]
191 | #print group_status
192 | #print counts
193 | if get_non_zero(counts) >= 3:
194 | genes[idx].append((tokens[0], counts, counts1, counts0, est_params_g, group_status))
195 | idx += 1
196 | total_num_genes += 1
197 | if idx >= size:
198 | idx=0
199 | log_fh.log('total number of genes parsed: ' + str(total_num_genes))
200 | return genes
201 |
202 |
203 | def opt_neg_log_sum_marginal_likelihood(gene_name, abkt, y_g, num_random_restarts, minrr):
204 |
205 | theta_lower, theta_upper, p_lower, p_upper, std_lower, std_upper = get_rr_range(y_g)
206 |
207 | real_params_g_rtimes = column_stack((uniform(theta_lower, theta_upper, num_random_restarts),
208 | log(uniform(std_lower, std_upper, num_random_restarts))))
209 |
210 | arg_min_x=[]
211 | val_min_x=[]
212 | for i in range(num_random_restarts):
213 | log_fh.log('tasc optimization #' + str(i) + ' for gene ' + gene_name)
214 | real_params_g=real_params_g_rtimes[i,:]
215 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood_nob, x0=real_params_g, args=(abkt, y_g), method='L-BFGS-B')
216 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (not optim_result_obj.fun == 0):
217 | arg_min_x.append(optim_result_obj)
218 | val_min_x.append(optim_result_obj.fun)
219 | if len(arg_min_x) >= minrr:
220 | break
221 |
222 | if len(arg_min_x) == 0:
223 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan')))
224 | return
225 | tasc_nob_res = arg_min_x[np.argmin(val_min_x)]
226 |
227 |
228 | real_params_g_rtimes = column_stack((uniform(theta_lower, theta_upper, num_random_restarts),
229 | log(uniform(std_lower, std_upper, num_random_restarts)),
230 | logit(uniform(p_lower, p_upper, num_random_restarts))))
231 | arg_min_x=[]
232 | val_min_x=[]
233 |
234 | for i in range(num_random_restarts):
235 | log_fh.log('tasc-b optimization #' + str(i) + ' for gene ' + gene_name)
236 | real_params_g=real_params_g_rtimes[i,:]
237 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood, x0=real_params_g, args=(abkt, y_g), method='L-BFGS-B')
238 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (not optim_result_obj.fun == 0):
239 | arg_min_x.append(optim_result_obj)
240 | val_min_x.append(optim_result_obj.fun)
241 | if len(arg_min_x) >= minrr:
242 | break
243 |
244 | if len(arg_min_x) == 0:
245 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan')))
246 | return
247 | tasc_b_res=arg_min_x[np.argmin(val_min_x)]
248 |
249 | lrt_stat = 2 * (tasc_nob_res.fun - tasc_b_res.fun)
250 |
251 | if np.isnan(lrt_stat):
252 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan')))
253 | else:
254 | lrt_pval = 1 - chi2.cdf(lrt_stat, df=1)
255 | res_fh.log((gene_name, True, tasc_nob_res.x[0], exp(tasc_nob_res.x[1]), tasc_b_res.x[0], exp(tasc_b_res.x[1]), expit(tasc_b_res.x[2]), lrt_stat, lrt_pval))
256 |
257 |
258 | def lrt_free_p(gene_name, abkt, y_g, num_random_restarts, minrr):
259 | theta_lower, theta_upper, p_lower, p_upper, std_lower, std_upper = get_rr_range(y_g)
260 |
261 | real_params_g_rtimes = column_stack((uniform(theta_lower, theta_upper, num_random_restarts),
262 | log(uniform(std_lower, std_upper, num_random_restarts)),
263 | logit(uniform(p_lower, p_upper, num_random_restarts))))
264 | arg_min_x = []
265 | val_min_x = []
266 |
267 | for i in range(num_random_restarts):
268 | log_fh.log('tasc-b optimization #' + str(i) + ' for gene ' + gene_name)
269 | real_params_g = real_params_g_rtimes[i, :]
270 | optim_result_obj = minimize(likelihoodumi.neg_log_sum_marginal_likelihood, x0=real_params_g, args=(abkt, y_g),
271 | method='L-BFGS-B')
272 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (not optim_result_obj.fun == 0):
273 | arg_min_x.append(optim_result_obj)
274 | val_min_x.append(optim_result_obj.fun)
275 | if len(arg_min_x) >= minrr:
276 | break
277 |
278 | if len(arg_min_x) == 0:
279 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'),
280 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan')))
281 | return
282 | tasc_b_res = arg_min_x[np.argmin(val_min_x)]
283 |
284 | theta_lower0, theta_upper0, p_lower0, p_upper0, std_lower0, std_upper0, theta_lower1, theta_upper1, p_lower1, p_upper1, std_lower1, std_upper1 = get_rr_range_grp(
285 | y_g, group_info)
286 | real_params_g_rtimes=column_stack((uniform(min(theta_lower0, theta_lower1), max(theta_upper0, theta_upper1), num_random_restarts),
287 | log(uniform(min(std_lower0, std_lower1), max(std_upper0, std_upper1), num_random_restarts)),
288 | logit(uniform(p_lower0, p_upper0, num_random_restarts)),
289 | logit(uniform(p_lower1, p_upper1, num_random_restarts))))
290 | arg_min_x=[]
291 | val_min_x=[]
292 |
293 | for i in range(num_random_restarts):
294 | log_fh.log('tasc free p optimization #' + str(i) + ' for gene ' + gene_name)
295 | real_params_g=real_params_g_rtimes[i,:]
296 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood_free_p, x0=real_params_g, args=(abkt, y_g, group_info), method='L-BFGS-B')
297 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (optim_result_obj.fun != 0):
298 | arg_min_x.append(optim_result_obj)
299 | val_min_x.append(optim_result_obj.fun)
300 | if len(arg_min_x) >= minrr:
301 | break
302 |
303 | if len(arg_min_x) == 0:
304 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'),
305 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan')))
306 | return
307 | tasc_freep_res = arg_min_x[np.argmin(val_min_x)]
308 |
309 | lrt_stat = 2 * (tasc_b_res.fun - tasc_freep_res.fun)
310 |
311 | if np.isnan(lrt_stat):
312 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'),
313 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan')))
314 | else:
315 | lrt_pval = 1 - chi2.cdf(lrt_stat, df=1)
316 | res_fh.log(((gene_name, True, tasc_b_res.x[0], exp(tasc_b_res.x[1]), expit(tasc_b_res.x[2]),
317 | tasc_freep_res.x[0], exp(tasc_freep_res.x[1]), expit(tasc_freep_res.x[2]),
318 | expit(tasc_freep_res.x[3]), lrt_stat, lrt_pval)))
319 |
320 |
321 | def lrt_free_theta(gene_name, abkt, y_g, num_random_restarts, minrr):
322 | theta_lower, theta_upper, p_lower, p_upper, std_lower, std_upper = get_rr_range(y_g)
323 |
324 | real_params_g_rtimes = column_stack((uniform(theta_lower, theta_upper, num_random_restarts),
325 | log(uniform(std_lower, std_upper, num_random_restarts)),
326 | logit(uniform(p_lower, p_upper, num_random_restarts))))
327 | arg_min_x=[]
328 | val_min_x=[]
329 |
330 | for i in range(num_random_restarts):
331 | log_fh.log('tasc-b optimization #' + str(i) + ' for gene ' + gene_name)
332 | real_params_g=real_params_g_rtimes[i,:]
333 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood, x0=real_params_g, args=(abkt, y_g), method='L-BFGS-B')
334 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (optim_result_obj.fun != 0):
335 | arg_min_x.append(optim_result_obj)
336 | val_min_x.append(optim_result_obj.fun)
337 | if len(arg_min_x) >= minrr:
338 | break
339 |
340 | if len(arg_min_x) == 0:
341 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'),
342 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan')))
343 | return
344 | tasc_b_res=arg_min_x[np.argmin(val_min_x)]
345 |
346 | theta_lower0, theta_upper0, p_lower0, p_upper0, std_lower0, std_upper0, theta_lower1, theta_upper1, p_lower1, p_upper1, std_lower1, std_upper1 = get_rr_range_grp(
347 | y_g, group_info)
348 | real_params_g_rtimes = column_stack((uniform(theta_lower0, theta_upper0, num_random_restarts),
349 | uniform(theta_lower1, theta_upper1, num_random_restarts),
350 | log(uniform(min(std_lower0, std_lower1), max(std_upper0, std_upper1), num_random_restarts)),
351 | logit(uniform(min(p_lower0, p_lower1), max(p_upper0, p_upper1), num_random_restarts))))
352 | arg_min_x=[]
353 | val_min_x=[]
354 | for i in range(num_random_restarts):
355 | log_fh.log('tasc free theta optimization #' + str(i) + ' for gene ' + gene_name)
356 | real_params_g=real_params_g_rtimes[i,:]
357 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood_free_theta, x0=real_params_g, args=(abkt, y_g, group_info), method='L-BFGS-B')
358 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (optim_result_obj.fun != 0):
359 | arg_min_x.append(optim_result_obj)
360 | val_min_x.append(optim_result_obj.fun)
361 | if len(arg_min_x) >= minrr:
362 | break
363 |
364 | if len(arg_min_x) == 0:
365 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'),
366 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan')))
367 | return
368 | tasc_free_theta = arg_min_x[np.argmin(val_min_x)]
369 |
370 | lrt_stat = 2 * (tasc_b_res.fun - tasc_free_theta.fun)
371 |
372 | if np.isnan(lrt_stat):
373 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'),
374 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan')))
375 | else:
376 | lrt_pval = 1 - chi2.cdf(lrt_stat, df=1)
377 | res_fh.log((gene_name, True, tasc_b_res.x[0], exp(tasc_b_res.x[1]), expit(tasc_b_res.x[2]),
378 | tasc_free_theta.x[0], tasc_free_theta.x[1], exp(tasc_free_theta.x[2]),
379 | expit(tasc_free_theta.x[3]), lrt_stat, lrt_pval))
380 |
381 | ##############################################################################################
382 |
383 | def lrt_free_theta_umi(gene_name, abkt, y_g, num_random_restarts, minrr):
384 | theta_lower, theta_upper, p_lower, p_upper, std_lower, std_upper = get_rr_range_umi(y_g)
385 |
386 | real_params_g_rtimes = column_stack((uniform(theta_lower, theta_upper, num_random_restarts),
387 | log(uniform(std_lower, std_upper, num_random_restarts)),
388 | logit(uniform(p_lower, p_upper, num_random_restarts))))
389 | arg_min_x=[]
390 | val_min_x=[]
391 |
392 | for i in range(num_random_restarts):
393 | log_fh.log('tasc-b optimization #' + str(i) + ' for gene ' + gene_name)
394 | real_params_g=real_params_g_rtimes[i,:]
395 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood_umi, x0=real_params_g, args=(abkt, y_g), method='L-BFGS-B')
396 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (optim_result_obj.fun != 0):
397 | arg_min_x.append(optim_result_obj)
398 | val_min_x.append(optim_result_obj.fun)
399 | if len(arg_min_x) >= minrr:
400 | break
401 |
402 | if len(arg_min_x) == 0:
403 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'),
404 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan')))
405 | return
406 | tasc_b_res=arg_min_x[np.argmin(val_min_x)]
407 |
408 | theta_lower0, theta_upper0, p_lower0, p_upper0, std_lower0, std_upper0, theta_lower1, theta_upper1, p_lower1, p_upper1, std_lower1, std_upper1 = get_rr_range_grp_umi(
409 | y_g, group_info)
410 | real_params_g_rtimes = column_stack((uniform(theta_lower0, theta_upper0, num_random_restarts),
411 | uniform(theta_lower1, theta_upper1, num_random_restarts),
412 | log(uniform(min(std_lower0, std_lower1), max(std_upper0, std_upper1), num_random_restarts)),
413 | logit(uniform(min(p_lower0, p_lower1), max(p_upper0, p_upper1), num_random_restarts))))
414 | arg_min_x=[]
415 | val_min_x=[]
416 | for i in range(num_random_restarts):
417 | log_fh.log('tasc free theta optimization #' + str(i) + ' for gene ' + gene_name)
418 | real_params_g=real_params_g_rtimes[i,:]
419 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood_free_theta_umi, x0=real_params_g, args=(abkt, y_g, group_info), method='L-BFGS-B')
420 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (optim_result_obj.fun != 0):
421 | arg_min_x.append(optim_result_obj)
422 | val_min_x.append(optim_result_obj.fun)
423 | if len(arg_min_x) >= minrr:
424 | break
425 |
426 | if len(arg_min_x) == 0:
427 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'),
428 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan')))
429 | return
430 | tasc_free_theta = arg_min_x[np.argmin(val_min_x)]
431 |
432 | lrt_stat = 2 * (tasc_b_res.fun - tasc_free_theta.fun)
433 |
434 | if np.isnan(lrt_stat):
435 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'),
436 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan')))
437 | else:
438 | lrt_pval = 1 - chi2.cdf(lrt_stat, df=1)
439 | res_fh.log((gene_name, True, tasc_b_res.x[0], exp(tasc_b_res.x[1]), expit(tasc_b_res.x[2]),
440 | tasc_free_theta.x[0], tasc_free_theta.x[1], exp(tasc_free_theta.x[2]),
441 | expit(tasc_free_theta.x[3]), lrt_stat, lrt_pval))
442 |
443 |
444 |
445 | def lrt_free_p_and_theta(gene_name, abkt, y_g, num_random_restarts, minrr):
446 | theta_lower, theta_upper, p_lower, p_upper, std_lower, std_upper = get_rr_range(y_g)
447 |
448 | real_params_g_rtimes = column_stack((uniform(theta_lower, theta_upper, num_random_restarts),
449 | log(uniform(std_lower, std_upper, num_random_restarts)),
450 | logit(uniform(p_lower, p_upper, num_random_restarts))))
451 | arg_min_x=[]
452 | val_min_x=[]
453 |
454 | for i in range(num_random_restarts):
455 | log_fh.log('tasc-b optimization #' + str(i) + ' for gene ' + gene_name)
456 | real_params_g=real_params_g_rtimes[i,:]
457 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood, x0=real_params_g, args=(abkt, y_g), method='L-BFGS-B')
458 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (optim_result_obj.fun != 0):
459 | arg_min_x.append(optim_result_obj)
460 | val_min_x.append(optim_result_obj.fun)
461 | if len(arg_min_x) >= minrr:
462 | break
463 |
464 | if len(arg_min_x) == 0:
465 | print("xx")
466 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'),
467 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan')))
468 | return
469 | tasc_b_res=arg_min_x[np.argmin(val_min_x)]
470 |
471 | theta_lower0, theta_upper0, p_lower0, p_upper0, std_lower0, std_upper0, theta_lower1, theta_upper1, p_lower1, p_upper1, std_lower1, std_upper1 = get_rr_range_grp(
472 | y_g, group_info)
473 | real_params_g_rtimes = column_stack((uniform(theta_lower0, theta_upper0, num_random_restarts),
474 | uniform(theta_lower1, theta_upper1, num_random_restarts),
475 | log(uniform(min(std_lower0, std_lower1), max(std_upper0, std_upper1), num_random_restarts)),
476 | logit(uniform(p_lower0, p_upper0, num_random_restarts))))
477 | arg_min_x=[]
478 | val_min_x=[]
479 | for i in range(num_random_restarts):
480 | log_fh.log('tasc free both optimization #' + str(i) + ' for gene ' + gene_name)
481 | real_params_g=real_params_g_rtimes[i,:]
482 | optim_result_obj=minimize(likelihoodumi.neg_log_sum_marginal_likelihood_free_both, x0=real_params_g, args=(abkt, y_g, group_info), method='L-BFGS-B')
483 | if optim_result_obj.success and (not np.isnan(optim_result_obj.fun)) and (optim_result_obj.fun != 0):
484 | arg_min_x.append(optim_result_obj)
485 | val_min_x.append(optim_result_obj.fun)
486 | if len(arg_min_x) >= minrr:
487 | break
488 |
489 | if len(arg_min_x) == 0:
490 | print("xxx")
491 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'),
492 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan')))
493 | return
494 | tasc_freeboth_res = arg_min_x[np.argmin(val_min_x)]
495 |
496 | lrt_stat = 2 * (tasc_b_res.fun - tasc_freeboth_res.fun)
497 |
498 | if np.isnan(lrt_stat):
499 | res_fh.log((gene_name, False, float('nan'), float('nan'), float('nan'), float('nan'),
500 | float('nan'), float('nan'), float('nan'), float('nan'), float('nan'), float('nan')))
501 | else:
502 | lrt_pval = 1 - chi2.cdf(lrt_stat, df=1)
503 | res_fh.log((gene_name, True, tasc_b_res.x[0], exp(tasc_b_res.x[1]), expit(tasc_b_res.x[2]),
504 | tasc_freeboth_res.x[0], tasc_freeboth_res.x[1], exp(tasc_freeboth_res.x[2]),
505 | expit(tasc_freeboth_res.x[3]), lrt_stat, lrt_pval))
506 |
507 |
508 |
509 | ##################################################################################################
510 |
511 | def get_min_marginal(data):
512 | if args.type_op == 1:
513 | for el in data:
514 | log_fh.log('now analyzing ' + el[0] + ' on node #' + str(rank))
515 | opt_neg_log_sum_marginal_likelihood(el[0], py_stan_input['abkt'], el[1], args.maxNR, args.minNR)
516 | elif args.type_op == 2:
517 | for el in data:
518 | log_fh.log('now analyzing ' + el[0] + ' on node #' + str(rank))
519 | lrt_free_p(el[0], py_stan_input['abkt'], el[1], args.maxNR, args.minNR)
520 | elif args.type_op == 3:
521 | for el in data:
522 | log_fh.log('now analyzing ' + el[0] + ' on node #' + str(rank))
523 | lrt_free_theta(el[0], py_stan_input['abkt'], el[1], args.maxNR, args.minNR)
524 | elif args.type_op == 4:
525 | for el in data:
526 | log_fh.log('now analyzing ' + el[0] + ' on node #' + str(rank))
527 | lrt_free_theta_umi(el[0], py_stan_input['abkt'], el[1], args.maxNR, args.minNR)
528 | elif args.type_op == 5:
529 | for el in data:
530 | #x = likelihoodumi.neg_log_sum_marginal_likelihood_psi_both([50,50,10,10], el[4], py_stan_input['abkt'], el[1], el[2], el[3], group_info)
531 | #x = likelihoodumi.neg_log_sum_marginal_likelihood_nob([8,2], py_stan_input['abkt'], el[1])
532 | #print el
533 | lrt_free_psi_equal_variance(el[0], py_stan_input['abkt'], el[2], el[3], el[4], el[5], args.maxNR, args.minNR)
534 | elif args.type_op == 6:
535 | for el in data:
536 | lrt_free_psi_equal_variance_umi(el[0], py_stan_input['abkt'], el[2], el[3], el[4], el[5], args.maxNR, args.minNR)
537 |
538 |
539 | np.seterr(all='ignore')
540 |
541 | #parse args
542 | args=get_parsed_options()
543 |
544 | # init mpi env
545 | comm=MPI.COMM_WORLD
546 | rank=comm.Get_rank()
547 | size=comm.Get_size()
548 |
549 | # init logger file handle
550 | log_fh = logger(MPI.File.Open(comm, args.out_filename + '.log', MPI.MODE_CREATE | MPI.MODE_WRONLY))
551 | res_fh = result_writer(MPI.File.Open(comm, args.out_filename, MPI.MODE_CREATE | MPI.MODE_WRONLY))
552 |
553 | # all nodes init
554 | genes_grouped_by_worker=None
555 | abkt_params=None
556 | py_stan_input=None
557 | tasc_sm=None
558 | group_info=None
559 |
560 | # master node init
561 | if rank == 0:
562 | log_fh.log('opened MPI World with size ' + str(size))
563 | log_fh.log('input counts filename: ' + str(args.y_filename))
564 | log_fh.log('input abkt filename: ' + str(args.abkt_filename))
565 | log_fh.log('output filename: ' + str(args.out_filename))
566 | log_fh.log('max number of restarts: ' + str(args.maxNR))
567 | log_fh.log('min number of restarts: ' + str(args.minNR))
568 |
569 | log_fh.log('parsing abkt file: ' + args.abkt_filename)
570 | res_fh.write_header(args.type_op)
571 | abkt_params = np.genfromtxt(args.abkt_filename)
572 |
573 | log_fh.log('parsing x file: ' + args.x_filename)
574 | group_info = np.genfromtxt(args.x_filename, dtype=np.int8)
575 |
576 | py_stan_input={
577 | 'C': abkt_params.shape[0],
578 | 'abkt' : abkt_params
579 | }
580 |
581 | genes_grouped_by_worker=parse_filter_counts(args.y_filename, size)
582 |
583 | #print genes_grouped_by_worker
584 | part_data = comm.scatter(genes_grouped_by_worker, root=0)
585 | #print group_info
586 | log_fh.log('rank ' + str(rank) + ' has ' + str(len(part_data)) + ' genes. the first gene is ' + part_data[0][0])
587 |
588 | py_stan_input = comm.bcast(py_stan_input, root=0)
589 | abkt_params = comm.bcast(abkt_params, root=0)
590 | group_info = comm.bcast(group_info, root=0)
591 | #print abkt_params[0]
592 | opt_marg_results = get_min_marginal(part_data)
593 |
594 | log_fh.close()
595 | res_fh.close()
596 |
597 |
598 |
--------------------------------------------------------------------------------
/bin/summarizedas.pl:
--------------------------------------------------------------------------------
1 | my %results;
2 | my %qgene;
3 | my $ct = 0;
4 | my $comparedir = $ARGV[0];
5 | my $dasdir = $comparedir."/das_script";
6 | my $datadir = $dasdir."/data";
7 | my $infofile = $ARGV[1];
8 | my $outfile = $ARGV[2];
9 |
10 | my %event2exon;
11 | open FP, "$infofile";
12 | while() {
13 | chomp();
14 | my @a = split("\t");
15 | my $tmp = $a[0].":".$a[1];
16 | $event2exon{$tmp} = $a[$#a] if $a[$#a] ne "NA";
17 | }
18 | close FP;
19 |
20 |
21 | open OUT, ">$outfile";
22 | print OUT "gp1\tgp2\tgene_name\tAS_exons\tPSI_gp1\tPSI_gp2\ttest_stat\tp_value\n";
23 | open FP, "$comparedir\/comparegroup";
24 | while() {
25 | chomp();
26 | $ct++;
27 | my @a = split("\t");
28 | my $gp = "$a[0]\_$a[1]";
29 | open FP1, "$datadir\/out_$gp";
30 | while() {
31 | chomp();
32 | my @b = split("\t");
33 | if($b[1] eq "True" && $b[5] ne "nan" && $b[6] ne "nan") {
34 | my @c = split(":", $b[0]);
35 | print OUT "$a[0]\t$a[1]\t$c[0]\t$event2exon{$b[0]}\t$b[$#b-5]\t$b[$#b-4]\t$b[$#b-1]\t$b[$#b]\n";
36 |
37 | #$results{$gp}{$b[0]}{"pv"} = $b[$#b];
38 | #$results{$gp}{$b[0]}{"stat"} = $b[$#b-1];
39 | #$results{$gp}{$b[0]}{0} = $b[$#b-5];
40 | #$results{$gp}{$b[0]}{1} = $b[$#b-4];
41 | }
42 | }
43 | close FP1;
44 | }
45 | close FP;
46 | close OUT;
47 |
--------------------------------------------------------------------------------
/doc/Clarity_step1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/doc/Clarity_step1.JPG
--------------------------------------------------------------------------------
/doc/Clarity_step2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/doc/Clarity_step2.JPG
--------------------------------------------------------------------------------
/doc/Clarity_step3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/doc/Clarity_step3.JPG
--------------------------------------------------------------------------------
/doc/Fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huyustats/SCATS/bc44d9a3d0163cec0f3bbde922353e2cb0e1b041/doc/Fig1.png
--------------------------------------------------------------------------------
/doc/Install.md:
--------------------------------------------------------------------------------
1 | ## Prerequisites:
2 |
3 | #### Make sure you have the following libraries and packages installed on your system.
4 | ```
5 | cmake >= 2.6.4
6 | gcc >= 4.4
7 | Python 2.7
8 | python packages:
9 | pysam
10 | numpy
11 | scipy
12 | cython
13 | OpenMPI
14 | SAMTOOLS
15 | Perl
16 | ```
17 | ## Installation
18 |
19 | #### Download SCATS from github.
20 | ```
21 | git clone https://github.com/huyustats/SCATS.git
22 | ```
23 |
24 | #### Complie C functions using `Cython` and `gcc`
25 | ```
26 | cd SCATS/bin/
27 | bash complie_likelihoodumi.sh
28 | ```
29 |
30 | #### Check programs and python packages installed or not
31 | ```
32 | python check_software.py
33 | ```
34 |
35 |
36 |
--------------------------------------------------------------------------------
/doc/Usage.md:
--------------------------------------------------------------------------------
1 | # Instruction about how to use SCATS
2 |
3 | The inputs of SCATS are aligned single-cell RNA-seq data in BAM format and a reference isoform annotation file (Ensembl/Refseq). User needs to specify `-task` to perform in each step:
4 | ```
5 | SCATS.py -task :
6 |
7 | refgene: preprocess reference file
8 |
9 | group: group alternative splicing exon
10 |
11 | count: count informative reads from indexed BAM file
12 |
13 | gene: estimate mean gene expression for each single cell condition
14 |
15 | das: detect differential alternative splicing (DAS) for each exon group between conditions
16 |
17 | sum: summarize DAS test results
18 |
19 | ```
20 |
21 | ## Step 1: Group exons based on reference annotation file
22 | SCATS requires a reference annotation file `example.refFile` in following format:
23 | ```
24 | 749 NM_001397 chr1 - 21543739 21616982 21546447 21616907 19 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185
25 | ,21586763,21599191,21605683,21616562,21616856, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21605825,21616649,21616982, 0 ECE1cmpl cmpl 0,0,1,2,2,0,0,0,0,2,0,0,0,0,1,1,0,0,0,
26 | 93 NM_001113348 chr1 - 21543739 21672034 21546447 21671871 19 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185
27 | ,21586763,21599191,21605683,21616562,21671868, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21605825,21616649,21672034, 0 ECE1cmpl cmpl 0,0,1,2,2,0,0,0,0,2,0,0,0,0,1,1,0,0,0,
28 | 749 NM_001113349 chr1 - 21543739 21616766 21546447 21616691 18 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185
29 | ,21586763,21599191,21605683,21616562, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21605825,21616766, 0 ECE1 cmpl cmpl0,0,1,2,2,0,0,0,0,2,0,0,0,0,1,1,0,0,
30 | 749 NM_001113347 chr1 - 21543739 21606183 21546447 21605927 17 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185
31 | ,21586763,21599191,21605683, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21606183, 0 ECE1 cmpl cmpl 0,0,1,2,2,0,0
32 | ,0,0,2,0,0,0,0,1,1,0,
33 | ```
34 | Reference file in this format can be downloaded at [UCSC](https://genome.ucsc.edu/cgi-bin/hgTables?command=start) by selecting "all fields from selected table" in output format.
35 |
36 | We preprocess `example.refFile` by using `python SCATS.py -task refgene`. An example is given below.
37 | ```
38 | python SCATS.py -task refgene -ref example.refFile -out example.refgene
39 | ```
40 | Next, the command for exon grouping is to run `python SCATS.py -task group`.
41 | ```
42 | python SCATS.py -task group -refgene example.refgene -out example.gpinfo
43 | ```
44 | `example.refgene` and `example.gpinfo` are two important calibrated annotation files for following steps.
45 |
46 | ## Step 2: Extract informative read count for each exon group from alignment file
47 | SCATS requires a headerless `metafile` in this step to tell SCATS that how and where to find the aligment BAM files to extract cell-specific informative read count. BAM files have to be indexed. Here is an example of `metafile`
48 | ```
49 | AACACGTCACATAACC-1 A ~/1dot1/outs/possorted_genome_bam.bam UB CB
50 | GGACAAGTCTCCCTGA-1 A ~/1dot1/outs/possorted_genome_bam.bam UB CB
51 | CACAGGCAGATCCCGC-1 B ~/1dot1/outs/possorted_genome_bam.bam UB CB
52 | ATCTGCCGTCATCGGC-1 B ~/1dot1/outs/possorted_genome_bam.bam UB CB
53 | GGAAAGCGTTGCTCCT-1 C ~/1dot1/outs/possorted_genome_bam.bam UB CB
54 | CGAGCACGTGTTCTTT-1 C ~/1dot1/outs/possorted_genome_bam.bam UB CB
55 | CCTATTACAATGGATA-1 D ~/1dot1/outs/possorted_genome_bam.bam UB CB
56 | AAGGAGCAGCGTCAAG-1 D ~/1dot1/outs/possorted_genome_bam.bam UB CB
57 | ```
58 | where 1st column contains cell barcode/cell name, 2nd column represents condition group, 3rd column represents the location of BAM file. 4th and 5th columns represent the tag names of UMI barcode and cell barcode in BAM file. For example
59 | ```
60 | NS500497:57:H27CKBGX2:3:12506:1885:16376 272 1 3014861 1 98M * 0 0 TGGCGTTCCCCTGTACTGGGGCTTATAAAGTTTGCAAGTCCAATGGGCCTCTCTTTGCAGTGATGGCCGACTAGGCCATCTTTTGATACATATGCAGC //A/A/A/EEE -refgene -gpinfo
68 |
69 | [count options] type 'python SCATS.py -task count' to check two important count options.
70 |
71 | -umi collect UMI count or not
72 |
73 | -onebam whether all aligned reads are merged in one BAM files
74 |
75 | OUTPUT:
76 |
77 | count_*.sh script files will be generated under directory `./tmp/count_script`.
78 | ```
79 | where '-umi' and '-onebam' are two important options:
80 | * `-umi yes -onebam yes`: UMI and cell barcode tag names have to be specified in the 4th and 5th columns of `metafile`.
81 | * `-umi yes -onebam no`: only UMI barcode tag name is needed. It has to be specified in the 4th column of `metafile`.
82 | * `-umi no -onebam yes`: only cell barcode tag name is needed. It has to be specified in the 4th column of `metafile`.
83 | * `-umi no -onebam no`: no tag name is needed.
84 |
85 | Outputs of `python SCATS.py -task count` are `count_*.sh` script files located at `./tmp/count_script`. User needs to run all of them to obtain informative read count for each single cell.
86 |
87 | ## Step 3: Quantify gene-level expression accounting for technical noises
88 | In this step, user needs to give `metafile` to SCATS and specify the number of cores to use for each pairwise comparison between conditions:
89 | ```
90 | python SCATS.py -task gene -ncore 20 -meta metafile
91 | ```
92 | Outputs of `python SCATS.py -task gene` are `gene_*.sh` script files located at `./tmp/gene_script`. User needs to run all of them to obtain accurate gene expression estimations for each cell condition group.
93 |
94 | ## Step 4: Detect differential alternative splicing (DAS) across cell conditions accounting for technical noises
95 | In this step, user needs to give `metafile` and `example.gpinfo` to SCATS and specify the number of cores to use for each pairwise comparison between conditions:
96 | ```
97 | python SCATS.py -task das -ncore 20 -meta metafile -gpinfo example.gpinfo
98 | ```
99 | Outputs of `python SCATS.py -task das` are `das_*.sh` script files located at `./tmp/das_script`. User needs to run all of them to obtain differential alternative splicing even at exon group level across cell conditions.
100 |
101 | ## Step 5: Summarize DAS test results
102 | ```
103 | python SCATS.py -task sum -gpinfo example.gpinfo
104 | ```
105 |
--------------------------------------------------------------------------------
/example/example.gpinfo:
--------------------------------------------------------------------------------
1 | ECE1 4 plus + 0.0290081639742922 0,1,3, chr1,21671868,21672034;
2 | ECE1 4 plus - 0.0290081639742922 2, NA
3 | ECE1 1 both + 0.0621851658850096 1,2,3, chr1,21605826,21606183;
4 | ECE1 1 both - 0.0152857391002258 0, chr1,21616562,21616649;
5 | ECE1 7 minus + 0.0220601007469168 3, NA
6 | ECE1 7 minus - 0.0220601007469168 0,1,2, chr1,21616856,21616982;
7 | ECE1 2 plus + 0.020323084940073 0,2,3, chr1,21616650,21616766;
8 | ECE1 2 plus - 0.020323084940073 1, NA
9 |
--------------------------------------------------------------------------------
/example/example.refFile:
--------------------------------------------------------------------------------
1 | 749 NM_001397 chr1 - 21543739 21616982 21546447 21616907 19 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185,21586763,21599191,21605683,21616562,21616856, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21605825,21616649,21616982, 0 ECE1 cmpl cmpl 0,0,1,2,2,0,0,0,0,2,0,0,0,0,1,1,0,0,0,
2 | 93 NM_001113348 chr1 - 21543739 21672034 21546447 21671871 19 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185,21586763,21599191,21605683,21616562,21671868, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21605825,21616649,21672034, 0 ECE1 cmpl cmpl 0,0,1,2,2,0,0,0,0,2,0,0,0,0,1,1,0,0,0,
3 | 749 NM_001113349 chr1 - 21543739 21616766 21546447 21616691 18 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185,21586763,21599191,21605683,21616562, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21605825,21616766, 0 ECE1 cmpl cmpl 0,0,1,2,2,0,0,0,0,2,0,0,0,0,1,1,0,0,
4 | 749 NM_001113347 chr1 - 21543739 21606183 21546447 21605927 17 21543739,21548239,21551742,21553651,21554423,21560050,21562342,21563238,21564626,21571481,21573713,21582439,21584017,21585185,21586763,21599191,21605683, 21546624,21548335,21551933,21553719,21554534,21560154,21562420,21563337,21564737,21571596,21573856,21582631,21584083,21585332,21586885,21599404,21606183, 0 ECE1 cmpl cmpl 0,0,1,2,2,0,0,0,0,2,0,0,0,0,1,1,0,
5 |
--------------------------------------------------------------------------------
/example/example.refgene:
--------------------------------------------------------------------------------
1 | ECE1 chr1 - 21543739 21672034 NM_001113349,NM_001113348,NM_001113347,NM_001397,
2 | ECE1 chr1 - 21543739 21546624 1,1,1,1,
3 | ECE1 chr1 - 21548239 21548335 1,1,1,1,
4 | ECE1 chr1 - 21551742 21551933 1,1,1,1,
5 | ECE1 chr1 - 21553651 21553719 1,1,1,1,
6 | ECE1 chr1 - 21554423 21554534 1,1,1,1,
7 | ECE1 chr1 - 21560050 21560154 1,1,1,1,
8 | ECE1 chr1 - 21562342 21562420 1,1,1,1,
9 | ECE1 chr1 - 21563238 21563337 1,1,1,1,
10 | ECE1 chr1 - 21564626 21564737 1,1,1,1,
11 | ECE1 chr1 - 21571481 21571596 1,1,1,1,
12 | ECE1 chr1 - 21573713 21573856 1,1,1,1,
13 | ECE1 chr1 - 21582439 21582631 1,1,1,1,
14 | ECE1 chr1 - 21584017 21584083 1,1,1,1,
15 | ECE1 chr1 - 21585185 21585332 1,1,1,1,
16 | ECE1 chr1 - 21586763 21586885 1,1,1,1,
17 | ECE1 chr1 - 21599191 21599404 1,1,1,1,
18 | ECE1 chr1 - 21605683 21605825 1,1,1,1,
19 | ECE1 chr1 - 21605826 21606183 0,0,1,0,
20 | ECE1 chr1 - 21616562 21616649 1,1,0,1,
21 | ECE1 chr1 - 21616650 21616766 1,0,0,0,
22 | ECE1 chr1 - 21616856 21616982 0,0,0,1,
23 | ECE1 chr1 - 21671868 21672034 0,1,0,0,
24 |
--------------------------------------------------------------------------------
/example/metafile:
--------------------------------------------------------------------------------
1 | TTTGGTTGTACTCAAC-1 1 /home/huyu1/SCATS/data/TTTGGTTGTACTCAAC-1.bam UB CB
2 | TTTGGTTGTGCACCAC-1 2 /home/huyu1/SCATS/data/TTTGGTTGTGCACCAC-1.bam UB CB
--------------------------------------------------------------------------------
]