├── .gitignore ├── README.md ├── col2rows ├── vcf_add_chr.awk ├── sqgrep.sh ├── sqgrepc.sh ├── raf_viz.sh ├── contrafold_viz.sh ├── r ├── mi_parmigene.r ├── peak_bam_plot.r ├── te_bam_coverage_bam.r ├── te_diff_count.r ├── bam_heat_meta.r ├── diff_diff_ma.r ├── bam_heat_heat.r ├── plot_gff_cov_heat.r ├── fpkm_fpkm_scatter.r ├── diff_diff_scatter.r ├── annotation_pie_ratios.r ├── cuff_2d.r ├── annotation_pie_pie.r ├── cuff_rep_cor.r ├── te_diff.r ├── cuff_bar.r ├── cuff_scatter.r ├── diff_summary.r ├── peaks_diff_compare.r ├── cuff_heat.r └── plot_gff_cov_meta.r ├── template_gpu.sb ├── test_template.py ├── template.py ├── sym_matrix.py ├── template_sci.py ├── parallel_template.py ├── clear_slurm.py ├── pygene_utrs.py ├── gtf2utrs.py ├── vcf2vds.py ├── fasta_upper.py ├── vcf2bed.py ├── h5_sum.py ├── explore.ipynb ├── fasta_genome.py ├── rm_nonxs.py ├── bam_unique.py ├── make_fasta_genome.py ├── clean_csv.py ├── transid2geneid.py ├── possum2bed.py ├── mess2fasta.py ├── gff2bed.py ├── rm2bed.py ├── possum2gff.py ├── zarr_h5.py ├── bam_12.py ├── rm2gff.py ├── plot.py ├── bam_plus_minus.py ├── fastq_filter.py ├── gsea_ranks.py ├── fastq_trim.py ├── bam_quality.py ├── gtf2prom.py ├── bw_nan.py ├── h5_zarr.py ├── sciseq_collision.py ├── gtf_span.py ├── bim_vcf.py ├── cuff_fails.py ├── gtf_filter_csf.py ├── zarr_bw.py ├── bed2gff.py ├── bed2gtf.py ├── gaps_bed.py ├── stockholm2fasta.py ├── bed_clean.py ├── multiz_gff.py ├── fastq_quality_change.py ├── w5_bg.py ├── split_fragment_lengths.py ├── gtf_cut.py ├── gtf_filter_expr.py ├── fpkm_tracking.py ├── size.py ├── w5_bw.py ├── gtf2bed.py ├── reservoir_sample.py ├── bl2gff.py ├── bam_len_hist.py ├── isoforms_fpkm.py ├── vcf_tss.py ├── peaks_venn.py ├── multiz_lncrna.py ├── fpkm_hist.py ├── bg_w5.py ├── gtf_homologues.py ├── gsea_rnk.py ├── lnc_expression.py ├── nuc2gff.py ├── plot_fragment_lengths.py ├── bam_bedg.py ├── bgo_w5.py ├── make_ref_ml.py ├── vcf_splice.py ├── seq_logo.py ├── rmdup_iclip.py ├── fpkm_fpkm.py ├── gsea.py ├── gtf_multimaps.py ├── meme2possum.py ├── cutFasta.py ├── peaks3_venn.py ├── trf_mask.py ├── vcf_ld.py ├── quantile_normalization.py ├── ggplot.py ├── strand_specifity.py ├── geneid2transid.py ├── h5_h5z.py ├── transcripts_fasta.py ├── transmapbed2gtf.py ├── tss_bam_replot.py ├── attach_nh.py ├── citemelike.py ├── cuff_rep_cor.py └── bedtools.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py~ 2 | *.pyc 3 | r/*.r~ 4 | .gitignore~ 5 | ._* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | utility 2 | ======= 3 | 4 | Computational biology utility scripts -------------------------------------------------------------------------------- /col2rows: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | awk -F "\t" '{for (i=1; i <= NF; i++) print $i}' 4 | -------------------------------------------------------------------------------- /vcf_add_chr.awk: -------------------------------------------------------------------------------- 1 | #!/bin/awk -f 2 | 3 | {if (substr($0,1,1) == "#") print $0; else print "chr"$0} 4 | -------------------------------------------------------------------------------- /sqgrep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | squeue --format "%.10i %.9P %.20j %.8u %.2t %.10M %.6D %Q %R" -u drk | grep $1 | awk '{print $1}' 3 | -------------------------------------------------------------------------------- /sqgrepc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | scancel `squeue --format "%.10i %.9P %.20j %.8u %.2t %.10M %.6D %Q %R" -u drk | grep $1 | awk '{print $1}' | xargs` 4 | -------------------------------------------------------------------------------- /raf_viz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # raf predict ... 4 | raf2bpseq.py $1.raf > $1.bpseq 5 | make_coords $1.bpseq $1.coords 6 | plot_rna $1.bpseq $1.coords --png $1.png 7 | open $1.png 8 | -------------------------------------------------------------------------------- /contrafold_viz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | contrafold predict $1 --bpseq $1.bpseq --parens $1.parens 4 | make_coords $1.bpseq $1.coords 5 | plot_rna $1.bpseq $1.coords --png $1.png 6 | open $1.png 7 | -------------------------------------------------------------------------------- /r/mi_parmigene.r: -------------------------------------------------------------------------------- 1 | library(parmigene) 2 | 3 | ca = commandArgs(trailing=T) 4 | df.file = ca[1] 5 | 6 | df = read.table(df.file, header=T, quote="\"") 7 | 8 | mi = knnmi(df$A, df$B, k=5) 9 | 10 | cat(mi) 11 | -------------------------------------------------------------------------------- /template_gpu.sb: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -p gpu 4 | #SBATCH -n 1 5 | #SBATCH -c 2 6 | #SBATCH --gres=gpu:nvidia_geforce_gtx_1080_ti:1 7 | #SBATCH --mem 23000 8 | #SBATCH --time 2-0:0:0 9 | #SBATCH -J 3/5_name 10 | #SBATCH -o train_name.out 11 | #SBATCH -e train_name.err 12 | 13 | . /home/drk/anaconda3/etc/profile.d/conda.sh 14 | conda activate tf210 15 | 16 | basenji_train.py 17 | -------------------------------------------------------------------------------- /r/peak_bam_plot.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | ca = commandArgs(trailing=T) 4 | df.file = ca[1] 5 | output.pdf = ca[2] 6 | 7 | df = read.table(df.file, header=T, quote="\"") 8 | 9 | ggplot(df, aes(x=peak_i, y=cov)) + 10 | geom_point() + 11 | scale_x_continuous("Peak index") + 12 | scale_y_continuous("Coverage") + 13 | theme_bw() + 14 | theme(text=element_text(size=20)) 15 | 16 | ggsave(output.pdf) 17 | -------------------------------------------------------------------------------- /r/te_bam_coverage_bam.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | ca = commandArgs(trailing=T) 4 | df.file = ca[1] 5 | plot.title = ca[2] 6 | output.pdf = ca[3] 7 | 8 | df = read.table(df.file, header=T) 9 | 10 | ggplot(df, aes(x=indexes, y=coverage)) + 11 | geom_histogram(stat="identity") + 12 | scale_x_continuous("TE index") + 13 | scale_y_continuous("") + 14 | ggtitle(plot.title) + 15 | theme_bw() 16 | 17 | ggsave(output.pdf) 18 | -------------------------------------------------------------------------------- /r/te_diff_count.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(plyr) 3 | 4 | ca = commandArgs(trailing=T) 5 | df.file = ca[1] 6 | out.pdf = ca[2] 7 | scale = as.numeric(ca[3]) 8 | 9 | df = read.table(df.file, header=T) 10 | 11 | ggplot(df, aes(x=TEs, y=stat_mid, ymin=stat_low, ymax=stat_hi)) + 12 | geom_pointrange() + 13 | stat_smooth(se=FALSE, color="black", lty=2) + 14 | scale_y_continuous("log2 fRIP/input") + 15 | theme_bw() + 16 | theme(text=element_text(size=(sqrt(scale)*28))) 17 | 18 | ggsave(out.pdf, scale=scale) 19 | -------------------------------------------------------------------------------- /test_template.py: -------------------------------------------------------------------------------- 1 | #!/user/bin/env python 2 | 3 | ############################################################ 4 | # name 5 | # 6 | # 7 | ############################################################ 8 | import unittest 9 | 10 | class Test...(unittest.TestCase): 11 | def setUp(self): 12 | 13 | 14 | #def test...(self): 15 | #self.assert_() 16 | #self.assertEqual(,) 17 | 18 | ############################################################ 19 | # __main__ 20 | ############################################################ 21 | if __name__ == '__main__': 22 | unittest.main() 23 | -------------------------------------------------------------------------------- /r/bam_heat_meta.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | ca = commandArgs(trailing=T) 4 | df.file = ca[1] 5 | output.pdf = ca[2] 6 | 7 | df = read.table(df.file, header=T, quote="\"") 8 | 9 | if (ncol(df) == 2) { 10 | gp = ggplot(df, aes(x=Index, y=Coverage)) 11 | } else { 12 | gp = ggplot(df, aes(x=Index, y=Coverage, color=Type)) + 13 | scale_color_brewer(palette="Set1") 14 | } 15 | 16 | gp + 17 | geom_point() + 18 | geom_smooth() + 19 | theme_bw() + 20 | theme(text=element_text(size=16)) + 21 | theme(legend.justification=c(1,0), legend.position=c(1,0)) 22 | 23 | ggsave(output.pdf) 24 | -------------------------------------------------------------------------------- /r/diff_diff_ma.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | ca = commandArgs(trailing=T) 4 | df.file = ca[1] 5 | output.pdf = ca[2] 6 | 7 | df = read.table(df.file, header=T, quote="\"") 8 | 9 | x.min = quantile(df$avg, .002) 10 | x.max = quantile(df$avg, .998) 11 | 12 | y.min = quantile(df$minus, .002) 13 | y.max = quantile(df$minus, .998) 14 | 15 | ggplot(df, aes(x=avg, y=minus)) + 16 | geom_point(size=1.5, alpha=.3) + 17 | scale_x_continuous("Avg test stat", lim=c(x.min,x.max)) + 18 | scale_y_continuous("Test stat 1 - 2", lim=c(y.min,y.max)) + 19 | geom_hline(y=0) + 20 | theme_bw() + 21 | theme(text=element_text(size=16)) 22 | 23 | ggsave(output.pdf) 24 | -------------------------------------------------------------------------------- /r/bam_heat_heat.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | ca = commandArgs(trailing=T) 4 | df.file = ca[1] 5 | output.pdf = ca[2] 6 | control = ca[3] 7 | 8 | df = read.table(df.file, header=T, quote="\"") 9 | 10 | gp = ggplot(df, aes(x=Index, y=Feature, fill=Coverage)) + 11 | geom_tile() 12 | 13 | if(control == "True") { 14 | gp = gp + scale_fill_gradient2(low="#377eb8", high="#e41a1c") 15 | } else { 16 | gp = gp + scale_fill_gradient(low="white", high="#e41a1c") 17 | } 18 | 19 | gp + 20 | scale_y_discrete("") + 21 | theme_bw() + 22 | theme(text=element_text(size=16)) + 23 | theme(axis.ticks.y=element_blank(), axis.text.y=element_blank()) 24 | 25 | ggsave(output.pdf) 26 | -------------------------------------------------------------------------------- /r/plot_gff_cov_heat.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | ca = commandArgs(trailing=T) 4 | df.file = ca[1] 5 | output.pdf = ca[2] 6 | control = ca[3] 7 | 8 | df = read.table(df.file, header=T, quote="\"") 9 | 10 | gp = ggplot(df, aes(x=Index, y=Anchor, fill=Coverage)) + 11 | geom_tile() 12 | 13 | if(control == "True") { 14 | gp = gp + scale_fill_gradient2(low="#377eb8", high="#e41a1c") 15 | } else { 16 | gp = gp + scale_fill_gradient(low="white", high="#e41a1c") 17 | } 18 | 19 | gp + 20 | scale_y_discrete("") + 21 | theme_bw() + 22 | theme(text=element_text(size=16)) + 23 | theme(axis.ticks.y=element_blank(), axis.text.y=element_blank()) 24 | 25 | ggsave(output.pdf) 26 | -------------------------------------------------------------------------------- /r/fpkm_fpkm_scatter.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | ca = commandArgs(trailing=T) 4 | df.file = ca[1] 5 | output.pdf = ca[2] 6 | 7 | df = read.table(df.file, header=T, quote="\"") 8 | 9 | x.min = quantile(df$fpkm1, .002) 10 | x.max = quantile(df$fpkm1, .998) 11 | 12 | y.min = quantile(df$fpkm2, .002) 13 | y.max = quantile(df$fpkm2, .998) 14 | 15 | ggplot(df, aes(x=fpkm1, y=fpkm2)) + 16 | geom_point(size=1.5, alpha=.3) + 17 | stat_smooth() + 18 | scale_x_continuous("FPKM 1", lim=c(x.min,x.max)) + 19 | scale_y_continuous("FPKM 2", lim=c(y.min,y.max)) + 20 | geom_abline(intercept=0, slope=1, linetype=2) + 21 | theme_bw() + 22 | theme(text=element_text(size=18)) 23 | 24 | ggsave(output.pdf) 25 | -------------------------------------------------------------------------------- /template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | ''' 5 | Name 6 | 7 | Description... 8 | ''' 9 | 10 | ################################################################################ 11 | # main 12 | ################################################################################ 13 | def main(): 14 | usage = 'usage: %prog [options] arg' 15 | parser = OptionParser(usage) 16 | #parser.add_option() 17 | (options,args) = parser.parse_args() 18 | 19 | 20 | ################################################################################ 21 | # __main__ 22 | ################################################################################ 23 | if __name__ == '__main__': 24 | main() 25 | -------------------------------------------------------------------------------- /sym_matrix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import numpy as np 3 | 4 | ################################################################################ 5 | # sym_matrix.py 6 | # 7 | # Space-efficient symmetric matrix class. 8 | # 9 | # Indexing adapted from here: http://stackoverflow.com/a/24563079/4114434 10 | ################################################################################ 11 | 12 | class sym_matrix: 13 | def __init__(self, n): 14 | self.n = n 15 | self.M = np.zeros(self.n*self.n) 16 | 17 | def get(self, i, j): 18 | if i < j: 19 | i, j = j, i 20 | return self.M[j*self.n - (j+1)*j/2 + i] 21 | 22 | def set(self, i, j, v): 23 | if i < j: 24 | i, j = j, i 25 | self.M[j*self.n - (j+1)*j/2 + i] = v 26 | -------------------------------------------------------------------------------- /r/diff_diff_scatter.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | ca = commandArgs(trailing=T) 4 | df.file = ca[1] 5 | output.pdf = ca[2] 6 | 7 | df = read.table(df.file, header=T, quote="\"") 8 | 9 | x.min = quantile(df$diff1, .002) 10 | x.max = quantile(df$diff1, .998) 11 | 12 | y.min = quantile(df$diff2, .002) 13 | y.max = quantile(df$diff2, .998) 14 | 15 | ggplot(df, aes(x=diff1, y=diff2)) + 16 | geom_point(size=1.5, alpha=.3) + 17 | stat_smooth(method="lm") + 18 | scale_x_continuous("Test stat 1", lim=c(x.min,x.max)) + 19 | scale_y_continuous("Test stat 2", lim=c(y.min,y.max)) + 20 | geom_abline(intercept=0, slope=1, linetype=2) + 21 | theme_bw() + 22 | theme(text=element_text(size=25)) 23 | 24 | # stat_smooth(method="lm") + 25 | 26 | ggsave(output.pdf) 27 | -------------------------------------------------------------------------------- /r/annotation_pie_ratios.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | ca = commandArgs(trailing=T) 4 | df.file = ca[1] 5 | plot.title = ca[2] 6 | output.pdf = ca[3] 7 | 8 | df = read.table(df.file, header=T, quote="\"") 9 | 10 | annotation.order.all = c('Intergenic','Introns','3\'UTR','5\'UTR','CDS','lncRNA','Pseudogene','rRNA','smallRNA') 11 | annotation.order = annotation.order.all[annotation.order.all %in% df$annotation] 12 | df$annotation = factor(df$annotation, levels=annotation.order) 13 | 14 | ggplot(df, aes(x=annotation, y=ratio)) + 15 | geom_bar(stat="identity") + 16 | scale_x_discrete("Annotation") + 17 | scale_y_continuous("log2 feature% / length%") + 18 | ggtitle(plot.title) + 19 | theme_bw() + 20 | theme(panel.grid.minor=element_blank(), panel.grid.major=element_blank()) 21 | 22 | ggsave(output.pdf) 23 | -------------------------------------------------------------------------------- /template_sci.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | import h5py 5 | import numpy as np 6 | 7 | import matplotlib.pyplot as plt 8 | import seaborn as sns 9 | 10 | ''' 11 | Name 12 | 13 | Description... 14 | ''' 15 | 16 | ################################################################################ 17 | # main 18 | ################################################################################ 19 | def main(): 20 | usage = 'usage: %prog [options] arg' 21 | parser = OptionParser(usage) 22 | #parser.add_option() 23 | (options,args) = parser.parse_args() 24 | 25 | 26 | ################################################################################ 27 | # __main__ 28 | ################################################################################ 29 | if __name__ == '__main__': 30 | main() 31 | -------------------------------------------------------------------------------- /r/cuff_2d.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | ca = commandArgs(trailing=T) 4 | df.file = ca[1] 5 | output.pdf = ca[2] 6 | square = ca[3] 7 | 8 | df = read.table(df.file, header=T, quote="\"") 9 | 10 | if (square == "True") { 11 | d1.span = max(df$D1) - min(df$D1) 12 | d2.span = max(df$D2) - min(df$D2) 13 | plot.ratio = d1.span/d2.span 14 | } else { 15 | plot.ratio = 1 16 | } 17 | 18 | 19 | ggplot(df, aes(x=D1, y=D2, label=Label, color=Sample)) + 20 | geom_point(size=3, alpha=0.8) + 21 | scale_x_continuous("") + 22 | scale_y_continuous("") + 23 | scale_color_discrete("") + 24 | theme_bw() + 25 | theme(text=element_text(size=22)) + 26 | coord_fixed(ratio=plot.ratio) + 27 | theme(legend.justification=c(1,0), legend.position=c(1,0)) 28 | 29 | ggsave(output.pdf) 30 | 31 | # theme(legend.justification=c(1,0), legend.position=c(1,0)) 32 | # geom_text(size=5) + 33 | -------------------------------------------------------------------------------- /parallel_template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os, sys 3 | 4 | ############################################################ 5 | # name 6 | # 7 | # description 8 | ############################################################ 9 | 10 | ############################################################ 11 | # main 12 | ############################################################ 13 | def main(cpu_id, num_cpus): 14 | 15 | 16 | ############################################################ 17 | # __main__ 18 | ############################################################ 19 | if __name__ == '__main__': 20 | if len(sys.argv) == 3 and sys.argv[1] == '--launch': 21 | n = int(sys.argv[2]) 22 | for i in range(n): 23 | os.system('./template_parallel.py %d %d &' % (i,n)) 24 | elif len(sys.argv) == 3: 25 | main(int(sys.argv[1]), int(sys.argv[2])) 26 | -------------------------------------------------------------------------------- /r/annotation_pie_pie.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | ca = commandArgs(trailing=T) 4 | df.file = ca[1] 5 | output.pdf = ca[2] 6 | 7 | df = read.table(df.file, header=T, quote="\"") 8 | 9 | annotation.order.all = c('Intergenic','Introns','Exons','3\'UTR','5\'UTR','CDS','lncRNA','Pseudogene','rRNA','smallRNA') 10 | annotation.order = annotation.order.all[annotation.order.all %in% df$annotation] 11 | df$annotation = factor(df$annotation, levels=annotation.order) 12 | 13 | ggplot(df, aes(x=dummy, y=count, fill=annotation)) + 14 | geom_bar(stat="identity", width=1) + 15 | coord_polar(theta="y") + 16 | scale_x_discrete("") + 17 | scale_y_continuous("") + 18 | scale_fill_discrete("Annotation") + 19 | theme_bw() + 20 | theme(axis.text.x=element_blank(), axis.text.y=element_blank(), axis.ticks=element_blank(), panel.grid.minor=element_blank(), panel.grid.major=element_blank()) 21 | 22 | ggsave(output.pdf) 23 | -------------------------------------------------------------------------------- /r/cuff_rep_cor.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(reshape2) 3 | 4 | ca = commandArgs(trailing=T) 5 | df.file = ca[1] 6 | output.pdf = ca[2] 7 | 8 | df = read.table(df.file, header=T, quote="\"") 9 | 10 | # this is broken 11 | 12 | df$dist = 1 - df$Correlation 13 | sample12.matrix = acast(df, Sample1 ~ Sample2, value.var="dist", fill=0) 14 | sample21.matrix = acast(df, Sample2 ~ Sample1, value.var="dist", fill=0) 15 | sample.dist = as.dist(sample12.matrix+sample21.matrix) 16 | sample.clust = hclust(sample.dist) 17 | sample.order = rownames(sample12.matrix)[sample.clust$order] 18 | sample.order = rownames(sample12.matrix) 19 | 20 | ggplot(df, aes(x=Sample1, y=Sample2, fill=Correlation)) + 21 | geom_tile() + 22 | scale_x_discrete("", limits=sample.order) + 23 | scale_y_discrete("", limits=sample.order) + 24 | scale_fill_gradient(low="white", high="darkred") + 25 | theme_bw() 26 | 27 | ggsave(output.pdf) 28 | -------------------------------------------------------------------------------- /r/te_diff.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | ca = commandArgs(trailing=T) 4 | df.file = ca[1] 5 | output.pdf = ca[2] 6 | scale = as.numeric(ca[3]) 7 | 8 | df = read.table(df.file, header=T, quote="\"") 9 | 10 | xmin = quantile(df$diff, .005, na.rm=T) 11 | xmax = quantile(df$diff, .995, na.rm=T) 12 | 13 | ggplot(df, aes(x=diff, color=class)) + 14 | stat_ecdf(size=(sqrt(scale)*1.5), alpha=0.8, na.rm=T) + 15 | scale_x_continuous("log2 fRIP/Input", limits=c(xmin,xmax)) + 16 | scale_y_continuous("") + 17 | scale_color_manual("", values=c("#F46D43", "#66BD63")) + 18 | theme_bw() + 19 | theme(text=element_text(size=(sqrt(scale)*28))) + 20 | theme(legend.justification=c(1,0), legend.position=c(1,0)) 21 | 22 | # scale_x_continuous("Differential expression test statistic", limits=c(xmin,xmax)) + 23 | # scale_color_brewer("", palette="Set1") + 24 | 25 | ggsave(output.pdf, scale=scale) 26 | -------------------------------------------------------------------------------- /r/cuff_bar.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(RColorBrewer) 3 | 4 | ca = commandArgs(trailing=T) 5 | df.file = ca[1] 6 | y.min = as.numeric(ca[2]) 7 | y.max = ca[3] 8 | output.pdf = ca[4] 9 | 10 | df = read.table(df.file, header=T, quote="\"") 11 | 12 | if (y.max == "None") { 13 | y.max = max(df$conf_hi) 14 | } else { 15 | y.max = as.numeric(y.max) 16 | } 17 | 18 | color.count = length(unique(df$Sample)) 19 | get.pal = colorRampPalette(brewer.pal(11, "Spectral")) 20 | 21 | ggplot(df, aes(x=Sample, y=FPKM, ymin=conf_lo, ymax=conf_hi, fill=Sample)) + 22 | geom_bar(stat="identity", fill=get.pal(color.count)) + 23 | geom_errorbar(width=0.5) + 24 | scale_x_discrete("", limits=df$Sample) + 25 | guides(fill=FALSE) + 26 | theme_bw() + 27 | theme(text=element_text(size=22)) + 28 | theme(axis.text.x=element_text(angle=315, hjust=0, vjust=1)) 29 | 30 | # scale_fill_brewer(palette="Spectral") + 31 | 32 | ggsave(output.pdf) 33 | -------------------------------------------------------------------------------- /r/cuff_scatter.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | ca = commandArgs(trailing=T) 4 | df.file = ca[1] 5 | output.pdf = ca[2] 6 | cond1 = ca[3] 7 | cond2 = ca[4] 8 | pseudocount = as.numeric(ca[5]) 9 | 10 | df = read.table(df.file, header=T, quote="\"") 11 | 12 | fpkm.min = log2(pseudocount) 13 | 14 | fpkm.max1 = quantile(df$fpkm1, .997) 15 | fpkm.max2 = quantile(df$fpkm2, .997) 16 | fpkm.max = max(fpkm.max1, fpkm.max2) 17 | 18 | qval.unique = unique(df$qval) 19 | if (length(qval.unique) == 1) { 20 | gp = ggplot(df, aes(x=fpkm1, y=fpkm2)) 21 | } else { 22 | gp = ggplot(df, aes(x=fpkm1, y=fpkm2, colour=qval)) 23 | } 24 | 25 | gp + 26 | geom_point(size=1.5, alpha=.3) + 27 | scale_x_continuous(paste(cond1, "log2 FPKM"), lim=c(fpkm.min,fpkm.max)) + 28 | scale_y_continuous(paste(cond2, "log2 FPKM"), lim=c(fpkm.min,fpkm.max)) + 29 | geom_abline(intercept=0, slope=1, linetype=2) + 30 | theme_bw() + 31 | theme(text=element_text(size=16)) 32 | 33 | # ggtitle(paste(cond1, "vs", cond2)) + 34 | 35 | ggsave(output.pdf) 36 | -------------------------------------------------------------------------------- /clear_slurm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os, glob 4 | 5 | ''' 6 | clear_slurm 7 | 8 | Helper script to clear out slurm log files. 9 | ''' 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] ' 16 | parser = OptionParser(usage) 17 | #parser.add_option() 18 | (options,args) = parser.parse_args() 19 | 20 | file_size = None 21 | if len(args) > 0: 22 | file_size = int(args[0]) 23 | 24 | for slurm_out in glob.glob('slurm*.out'): 25 | if file_size is None or os.path.getsize(slurm_out) == file_size: 26 | os.remove(slurm_out) 27 | 28 | 29 | ################################################################################ 30 | # __main__ 31 | ################################################################################ 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /pygene_utrs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | import pygene 5 | 6 | ''' 7 | Name 8 | 9 | Description... 10 | ''' 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | (options,args) = parser.parse_args() 19 | 20 | if len(args) != 2: 21 | parser.error('Must provide input and output GTFs') 22 | else: 23 | in_gtf_file = args[0] 24 | out_gtf_file = args[1] 25 | 26 | gtf = pygene.GTF(in_gtf_file) 27 | 28 | with open(out_gtf_file, 'w') as out_gtf_open: 29 | gtf.write_gtf(out_gtf_file, write_cds=True, write_utrs=True) 30 | 31 | 32 | ################################################################################ 33 | # __main__ 34 | ################################################################################ 35 | if __name__ == '__main__': 36 | main() 37 | -------------------------------------------------------------------------------- /gtf2utrs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | ################################################################################ 5 | # gtf2utrs.py 6 | # 7 | # Take a gtf file with exons and CDS annotated and return a gtf of the UTRs. 8 | ################################################################################ 9 | 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] ' 16 | parser = OptionParser(usage) 17 | #parser.add_option() 18 | (options,args) = parser.parse_args() 19 | 20 | if len(args) != 1: 21 | parser.error('Must provide gtf file') 22 | else: 23 | gtf_file = args[0] 24 | 25 | 26 | ################################################################################ 27 | # __main__ 28 | ################################################################################ 29 | if __name__ == '__main__': 30 | main() 31 | -------------------------------------------------------------------------------- /vcf2vds.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os 4 | import shutil 5 | from hail import * 6 | 7 | ''' 8 | Name 9 | 10 | Description... 11 | ''' 12 | 13 | ################################################################################ 14 | # main 15 | ################################################################################ 16 | def main(): 17 | usage = 'usage: %prog [options] ' 18 | parser = OptionParser(usage) 19 | #parser.add_option() 20 | (options,args) = parser.parse_args() 21 | 22 | if len(args) != 2: 23 | parser.error('Must provide VCF and VDS files') 24 | else: 25 | vcf_file = args[0] 26 | vds_file = args[1] 27 | 28 | if os.path.isdir(vds_file): 29 | shutil.rmtree(vds_file) 30 | 31 | hc = HailContext() 32 | hc.import_vcf(vcf_file).write(vds_file) 33 | 34 | 35 | ################################################################################ 36 | # __main__ 37 | ################################################################################ 38 | if __name__ == '__main__': 39 | main() 40 | -------------------------------------------------------------------------------- /fasta_upper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | ''' 5 | Name 6 | 7 | Description... 8 | ''' 9 | 10 | ################################################################################ 11 | # main 12 | ################################################################################ 13 | def main(): 14 | usage = 'usage: %prog [options] ' 15 | parser = OptionParser(usage) 16 | #parser.add_option() 17 | (options,args) = parser.parse_args() 18 | 19 | in_fasta_file = args[0] 20 | out_fasta_file = args[1] 21 | out_fasta_open = open(out_fasta_file, 'w') 22 | 23 | for line in open(in_fasta_file): 24 | if line[0] == '>': 25 | print(line, end='', file=out_fasta_open) 26 | else: 27 | print(line.upper(), end='', file=out_fasta_open) 28 | 29 | out_fasta_open.close() 30 | 31 | 32 | ################################################################################ 33 | # __main__ 34 | ################################################################################ 35 | if __name__ == '__main__': 36 | main() 37 | -------------------------------------------------------------------------------- /vcf2bed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | ''' 5 | vcf2bed.py 6 | 7 | Simple VCF to BED converter. 8 | ''' 9 | 10 | ################################################################################ 11 | # main 12 | ################################################################################ 13 | def main(): 14 | usage = 'usage: %prog [options] arg' 15 | parser = OptionParser(usage) 16 | #parser.add_option() 17 | (options,args) = parser.parse_args() 18 | 19 | vcf_file = args[0] 20 | bed_file = args[1] 21 | bed_out = open(bed_file, 'w') 22 | 23 | for line in open(vcf_file): 24 | if not line.startswith('#'): 25 | a = line.split('\t') 26 | chrm = a[0] 27 | pos = int(a[1]) 28 | name = a[2] 29 | 30 | start = pos - 1 31 | end = start + 1 32 | print('%s\t%d\t%d\t%s' % (chrm,start,end,name), file=bed_out) 33 | 34 | bed_out.close() 35 | 36 | 37 | ################################################################################ 38 | # __main__ 39 | ################################################################################ 40 | if __name__ == '__main__': 41 | main() 42 | -------------------------------------------------------------------------------- /r/diff_summary.r: -------------------------------------------------------------------------------- 1 | library(cummeRbund) 2 | 3 | cuff = readCufflinks() 4 | 5 | ############################################ 6 | # density plot 7 | ############################################ 8 | csDensity(genes(cuff), rep=T, pseudocount=0.1) 9 | ggsave("density.pdf") 10 | 11 | ############################################ 12 | # dendrogram 13 | ############################################ 14 | pdf("dendro.pdf") 15 | csDendro(genes(cuff), rep=T, pseudocount=1) 16 | dev.off() 17 | 18 | ############################################ 19 | # MDS 20 | ############################################ 21 | MDSplot(genes(cuff), rep=T, pseudocount=1) + 22 | coord_fixed() + 23 | theme_bw() + 24 | theme(text=element_text(size=15)) 25 | 26 | ggsave("mds.pdf") 27 | 28 | ############################################ 29 | # PCA 30 | ############################################ 31 | PCAplot(genes(cuff), x="PC1", y="PC2", rep=T, pseudocount=1) + 32 | coord_fixed() + 33 | theme_bw() + 34 | theme(text=element_text(size=15)) 35 | 36 | ggsave("pca12.pdf") 37 | 38 | PCAplot(genes(cuff), x="PC2", y="PC3", rep=T, pseudocount=1) + 39 | coord_fixed() + 40 | theme_bw() + 41 | theme(text=element_text(size=15)) 42 | 43 | ggsave("pca23.pdf") 44 | -------------------------------------------------------------------------------- /h5_sum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import h5py 4 | 5 | ''' 6 | Name 7 | 8 | Description... 9 | ''' 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] ' 16 | parser = OptionParser(usage) 17 | #parser.add_option() 18 | (options,args) = parser.parse_args() 19 | 20 | h5_in = h5py.File(args[0], 'r') 21 | print_h5_tree(h5_in) 22 | 23 | # h5_keys = sorted(list(h5_in.keys())) 24 | # for hkey in h5_keys: 25 | # print(h5_in[hkey]) 26 | 27 | h5_in.close() 28 | 29 | def print_h5_tree(h5_obj, depth=0): 30 | h5_keys = sorted(list(h5_obj.keys())) 31 | for hkey in h5_keys: 32 | print(''.join(['\t']*depth), h5_obj[hkey]) 33 | if type(h5_obj[hkey]) == h5py._hl.group.Group: 34 | print_h5_tree(h5_obj[hkey], depth+1) 35 | 36 | ################################################################################ 37 | # __main__ 38 | ################################################################################ 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /explore.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import numpy as np\n", 23 | "import pandas as pd\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "import seaborn as sns" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [] 36 | } 37 | ], 38 | "metadata": { 39 | "kernelspec": { 40 | "display_name": "Python [default]", 41 | "language": "python", 42 | "name": "python3" 43 | }, 44 | "language_info": { 45 | "codemirror_mode": { 46 | "name": "ipython", 47 | "version": 3 48 | }, 49 | "file_extension": ".py", 50 | "mimetype": "text/x-python", 51 | "name": "python", 52 | "nbconvert_exporter": "python", 53 | "pygments_lexer": "ipython3", 54 | "version": "3.5.2" 55 | } 56 | }, 57 | "nbformat": 4, 58 | "nbformat_minor": 2 59 | } 60 | -------------------------------------------------------------------------------- /fasta_genome.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | import pysam 5 | 6 | ''' 7 | Name 8 | 9 | Description... 10 | ''' 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | #parser.add_option() 19 | (options,args) = parser.parse_args() 20 | 21 | if len(args) != 2: 22 | parser.error('Must provide input FASTA and output genome files.') 23 | else: 24 | fasta_file = args[0] 25 | genome_file = args[1] 26 | 27 | fasta_open = pysam.Fastafile(fasta_file) 28 | genome_open = open(genome_file, 'w') 29 | 30 | for ref in fasta_open.references: 31 | ref_len = fasta_open.get_reference_length(ref) 32 | print('%s\t%d' % (ref, ref_len), file=genome_open) 33 | 34 | fasta_open.close() 35 | genome_open.close() 36 | 37 | 38 | ################################################################################ 39 | # __main__ 40 | ################################################################################ 41 | if __name__ == '__main__': 42 | main() 43 | -------------------------------------------------------------------------------- /r/peaks_diff_compare.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | ca = commandArgs(trailing=T) 4 | df.file = ca[1] 5 | out.pre = ca[2] 6 | rbp = ca[3] 7 | tstat = ca[4] 8 | 9 | df = read.table(df.file, header=T, quote="\"") 10 | 11 | x.min = quantile(df$RIP, .005, na.rm=T) 12 | x.max = quantile(df$RIP, .995, na.rm=T) 13 | 14 | if(tstat == "True") { 15 | x.lab = paste(rbp, "fRIP/input diff stat") 16 | } else { 17 | x.lab = paste(rbp, "log2 fRIP/input") 18 | } 19 | 20 | ggplot(df, aes(x=RIP, color=CLIP)) + 21 | geom_line(stat="density", size=1.5, alpha=0.8) + 22 | scale_x_continuous(x.lab, limits=c(x.min,x.max)) + 23 | scale_color_brewer(palette="Set1") + 24 | theme_bw() + 25 | theme(text=element_text(size=25)) + 26 | theme(legend.justification=c(1,1), legend.position=c(1,1)) 27 | 28 | out.pdf = paste(out.pre, "_dens.pdf", sep="") 29 | ggsave(out.pdf) 30 | 31 | 32 | ggplot(df, aes(x=RIP, color=CLIP)) + 33 | stat_ecdf(size=1.5, alpha=0.8) + 34 | scale_x_continuous(x.lab, limits=c(x.min,x.max)) + 35 | scale_y_continuous("") + 36 | scale_color_brewer(palette="Set1") + 37 | theme_bw() + 38 | theme(text=element_text(size=25)) + 39 | theme(legend.justification=c(1,0), legend.position=c(1,0)) 40 | 41 | out.pdf = paste(out.pre, "_cdf.pdf", sep="") 42 | ggsave(out.pdf) 43 | -------------------------------------------------------------------------------- /rm_nonxs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os, subprocess 4 | 5 | ################################################################################ 6 | # rm_nonxs.py 7 | # 8 | # Check for BAM files where I changed the XS tags, but left the original, 9 | # and delete the original. 10 | ################################################################################ 11 | 12 | 13 | ################################################################################ 14 | # main 15 | ################################################################################ 16 | def main(): 17 | usage = 'usage: %prog [options] arg' 18 | parser = OptionParser(usage) 19 | #parser.add_option() 20 | (options,args) = parser.parse_args() 21 | 22 | xs_bams_str = subprocess.check_output('find . -name accepted_hits_xs.bam', shell=True).strip() 23 | xs_bams = xs_bams_str.split('\n') 24 | 25 | for xs_bam in xs_bams: 26 | bam = xs_bam.replace('_xs','') 27 | if os.path.isfile(bam): 28 | print 'rm %s' % bam 29 | os.remove(bam) 30 | 31 | 32 | ################################################################################ 33 | # __main__ 34 | ################################################################################ 35 | if __name__ == '__main__': 36 | main() 37 | -------------------------------------------------------------------------------- /r/cuff_heat.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(reshape2) 3 | library(seriation) 4 | library(RColorBrewer) 5 | 6 | ca = commandArgs(trailing=T) 7 | df.file = ca[1] 8 | output.pdf = ca[2] 9 | 10 | df = read.table(df.file, header=T, quote="\"") 11 | 12 | fpkm.matrix = acast(df, Gene ~ Sample, value.var="FPKM") 13 | 14 | gene.dist = dist(fpkm.matrix) 15 | #gene.clust = hclust(gene.dist) 16 | #gene.order = rownames(fpkm.matrix)[gene.clust$order] 17 | gene.ser = seriate(gene.dist, method="OLO") 18 | gene.order = rownames(fpkm.matrix)[get_order(gene.ser)] 19 | 20 | sample.dist = dist(t(fpkm.matrix)) 21 | #sample.clust = hclust(sample.dist) 22 | #sample.order = colnames(fpkm.matrix)[sample.clust$order] 23 | sample.ser = seriate(sample.dist, method="OLO") 24 | sample.order = colnames(fpkm.matrix)[get_order(sample.ser)] 25 | 26 | ggplot(df, aes(x=Sample, y=Gene, fill=FPKM)) + 27 | geom_tile() + 28 | scale_x_discrete("", limits=sample.order) + 29 | scale_y_discrete(limits=gene.order) + 30 | scale_fill_gradientn("FPKM", colours=c("white", brewer.pal(8, "YlOrRd"))) + 31 | theme_bw() + 32 | theme(text=element_text(size=20)) + 33 | theme(axis.text.x=element_text(angle=315, hjust=0, vjust=1), axis.ticks.y=element_blank(), axis.text.y=element_blank()) 34 | 35 | ggsave(output.pdf) 36 | 37 | # scale_fill_gradientn("FPKM", colours=c("white", brewer.pal(8, "YOrRd"))) + 38 | -------------------------------------------------------------------------------- /bam_unique.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import pysam 4 | 5 | ################################################################################ 6 | # bam_unique.py 7 | # 8 | # Remove multi-mapping alignments from a BAM file 9 | ################################################################################ 10 | 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | (options,args) = parser.parse_args() 19 | 20 | if len(args) != 2: 21 | parser.error('Must provide input and output BAMs') 22 | else: 23 | input_bam = args[0] 24 | output_bam = args[1] 25 | 26 | bam_in = pysam.Samfile(input_bam, 'rb') 27 | bam_out = pysam.Samfile(output_bam, 'wb', template=bam_in) 28 | 29 | for aligned_read in bam_in: 30 | if aligned_read.get_tag('NH') == 1: 31 | bam_out.write(aligned_read) 32 | 33 | bam_in.close() 34 | bam_out.close() 35 | 36 | ################################################################################ 37 | # __main__ 38 | ################################################################################ 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /make_fasta_genome.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import pdb 4 | 5 | import numpy as np 6 | import pysam 7 | 8 | ''' 9 | make_fasta_genome.py 10 | 11 | Make a "genome" file, with chromosome names and lengths. 12 | ''' 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | #parser.add_option() 21 | (options,args) = parser.parse_args() 22 | 23 | if len(args) != 2: 24 | parser.error('Must provide input FASTA file and output genome file') 25 | else: 26 | fasta_file = args[0] 27 | genome_file = args[1] 28 | 29 | genome_out = open(genome_file, 'w') 30 | 31 | fasta_open = pysam.Fastafile(fasta_file) 32 | ref_indexes = np.argsort(fasta_open.lengths)[::-1] 33 | 34 | for i in ref_indexes: 35 | print('%s\t%d' % (fasta_open.references[i], fasta_open.lengths[i]), file=genome_out) 36 | fasta_open.close() 37 | 38 | genome_out.close() 39 | 40 | 41 | ################################################################################ 42 | # __main__ 43 | ################################################################################ 44 | if __name__ == '__main__': 45 | main() 46 | -------------------------------------------------------------------------------- /clean_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | ################################################################################ 5 | # clean_csv.py 6 | # 7 | # Clean up an excel-saved .csv file with \r's and commas. 8 | ################################################################################ 9 | 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] arg' 16 | parser = OptionParser(usage) 17 | #parser.add_option() 18 | (options,args) = parser.parse_args() 19 | 20 | file_in = open(args[0]) 21 | 22 | file_str = file_in.readline() 23 | 24 | if file_str.find('\r') != -1: 25 | for line in file_str.split('\r'): 26 | a = line.split(',') 27 | print '\t'.join(a) 28 | 29 | else: 30 | line = file_str 31 | while line: 32 | a = line.split(',') 33 | a[-1] = a[-1].rstrip() 34 | print '\t'.join(a) 35 | line = file_in.readline() 36 | 37 | file_in.close() 38 | 39 | 40 | 41 | ################################################################################ 42 | # __main__ 43 | ################################################################################ 44 | if __name__ == '__main__': 45 | main() 46 | -------------------------------------------------------------------------------- /transid2geneid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import gff 4 | 5 | ################################################################################ 6 | # transid2geneid.py 7 | # 8 | # Given a transcript id, produce a gene id to punch into the browser 9 | ################################################################################ 10 | 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | parser.add_option('-l', dest='lnc_file', default='/Users/dk/research/common/data/lncrna/lnc_catalog.gtf', help='lncRNA catalog file [Default: %default]') 19 | (options,args) = parser.parse_args() 20 | 21 | if len(args) != 1: 22 | parser.error('Must provide transcript id') 23 | else: 24 | trans_id = args[0] 25 | 26 | for line in open(options.lnc_file): 27 | a = line.split('\t') 28 | kv = gff.gtf_kv(a[8]) 29 | if kv['transcript_id'] == trans_id: 30 | print kv['gene_id'] 31 | break 32 | 33 | 34 | ################################################################################ 35 | # __main__ 36 | ################################################################################ 37 | if __name__ == '__main__': 38 | main() 39 | -------------------------------------------------------------------------------- /possum2bed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | ################################################################################ 5 | # possum2bed.py 6 | # 7 | # 8 | ################################################################################ 9 | 10 | ################################################################################ 11 | # main 12 | ################################################################################ 13 | def main(): 14 | usage = 'usage: %prog [options] ' 15 | parser = OptionParser(usage) 16 | #parser.add_option() 17 | (options,args) = parser.parse_args() 18 | 19 | if len(args) != 1: 20 | parser.error('Must provide possum output file') 21 | else: 22 | possum_file = args[0] 23 | 24 | for line in open(possum_file): 25 | a = line.split('\t') 26 | 27 | tf_id = a[0] 28 | start = int(a[5])+1 29 | end = start+int(a[6])-1 30 | fnrc = a[7] 31 | score = a[9] 32 | seq_id = a[16][:a[16].find('.')] 33 | 34 | if fnrc == 'fn': 35 | strand = '+' 36 | else: 37 | strand = '-' 38 | 39 | out_a = [seq_id, str(start), str(end), tf_id, score, strand] 40 | print '\t'.join(out_a) 41 | 42 | 43 | ################################################################################ 44 | # __main__ 45 | ################################################################################ 46 | if __name__ == '__main__': 47 | main() 48 | -------------------------------------------------------------------------------- /mess2fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | ################################################################################ 5 | # mess2fa.py 6 | # 7 | # Convert a mess of nt's and numbers and spaces into a neat fasta file 8 | ################################################################################ 9 | 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] ' 16 | parser = OptionParser(usage) 17 | parser.add_option('-d','--header', dest='header', default='mess', help='Fasta header [Default: %default]') 18 | parser.add_option('-u', '--upper', dest='upper', action='store_true', default=False, help='Uppercase all nucleotides [Default: %default]') 19 | (options,args) = parser.parse_args() 20 | 21 | allowed_nts = set(['A','C','G','T','N','a','c','g','t','n']) 22 | 23 | seq = '' 24 | for line in open(args[0]): 25 | seq += ''.join([nt for nt in line if nt in allowed_nts]) 26 | 27 | if options.upper: 28 | seq = ''.join([nt.upper() for nt in seq]) 29 | 30 | print '>%s\n%s' % (options.header,seq) 31 | 32 | 33 | ################################################################################ 34 | # __main__ 35 | ################################################################################ 36 | if __name__ == '__main__': 37 | main() 38 | -------------------------------------------------------------------------------- /gff2bed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from optparse import OptionParser 4 | import gff, sys 5 | 6 | ################################################################################ 7 | # gff2bed.py 8 | # 9 | # Convert a gff file to a bed file. Each entry is converted independently, 10 | # so no blocks. 11 | ################################################################################ 12 | 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | #parser.add_option() 21 | (options,args) = parser.parse_args() 22 | 23 | if len(args) != 1: 24 | parser.error('Must provide gff file') 25 | else: 26 | if args[0] == '-': 27 | gff_open = sys.stdin 28 | else: 29 | gff_open = open(args[0]) 30 | 31 | for line in gff_open: 32 | if not line.startswith('##'): 33 | a = line.split('\t') 34 | 35 | cols = [a[0], str(int(a[3])-1), a[4], a[2], '0', a[6], '0', '0', '255,0,0', '1', str(int(a[4])-int(a[3])+1), '0'] 36 | print('\t'.join(cols)) 37 | 38 | 39 | ################################################################################ 40 | # __main__ 41 | ################################################################################ 42 | if __name__ == '__main__': 43 | main() 44 | -------------------------------------------------------------------------------- /rm2bed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import gzip 4 | 5 | ''' 6 | rm2bed.py 7 | 8 | Convert RepeatMasker .out format to BED. 9 | ''' 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] ' 16 | parser = OptionParser(usage) 17 | #parser.add_option() 18 | (options,args) = parser.parse_args() 19 | 20 | if len(args) != 1: 21 | parser.error('Must provide RepeatMasker .out file') 22 | else: 23 | if args[0][-2:] == 'gz': 24 | rm_in = gzip.open(args[0], 'rt') 25 | else: 26 | rm_in = open(args[0]) 27 | 28 | for i in range(4): 29 | line = rm_in.readline() 30 | while line: 31 | a = line.split() 32 | 33 | chrm = a[4] 34 | start = str(int(a[5])-1) 35 | end = a[6] 36 | 37 | if a[8] == '+': 38 | strand = '+' 39 | else: 40 | strand = '-' 41 | 42 | repeat = a[9] 43 | family = a[10] 44 | 45 | cols = (chrm, start, end, '%s;%s' % (family,repeat), '.', strand) 46 | print('\t'.join(cols)) 47 | 48 | line = rm_in.readline() 49 | 50 | rm_in.close() 51 | 52 | 53 | ################################################################################ 54 | # __main__ 55 | ################################################################################ 56 | if __name__ == '__main__': 57 | main() 58 | -------------------------------------------------------------------------------- /possum2gff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os 4 | 5 | ################################################################################ 6 | # possum2gff.py 7 | # 8 | # Convert the motif annotations in Possum output to a gff file. 9 | ################################################################################ 10 | 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | #parser.add_option() 19 | (options,args) = parser.parse_args() 20 | 21 | if len(args) != 1: 22 | parser.error('Must provide possum output file') 23 | else: 24 | possum_file = args[0] 25 | 26 | for line in open(possum_file): 27 | a = line.split('\t') 28 | 29 | tf_id = a[0] 30 | start = int(a[5])+1 31 | end = start+int(a[6])-1 32 | fnrc = a[7] 33 | seq_id = a[16][:a[16].find('.')] 34 | 35 | if fnrc == 'fn': 36 | strand = '+' 37 | else: 38 | strand = '-' 39 | 40 | out_a = [seq_id, 'possum', 'motif', str(start), str(end), '.', strand, '.', tf_id] 41 | print '\t'.join(out_a) 42 | 43 | 44 | ################################################################################ 45 | # __main__ 46 | ################################################################################ 47 | if __name__ == '__main__': 48 | main() 49 | -------------------------------------------------------------------------------- /zarr_h5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | import h5py 5 | import numpy as np 6 | import zarr 7 | 8 | ''' 9 | zarr_h5.py 10 | 11 | Convert a coverage Zarr to HDF5. 12 | ''' 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | parser.add_option('-v', dest='verbose', default=False, action='store_true') 21 | (options,args) = parser.parse_args() 22 | 23 | if len(args) != 2: 24 | parser.error('Must provide input Zarr and output HDF5.') 25 | else: 26 | zarr_file = args[0] 27 | hdf5_file = args[1] 28 | 29 | # open files 30 | zarr_in = zarr.open_group(zarr_file, 'r') 31 | h5_out = h5py.File(hdf5_file, 'w') 32 | 33 | # foreach chromosome 34 | for chrom in sorted(zarr_in.keys()): 35 | if options.verbose: 36 | print(chrom) 37 | 38 | # read values 39 | x = np.array(zarr_in[chrom]) 40 | 41 | # write gzipped into HDF5 42 | h5_out.create_dataset(chrom, data=x, dtype='float16', chunks=True, compression='lzf', shuffle=True) 43 | 44 | # close files 45 | h5_out.close() 46 | 47 | 48 | ################################################################################ 49 | # __main__ 50 | ################################################################################ 51 | if __name__ == '__main__': 52 | main() 53 | -------------------------------------------------------------------------------- /bam_12.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import pdb, os 4 | import pysam 5 | 6 | ################################################################################ 7 | # bam_12.py 8 | # 9 | # Separate the alignments in a BAM file into two BAM files of the first and 10 | # second reads. 11 | ################################################################################ 12 | 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | (options,args) = parser.parse_args() 21 | 22 | if len(args) != 1: 23 | parser.error('Must provide bam file') 24 | else: 25 | bam_file = args[0] 26 | 27 | bam_pre = os.path.splitext(bam_file)[0] 28 | 29 | bam_in = pysam.Samfile(bam_file, 'rb') 30 | bam1_out = pysam.Samfile('%s_1.bam'%bam_pre, 'wb', header=bam_in.header) 31 | bam2_out = pysam.Samfile('%s_2.bam'%bam_pre, 'wb', header=bam_in.header) 32 | 33 | for read in bam_in: 34 | if read.is_read1: 35 | bam1_out.write(read) 36 | else: 37 | bam2_out.write(read) 38 | 39 | bam_in.close() 40 | bam1_out.close() 41 | bam2_out.close() 42 | 43 | 44 | ################################################################################ 45 | # __main__ 46 | ################################################################################ 47 | if __name__ == '__main__': 48 | main() 49 | #pdb.runcall(main) 50 | -------------------------------------------------------------------------------- /rm2gff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import gzip 4 | import gff 5 | 6 | ################################################################################ 7 | # rm2gff.py 8 | # 9 | # Convert RepeatMasker .out format to gff 10 | ################################################################################ 11 | 12 | 13 | ################################################################################ 14 | # main 15 | ################################################################################ 16 | def main(): 17 | usage = 'usage: %prog [options] ' 18 | parser = OptionParser(usage) 19 | #parser.add_option() 20 | (options,args) = parser.parse_args() 21 | 22 | if len(args) != 1: 23 | parser.error('Must provide RepeatMasker .out file') 24 | else: 25 | if args[0][-2:] == 'gz': 26 | rm_in = gzip.open(args[0]) 27 | else: 28 | rm_in = open(args[0]) 29 | 30 | for i in range(4): 31 | line = rm_in.readline() 32 | while line: 33 | a = line.split() 34 | 35 | if a[8] == '+': 36 | strand = '+' 37 | else: 38 | strand = '-' 39 | 40 | cols = (a[4], 'RepeatMasker', 'repeat', a[5], a[6], '.', strand, '.', gff.kv_gtf({'repeat':a[9], 'family':a[10]})) 41 | print '\t'.join(cols) 42 | 43 | line = rm_in.readline() 44 | 45 | 46 | ################################################################################ 47 | # __main__ 48 | ################################################################################ 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | import matplotlib.pyplot as plt 5 | 6 | ################################################################################ 7 | # plot.py 8 | # 9 | # matplotlib helper methods 10 | ################################################################################ 11 | 12 | ##################################################################### 13 | # limits 14 | # 15 | # Determine a nice buffered axis range from a list/array of numbers 16 | ##################################################################### 17 | def limits(nums, buf_pct=0.05): 18 | nmin = min(nums) 19 | nmax = max(nums) 20 | spread = nmax-nmin 21 | buf = buf_pct*spread 22 | return nmin-buf, nmax+buf 23 | 24 | ##################################################################### 25 | # scatter 26 | # 27 | # Example scatter plot with some reasonable parameter choices 28 | ##################################################################### 29 | def scatter(x, y, pdf, xlabel='', ylabel=''): 30 | f, ax = plt.subplots() 31 | 32 | # scatter 33 | plt.scatter(x, y, s=20, alpha=0.8, linewidths=0) 34 | 35 | # x-axis 36 | xmin, xmax = limits(x) 37 | plt.xlim(xmin, xmax) 38 | plt.xlabel(xlabel) 39 | ax.xaxis.label.set_fontsize(18) 40 | map(lambda xl: xl.set_fontsize(15), ax.get_xticklabels()) 41 | 42 | # y-axis 43 | ymin, ymax = limits(y) 44 | plt.ylim(ymin, ymax) 45 | plt.ylabel(ylabel) 46 | ax.yaxis.label.set_fontsize(18) 47 | map(lambda yl: yl.set_fontsize(15), ax.get_yticklabels()) 48 | 49 | # save 50 | plt.savefig(pdf) 51 | 52 | # close 53 | plt.close() 54 | -------------------------------------------------------------------------------- /bam_plus_minus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import pdb, os 4 | import pysam 5 | 6 | ################################################################################ 7 | # bam_plus_minus.py 8 | # 9 | # Separate the alignments in a BAM file into two BAM files of the plus and 10 | # minus strand reads. 11 | ################################################################################ 12 | 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | (options,args) = parser.parse_args() 21 | 22 | if len(args) != 1: 23 | parser.error('Must provide bam file') 24 | else: 25 | bam_file = args[0] 26 | 27 | chr_starts = {} 28 | 29 | bam_pre = os.path.splitext(bam_file)[0] 30 | 31 | bam_in = pysam.Samfile(bam_file, 'rb') 32 | bamp_out = pysam.Samfile('%s_p.bam'%bam_pre, 'wb', header=bam_in.header) 33 | bamm_out = pysam.Samfile('%s_m.bam'%bam_pre, 'wb', header=bam_in.header) 34 | 35 | for read in bam_in: 36 | if read.is_reverse: 37 | bamm_out.write(read) 38 | else: 39 | bamp_out.write(read) 40 | 41 | bam_in.close() 42 | bamp_out.close() 43 | bamm_out.close() 44 | 45 | 46 | ################################################################################ 47 | # __main__ 48 | ################################################################################ 49 | if __name__ == '__main__': 50 | main() 51 | #pdb.runcall(main) 52 | -------------------------------------------------------------------------------- /fastq_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import bz2 4 | import gzip 5 | 6 | ''' 7 | fastq_filter.py 8 | 9 | Filter a FASTQ file for various properties, like read length. 10 | ''' 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | parser.add_option('-l', dest='length_min', default=None, type='int', help='Minimum read length') 19 | (options,args) = parser.parse_args() 20 | 21 | if len(args) != 1: 22 | parser.error('Must provide FASTQ file') 23 | else: 24 | fastq_file = args[0] 25 | 26 | if fastq_file[-3:] == '.gz': 27 | fastq_in = gzip.open(fastq_file, 'rt') 28 | elif fastq_file[-4:] == '.bz2': 29 | fastq_in = bz2.open(fastq_file, 'rt') 30 | else: 31 | fastq_in = open(fastq_file) 32 | 33 | header = fastq_in.readline() 34 | while header: 35 | seq = fastq_in.readline() 36 | mid = fastq_in.readline() 37 | qual = fastq_in.readline() 38 | 39 | if options.length_min is not None: 40 | if len(seq)-1 >= options.length_min: 41 | print('%s%s%s%s' % (header,seq,mid,qual), end='') 42 | 43 | header = fastq_in.readline() 44 | 45 | fastq_in.close() 46 | 47 | 48 | ################################################################################ 49 | # __main__ 50 | ################################################################################ 51 | if __name__ == '__main__': 52 | main() 53 | -------------------------------------------------------------------------------- /gsea_ranks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import gsea 4 | 5 | ################################################################################ 6 | # gsea_ranks.py 7 | # 8 | # Print out the ranks for a given GO term. 9 | ################################################################################ 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] ' 16 | parser = OptionParser(usage) 17 | (options,args) = parser.parse_args() 18 | 19 | if len(args) != 2: 20 | parser.error('Must provide correlations file') 21 | else: 22 | cors_file = args[0] 23 | go_term = args[1] 24 | 25 | # get genes, correlations 26 | correlations_genes = [] 27 | genes = [] 28 | for line in open(cors_file): 29 | a = line.split() 30 | correlations_genes.append((abs(float(a[1])),a[0])) 31 | genes.append(a[0]) 32 | correlations_genes.sort(reverse=True) 33 | 34 | # GO 35 | go_map, go_descs = gsea.read_go(set(genes)) 36 | go_term_map = go_map[go_term] 37 | 38 | # print ranks, correlations 39 | i = 1 40 | for (cor,gene) in correlations_genes: 41 | if gene in go_term_map: 42 | print i, cor 43 | i += 1 44 | 45 | 46 | ################################################################################ 47 | # __main__ 48 | ################################################################################ 49 | if __name__ == '__main__': 50 | main() 51 | #pdb.runcall(main) 52 | -------------------------------------------------------------------------------- /fastq_trim.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import bz2 4 | import gzip 5 | 6 | ''' 7 | fastq_trim.py 8 | 9 | Filter a FASTQ file for various properties, like read length. 10 | ''' 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | (options,args) = parser.parse_args() 19 | 20 | if len(args) != 2: 21 | parser.error('Must provide trim length and FASTQ file') 22 | else: 23 | trim_length = int(args[0]) 24 | fastq_file = args[1] 25 | 26 | if fastq_file[-3:] == '.gz': 27 | fastq_in = gzip.open(fastq_file, 'rt') 28 | elif fastq_file[-4:] == '.bz2': 29 | fastq_in = bz2.open(fastq_file, 'rt') 30 | else: 31 | fastq_in = open(fastq_file) 32 | 33 | header = fastq_in.readline().rstrip() 34 | while header: 35 | seq = fastq_in.readline().rstrip() 36 | mid = fastq_in.readline().rstrip() 37 | qual = fastq_in.readline().rstrip() 38 | 39 | # trim 40 | seq = seq[:trim_length] 41 | qual = qual[:trim_length] 42 | 43 | print('%s\n%s\n%s\n%s' % (header,seq,mid,qual)) 44 | 45 | header = fastq_in.readline().rstrip() 46 | 47 | fastq_in.close() 48 | 49 | 50 | ################################################################################ 51 | # __main__ 52 | ################################################################################ 53 | if __name__ == '__main__': 54 | main() 55 | -------------------------------------------------------------------------------- /bam_quality.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import pysam 4 | 5 | ''' 6 | bam_quality.py 7 | 8 | Remove low quality alignments from a BAM file. 9 | ''' 10 | 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | parser.add_option('-q', dest='mapq_t', 19 | type='int', default=None, 20 | help='Keep alignments with mapping quality at or above [Default: %default]') 21 | parser.add_option('-s', dest='score_t', 22 | type='int', default=None, 23 | help='Keep alignments with alignment score at or above [Default: %default]') 24 | (options,args) = parser.parse_args() 25 | 26 | if len(args) != 2: 27 | parser.error('Must provide input and output BAMs') 28 | else: 29 | input_bam = args[0] 30 | output_bam = args[1] 31 | 32 | bam_in = pysam.AlignmentFile(input_bam, 'r') 33 | bam_out = pysam.AlignmentFile(output_bam, 'wb', template=bam_in) 34 | 35 | for align in bam_in: 36 | if options.mapq_t is None or align.mapping_quality >= options.mapq_t: 37 | if options.score_t is None or align.get_tag('AS') >= options.score_t: 38 | bam_out.write(align) 39 | 40 | bam_in.close() 41 | bam_out.close() 42 | 43 | ################################################################################ 44 | # __main__ 45 | ################################################################################ 46 | if __name__ == '__main__': 47 | main() 48 | -------------------------------------------------------------------------------- /gtf2prom.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import gff 4 | import os, subprocess 5 | 6 | ################################################################################ 7 | # gtf2prom.py 8 | # 9 | # Produce a GFF file and a fasta file corresponding to the promoter of the 10 | # genes in a gtf fle. 11 | ################################################################################ 12 | 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] arg' 19 | parser = OptionParser(usage) 20 | parser.add_option('-d', dest='downstream', type='int', default=0, help='Downstream promoter length [Default: %default]') 21 | parser.add_option('-u', dest='upstream', type='int', default=2000, help='Upstream promoter length [Default: %default]') 22 | parser.add_option('-o', dest='output_pre', default='promoter', help='Output file prefix [Default: %default]') 23 | (options,args) = parser.parse_args() 24 | 25 | if len(args) != 1: 26 | parser.error('Must provide gtf file') 27 | else: 28 | gtf_file = args[0] 29 | 30 | gff.promoters(gtf_file, options.upstream, options.downstream, '%s.gff'%options.output_pre) 31 | p = subprocess.Popen('gff2fa.py %s.gff > %s.fa' % (options.output_pre,options.output_pre), shell=True) 32 | os.waitpid(p.pid,0) 33 | 34 | 35 | ################################################################################ 36 | # __main__ 37 | ################################################################################ 38 | if __name__ == '__main__': 39 | main() 40 | -------------------------------------------------------------------------------- /bw_nan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | import numpy as np 5 | import pyBigWig 6 | 7 | ''' 8 | bw_nan.py 9 | 10 | Compute NaN % in a BigWig. 11 | ''' 12 | 13 | ################################################################################ 14 | # main 15 | ################################################################################ 16 | def main(): 17 | usage = 'usage: %prog [options] ' 18 | parser = OptionParser(usage) 19 | (options,args) = parser.parse_args() 20 | if len(args) != 1: 21 | parser.error('Must provide input BigWig.') 22 | else: 23 | bw_file = args[0] 24 | 25 | # open files 26 | bw_in = pyBigWig.open(bw_file) 27 | 28 | # process chromosomes in length order 29 | chrom_lengths = bw_in.chroms() 30 | chroms = sorted(chrom_lengths.keys()) 31 | length_chroms = [(chrom_lengths[chrm],chrm) for chrm in chroms] 32 | length_chroms = sorted(length_chroms)[::-1] 33 | mode_factor = None 34 | 35 | total_nt = 0 36 | nan_nt = 0 37 | 38 | # for each chromosome 39 | for clength, chrom in length_chroms: 40 | # read values 41 | x = bw_in.values(chrom, 0, chrom_lengths[chrom], numpy=True) 42 | 43 | # find NaN 44 | x_nan = np.isnan(x) 45 | 46 | total_nt += len(x) 47 | nan_nt += x_nan.sum() 48 | 49 | # close files 50 | bw_in.close() 51 | 52 | nan_pct = nan_nt / total_nt 53 | print('%.6f' % nan_pct) 54 | 55 | 56 | ################################################################################ 57 | # __main__ 58 | ################################################################################ 59 | if __name__ == '__main__': 60 | main() 61 | -------------------------------------------------------------------------------- /h5_zarr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | import h5py 5 | import numpy as np 6 | import zarr 7 | 8 | ''' 9 | h5_zarr.py 10 | 11 | Convert a coverage HDF5 to BigWig. 12 | ''' 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | parser.add_option('-c', dest='chunk_size', default=None, type='int') 21 | parser.add_option('-v', dest='verbose', default=False, action='store_true') 22 | (options,args) = parser.parse_args() 23 | 24 | if len(args) != 2: 25 | parser.error('Must provide input HDF5 and output BigWig.') 26 | else: 27 | hdf5_file = args[0] 28 | zarr_file = args[1] 29 | 30 | # open files 31 | h5_in = h5py.File(hdf5_file) 32 | zarr_out = zarr.open_group(zarr_file, 'w') 33 | 34 | # foreach chromosome 35 | for chrom in h5_in.keys(): 36 | if options.verbose: 37 | print(chrom) 38 | 39 | # read values 40 | x = np.array(h5_in[chrom], dtype='float16') 41 | 42 | # write gzipped into HDF5 43 | z = zarr_out.create_dataset(chrom, data=x, shape=x.shape, dtype='float16', chunks=options.chunk_size) 44 | if options.verbose: 45 | print(z) 46 | 47 | # close files 48 | h5_in.close() 49 | 50 | 51 | ################################################################################ 52 | # __main__ 53 | ################################################################################ 54 | if __name__ == '__main__': 55 | main() 56 | -------------------------------------------------------------------------------- /sciseq_collision.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import numpy as np 4 | 5 | ''' 6 | sciseq_collision.py 7 | 8 | Estimate the collision rate for a set of sci-seq barcode parameters. 9 | ''' 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] arg' 16 | parser = OptionParser(usage) 17 | parser.add_option('-b', dest='barcode1', 18 | default=None, type='int', 19 | help='Number of barcodes introduced in the first RT stage') 20 | parser.add_option('-c', dest='cells', 21 | default=None, type='int', 22 | help='Number of cells sorted per well in the second PCR stage') 23 | parser.add_option('-n', dest='num_samples', 24 | default=10000, type='int', 25 | help='Number of simulation samples [Default: %default]') 26 | (options,args) = parser.parse_args() 27 | 28 | collisions = 0 29 | 30 | for i in range(options.num_samples): 31 | cell_barcodes = np.random.randint(0, options.barcode1, size=options.cells) 32 | unique_cell_barcodes = len(set(cell_barcodes)) 33 | cell_collisions = options.cells - unique_cell_barcodes 34 | collisions += cell_collisions 35 | 36 | collision_rate = collisions / (options.num_samples*options.cells) 37 | print('Collision rate: %.4f' % collision_rate) 38 | 39 | 40 | ################################################################################ 41 | # __main__ 42 | ################################################################################ 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /gtf_span.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import gff 4 | 5 | ################################################################################ 6 | # gtf_span.py 7 | # 8 | # Merge all of the transcripts in one gene into a single spanning gtf entry. 9 | ################################################################################ 10 | 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | #parser.add_option() 19 | (options,args) = parser.parse_args() 20 | 21 | gtf_file = args[0] 22 | 23 | genes = {} 24 | 25 | for line in open(gtf_file): 26 | a = line.split() 27 | gene_id = a[9][1:-2] 28 | genes.setdefault(gene_id,[]).append(line) 29 | 30 | for gene_id in genes: 31 | start = min([int(line.split()[3]) for line in genes[gene_id]]) 32 | end = max([int(line.split()[4]) for line in genes[gene_id]]) 33 | 34 | a = genes[gene_id][0].split('\t') 35 | kv = gff.gtf_kv(a[8]) 36 | succinct_kv = {'gene_id':kv['gene_id']} 37 | succinct_kv['transcript_id'] = ','.join(list(set([line.split()[11][1:-2] for line in genes[gene_id]]))) 38 | 39 | d = [a[0], 'gtf', 'gene', str(start), str(end), '.', a[6], '.', gff.kv_gtf(succinct_kv)] 40 | print '\t'.join(d) 41 | 42 | ################################################################################ 43 | # __main__ 44 | ################################################################################ 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /bim_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import subprocess 4 | 5 | ''' 6 | bim_vcf.py 7 | 8 | Convert variants in a Plink .bim file to .vcf 9 | ''' 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] ' 16 | parser = OptionParser(usage) 17 | parser.add_option('-z', dest='zip', default=False, action='store_true') 18 | (options,args) = parser.parse_args() 19 | 20 | if len(args) != 2: 21 | parser.error('Must provide input BIM and output VCF.') 22 | else: 23 | in_bim_file = args[0] 24 | out_vcf_file = args[1] 25 | 26 | # open out VCF 27 | out_vcf_open = open(out_vcf_file, 'w') 28 | 29 | # print header 30 | print('##fileformat=VCFv4.2', file=out_vcf_open) 31 | cols = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO'] 32 | print('\t'.join(cols), file=out_vcf_open) 33 | 34 | # parse BIM 35 | for line in open(in_bim_file): 36 | a = line.split() 37 | chrom = a[0] 38 | snp_id = a[1] 39 | pos = a[3] 40 | a1 = a[4] 41 | a2 = a[5] 42 | 43 | cols = [chrom, pos, snp_id, a2, a1, '.', '.', '.'] 44 | print('\t'.join(cols), file=out_vcf_open) 45 | 46 | out_vcf_open.close() 47 | 48 | if options.zip: 49 | subprocess.call('gzip -f %s' % out_vcf_file, shell=True) 50 | 51 | 52 | ################################################################################ 53 | # __main__ 54 | ################################################################################ 55 | if __name__ == '__main__': 56 | main() 57 | -------------------------------------------------------------------------------- /cuff_fails.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | ################################################################################ 5 | # cuff_fails.py 6 | # 7 | # Print a table of the number of genes for which cufflinks failed. 8 | ################################################################################ 9 | 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] ' 16 | parser = OptionParser(usage) 17 | #parser.add_option() 18 | (options,args) = parser.parse_args() 19 | 20 | if len(args) != 1: 21 | parser.error('Must provide FPKM tracking file') 22 | else: 23 | fpkm_file = args[0] 24 | 25 | # get headers 26 | fpkm_in = open(fpkm_file) 27 | headers = fpkm_in.readline().split() 28 | 29 | # initialize fail counts 30 | fails = {} 31 | 32 | for line in fpkm_in: 33 | a = line.split('\t') 34 | a[-1] = a[-1].rstrip() 35 | 36 | gene_id = a[0] 37 | 38 | for i in range(len(a)): 39 | if headers[i][-7:] == '_status': 40 | if a[i] != 'OK': 41 | sample = headers[i][:-7] 42 | fails[sample] = fails.get(sample,0) + 1 43 | 44 | fpkm_in.close() 45 | 46 | for sample in fails: 47 | cols = (sample, fails[sample]) 48 | print '%-18s %5d' % cols 49 | 50 | 51 | ################################################################################ 52 | # __main__ 53 | ################################################################################ 54 | if __name__ == '__main__': 55 | main() 56 | -------------------------------------------------------------------------------- /gtf_filter_csf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import sys 4 | import gff 5 | 6 | ################################################################################ 7 | # gtf_filter_csf.py 8 | # 9 | # Filter the lnc catalog gtf file by CSF value. 10 | ################################################################################ 11 | 12 | 13 | ################################################################################ 14 | # main 15 | ################################################################################ 16 | def main(): 17 | usage = 'usage: %prog [options] ' 18 | parser = OptionParser(usage) 19 | parser.add_option('-g', dest='greater', action='store_true', default=False, help='Keep genes w/ CSF value greater than the one given [Default: %default]') 20 | parser.add_option('-l', dest='less', action='store_true', default=True, help='Keep genes w/ CSF value less than the one given [Default: %default]') 21 | parser.add_option('-t', dest='csf_t', type='float', default=100.0, help='CSF threshold [Default: %default]') 22 | (options,args) = parser.parse_args() 23 | 24 | if len(args) == 1: 25 | gtf_open = open(args[0]) 26 | else: 27 | gtf_open = sys.stdin 28 | 29 | line = gtf_open.readline() 30 | while line: 31 | a = line.split('\t') 32 | csf = float(gff.gtf_kv(a[8])['csf']) 33 | if (options.less and csf <= options.csf_t) or (options.greater and csf >= options.csf_t): 34 | print line, 35 | line = gtf_open.readline() 36 | 37 | 38 | ################################################################################ 39 | # __main__ 40 | ################################################################################ 41 | if __name__ == '__main__': 42 | main() 43 | -------------------------------------------------------------------------------- /zarr_bw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | import zarr 5 | import numpy as np 6 | import pyBigWig 7 | 8 | ''' 9 | zarr_bw.py 10 | 11 | Convert a coverage Zarr to BigWig. 12 | ''' 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | parser.add_option('-v', dest='verbose', default=False, action='store_true') 21 | (options,args) = parser.parse_args() 22 | 23 | if len(args) != 2: 24 | parser.error('Must provide input HDF5 and output BigWig.') 25 | else: 26 | zarr_file = args[0] 27 | bw_file = args[1] 28 | 29 | # open files 30 | zarr_in = zarr.open_group(zarr_file, 'r') 31 | bw_out = pyBigWig.open(bw_file, 'w') 32 | 33 | # construct header 34 | header = [] 35 | chroms = sorted(zarr_in.keys()) 36 | for chrom in chroms: 37 | # chromosome and length 38 | header.append((chrom,len(zarr_in[chrom]))) 39 | 40 | # write header 41 | bw_out.addHeader(header) 42 | 43 | for chrom, length in header: 44 | if options.verbose: 45 | print(chrom) 46 | 47 | # read values 48 | x = np.array(zarr_in[chrom]) 49 | 50 | # write gzipped into HDF5 51 | bw_out.addEntries(chrom, 0, values=x, span=1, step=1) 52 | 53 | # close files 54 | bw_out.close() 55 | 56 | 57 | ################################################################################ 58 | # __main__ 59 | ################################################################################ 60 | if __name__ == '__main__': 61 | main() 62 | -------------------------------------------------------------------------------- /bed2gff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import gff 4 | 5 | ################################################################################ 6 | # bed2gff.py 7 | # 8 | # Convert a bed file to a gff file. No blocks. 9 | ################################################################################ 10 | 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | parser.add_option('--source', dest='source', default='bed', help='Gff format "source" [Default: %default]') 19 | parser.add_option('--feature', dest='feature', default='feature', help='Gff format "feature" [Default: %default]') 20 | (options,args) = parser.parse_args() 21 | 22 | if len(args) != 1: 23 | parser.error('Must provide bed file') 24 | else: 25 | bed_file = args[0] 26 | 27 | group_num = 0 28 | for line in open(bed_file): 29 | a = line.split('\t') 30 | a[-1] = a[-1].rstrip() 31 | 32 | if len(a) >= 5: 33 | score = a[4] 34 | else: 35 | score = '.' 36 | if len(a) >= 6: 37 | strand = a[5] 38 | else: 39 | strand = '+' 40 | group_num += 1 41 | 42 | cols = [a[0], options.source, options.feature, str(int(a[1])+1), a[2], score, strand, '.', str(group_num)] 43 | print '\t'.join(cols) 44 | 45 | 46 | ################################################################################ 47 | # __main__ 48 | ################################################################################ 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /bed2gtf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | ################################################################################ 5 | # bed2gtf.py 6 | # 7 | # Convert a bed file to a gtf file. 8 | ################################################################################ 9 | 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] ' 16 | parser = OptionParser(usage) 17 | #parser.add_option() 18 | (options,args) = parser.parse_args() 19 | 20 | if len(args) != 1: 21 | parser.error('Must provide gtf file') 22 | else: 23 | bed_file = args[0] 24 | 25 | for line in open(bed_file): 26 | a = line.split('\t') 27 | a[-1] = a[-1].rstrip() 28 | 29 | tid = a[3] 30 | 31 | gene_start = int(a[1]) 32 | gene_end = int(a[2]) 33 | 34 | block_sizes = [int(x) for x in a[10].split(',') if x] 35 | block_starts = [int(x) for x in a[11].split(',') if x] 36 | 37 | exon_num = 1 38 | for i in range(len(block_starts)): 39 | exon_start = gene_start+1+block_starts[i] 40 | exon_end = gene_start+1+block_starts[i]+block_sizes[i]-1 41 | 42 | cols = [a[0], 'BED', 'exon', str(exon_start), str(exon_end), '.', a[5], '.', 'transcript_id "%s"; exon_number "%d"' % (tid,exon_num)] 43 | print '\t'.join(cols) 44 | exon_num += 1 45 | 46 | 47 | ################################################################################ 48 | # __main__ 49 | ################################################################################ 50 | if __name__ == '__main__': 51 | main() 52 | -------------------------------------------------------------------------------- /gaps_bed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import pdb 4 | 5 | ################################################################################ 6 | # gaps_bed.py 7 | # 8 | # Print a bed file of the gaps in a fasta file. 9 | ################################################################################ 10 | 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | parser.add_option('-g', dest='gap_size', default=50, type='int', help='Minimum gap size to print a bed entry [Default: %default]') 19 | (options,args) = parser.parse_args() 20 | 21 | if len(args) != 1: 22 | parser.error('Must provide fasta file') 23 | else: 24 | fasta_file = args[0] 25 | 26 | for line in open(fasta_file): 27 | if line[0] == '>': 28 | chrom = line[1:].rstrip() 29 | seq_i = 0 30 | gap_start = None 31 | else: 32 | for nt in line.rstrip(): 33 | if nt == 'N': 34 | if gap_start == None: 35 | gap_start = seq_i 36 | else: 37 | if gap_start != None and seq_i-gap_start >= options.gap_size: 38 | print '\t'.join([chrom,str(gap_start),str(seq_i)]) 39 | gap_start = None 40 | 41 | seq_i += 1 42 | 43 | 44 | ################################################################################ 45 | # __main__ 46 | ################################################################################ 47 | if __name__ == '__main__': 48 | main() 49 | #pdb.runcall(main) 50 | -------------------------------------------------------------------------------- /stockholm2fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | ################################################################################ 5 | # stockholm2fasta.py 6 | # 7 | # Convert Stockholm MSA format from HMMer to FASTA for viewing. 8 | ################################################################################ 9 | 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] ' 16 | parser = OptionParser(usage) 17 | parser.add_option('-c', dest='consensus_only', default=False, action='store_true', help='Print consensus columns only [Default: %default]') 18 | (options,args) = parser.parse_args() 19 | 20 | if len(args) != 1: 21 | parser.error('Must provide input Stockholm format file') 22 | else: 23 | stockholm_file = args[0] 24 | 25 | seqs = {} 26 | consensus = '' 27 | for line in open(stockholm_file): 28 | if line.rstrip() not in ['','//'] and line[0] != '#': 29 | header, msa = line.split() 30 | seqs[header] = seqs.get(header,'') + msa 31 | elif line.startswith('#=GC RF'): 32 | consensus += line.split()[-1] 33 | 34 | if options.consensus_only: 35 | for header in seqs: 36 | hseq = seqs[header] 37 | seqs[header] = ''.join([hseq[i] for i in range(len(hseq)) if consensus[i] == 'x']) 38 | 39 | for header in seqs: 40 | print '>%s\n%s' % (header,seqs[header]) 41 | 42 | 43 | ################################################################################ 44 | # __main__ 45 | ################################################################################ 46 | if __name__ == '__main__': 47 | main() 48 | -------------------------------------------------------------------------------- /bed_clean.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | ################################################################################ 5 | # bed_clean.py 6 | # 7 | # Detect and correct BED regions extending beyond chromosome ends 8 | ################################################################################ 9 | 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] ' 16 | parser = OptionParser(usage) 17 | parser.add_option('-d', dest='delete', default=False, action='store_true', help='Delete entries beyond boundaries [Default: %default]') 18 | (options,args) = parser.parse_args() 19 | 20 | if len(args) != 2: 21 | parser.error('Must provide BED file and chrom sizes file') 22 | else: 23 | csizes_file = args[0] 24 | bed_file = args[1] 25 | 26 | # read in chromosome sizes 27 | chrom_sizes = {} 28 | for line in open(csizes_file): 29 | a = line.split() 30 | chrom_sizes[a[0]] = int(a[1]) 31 | 32 | # clean BED file 33 | for line in open(bed_file): 34 | a = line.split() 35 | chrom = a[0] 36 | start = int(a[1]) 37 | end = int(a[2]) 38 | 39 | if end <= chrom_sizes[chrom]: 40 | print line, 41 | 42 | else: 43 | if not options.delete: 44 | end = chrom_sizes[chrom] 45 | a[2] = str(end) 46 | print '\t'.join(a) 47 | 48 | 49 | ################################################################################ 50 | # __main__ 51 | ################################################################################ 52 | if __name__ == '__main__': 53 | main() 54 | -------------------------------------------------------------------------------- /multiz_gff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | from pygr import worldbase 4 | import gff 5 | 6 | ################################################################################ 7 | # multiz_gff.py 8 | # 9 | # Return hg19 46-way multiz alignments of the entries in a gff file. 10 | ################################################################################ 11 | 12 | 13 | ################################################################################ 14 | # main 15 | ################################################################################ 16 | def main(): 17 | usage = 'usage: %prog [options] ' 18 | parser = OptionParser(usage) 19 | (options,args) = parser.parse_args() 20 | 21 | if len(args) != 1: 22 | parser.error('Must provide gff file.') 23 | else: 24 | gff_file = args[0] 25 | 26 | # get human genome 27 | hg19 = worldbase.Bio.Seq.Genome.HUMAN.hg19() 28 | 29 | # get feature intervals 30 | feat_ivals = [] 31 | for line in open(gff_file): 32 | a = line.split('\t') 33 | 34 | chrom = a[0] 35 | start = int(a[3]) 36 | end = int(a[4]) 37 | # ignoring orientation at the moment 38 | 39 | feat_ivals.append(hg19[chrom][start:end]) 40 | 41 | # get hg19 msa 42 | msa = worldbase.Bio.MSA.UCSC.hg19_multiz46way() 43 | 44 | # map returned sequences back to genome name 45 | idDict = ~(msa.seqDict) 46 | 47 | # print alignments 48 | for gi in feat_ivals: 49 | for src, dest, edg in msa[gi].edges(): 50 | print repr(gi), repr(src), repr(dest), idDict[dest], edg.length() 51 | 52 | 53 | ################################################################################ 54 | # __main__ 55 | ################################################################################ 56 | if __name__ == '__main__': 57 | main() 58 | -------------------------------------------------------------------------------- /fastq_quality_change.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | ################################################################################ 5 | # fastq_quality_change.py 6 | # 7 | # Change the quality value ascii index for a fastq file. 8 | # 9 | # Author: David Kelley dakelley@umiacs.umd.edu 10 | ################################################################################ 11 | 12 | 13 | ################################################################################ 14 | # main 15 | ################################################################################ 16 | def main(): 17 | usage = 'usage: %prog [options] ' 18 | parser = OptionParser(usage) 19 | parser.add_option('-a', dest='after', type='int', help='Desired fastq quality ascii index') 20 | parser.add_option('-b', dest='before', type='int', help='Current fastq quality ascii index') 21 | (options,args) = parser.parse_args() 22 | 23 | if options.after == None or options.before == None: 24 | parser.error('Must provide before and after ascii indexes') 25 | if len(args) != 1: 26 | parser.error('Must provide fastq file') 27 | else: 28 | fastq_file = args[0] 29 | 30 | fq_in = open(fastq_file) 31 | header = fq_in.readline() 32 | while header: 33 | seq = fq_in.readline() 34 | mid = fq_in.readline() 35 | qual = fq_in.readline() 36 | 37 | print header, 38 | print seq, 39 | print mid, 40 | print ''.join([chr(ord(q)-options.before+options.after) for q in qual]) 41 | 42 | header = fq_in.readline() 43 | fq_in.close() 44 | 45 | 46 | ################################################################################ 47 | # __main__ 48 | ################################################################################ 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /w5_bg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | import h5py 5 | import numpy as np 6 | 7 | ''' 8 | w5_bg.py 9 | 10 | Convert a Wiggle HDF5 to BedGraph. 11 | ''' 12 | 13 | ################################################################################ 14 | # main 15 | ################################################################################ 16 | def main(): 17 | usage = 'usage: %prog [options] ' 18 | parser = OptionParser(usage) 19 | parser.add_option('-v', dest='verbose', default=False, action='store_true') 20 | (options,args) = parser.parse_args() 21 | 22 | if len(args) != 2: 23 | parser.error('Must provide input Wig5 and output BedGraph.') 24 | else: 25 | in_w5_file = args[0] 26 | out_bg_file = args[1] 27 | 28 | # open files 29 | in_w5_open = h5py.File(in_w5_file) 30 | out_bg_open = open(out_bg_file, 'w') 31 | 32 | header = 'track type=bedGraph' 33 | print(header, file=out_bg_open) 34 | 35 | for chrm in sorted(in_w5_open.keys()): 36 | if options.verbose: 37 | print(chrm, flush=True) 38 | 39 | # read values 40 | x = np.array(in_w5_open[chrm]) 41 | 42 | # write to bedgraph 43 | i = 0 44 | while i < len(x): 45 | start = i 46 | end = i+1 47 | while end < len(x) and x[start] == x[end]: 48 | end += 1 49 | 50 | cols = [chrm, str(start), str(end), '%.4f'%x[start]] 51 | print('\t'.join(cols), file=out_bg_open) 52 | 53 | i = end 54 | 55 | in_w5_open.close() 56 | out_bg_open.close() 57 | 58 | 59 | ################################################################################ 60 | # __main__ 61 | ################################################################################ 62 | if __name__ == '__main__': 63 | main() 64 | -------------------------------------------------------------------------------- /split_fragment_lengths.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import pysam 4 | 5 | ################################################################################ 6 | # split_fragment_lengths.py 7 | # 8 | # Split a BAM file based on a fragment length threshold. 9 | ################################################################################ 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] ' 16 | parser = OptionParser(usage) 17 | parser.add_option('-m', dest='max_length', default=1000, help='Threshold length beyond which we ignore the read [Default: %default]') 18 | (options,args) = parser.parse_args() 19 | 20 | if len(args) != 3: 21 | parser.error('Must provide BAM file, split length, and output prefix') 22 | else: 23 | bam_file = args[0] 24 | split_len = int(args[1]) 25 | out_pre = args[2] 26 | 27 | bam_in = pysam.Samfile(bam_file, 'rb') 28 | 29 | minus_out = pysam.Samfile('%s_%d-.bam' % (out_pre,split_len), 'wb', template=bam_in) 30 | plus_out = pysam.Samfile('%s_%d+.bam' % (out_pre,split_len), 'wb', template=bam_in) 31 | 32 | for alignment in bam_in: 33 | tl = abs(alignment.template_length) 34 | if tl == 0: 35 | pass 36 | elif tl < split_len: 37 | minus_out.write(alignment) 38 | elif tl <= options.max_length: 39 | plus_out.write(alignment) 40 | else: 41 | pass 42 | 43 | minus_out.close() 44 | plus_out.close() 45 | 46 | ################################################################################ 47 | # __main__ 48 | ################################################################################ 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /gtf_cut.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import sys 4 | import gff 5 | 6 | ################################################################################ 7 | # gtf_cut.py 8 | # 9 | # Cut a gtf key:value pair out of a gtf file. 10 | ################################################################################ 11 | 12 | 13 | ################################################################################ 14 | # main 15 | ################################################################################ 16 | def main(): 17 | usage = 'usage: %prog [options] -k ' 18 | parser = OptionParser(usage) 19 | parser.add_option('-c', dest='column', default=8, type='int') 20 | parser.add_option('-k', dest='key', help='Key to extract') 21 | parser.add_option('-l', dest='line_too', action='store_true', default=False, help='Print the line too [Default: %default]') 22 | (options,args) = parser.parse_args() 23 | 24 | if len(args) == 1: 25 | if args[0] == '-': 26 | gtf_open = sys.stdin 27 | else: 28 | gtf_open = open(args[0]) 29 | else: 30 | parser.error(usage) 31 | 32 | if not options.key: 33 | parser.error('Must provide key') 34 | else: 35 | keys = options.key.split(',') 36 | 37 | for line in gtf_open: 38 | if not line.startswith('#'): 39 | a = line.split('\t') 40 | kv = gff.gtf_kv(a[options.column]) 41 | 42 | if options.line_too: 43 | key_str = '\t'.join([kv.get(key,'-') for key in keys]) 44 | print('%s\t%s' % (key_str,line)) 45 | else: 46 | print('\t'.join([kv.get(key,'-') for key in keys])) 47 | 48 | 49 | ################################################################################ 50 | # __main__ 51 | ################################################################################ 52 | if __name__ == '__main__': 53 | main() 54 | -------------------------------------------------------------------------------- /gtf_filter_expr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import cufflinks, gff 4 | 5 | ################################################################################ 6 | # gtf_filter_expr.py 7 | # 8 | # Filter a gtf file to only leave genes that are expressed over a specified 9 | # threshold in a specified tissue/cell type. 10 | ################################################################################ 11 | 12 | 13 | ################################################################################ 14 | # main 15 | ################################################################################ 16 | def main(): 17 | usage = 'usage: %prog [options] ' 18 | parser = OptionParser(usage) 19 | parser.add_option('-t', dest='expr_t', type='float', default=.1, help='Minimum allowed fpkm value') 20 | (options,args) = parser.parse_args() 21 | 22 | if len(args) != 2: 23 | parser.error('Must provide gtf file and cell type') 24 | else: 25 | gtf_file = args[0] 26 | cell_type = args[1] 27 | 28 | # get expression data 29 | cuff = cufflinks.fpkm_tracking() 30 | 31 | # find cell type experiment index 32 | cell_indexes = [i for i in range(len(cuff.experiments)) if cuff.experiments[i]==cell_type] 33 | if len(cell_indexes) == 0: 34 | parser.error('Cell type %s does not match any quantified experiments' % cell_type) 35 | else: 36 | cell_i = cell_indexes[0] 37 | 38 | # parser gtf file 39 | for line in open(gtf_file): 40 | a = line.split('\t') 41 | gene_id = gff.gtf_kv(a[8])['gene_id'] 42 | expr_vec = cuff.gene_expr(gene_id) 43 | if expr_vec[cell_i] > options.expr_t: 44 | print line, 45 | 46 | 47 | ################################################################################ 48 | # __main__ 49 | ################################################################################ 50 | if __name__ == '__main__': 51 | main() 52 | -------------------------------------------------------------------------------- /fpkm_tracking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import pandas as pd 4 | 5 | ################################################################################ 6 | # fpkm_tracking 7 | # 8 | # Print a table of FPKM abundance estimates with one gene/sample per row. 9 | # 10 | # Using Pandas for this is stupid because we have to read the whole thing 11 | # into memory, and when you iterate over rows, it has to create a Series object. 12 | ################################################################################ 13 | 14 | 15 | ################################################################################ 16 | # main 17 | ################################################################################ 18 | def main(): 19 | usage = 'usage: %prog [options] ' 20 | parser = OptionParser(usage) 21 | parser.add_option('-g', dest='gene_id', help='This gene only') 22 | (options,args) = parser.parse_args() 23 | 24 | if len(args) != 1: 25 | parser.error('Must provide .fpkm_tracking file') 26 | else: 27 | fpkm_tracking_file = args[0] 28 | 29 | cuff = pd.read_csv(fpkm_tracking_file, sep='\t') 30 | 31 | fpkm_indexes = [i for i in range(cuff.shape[1]) if cuff.columns[i][-5:] == '_FPKM'] 32 | 33 | for gene_i, gene_series in cuff.iterrows(): 34 | gene_id = gene_series['gene_id'] 35 | if options.gene_id == None or gene_id == options.gene_id: 36 | for i in fpkm_indexes: 37 | sample = cuff.columns[i][:-5] 38 | fpkm = str(gene_series[i]) 39 | status = gene_series[i+3] 40 | 41 | if status == 'OK': 42 | cols = [gene_series['tracking_id'], sample, fpkm] 43 | print '\t'.join(cols) 44 | 45 | 46 | ################################################################################ 47 | # __main__ 48 | ################################################################################ 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /size.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from sys import getsizeof, stderr 3 | from itertools import chain 4 | from collections import deque 5 | try: 6 | from reprlib import repr 7 | except ImportError: 8 | pass 9 | 10 | def total_size(o, handlers={}, verbose=False): 11 | """ Returns the approximate memory footprint an object and all of its contents. 12 | 13 | Automatically finds the contents of the following builtin containers and 14 | their subclasses: tuple, list, deque, dict, set and frozenset. 15 | To search other containers, add handlers to iterate over their contents: 16 | 17 | handlers = {SomeContainerClass: iter, 18 | OtherContainerClass: OtherContainerClass.get_elements} 19 | 20 | """ 21 | dict_handler = lambda d: chain.from_iterable(d.items()) 22 | all_handlers = {tuple: iter, 23 | list: iter, 24 | deque: iter, 25 | dict: dict_handler, 26 | set: iter, 27 | frozenset: iter, 28 | } 29 | all_handlers.update(handlers) # user handlers take precedence 30 | seen = set() # track which object id's have already been seen 31 | default_size = getsizeof(0) # estimate sizeof object without __sizeof__ 32 | 33 | def sizeof(o): 34 | if id(o) in seen: # do not double count the same object 35 | return 0 36 | seen.add(id(o)) 37 | s = getsizeof(o, default_size) 38 | 39 | if verbose: 40 | print(s, type(o), repr(o), file=stderr) 41 | 42 | for typ, handler in all_handlers.items(): 43 | if isinstance(o, typ): 44 | s += sum(map(sizeof, handler(o))) 45 | break 46 | return s 47 | 48 | return sizeof(o) 49 | 50 | 51 | ##### Example call ##### 52 | 53 | if __name__ == '__main__': 54 | d = dict(a=1, b=2, c=3, d=[4,5,6,7], e='a string of chars') 55 | print(total_size(d, verbose=True)) 56 | -------------------------------------------------------------------------------- /w5_bw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | import h5py 5 | import numpy as np 6 | import pyBigWig 7 | 8 | ''' 9 | w5_bw.py 10 | 11 | Convert a coverage wiggle HDF5 to BigWig. 12 | ''' 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | parser.add_option('-c', dest='chr', 21 | default=None, help='Comma-separated chromosomes') 22 | parser.add_option('-v', dest='verbose', 23 | default=False, action='store_true') 24 | (options,args) = parser.parse_args() 25 | 26 | if len(args) != 2: 27 | parser.error('Must provide input HDF5 and output BigWig.') 28 | else: 29 | hdf5_file = args[0] 30 | bw_file = args[1] 31 | 32 | # open files 33 | h5_in = h5py.File(hdf5_file, 'r') 34 | bw_out = pyBigWig.open(bw_file, 'w') 35 | 36 | # construct header 37 | if options.chr is not None: 38 | chroms = options.chr.split(',') 39 | else: 40 | chroms = sorted(h5_in.keys()) 41 | 42 | header = [] 43 | for chrom in chroms: 44 | # chromosome and length 45 | header.append((chrom,len(h5_in[chrom]))) 46 | 47 | # write header 48 | bw_out.addHeader(header) 49 | 50 | for chrom, length in header: 51 | if options.verbose: 52 | print(chrom) 53 | 54 | # read values 55 | x = np.array(h5_in[chrom]) 56 | 57 | # write gzipped into HDF5 58 | bw_out.addEntries(chrom, 0, values=x, span=1, step=1) 59 | 60 | # close files 61 | h5_in.close() 62 | bw_out.close() 63 | 64 | 65 | ################################################################################ 66 | # __main__ 67 | ################################################################################ 68 | if __name__ == '__main__': 69 | main() 70 | -------------------------------------------------------------------------------- /gtf2bed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import gff 4 | 5 | ################################################################################ 6 | # gtf2bed.py 7 | # 8 | # Convert a gtf file to a bed file. 9 | ################################################################################ 10 | 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | parser.add_option('-c', dest='cds', action='store_true', default=False, help='Use CDS, not exons [Default: %default]') 19 | (options,args) = parser.parse_args() 20 | 21 | if len(args) != 1: 22 | parser.error('Must provide gtf file') 23 | else: 24 | gtf_file = args[0] 25 | 26 | genes = gff.read_genes(gtf_file) 27 | 28 | for transcript_id in genes: 29 | g = genes[transcript_id] 30 | 31 | if options.cds: 32 | block_sizes = ','.join([str(ex.end-ex.start+1) for ex in g.cds]) 33 | block_starts = ','.join([str(ex.start-g.cds[0].start) for ex in g.cds]) 34 | 35 | cols = [g.chrom, str(g.cds[0].start-1), str(g.cds[-1].end), transcript_id, '0', g.strand, '0', '0', '255,0,0', str(len(g.cds)), block_sizes, block_starts] 36 | 37 | else: 38 | block_sizes = ','.join([str(ex.end-ex.start+1) for ex in g.exons]) 39 | block_starts = ','.join([str(ex.start-g.exons[0].start) for ex in g.exons]) 40 | 41 | cols = [g.chrom, str(g.exons[0].start-1), str(g.exons[-1].end), transcript_id, '0', g.strand, '0', '0', '255,0,0', str(len(g.exons)), block_sizes, block_starts] 42 | 43 | print '\t'.join(cols) 44 | 45 | 46 | ################################################################################ 47 | # __main__ 48 | ################################################################################ 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /reservoir_sample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | 4 | from optparse import OptionParser 5 | import gzip 6 | import random 7 | import sys 8 | 9 | ################################################################################ 10 | # reservoir_sample.py 11 | # 12 | # Randomly choose a subset of lines in a file using single pass 13 | # reservoir sampling. 14 | ################################################################################ 15 | 16 | 17 | ################################################################################ 18 | # main 19 | ################################################################################ 20 | def main(): 21 | usage = 'usage: %prog [options] ' 22 | parser = OptionParser(usage) 23 | parser.add_option('-d', dest='header', 24 | default=False, action='store_true') 25 | parser.add_option('-z', dest='gzip', 26 | default=False, action='store_true') 27 | (options,args) = parser.parse_args() 28 | 29 | if len(args) != 2: 30 | parser.error('Must provide file and sample number') 31 | else: 32 | sample_num = int(args[0]) 33 | input_file = args[1] 34 | 35 | reservoir = ['']*sample_num 36 | 37 | if input_file in ['-','stdin']: 38 | input_in = sys.stdin 39 | else: 40 | if options.gzip: 41 | input_in = gzip.open(input_file, 'rt') 42 | else: 43 | input_in = open(input_file) 44 | 45 | if options.header: 46 | print(input_in.readline(), end='') 47 | 48 | # fill 49 | i = 0 50 | while i < sample_num: 51 | reservoir[i] = input_in.readline() 52 | i += 1 53 | 54 | # sample 55 | for line in input_in: 56 | j = random.randint(0, i+1) 57 | if j < sample_num: 58 | reservoir[j] = line 59 | i += 1 60 | 61 | # print 62 | print(''.join(reservoir), end='') 63 | 64 | ################################################################################ 65 | # __main__ 66 | ################################################################################ 67 | if __name__ == '__main__': 68 | main() 69 | -------------------------------------------------------------------------------- /bl2gff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import glob, sys, pdb 4 | 5 | ################################################################################ 6 | # bl2gff.py 7 | # 8 | # Convert alignments from my Blast output to features in a .gff file. 9 | ################################################################################ 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] ' 16 | parser = OptionParser(usage) 17 | parser.add_option('-f', dest='feature_name', default='domain', help='Feature name [Default: %default]') 18 | parser.add_option('-p', dest='pct_t', type='float', default=0, help='Percentage of the 2nd sequence that must be covered by the alignment [Default: %default]') 19 | parser.add_option('-i', dest='idy_t', type='float', default=0, help='% identity that must be exceeded by the alignment [Default: %default]') 20 | (options,args) = parser.parse_args() 21 | 22 | if len(args) != 1: 23 | parser.error('Must provide blast output file') 24 | exit(1) 25 | else: 26 | blast_file = args[0] 27 | 28 | for line in open(blast_file): 29 | a = line.split() 30 | 31 | header1 = a[-2] 32 | header2 = a[-1] 33 | 34 | start1 = int(a[0]) 35 | end1 = int(a[1]) 36 | 37 | alen2 = int(a[7]) 38 | len2 = int(a[10]) 39 | idy = float(a[12]) 40 | if int(a[3]) < int(a[4]): 41 | strand = '+' 42 | else: 43 | strand = '-' 44 | 45 | if idy > options.idy_t and alen2 > len2*options.pct_t: 46 | gff_a = [header1, 'blast', options.feature_name, str(start1), str(end1), '.', strand, '.', header2] 47 | print '\t'.join(gff_a) 48 | 49 | 50 | ################################################################################ 51 | # __main__ 52 | ################################################################################ 53 | if __name__ == '__main__': 54 | main() 55 | #pdb.runcall(main) 56 | -------------------------------------------------------------------------------- /r/plot_gff_cov_meta.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(plyr) 3 | 4 | ca = commandArgs(trailing=T) 5 | df.file = ca[1] 6 | out.pre = ca[2] 7 | smooth.span = as.numeric(ca[3]) 8 | label.primary = ca[4] 9 | label.control = ca[5] 10 | 11 | df = read.table(df.file, header=T, quote="\"") 12 | 13 | # unnormalized 14 | if (ncol(df) == 2) { 15 | gp = ggplot(df, aes(x=Index, y=Coverage)) 16 | } else { 17 | gp = ggplot(df, aes(x=Index, y=Coverage, color=Type)) + 18 | scale_x_continuous("% in transcript") + 19 | scale_color_manual("", values=c("#F46D43", "#66BD63"), breaks=c("Primary","Control"), labels=c(label.primary, label.control)) 20 | 21 | #scale_color_brewer(palette="Set1") 22 | } 23 | 24 | gp + 25 | geom_point() + 26 | stat_smooth(method="loess", span=smooth.span) + 27 | theme_bw() + 28 | theme(text=element_text(size=25)) + 29 | theme(legend.justification=c(1,0), legend.position=c(1,0)) 30 | 31 | ggsave(paste(out.pre,"_raw.pdf",sep="")) 32 | 33 | 34 | # normalized 35 | if (ncol(df) > 2) { 36 | # the values are so low, I want to boost them up 37 | fudge=10 38 | 39 | control.sum = sum(df[df$Type=="Control",]$Coverage) 40 | primary.sum = sum(df[df$Type!="Control",]$Coverage) 41 | 42 | df$Coverage.Norm = df$Coverage 43 | for (i in 1:nrow(df)) { 44 | if (df[i,"Type"] == "Control") { 45 | df[i,"Coverage.Norm"] = fudge * df[i,"Coverage"] / control.sum 46 | } else { 47 | df[i,"Coverage.Norm"] = fudge * df[i,"Coverage"] / primary.sum 48 | } 49 | } 50 | 51 | ggplot(df, aes(x=Index, y=Coverage.Norm, color=Type)) + 52 | scale_x_continuous("% in transcript") + 53 | scale_color_manual("", values=c("#F46D43", "#66BD63"), breaks=c("Primary","Control"), labels=c(label.primary, label.control)) + 54 | geom_point() + 55 | stat_smooth(method="loess", span=smooth.span) + 56 | scale_y_continuous("Normalized coverage") + 57 | theme_bw() + 58 | theme(text=element_text(size=25)) + 59 | theme(legend.justification=c(1,0), legend.position=c(1,0)) 60 | 61 | # scale_color_brewer(palette="Set1") 62 | 63 | ggsave(paste(out.pre,"_norm.pdf",sep="")) 64 | } 65 | -------------------------------------------------------------------------------- /bam_len_hist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | from rpy2.robjects.packages import importr 4 | import rpy2.robjects as ro 5 | import rpy2.robjects.lib.ggplot2 as ggplot2 6 | import pdb 7 | import pysam 8 | 9 | grdevices = importr('grDevices') 10 | 11 | ################################################################################ 12 | # bam_len_hist.py 13 | # 14 | # Plot a histogram of the length of alignments in a BAM file. 15 | ################################################################################ 16 | 17 | 18 | ################################################################################ 19 | # main 20 | ################################################################################ 21 | def main(): 22 | usage = 'usage: %prog [options] arg' 23 | parser = OptionParser(usage) 24 | #parser.add_option() 25 | (options,args) = parser.parse_args() 26 | 27 | if len(args) != 1: 28 | parser.error('Must provide BAM file') 29 | else: 30 | bam_file = args[0] 31 | 32 | align_lengths = {} 33 | for aligned_read in pysam.Samfile(bam_file, 'rb'): 34 | align_lengths[aligned_read.qlen] = align_lengths.get(aligned_read.qlen,0) + 1 35 | 36 | min_len = min(align_lengths.keys()) 37 | max_len = max(align_lengths.keys()) 38 | 39 | # construct data frame 40 | len_r = ro.IntVector(range(min_len,max_len+1)) 41 | counts_r = ro.IntVector([align_lengths.get(l,0) for l in range(min_len,max_len+1)]) 42 | 43 | df = ro.DataFrame({'length':len_r, 'counts':counts_r}) 44 | 45 | # construct full plot 46 | gp = ggplot2.ggplot(df) + \ 47 | ggplot2.aes_string(x='length', y='counts') + \ 48 | ggplot2.geom_bar(stat='identity') + \ 49 | ggplot2.scale_x_continuous('Alignment length') + \ 50 | ggplot2.scale_y_continuous('') 51 | 52 | # plot to file 53 | grdevices.pdf(file='align_lengths.pdf') 54 | gp.plot() 55 | grdevices.dev_off() 56 | 57 | 58 | 59 | ################################################################################ 60 | # __main__ 61 | ################################################################################ 62 | if __name__ == '__main__': 63 | main() 64 | #pdb.runcall(main) 65 | -------------------------------------------------------------------------------- /isoforms_fpkm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | ################################################################################ 5 | # isoforms_fpkm.py 6 | # 7 | # Print the FPKM values for all isoforms of the given gene. 8 | ################################################################################ 9 | 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] ' 16 | parser = OptionParser(usage) 17 | #parser.add_option() 18 | (options,args) = parser.parse_args() 19 | 20 | if len(args) != 2: 21 | parser.error('Must provide a gene_id and isoforms.fpkm_tracking file') 22 | else: 23 | gene_id = args[0] 24 | iso_ft = args[1] 25 | 26 | # get headers 27 | fpkm_in = open(iso_ft) 28 | headers = fpkm_in.readline().split() 29 | 30 | # determine sample table length 31 | sample_len = 0 32 | for i in range(len(headers)): 33 | if headers[i][-5:] == '_FPKM': 34 | sample = headers[i][:-5] 35 | if len(sample) > sample_len: 36 | sample_len = len(sample) 37 | 38 | for line in fpkm_in: 39 | a = line.split('\t') 40 | a[-1] = a[-1].rstrip() 41 | 42 | tracking_id = a[0] 43 | line_gene_id = a[3] 44 | 45 | if line_gene_id == gene_id: 46 | i = 9 47 | while i < len(a): 48 | sample = headers[i][:-5] 49 | 50 | if a[i+3] in ['FAIL','HIDATA']: 51 | cols = (tracking_id, sample_len, sample, a[i+3]) 52 | print '%-18s %*s %11s' % cols 53 | else: 54 | fpkm = float(a[i]) 55 | cols = (tracking_id, sample_len, sample, fpkm) 56 | print '%-18s %*s %11.3f' % cols 57 | 58 | i += 4 59 | 60 | fpkm_in.close() 61 | 62 | 63 | ################################################################################ 64 | # __main__ 65 | ################################################################################ 66 | if __name__ == '__main__': 67 | main() 68 | -------------------------------------------------------------------------------- /vcf_tss.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os 4 | import pdb 5 | 6 | import pybedtools 7 | 8 | ''' 9 | vcf_tss.py 10 | 11 | Add TSS distance INFO column to a VCF file. 12 | ''' 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | parser.add_option('-g', dest='tss_gff_file', default='%s/genes/gencode28/gencode.v28.basic.annotation.tss.gff' % os.environ['HG38']) 21 | # parser.add_option('-g', dest='tss_gff_file', default='%s/genes/gencode28/gencode_basic_tss.gff' % os.environ['HG19']) 22 | (options,args) = parser.parse_args() 23 | 24 | if len(args) != 2: 25 | parser.error('Must provide input and output VCF files') 26 | else: 27 | in_vcf_file = args[0] 28 | out_vcf_file = args[1] 29 | 30 | # open files 31 | in_vcf_open = open(in_vcf_file) 32 | out_vcf_open = open(out_vcf_file, 'w') 33 | 34 | # print header 35 | line = in_vcf_open.readline() 36 | while line.startswith('#'): 37 | if line.startswith('#CHROM'): 38 | # add new INFO description first 39 | print('##FORMAT=', file=out_vcf_open) 40 | print(line, end='', file=out_vcf_open) 41 | line = in_vcf_open.readline() 42 | in_vcf_open.close() 43 | 44 | # intersect 45 | in_vcf_bedtool = pybedtools.BedTool(in_vcf_file) 46 | tss_bedtool = pybedtools.BedTool(options.tss_gff_file) 47 | 48 | for closest_a in in_vcf_bedtool.closest(tss_bedtool, d=True, t='first'): 49 | a = closest_a[:8] 50 | if a[-1] == '.': 51 | a[-1] = 'TS=%s' % closest_a[-1] 52 | else: 53 | a[-1] += ';TS=%s' % closest_a[-1] 54 | print('\t'.join(a), file=out_vcf_open) 55 | 56 | # close 57 | out_vcf_open.close() 58 | 59 | 60 | ################################################################################ 61 | # __main__ 62 | ################################################################################ 63 | if __name__ == '__main__': 64 | main() 65 | -------------------------------------------------------------------------------- /peaks_venn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os, subprocess 4 | import math, os, stats, subprocess 5 | 6 | import matplotlib 7 | matplotlib.use('Agg') 8 | import matplotlib.pyplot as plt 9 | from matplotlib_venn import venn2 10 | 11 | ################################################################################ 12 | # peaks_venn.py 13 | # 14 | # Make a venn diagram comparing two sets of peak calls. 15 | ################################################################################ 16 | 17 | 18 | ################################################################################ 19 | # main 20 | ################################################################################ 21 | def main(): 22 | usage = 'usage: %prog [options] ' 23 | parser = OptionParser(usage) 24 | parser.add_option('--l1', dest='label1', default='peaks1', help='Label for peak set 1') 25 | parser.add_option('--l2', dest='label2', default='peaks2', help='Label for peak set 2') 26 | (options,args) = parser.parse_args() 27 | 28 | if len(args) != 3: 29 | parser.error('Must provide two peaks BED files and output PDF') 30 | else: 31 | peaks1_bed = args[0] 32 | peaks2_bed = args[1] 33 | out_pdf = args[2] 34 | 35 | # count individual 36 | peaks1_count = count_peaks(peaks1_bed) 37 | peaks2_count = count_peaks(peaks2_bed) 38 | 39 | # count overlap 40 | copeaks_count = 0 41 | p = subprocess.Popen('intersectBed -u -a %s -b %s' % (peaks1_bed, peaks2_bed), stdout=subprocess.PIPE, shell=True) 42 | for line in p.stdout: 43 | copeaks_count += 1 44 | p.communicate() 45 | 46 | plt.figure() 47 | venn_diag = venn2(subsets=(peaks1_count-copeaks_count, peaks2_count-copeaks_count, copeaks_count), set_labels=[options.label1, options.label2], set_colors=['#e41a1c', '#A1A838']) 48 | plt.savefig(out_pdf) 49 | plt.close() 50 | 51 | 52 | def count_peaks(bed_file): 53 | peak_counts = 0 54 | for line in open(bed_file): 55 | peak_counts += 1 56 | return peak_counts 57 | 58 | ################################################################################ 59 | # __main__ 60 | ################################################################################ 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /multiz_lncrna.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | from pygr import worldbase 4 | import gff 5 | 6 | ################################################################################ 7 | # multiz_lncrna.py 8 | # 9 | # Return hg19 46-way multiz alignments of a specified lncRNA gene. By default, 10 | # just do the exons, but as an option do the entire span. 11 | ################################################################################ 12 | 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | parser.add_option('-l', dest='lncrna_gtf', default='/Users/dk/research/common/data/lncrna/lnc_catalog.gtf', help='lncRNA gtf file [Default: %default]') 21 | parser.add_option('-s', dest='span', action='store_true', default=False, help='Map the gene\'s entire span, i.e. introns too [Default: %default]') 22 | (options,args) = parser.parse_args() 23 | 24 | if len(args) != 1: 25 | parser.error('Must provide gene id') 26 | else: 27 | gene_id = args[0] 28 | 29 | # get human genome 30 | hg19 = worldbase.Bio.Seq.Genome.HUMAN.hg19() 31 | 32 | # get gene exon intervals 33 | gene_ivals = [] 34 | for line in open(options.lncrna_gtf): 35 | a = line.split('\t') 36 | if gff.gtf_kv(a[8])['gene_id'] == gene_id: 37 | chrom = a[0] 38 | start = int(a[3]) 39 | end = int(a[4]) 40 | # ignoring orientation at the moment 41 | 42 | gene_ivals.append(hg19[chrom][start:end]) 43 | 44 | # get hg19 msa 45 | msa = worldbase.Bio.MSA.UCSC.hg19_multiz46way() 46 | 47 | # map returned sequences back to genome name 48 | idDict = ~(msa.seqDict) 49 | 50 | # print alignments 51 | for gi in gene_ivals: 52 | for src, dest, edg in msa[gi].edges(): 53 | print repr(gi), repr(src), repr(dest), idDict[dest], edg.length() 54 | 55 | 56 | ################################################################################ 57 | # __main__ 58 | ################################################################################ 59 | if __name__ == '__main__': 60 | main() 61 | -------------------------------------------------------------------------------- /fpkm_hist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import math, os, pdb, random 4 | import cufflinks, gff 5 | 6 | from rpy2.robjects.packages import importr 7 | import rpy2.robjects as ro 8 | import rpy2.robjects.lib.ggplot2 as ggplot2 9 | grdevices = importr('grDevices') 10 | 11 | ################################################################################ 12 | # fpkm_hist.py 13 | # 14 | # Plot a histogram of the max log2 FPKM values for the genes in a gtf file. 15 | ################################################################################ 16 | 17 | ################################################################################ 18 | # main 19 | ################################################################################ 20 | def main(): 21 | usage = 'usage: %prog [options] ' 22 | parser = OptionParser(usage) 23 | #parser.add_option('-m', dest='fpkm_min', type='float', default=0.25, help='Minimum FPKM [Default: %default]') 24 | (options,args) = parser.parse_args() 25 | 26 | if len(args) != 2: 27 | parser.error(usage) 28 | else: 29 | gtf_file = args[0] 30 | fpkm_tracking_file = args[1] 31 | 32 | # get genes 33 | genes = set() 34 | for line in open(gtf_file): 35 | a = line.split('\t') 36 | genes.add(gff.gtf_kv(a[8])['gene_id']) 37 | 38 | # get expression 39 | cuff = cufflinks.fpkm_tracking(fpkm_tracking_file) 40 | log_fpkms = [] 41 | for gene_id in genes: 42 | max_fpkm = max(cuff.gene_expr(gene_id)) 43 | if max_fpkm > 0: 44 | log_fpkms.append(math.log(max_fpkm,2)) 45 | 46 | # construct R data objects 47 | fpkms_r = ro.FloatVector(log_fpkms) 48 | df = ro.DataFrame({'fpkm':fpkms_r}) 49 | 50 | # construct plot 51 | gp = ggplot2.ggplot(df) + \ 52 | ggplot2.aes_string(x='fpkm') + \ 53 | ggplot2.geom_histogram(binwidth=0.2) 54 | 55 | # save to file 56 | gtf_pre = os.path.splitext(gtf_file)[0] 57 | grdevices.pdf(file='%s_fpkmhist.pdf' % gtf_pre) 58 | gp.plot() 59 | grdevices.dev_off() 60 | 61 | 62 | ################################################################################ 63 | # __main__ 64 | ################################################################################ 65 | if __name__ == '__main__': 66 | main() 67 | #pdb.runcall(main) 68 | -------------------------------------------------------------------------------- /bg_w5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import gzip 4 | 5 | import h5py 6 | import numpy as np 7 | 8 | ''' 9 | bg_w5.py 10 | 11 | Convert a BedGraph w/o overlapping entries to wig5. 12 | ''' 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | parser.add_option('-l', dest='norm_len', 21 | default=False, action='store_true', 22 | help='Normalize values by site length [Default: %default]') 23 | (options,args) = parser.parse_args() 24 | 25 | if len(args) != 3: 26 | parser.error('Must provide input BigWig, genome file, output HDF5.') 27 | else: 28 | bg_file = args[0] 29 | genome_file = args[1] 30 | hdf5_file = args[2] 31 | 32 | # initialize chromosome arrays 33 | chrm_values = {} 34 | for line in open(genome_file): 35 | a = line.split() 36 | chrm = a[0] 37 | chrm_len = int(a[1]) 38 | chrm_values[chrm] = np.zeros(chrm_len, dtype='float16') 39 | 40 | # write bedgraph entries 41 | if bg_file[-3:] == '.gz': 42 | bg_open = gzip.open(bg_file, 'rt') 43 | else: 44 | bg_open = open(bg_file) 45 | 46 | for line in bg_open: 47 | if not line.startswith('#'): 48 | a = line.split() 49 | if len(a) >= 4: 50 | chrm = a[0] 51 | start = int(a[1]) 52 | end = int(a[2]) 53 | v = float(a[3]) 54 | if options.norm_len: 55 | v /= (end-start) 56 | chrm_values[chrm][start:end] = v 57 | 58 | bg_open.close() 59 | 60 | # write gzipped into HDF5 61 | h5_out = h5py.File(hdf5_file, 'w') 62 | for chrm in chrm_values: 63 | h5_out.create_dataset(chrm, data=np.nan_to_num(chrm_values[chrm]), 64 | dtype='float16', compression='gzip', shuffle=True) 65 | h5_out.close() 66 | 67 | 68 | 69 | ################################################################################ 70 | # __main__ 71 | ################################################################################ 72 | if __name__ == '__main__': 73 | main() 74 | -------------------------------------------------------------------------------- /gtf_homologues.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os, subprocess, tempfile 4 | import gff 5 | 6 | ################################################################################ 7 | # gtf_homologues.py 8 | # 9 | # Make a table describing candidate homologue genes as determined by a 10 | # transmap from one genome to another. 11 | ################################################################################ 12 | 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | #parser.add_option() 21 | (options,args) = parser.parse_args() 22 | 23 | if len(args) != 4: 24 | parser.error('Must provide chain file and two GTF files') 25 | else: 26 | chain_file = args[0] 27 | net_file = args[1] 28 | gtf_from = args[2] 29 | gtf_to = args[3] 30 | 31 | # transmap to new genome 32 | from_map_gtf_fd, from_map_gtf_file = tempfile.mkstemp() 33 | subprocess.call('chain_map.py -k gene_id -n %s %s %s > %s' % (net_file,chain_file,gtf_from,from_map_gtf_file), shell=True) 34 | 35 | # intersect w/ gtf_to 36 | homologues = {} 37 | p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' % (from_map_gtf_file,gtf_to), shell=True, stdout=subprocess.PIPE) 38 | for line in p.stdout: 39 | a = line.split('\t') 40 | 41 | kv_to = gff.gtf_kv(a[17]) 42 | 43 | gid_from = a[8].split(';')[1].strip() 44 | gid_to = kv_to['gene_id'] 45 | 46 | homologues.setdefault(gid_from,set()).add(gid_to) 47 | p.communicate() 48 | 49 | # find all genes 50 | genes = set() 51 | for line in open(gtf_from): 52 | a = line.split('\t') 53 | genes.add(gff.gtf_kv(a[8])['gene_id']) 54 | 55 | # print table 56 | for g in genes: 57 | print '%s\t%s' % (g,' '.join(homologues.get(g,['-']))) 58 | 59 | os.close(from_map_gtf_fd) 60 | os.remove(from_map_gtf_file) 61 | 62 | 63 | ################################################################################ 64 | # __main__ 65 | ################################################################################ 66 | if __name__ == '__main__': 67 | main() 68 | -------------------------------------------------------------------------------- /gsea_rnk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import math, os 4 | 5 | ################################################################################ 6 | # gsea_rnk.py 7 | # 8 | # Output a set of .rnk files for GSEA from a cuffdiff .diff file. 9 | ################################################################################ 10 | 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | parser.add_option('-m', dest='min_fpkm', type='float') 19 | parser.add_option('-o', dest='out_dir', default='.') 20 | (options,args) = parser.parse_args() 21 | 22 | if len(args) != 1: 23 | parser.error('Must provide .diff') 24 | else: 25 | diff_file = args[0] 26 | 27 | if not os.path.isdir(options.out_dir): 28 | os.mkdir(options.out_dir) 29 | 30 | comparison_out = {} 31 | 32 | diff_in = open(diff_file) 33 | diff_in.readline() 34 | for line in diff_in: 35 | a = line.split('\t') 36 | 37 | gene_id = a[0] 38 | gene_name = a[2] 39 | sample1 = a[4].replace('-','_') # cmd line gsea cannot handle hyphens 40 | sample2 = a[5].replace('-','_') 41 | status = a[6] 42 | fpkm1 = float(a[7]) 43 | fpkm2 = float(a[8]) 44 | fold_change = float(a[9]) 45 | tstat = float(a[10]) 46 | qval = float(a[11]) 47 | sig = a[-1].rstrip() 48 | 49 | if status == 'OK' and not math.isnan(tstat): 50 | if options.min_fpkm == None or fpkm1 > options.min_fpkm or fpkm2 > options.min_fpkm: 51 | if not (sample1,sample2) in comparison_out: 52 | comparison_out[(sample1,sample2)] = open('%s/%s_%s.rnk' % (options.out_dir, sample1, sample2), 'w') 53 | 54 | print >> comparison_out[(sample1,sample2)], '%s\t%f' % (gene_name, fold_change) 55 | 56 | diff_in.close() 57 | 58 | for ckey in comparison_out: 59 | comparison_out[ckey].close() 60 | 61 | 62 | ################################################################################ 63 | # __main__ 64 | ################################################################################ 65 | if __name__ == '__main__': 66 | main() 67 | -------------------------------------------------------------------------------- /lnc_expression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import cufflinks, gff 4 | import os 5 | 6 | ################################################################################ 7 | # lnc_expession.py 8 | # 9 | # Print a summary of the lncrna gene's expression. 10 | ################################################################################ 11 | 12 | 13 | ################################################################################ 14 | # main 15 | ################################################################################ 16 | def main(): 17 | usage = 'usage: %prog [options] ' 18 | parser = OptionParser(usage) 19 | parser.add_option('-c', dest='cuff_dir', default='%s/research/common/data/lncrna'%os.environ['HOME'], help='Cufflinks output directory with .fpkm_tracking files [Default: %default]') 20 | parser.add_option('-l', dest='lnc_gtf', default='%s/research/common/data/lncrna/lnc_catalog.gtf'%os.environ['HOME'], help='lncRNA catalog gtf file [Default: %default]') 21 | parser.add_option('-t', dest='transcript_expr', default=False, action='store_true', help='Return transcript expression rather than gene [Default: %default]') 22 | (options,args) = parser.parse_args() 23 | 24 | if options.transcript_expr: 25 | cuff = cufflinks.fpkm_tracking('%s/isoforms.fpkm_tracking' % options.cuff_dir) 26 | 27 | if args[0].find('XLOC') != -1: 28 | trans_ids = set() 29 | for line in open(options.lnc_gtf): 30 | a = line.split('\t') 31 | kv = gff.gtf_kv(a[8]) 32 | if kv['gene_id'] == args[0]: 33 | trans_ids.add(kv['transcript_id']) 34 | else: 35 | trans_ids = [args[0]] 36 | 37 | for trans_id in trans_ids: 38 | print '%s:' % trans_id 39 | cuff.gene_expr_print(trans_id) 40 | 41 | else: 42 | cuff = cufflinks.fpkm_tracking('%s/genes.fpkm_tracking' % options.cuff_dir) 43 | 44 | if args[0].find('XLOC') != -1: 45 | gene_id = args[0] 46 | else: 47 | t2g = gff.t2g(options.lnc_gtf) 48 | gene_id = t2g[args[0]] 49 | 50 | cuff.gene_expr_print(gene_id) 51 | 52 | 53 | ################################################################################ 54 | # __main__ 55 | ################################################################################ 56 | if __name__ == '__main__': 57 | main() 58 | -------------------------------------------------------------------------------- /nuc2gff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import glob, sys, pdb 4 | 5 | ################################################################################ 6 | # nuc2gff.py 7 | # 8 | # Convert alignments from a nucmer coords file to features in a .gff file. 9 | ################################################################################ 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] ' 16 | parser = OptionParser(usage) 17 | parser.add_option('-f', dest='feature_name', default='domain', help='Feature name [Default: %default]') 18 | parser.add_option('-p', dest='pct_t', type='float', default=0.9, help='Percentage of the 2nd sequence that must be covered by the alignment [Default: %default]') 19 | parser.add_option('-i', dest='idy_t', type='float', default=0.8, help='% identity that must be exceeded by the alignment [Default: %default]') 20 | (options,args) = parser.parse_args() 21 | 22 | if len(args) != 1: 23 | parser.error('Must provide nucmer output coords file') 24 | else: 25 | coords_file = args[0] 26 | 27 | # get header 28 | cf = open(coords_file) 29 | for i in range(5): 30 | cf.readline() 31 | 32 | line = cf.readline() 33 | while line: 34 | a = line.split() 35 | 36 | header1 = a[-2] 37 | header2 = a[-1] 38 | 39 | start1 = int(a[0]) 40 | end1 = int(a[1]) 41 | 42 | idy = float(a[9])/100.0 43 | len2 = int(a[12]) 44 | if int(a[3]) < int(a[4]): 45 | strand = '+' 46 | start2 = int(a[3]) 47 | end2 = int(a[4]) 48 | else: 49 | strand = '-' 50 | start2 = int(a[4]) 51 | end2 = int(a[3]) 52 | 53 | if idy > options.idy_t and end2-start2+1 > len2*options.pct_t: 54 | gff_a = [header1, 'nucmer', options.feature_name, str(start1), str(end1), '.', strand, '.', header2] 55 | print '\t'.join(gff_a) 56 | 57 | line = cf.readline() 58 | 59 | 60 | ################################################################################ 61 | # __main__ 62 | ################################################################################ 63 | if __name__ == '__main__': 64 | main() 65 | #pdb.runcall(main) 66 | -------------------------------------------------------------------------------- /plot_fragment_lengths.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import pysam 4 | 5 | import matplotlib 6 | matplotlib.use('Agg') 7 | import matplotlib.pyplot as plt 8 | 9 | import seaborn as sns 10 | sns.set_style('ticks') 11 | 12 | ################################################################################ 13 | # plot_fragment_lengths.py 14 | # 15 | # Plot the distribution of fragment lengths 16 | ################################################################################ 17 | 18 | 19 | ################################################################################ 20 | # main 21 | ################################################################################ 22 | def main(): 23 | usage = 'usage: %prog [options] ' 24 | parser = OptionParser(usage) 25 | parser.add_option('-m', dest='max_length', type='int') 26 | (options,args) = parser.parse_args() 27 | 28 | if len(args) != 2: 29 | parser.error('Must provide BAM file and output PDF') 30 | else: 31 | bam_file = args[0] 32 | out_pdf = args[1] 33 | 34 | tlens = {} 35 | for alignment in pysam.Samfile(bam_file): 36 | tl = abs(alignment.template_length) 37 | tlens[tl] = tlens.get(tl,0) + 1 38 | 39 | # not sure what 0 means 40 | tlens[0] = 0 41 | 42 | if options.max_length is None: 43 | num_fragments = sum([tlens.get(i,0) for i in range(10000)]) 44 | max_length_fragments = 0.99*num_fragments 45 | 46 | length = 1 47 | length_fragments = tlens.get(length,0) 48 | while length_fragments < max_length_fragments and length < 1000: 49 | print length, length_fragments, max_length_fragments 50 | length += 1 51 | length_fragments += tlens.get(length,0) 52 | 53 | options.max_length = length 54 | 55 | #for tl in range(max(tlens.keys())+1): 56 | # print '%4d %d' % (tl,tlens.get(tl,0)) 57 | 58 | length_counts = [tlens.get(length,0) for length in range(options.max_length)] 59 | 60 | plt.figure() 61 | plt.plot(length_counts) 62 | plt.xlabel('Fragment length') 63 | plt.xlim(0,options.max_length+1) 64 | sns.despine() 65 | plt.savefig(out_pdf) 66 | plt.close() 67 | 68 | 69 | ################################################################################ 70 | # __main__ 71 | ################################################################################ 72 | if __name__ == '__main__': 73 | main() 74 | -------------------------------------------------------------------------------- /bam_bedg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import pysam 4 | 5 | ################################################################################ 6 | # bam_bedg 7 | # 8 | # Map a BAM file of aligned reads from a ChIP-seq or ATAC-seq to a BEDGRAPH 9 | # file, counting only the events relevant to that experiment. 10 | ################################################################################ 11 | 12 | 13 | ################################################################################ 14 | # main 15 | ################################################################################ 16 | def main(): 17 | usage = 'usage: %prog [options] ' 18 | parser = OptionParser(usage) 19 | parser.add_option('-l', dest='frag_len', default=200, type='int', action='store_true') 20 | (options,args) = parser.parse_args() 21 | 22 | if len(args) != 2: 23 | parser.error('Must provide input BAM and output BEDGRAPH files') 24 | else: 25 | bam_file = args[0] 26 | bedg_file = args[1] 27 | 28 | chrom_events = {} 29 | 30 | bam_in = pysam.Samfile(bam_file, 'rb') 31 | for align in bam_in: 32 | # get chrom 33 | chrom = bam_in.references[align.tid] 34 | 35 | # weight multi-mappers 36 | multi_weight = weight_multi(align) 37 | 38 | # determine fragment length 39 | if align.is_proper_pair: 40 | frag_len = abs(align.tlen) 41 | else: 42 | frag_len = options.frag_len 43 | 44 | # map to event position 45 | event_pos = align.reference_start + frag_len/2 46 | 47 | # save 48 | if chrom not in chrom_events: 49 | chrom_events[chrom] = {} 50 | chrom_events[chrom][event_pos] = chrom_events[chrom].get(event_pos,0) + multi_weight 51 | bam_in.close() 52 | 53 | # output BEDGRAPH 54 | 55 | 56 | def weight_multi(align): 57 | ''' Weight the alignment by its multimap properties 58 | 59 | I'm making this a separate function, because I might 60 | want to use more sophisticated weights later. 61 | ''' 62 | try: 63 | nh_tag = align_read.opt('NH') 64 | except: 65 | nh_tag = 1 66 | 67 | multi_weight = 1.0 / nh_tag 68 | 69 | return multi_weight 70 | 71 | 72 | 73 | ################################################################################ 74 | # __main__ 75 | ################################################################################ 76 | if __name__ == '__main__': 77 | main() 78 | -------------------------------------------------------------------------------- /bgo_w5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import gzip 4 | 5 | import h5py 6 | import numpy as np 7 | 8 | ''' 9 | bg_w5.py 10 | 11 | Convert a BedGraph w/ overlapping entries to Wig5. 12 | ''' 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | # parser.add_option('-v', dest='verbose', 21 | # default=False, action='store_true') 22 | (options,args) = parser.parse_args() 23 | 24 | if len(args) != 3: 25 | parser.error('Must provide input BigWig, genome file, output HDF5.') 26 | else: 27 | bg_file = args[0] 28 | genome_file = args[1] 29 | hdf5_file = args[2] 30 | 31 | # initialize chromosome arrays 32 | chrm_values = {} 33 | chrm_counts = {} 34 | for line in open(genome_file): 35 | a = line.split() 36 | chrm = a[0] 37 | chrm_len = int(a[1]) 38 | chrm_values[chrm] = np.zeros(chrm_len, dtype='float16') 39 | chrm_counts[chrm] = np.zeros(chrm_len, dtype='uint8') 40 | 41 | # write bedgraph entries 42 | if bg_file[-3:] == '.gz': 43 | bg_open = gzip.open(bg_file, 'rt') 44 | else: 45 | bg_open = open(bg_file) 46 | 47 | for line in bg_open: 48 | if not line.startswith('#'): 49 | a = line.split() 50 | if len(a) >= 4: 51 | chrm = a[0] 52 | start = int(a[1]) 53 | end = int(a[2]) 54 | v = float(a[3]) 55 | chrm_values[chrm][start:end] += v 56 | chrm_counts[chrm][start:end] += 1 57 | 58 | bg_open.close() 59 | 60 | # take mean 61 | for chrm in chrm_values: 62 | chrm_values[chrm] = np.divide(chrm_values[chrm], chrm_counts[chrm]) 63 | chrm_values[chrm] = np.nan_to_num(chrm_values[chrm]) 64 | 65 | # write gzipped into HDF5 66 | h5_out = h5py.File(hdf5_file, 'w') 67 | for chrm in chrm_values: 68 | h5_out.create_dataset(chrm, data=chrm_values[chrm], dtype='float16', 69 | compression='gzip', shuffle=True) 70 | h5_out.close() 71 | 72 | 73 | 74 | ################################################################################ 75 | # __main__ 76 | ################################################################################ 77 | if __name__ == '__main__': 78 | main() 79 | -------------------------------------------------------------------------------- /make_ref_ml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import subprocess 4 | 5 | ''' 6 | make_ref_ml.py 7 | 8 | Make machine learning friendly genome files, removing unplaced contigs, 9 | and chrY. 10 | ''' 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | #parser.add_option() 19 | (options,args) = parser.parse_args() 20 | 21 | if len(args) != 2: 22 | parser.error('Must provide FASTA and genome files') 23 | else: 24 | fasta_file = args[0] 25 | genome_file = args[1] 26 | 27 | fasta_ml_file = fasta_file.replace('.fa', '.ml.fa') 28 | fasta_ml_out = open(fasta_ml_file, 'w') 29 | 30 | for line in open(fasta_file): 31 | if line[0] == '>': 32 | keep_chr = True 33 | header = line[1:] 34 | keep_chr = filter_chr(header) 35 | if keep_chr: 36 | print(line, file=fasta_ml_out, end='') 37 | 38 | fasta_ml_out.close() 39 | 40 | subprocess.call('samtools faidx %s' % fasta_ml_file, shell=True) 41 | 42 | 43 | genome_ml_file = genome_file.replace('.genome', '.ml.genome') 44 | genome_ml_file = open(genome_ml_file, 'w') 45 | 46 | for line in open(genome_file): 47 | header = line.split()[0] 48 | keep_chr = filter_chr(header) 49 | if keep_chr: 50 | print(line, file=genome_ml_file, end='') 51 | 52 | genome_ml_file.close() 53 | 54 | 55 | def filter_chr(header): 56 | keep_chr = True 57 | if header.find('chrUn') != -1: 58 | keep_chr = False 59 | elif header.find('random') != -1: 60 | keep_chr = False 61 | elif header.find('hap') != -1: 62 | keep_chr = False 63 | elif header.find('alt') != -1: 64 | keep_chr = False 65 | elif header.find('KI270') != -1: 66 | keep_chr = False 67 | elif header.find('GL000') != -1: 68 | keep_chr = False 69 | elif header.find('JH584') != -1: 70 | keep_chr = False 71 | elif header.find('GL456') != -1: 72 | keep_chr = False 73 | elif header.rstrip() in ['chrM','chrMT','chrY']: 74 | keep_chr = False 75 | return keep_chr 76 | 77 | ################################################################################ 78 | # __main__ 79 | ################################################################################ 80 | if __name__ == '__main__': 81 | main() 82 | -------------------------------------------------------------------------------- /vcf_splice.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os 4 | import pdb 5 | 6 | import pybedtools 7 | 8 | ''' 9 | vcf_splice.py 10 | 11 | Add splice site distance INFO column to a VCF file. 12 | ''' 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | parser.add_option('-g', dest='splice_gff_file', default='%s/genes/gencode28/gencode.v28.basic.annotation.splice.gff' % os.environ['HG38']) 21 | # parser.add_option('-g', dest='splice_gff_file', default='%s/genes/gencode28/gencode_basic_splice.gff' % os.environ['HG19']) 22 | parser.add_option('-t', dest='filter_t', 23 | default=None, type='int', 24 | help='Filter out variants less than the given distance threshold [Default: %default]') 25 | (options,args) = parser.parse_args() 26 | 27 | if len(args) != 2: 28 | parser.error('Must provide input and output VCF files') 29 | else: 30 | in_vcf_file = args[0] 31 | out_vcf_file = args[1] 32 | 33 | # open files 34 | in_vcf_open = open(in_vcf_file) 35 | out_vcf_open = open(out_vcf_file, 'w') 36 | 37 | # print header 38 | line = in_vcf_open.readline() 39 | while line.startswith('#'): 40 | if line.startswith('#CHROM'): 41 | # add new INFO description first 42 | print('##FORMAT=', file=out_vcf_open) 43 | print(line, end='', file=out_vcf_open) 44 | line = in_vcf_open.readline() 45 | in_vcf_open.close() 46 | 47 | # intersect 48 | in_vcf_bedtool = pybedtools.BedTool(in_vcf_file) 49 | splice_bedtool = pybedtools.BedTool(options.splice_gff_file) 50 | 51 | for closest_a in in_vcf_bedtool.closest(splice_bedtool, d=True, t='first'): 52 | a = closest_a[:8] 53 | splice_distance = int(closest_a[-1]) 54 | if a[-1] == '.': 55 | a[-1] = 'SS=%s' % str(splice_distance) 56 | else: 57 | 58 | a[-1] += ';SS=%s' % str(splice_distance) 59 | 60 | if options.filter_t is None or splice_distance >= options.filter_t: 61 | print('\t'.join(a), file=out_vcf_open) 62 | 63 | # close 64 | out_vcf_open.close() 65 | 66 | 67 | ################################################################################ 68 | # __main__ 69 | ################################################################################ 70 | if __name__ == '__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /seq_logo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os, re, subprocess, tempfile 4 | 5 | ################################################################################ 6 | # name 7 | # 8 | # 9 | ################################################################################ 10 | 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] arg' 17 | parser = OptionParser(usage) 18 | #parser.add_option() 19 | (options,args) = parser.parse_args() 20 | 21 | seq = 'ACGTACGT' 22 | heights = [1, 1, 2, 2, 1, 1, 1, 1] 23 | out_eps = 'test_logo.eps' 24 | seq_logo(seq, heights, out_eps) 25 | 26 | 27 | def seq_logo(seq, heights, out_eps, weblogo_args=''): 28 | # print the sequence to a temp fasta file 29 | fasta_fd, fasta_file = tempfile.mkstemp() 30 | fasta_out = open(fasta_file, 'w') 31 | print >> fasta_out, '>seq\n%s' % seq 32 | fasta_out.close() 33 | 34 | # print figure to a temp eps file 35 | eps_fd, eps_file = tempfile.mkstemp() 36 | weblogo_cmd = 'weblogo --errorbars NO --show-xaxis NO --show-yaxis NO --fineprint "" -c classic -n %d %s < %s > %s' % (len(seq), weblogo_args, fasta_file, eps_file) 37 | subprocess.call(weblogo_cmd, shell=True) 38 | 39 | # copy eps file over and write in my own heights 40 | start_stack_re = re.compile('^\(\d*\) StartStack') 41 | out_eps_open = open(out_eps, 'w') 42 | weblogo_eps_in = open(eps_file) 43 | line = weblogo_eps_in.readline() 44 | si = 0 45 | while line: 46 | start_stack_match = start_stack_re.search(line) 47 | 48 | # nt column begins 49 | if start_stack_match: 50 | print >> out_eps_open, line, 51 | 52 | # loop over 4 nt's 53 | for i in range(4): 54 | line = weblogo_eps_in.readline() 55 | a = line.split() 56 | 57 | nt = a[2][1:-1] 58 | if nt != seq[si]: 59 | print >> out_eps_open, line, 60 | else: 61 | # change the nt of seq 62 | a[1] = '%.6f' % heights[si] 63 | print >> out_eps_open, ' %s' % ' '.join(a) 64 | 65 | # move to next nucleotide 66 | si += 1 67 | else: 68 | print >> out_eps_open, line, 69 | 70 | # advance to next line 71 | line = weblogo_eps_in.readline() 72 | 73 | # clean 74 | os.close(fasta_fd) 75 | os.remove(fasta_file) 76 | os.close(eps_fd) 77 | os.remove(eps_file) 78 | 79 | 80 | ################################################################################ 81 | # __main__ 82 | ################################################################################ 83 | if __name__ == '__main__': 84 | main() 85 | -------------------------------------------------------------------------------- /rmdup_iclip.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import gzip, pdb 4 | import pysam 5 | 6 | ################################################################################ 7 | # rmdup_iclip.py 8 | # 9 | # Remove duplicates in Tollervey and Zamack et al's CLIP-Seq data, where the 10 | # reads have barcodes at varying positions. 11 | ################################################################################ 12 | 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ...' 19 | parser = OptionParser(usage) 20 | #parser.add_option() 21 | (options,args) = parser.parse_args() 22 | 23 | if len(args) < 3: 24 | parser.error('Must provide barcode indexes, BAM file, and FASTQ files') 25 | else: 26 | barcode_indexes = [int(bi) for bi in args[0].split(',')] 27 | bam_file = args[1] 28 | fastq_files = args[2:] 29 | 30 | # map headers to barcodes 31 | header_barcodes = {} 32 | for fastq_file in fastq_files: 33 | if fastq_file[-2:] == 'gz': 34 | fastq_in = gzip.open(fastq_file) 35 | else: 36 | fastq_in = open(fastq_file) 37 | 38 | header = fastq_in.readline() 39 | while header: 40 | seq = fastq_in.readline() 41 | mid = fastq_in.readline() 42 | qual = fastq_in.readline() 43 | 44 | align_header = header[1:].split()[0] 45 | barcode = ''.join([seq[bi] for bi in barcode_indexes]) 46 | header_barcodes[align_header] = barcode 47 | 48 | header = fastq_in.readline() 49 | 50 | # open BAM 51 | bam_in = pysam.Samfile(bam_file, 'rb') 52 | bam_out = pysam.Samfile(bam_file[:-4] + '_rmdup.bam', 'wb', template=bam_in) 53 | 54 | alignment_hash = set() 55 | 56 | for aligned_read in bam_in: 57 | # hash by chrom, start, strand, barcode 58 | align_key = (aligned_read.tid, aligned_read.pos, aligned_read.is_reverse, header_barcodes[aligned_read.qname]) 59 | 60 | # if alignment not yet printed 61 | if not align_key in alignment_hash: 62 | bam_out.write(aligned_read) 63 | alignment_hash.add(align_key) 64 | 65 | bam_in.close() 66 | bam_out.close() 67 | 68 | ################################################################################ 69 | # __main__ 70 | ################################################################################ 71 | if __name__ == '__main__': 72 | main() 73 | #pdb.runcall(main) 74 | -------------------------------------------------------------------------------- /fpkm_fpkm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os, math, sys 4 | from scipy.stats import spearmanr 5 | import gff, ggplot, cufflinks 6 | 7 | ################################################################################ 8 | # fpkm_fpkm.py 9 | # 10 | # Compare two cufflinks runs. 11 | ################################################################################ 12 | 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | parser.add_option('-g', dest='gtf') 21 | parser.add_option('-o', dest='out_dir', default='.') 22 | parser.add_option('-p', dest='pseudocount', default=0.125, type='float') 23 | (options,args) = parser.parse_args() 24 | 25 | if len(args) != 2: 26 | parser.error('Must provide two diff files') 27 | else: 28 | fpkm1_file = args[0] 29 | fpkm2_file = args[1] 30 | 31 | cuff1 = cufflinks.fpkm_tracking(fpkm1_file) 32 | cuff2 = cufflinks.fpkm_tracking(fpkm2_file) 33 | 34 | gtf_genes = set() 35 | if options.gtf: 36 | gtf_genes = gff.gtf_gene_set(options.gtf) 37 | 38 | if not os.path.isdir(options.out_dir): 39 | os.mkdir(options.out_dir) 40 | 41 | for sample in cuff1.experiments: 42 | # scatter plot fpkm 43 | df = {'fpkm1':[], 'fpkm2':[]} 44 | for i in range(len(cuff1.genes)): 45 | if len(gtf_genes) == 0 or cuff1.genes[i] in gtf_genes: 46 | fpkm1 = cuff1.gene_expr_exp(i, sample) 47 | fpkm2 = cuff2.gene_expr_exp(i, sample) 48 | 49 | if not math.isnan(fpkm1) and not math.isnan(fpkm2): 50 | df['fpkm1'].append(math.log(options.pseudocount+fpkm1,2)) 51 | df['fpkm2'].append(math.log(options.pseudocount+fpkm2,2)) 52 | 53 | r_script = '%s/fpkm_fpkm_scatter.r' % os.environ['RDIR'] 54 | out_pdf = '%s/%s_scatter.pdf' % (options.out_dir, sample) 55 | ggplot.plot(r_script, df, [out_pdf]) 56 | 57 | # compute correlation 58 | cor, p = spearmanr(df['fpkm1'], df['fpkm2']) 59 | 60 | report_out = open('%s/%s_report.txt' % (options.out_dir,sample), 'w') 61 | print >> report_out, 'Spearman correlation: %f (%e)' % (cor, p) 62 | report_out.close() 63 | 64 | 65 | ################################################################################ 66 | # __main__ 67 | ################################################################################ 68 | if __name__ == '__main__': 69 | main() 70 | -------------------------------------------------------------------------------- /gsea.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import glob, os, subprocess, sys 4 | 5 | ################################################################################ 6 | # gsea.py 7 | # 8 | # Helper script to run GSEA from CuffDiff output. 9 | ################################################################################ 10 | 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | parser.add_option('-c', dest='scheme', default='weighted', help='weighted or classic [Default: %default]') 19 | parser.add_option('-o', dest='out_dir', default='.') 20 | parser.add_option('-s', dest='gene_set', default='go', help='Gene sets [Default: %default]') 21 | (options,args) = parser.parse_args() 22 | 23 | if len(args) != 1: 24 | parser.error('Must provide .diff') 25 | else: 26 | diff_file = args[0] 27 | 28 | if not os.path.isdir(options.out_dir): 29 | os.mkdir(options.out_dir) 30 | 31 | # choose chip 32 | gsea_jar = glob.glob('%s/gsea*.jar' % os.environ['GSEA'])[0] 33 | chip_file = '%s/GENE_SYMBOL.chip' % os.environ['GSEA'] 34 | 35 | # choose sets 36 | if options.gene_set.lower() in ['c5', 'go']: 37 | sets_file = '%s/sets/c5.all.v5.0.symbols.gmt' % os.environ['GSEA'] 38 | else: 39 | print >> sys.stderr, 'Unrecognized gene set: %s' % options.gene_set 40 | exit(1) 41 | 42 | # make rank files 43 | rank_cmd = 'gsea_rnk.py -o %s %s' % (options.out_dir, diff_file) 44 | subprocess.call(rank_cmd, shell=True) 45 | 46 | for rank_file in glob.glob('%s/*.rnk' % options.out_dir): 47 | rank_name = rank_file.split('/')[-1][:-4] 48 | 49 | # run GSEA 50 | gsea_cmd = 'java -cp %s -Xmx4000m xtools.gsea.GseaPreranked -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -rnk %s -scoring_scheme %s -rpt_label %s -chip %s -include_only_symbols true -make_sets true -plot_top_x 50 -rnd_seed timestamp -set_max 1000 -set_min 10 -zip_report false -out %s -gui false' % (gsea_jar, sets_file, rank_file, options.scheme, rank_name, chip_file, options.out_dir) 51 | subprocess.call(gsea_cmd, shell=True) 52 | 53 | # consider making a new excel file from the gsea_report_for_na_neg*.xls file 54 | # where I strip out the redundant col 1 and stupid col 2. 55 | 56 | 57 | ################################################################################ 58 | # __main__ 59 | ################################################################################ 60 | if __name__ == '__main__': 61 | main() 62 | -------------------------------------------------------------------------------- /gtf_multimaps.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import gff, util 4 | import os, subprocess, sys 5 | 6 | ################################################################################ 7 | # gtf_multimaps.py 8 | # 9 | # Print a summary table about multimapping reads for the transcripts in a gtf 10 | # file. 11 | ################################################################################ 12 | 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | parser.add_option('-i', dest='intersect_done', default=False, action='store_true', help='intersectBed is already done [Default: %default]') 21 | parser.add_option('-o', dest='output_prefix', help='Prefix for the intersectBed intermediate file [Default: %default]') 22 | (options,args) = parser.parse_args() 23 | 24 | if len(args) != 2: 25 | parser.error('Must provide gtf file and bam file') 26 | else: 27 | gtf_file = args[0] 28 | bam_file = args[1] 29 | 30 | if options.output_prefix: 31 | ib_file = '%s_reads_genes.gff' % options.output_prefix 32 | else: 33 | ib_file = 'reads_genes.gff' 34 | 35 | if not options.intersect_done: 36 | # overlap genes w/ aligned reads 37 | p = subprocess.Popen('intersectBed -s -wo -abam -bed -a %s -b %s > %s' % (bam_file,gtf_file,ib_file), shell=True) 38 | os.waitpid(p.pid,0) 39 | 40 | # count transcriptome alignments per read 41 | read_aligns = {} 42 | for line in open(ib_file): 43 | a = line.split('\t') 44 | chrom = a[0] 45 | start = int(a[1]) 46 | read_id = a[3] 47 | 48 | read_aligns.setdefault(read_id,set()).add((chrom,start)) 49 | 50 | # hash reads by gene 51 | gene_reads = {} 52 | for line in open(ib_file): 53 | a = line.split('\t') 54 | read_id = a[3] 55 | gene_id = gff.gtf_kv(a[14])['transcript_id'] 56 | gene_reads.setdefault(gene_id,[]).append(read_id) 57 | 58 | # print gene stats 59 | for gene_id in gene_reads: 60 | align_counts = [len(read_aligns[read_id]) for read_id in gene_reads[gene_id]] 61 | multi_count = float(len([ac for ac in align_counts if ac > 1])) 62 | cols = (gene_id, len(align_counts), util.mean(align_counts), multi_count/float(len(align_counts))) 63 | print '%-15s %7d %7.2f %7.2f' % cols 64 | 65 | 66 | ################################################################################ 67 | # __main__ 68 | ################################################################################ 69 | if __name__ == '__main__': 70 | main() 71 | -------------------------------------------------------------------------------- /meme2possum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | import numpy as np 5 | 6 | ################################################################################ 7 | # meme2possum.py 8 | # 9 | # Convert a file of MEME PWMs to Possum's input format. 10 | ################################################################################ 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | # parser.add_option() 19 | (options,args) = parser.parse_args() 20 | 21 | if len(args) != 2: 22 | parser.error('Must provide input MEME file and output Possum file') 23 | else: 24 | meme_file = args[0] 25 | possum_file = args[1] 26 | 27 | ####################################################### 28 | # input MEME motifs 29 | ####################################################### 30 | motif_pwms = {} 31 | in_motif = False 32 | for line in open(meme_file): 33 | if line.startswith('MOTIF'): 34 | motif_id = line.split()[1] 35 | in_motif = True 36 | pwm_cols = [] 37 | elif in_motif: 38 | if line.startswith('letter-probability matrix'): 39 | pass 40 | elif line.strip() == '': 41 | in_motif = False 42 | motif_pwms[motif_id] = np.array(pwm_cols) 43 | else: 44 | pwm_cols.append([float(p) for p in line.split()]) 45 | 46 | if in_motif: 47 | motif_pwms[motif_id] = np.array(pwm_cols) 48 | 49 | ####################################################### 50 | # output Possum 51 | ####################################################### 52 | possum_out = open(possum_file, 'w') 53 | print >> possum_out, 'BEGIN GROUP' 54 | 55 | for motif_id in motif_pwms: 56 | mpwm = motif_pwms[motif_id] 57 | motif_len = mpwm.shape[0] 58 | 59 | print >> possum_out, 'BEGIN FLOAT' 60 | print >> possum_out, 'ID %s' % motif_id 61 | print >> possum_out, 'AP DNA' 62 | print >> possum_out, 'LE %d' % motif_len 63 | for ci in range(motif_len): 64 | print >> possum_out, 'MA %s' % ' '.join([str(n) for n in mpwm[ci]]) 65 | print >> possum_out, 'END' 66 | print >> possum_out, '' 67 | 68 | print >> possum_out, 'END' 69 | 70 | possum_out.close() 71 | 72 | 73 | ################################################################################ 74 | # __main__ 75 | ################################################################################ 76 | if __name__ == '__main__': 77 | main() 78 | -------------------------------------------------------------------------------- /cutFasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import gzip, sys 4 | 5 | ############################################################ 6 | # cutFasta 7 | # 8 | # Extract a sequence from a fasta file, using 1-based 9 | # indexing 10 | ############################################################ 11 | 12 | 13 | ############################################################ 14 | # main 15 | ############################################################ 16 | def main(): 17 | parser = OptionParser() 18 | parser.add_option('-x', dest='start', type='int', help='Cut start') 19 | parser.add_option('-y', dest='end', type='int', help='Cut end') 20 | parser.add_option('-s', dest='header', help='Sequence header') 21 | parser.add_option('-c', dest='contain', action='store_true', default=False, help='Grab all sequences that contain the header pattern') 22 | (options,args) = parser.parse_args() 23 | 24 | if len(args) != 1: 25 | parser.error('Please provide a single fasta file') 26 | 27 | print cf(options.start, options.end, options.header, options.contain, args[0]) 28 | 29 | ############################################################ 30 | # cf 31 | # 32 | # Pull out the sequence from start to end in the entry 33 | # header in the file fasta_file 34 | ############################################################ 35 | def cf(start, end, header, contain, fasta_file): 36 | # collect sequence up to end 37 | seq = '' 38 | get_seq = False 39 | 40 | if fasta_file[-3:] == '.gz': 41 | ff = gzip.open(fasta_file) 42 | else: 43 | ff = open(fasta_file) 44 | line = ff.readline() 45 | while line: 46 | if line[0] == '>': 47 | if get_seq: 48 | # already found, stop 49 | break 50 | else: 51 | # check header 52 | h = line[1:].rstrip() 53 | if not header: 54 | get_seq = True 55 | header = h 56 | elif h == header or (contain and h.find(header) != -1): 57 | get_seq = True 58 | 59 | elif get_seq: 60 | seq += line.rstrip() 61 | 62 | # if past end, stop 63 | if end and len(seq) > end: 64 | break 65 | 66 | line = ff.readline() 67 | 68 | # print seq 69 | if start and end: 70 | return '>%s_(%d-%d)\n%s' % (header,start,end,seq[start-1:end]) 71 | elif start: 72 | return '>%s_(%d-)\n%s' % (header,start,seq[start-1:]) 73 | elif end: 74 | return '>%s_(-%d)\n%s' % (header,start,seq[:end]) 75 | else: 76 | return '>%s\n%s' % (header,seq) 77 | 78 | ############################################################ 79 | # __main__ 80 | ############################################################ 81 | if __name__ == '__main__': 82 | main() 83 | -------------------------------------------------------------------------------- /peaks3_venn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import math, os, stats, subprocess, tempfile 4 | 5 | import matplotlib.pyplot as plt 6 | from matplotlib_venn import venn3 7 | import seaborn as sns 8 | 9 | ################################################################################ 10 | # peaks3_venn.py 11 | # 12 | # Make a venn diagram comparing three sets of peak calls. 13 | ################################################################################ 14 | 15 | 16 | ################################################################################ 17 | # main 18 | ################################################################################ 19 | def main(): 20 | usage = 'usage: %prog [options] ' 21 | parser = OptionParser(usage) 22 | parser.add_option('--l1', dest='label1', default='peaks1', help='Label for peak set 1') 23 | parser.add_option('--l2', dest='label2', default='peaks2', help='Label for peak set 2') 24 | parser.add_option('--l3', dest='label3', default='peaks3', help='Label for peak set 3') 25 | (options,args) = parser.parse_args() 26 | 27 | if len(args) != 4: 28 | parser.error('Must provide three peaks BED files and output PDF') 29 | else: 30 | peak_beds = args[:3] 31 | out_pdf = args[3] 32 | 33 | merge_fd, merge_bed = tempfile.mkstemp() 34 | 35 | # merge peaks 36 | cmd = 'cat %s %s %s | awk \'{OFS="\t"} {print $1, $2, $3}\' | bedtools sort -i stdin | bedtools merge -i stdin > %s' % (peak_beds[0], peak_beds[1], peak_beds[2], merge_bed) 37 | subprocess.call(cmd, shell=True) 38 | 39 | # annotate merged peaks with each individual set 40 | num_peaks = count_peaks(merge_bed) 41 | peak_overlaps = [set(), set(), set()] 42 | 43 | for bi in range(3): 44 | cmd = 'bedtools intersect -c -a %s -b %s' % (merge_bed, peak_beds[bi]) 45 | p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) 46 | pi = 0 47 | for line in p.stdout: 48 | a = line.split() 49 | if int(a[-1]) > 0: 50 | peak_overlaps[bi].add(pi) 51 | pi += 1 52 | 53 | # plot 54 | plt.figure() 55 | venn_diag = venn3(peak_overlaps, set_labels=[options.label1, options.label2, options.label3]) # , set_colors=['#e41a1c', '#A1A838', '']) 56 | plt.savefig(out_pdf) 57 | plt.close() 58 | 59 | # clean up 60 | os.close(merge_fd) 61 | os.remove(merge_bed) 62 | 63 | 64 | def count_peaks(bed_file): 65 | peak_counts = 0 66 | for line in open(bed_file): 67 | peak_counts += 1 68 | return peak_counts 69 | 70 | ################################################################################ 71 | # __main__ 72 | ################################################################################ 73 | if __name__ == '__main__': 74 | main() 75 | -------------------------------------------------------------------------------- /trf_mask.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import re 4 | 5 | ################################################################################ 6 | # trf_mask.py 7 | # 8 | # Mask the tandem repeats found by Tandem Repeat Finder. 9 | ################################################################################ 10 | 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] ... ' 17 | parser = OptionParser(usage) 18 | #parser.add_option() 19 | (options,args) = parser.parse_args() 20 | 21 | if len(args) < 2: 22 | parser.error('Please provide sequence file and TRF output file') 23 | else: 24 | seq_file = args[0] 25 | trf_files = args[1:] 26 | 27 | repeats = {} 28 | for trf_file in trf_files: 29 | get_repeats(trf_file, repeats) 30 | 31 | header = '' 32 | for line in open(seq_file): 33 | if line[0] == '>': 34 | if header: 35 | mseq = mask_seq(seq, repeats.get(header,[])) 36 | print '>%s\n%s' % (header,mseq) 37 | 38 | header = line[1:].rstrip() 39 | seq = '' 40 | 41 | else: 42 | seq += line.rstrip() 43 | 44 | if header: 45 | mseq = mask_seq(seq, repeats.get(header,[])) 46 | print '>%s\n%s' % (header,mseq) 47 | 48 | 49 | ################################################################################ 50 | # get_repeats 51 | # 52 | # Save the repeats in a dict keyed by the header 53 | ################################################################################ 54 | def get_repeats(trf_file, repeats): 55 | indices_re = re.compile('Indices: (\d+)\-\-(\d+)\s*Score') 56 | for line in open(trf_file): 57 | if line.startswith('Sequence:'): 58 | header = line[10:].rstrip() 59 | else: 60 | m = indices_re.search(line) 61 | if m: 62 | (start,end) = m.group(1,2) 63 | repeats.setdefault(header,[]).append((int(start)-1,int(end))) 64 | 65 | 66 | ################################################################################ 67 | # mask_seq 68 | # 69 | # Mask the sequence using the list of repeats 70 | ################################################################################ 71 | def mask_seq(seq, seq_repeats): 72 | mseq = list(seq) 73 | for rep in seq_repeats: 74 | for i in range(rep[0],rep[1]): 75 | mseq[i] = 'N' 76 | return ''.join(mseq) 77 | 78 | 79 | ################################################################################ 80 | # __main__ 81 | ################################################################################ 82 | if __name__ == '__main__': 83 | main() 84 | -------------------------------------------------------------------------------- /vcf_ld.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import pdb 4 | import os 5 | 6 | import pandas as pd 7 | 8 | from basenji.emerald import EmeraldVCF 9 | 10 | ''' 11 | vcf_ld.py 12 | 13 | Transform an input VCF to add all linked variants above some threshold. 14 | Makes use of Emerald for LD queries. 15 | ''' 16 | 17 | ################################################################################ 18 | # main 19 | ################################################################################ 20 | def main(): 21 | usage = 'usage: %prog [options] ' 22 | parser = OptionParser(usage) 23 | parser.add_option('-l','--ld', dest='ld_t', 24 | default=0.8, type='float', 25 | help='LD threshold to include SNP [Default: %default]') 26 | parser.add_option('-r', dest='refpanel_stem', 27 | default='%s/popgen/1000G/phase3/eur/1000G.EUR.QC' % os.environ['HG19'], 28 | help='Reference panel chromosome VCF stem [Default: %default]') 29 | (options,args) = parser.parse_args() 30 | 31 | if len(args) != 2: 32 | parser.error('Must provide input and output VCF files') 33 | else: 34 | in_vcf_file = args[0] 35 | out_vcf_file = args[1] 36 | 37 | # initialize reference panel 38 | refp_em = EmeraldVCF(options.refpanel_stem) 39 | 40 | # retrieve all SNPs in LD 41 | all_snps_df = [] 42 | 43 | # initialize VCFs 44 | in_vcf_open = open(in_vcf_file) 45 | out_vcf_open = open(out_vcf_file, 'w') 46 | 47 | # hash SNPs by chromosome 48 | for line in in_vcf_open: 49 | if line[0] == '#': 50 | # print header 51 | print(line, end='', file=out_vcf_open) 52 | 53 | else: 54 | a = line.split() 55 | chrm = a[0] 56 | pos = int(a[1]) 57 | rsid = a[2] 58 | 59 | # query LD SNPs 60 | snp_df = refp_em.query_ld(rsid, chrm, pos, 61 | options.ld_t, return_pos=True) 62 | 63 | if snp_df.shape[0] == 0: 64 | print('WARNING: %s not found in reference panel.' % rsid) 65 | else: 66 | # set SNP id index 67 | snp_df.set_index('snp', inplace=True) 68 | 69 | # fetch VCF lines 70 | pos_start = snp_df.pos.iloc[0] 71 | pos_end = snp_df.pos.iloc[-1] 72 | for snp_rec in refp_em.fetch(chrm, pos_start-1, pos_end): 73 | if snp_rec.id in snp_df.index: 74 | snp_str = snp_rec.__str__().rstrip() 75 | snp_str += '=%s;LD=%.2f' % (rsid, snp_df.loc[snp_rec.id].r) 76 | print(snp_str, file=out_vcf_open) 77 | 78 | out_vcf_open.close() 79 | in_vcf_open.close() 80 | 81 | 82 | ################################################################################ 83 | # __main__ 84 | ################################################################################ 85 | if __name__ == '__main__': 86 | main() 87 | -------------------------------------------------------------------------------- /quantile_normalization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import copy 4 | 5 | import numpy as np 6 | 7 | ''' 8 | quantile_normalization.py 9 | ''' 10 | 11 | ################################################################################ 12 | # main 13 | ################################################################################ 14 | def main(): 15 | usage = 'usage: %prog [options] arg' 16 | parser = OptionParser(usage) 17 | #parser.add_option() 18 | (options,args) = parser.parse_args() 19 | 20 | x = np.random.randn(10,5) 21 | for ti in range(x.shape[1]): 22 | x[:,ti] = (ti+1)*x[:,ti] 23 | 24 | print(x, end='\n\n') 25 | 26 | xn = quantile_normalize(x) 27 | print(xn, end='\n\n') 28 | 29 | print(x.mean(axis=0)) 30 | print(xn.mean(axis=0)) 31 | 32 | 33 | def quantile_normalize_expr(gene_expr, quantile_stat='median'): 34 | ''' Quantile normalize across targets. The version below 35 | just labels the variables more generally, but should 36 | return the same answer. ''' 37 | 38 | # make a copy 39 | gene_expr_qn = copy.copy(gene_expr) 40 | 41 | # sort values within each column 42 | for ti in range(gene_expr.shape[1]): 43 | gene_expr_qn[:,ti].sort() 44 | 45 | # compute the mean/median in each row 46 | if quantile_stat == 'median': 47 | sorted_index_stats = np.median(gene_expr_qn, axis=1) 48 | elif quantile_stat == 'mean': 49 | sorted_index_stats = np.mean(gene_expr_qn, axis=1) 50 | else: 51 | print('Unrecognized quantile statistic %s' % quantile_stat, file=sys.stderr) 52 | exit() 53 | 54 | # set new values 55 | for ti in range(gene_expr.shape[1]): 56 | sorted_indexes = np.argsort(gene_expr[:,ti]) 57 | for gi in range(gene_expr.shape[0]): 58 | gene_expr_qn[sorted_indexes[gi],ti] = sorted_index_stats[gi] 59 | 60 | return gene_expr_qn 61 | 62 | def quantile_normalize(X, quantile_stat='median'): 63 | ''' Quantile normalize features across samples. ''' 64 | 65 | # make a copy 66 | Xq = copy.copy(X) 67 | 68 | # sort values within each column 69 | for fi in range(X.shape[1]): 70 | Xq[:,fi].sort() 71 | 72 | # compute the mean/median in each row 73 | if quantile_stat == 'median': 74 | sorted_index_stats = np.median(Xq, axis=1) 75 | elif quantile_stat == 'mean': 76 | sorted_index_stats = np.mean(Xq, axis=1) 77 | else: 78 | print('Unrecognized quantile statistic %s' % quantile_stat, file=sys.stderr) 79 | exit() 80 | 81 | # set new values 82 | for fi in range(X.shape[1]): 83 | sorted_indexes = np.argsort(X[:,fi]) 84 | for si in range(X.shape[0]): 85 | Xq[sorted_indexes[si],fi] = sorted_index_stats[si] 86 | 87 | return Xq 88 | 89 | 90 | ################################################################################ 91 | # __main__ 92 | ################################################################################ 93 | if __name__ == '__main__': 94 | main() 95 | -------------------------------------------------------------------------------- /ggplot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os, subprocess, sys, tempfile 4 | 5 | ################################################################################ 6 | # ggplot.py 7 | # 8 | # Make a plot given an R script, dict data frame, and arguments. 9 | ################################################################################ 10 | 11 | 12 | ################################################################################ 13 | # plot 14 | ################################################################################ 15 | def plot(r_script, df_dict, args, df_file=None, print_cmd=False, sep=' '): 16 | # open temp file 17 | if df_file == None: 18 | df_fd, df_file = tempfile.mkstemp() 19 | else: 20 | df_fd = None 21 | df_out = open(df_file, 'w') 22 | 23 | # get headers 24 | headers = sorted(df_dict.keys()) 25 | print >> df_out, sep.join([str(head) for head in headers]) 26 | 27 | # check list lengths 28 | length = len(df_dict[headers[0]]) 29 | for i in range(1,len(headers)): 30 | if length != len(df_dict[headers[i]]): 31 | print >> sys.stderr, 'Lists in dict vary in length.' 32 | exit(1) 33 | 34 | # print data frame 35 | for i in range(length): 36 | print >> df_out, sep.join([str(df_dict[head][i]) for head in headers]) 37 | df_out.close() 38 | 39 | # convert args to one string 40 | args_str = sep.join([str(a) for a in args]) 41 | 42 | # plot in R 43 | cmd = 'R --slave --args %s %s < %s' % (df_file, args_str, r_script) 44 | 45 | if print_cmd: 46 | print >> sys.stderr, cmd 47 | 48 | subprocess.call(cmd, shell=True) 49 | 50 | # clean 51 | if df_fd != None: 52 | os.close(df_fd) 53 | os.remove(df_file) 54 | 55 | 56 | ################################################################################ 57 | # print_df 58 | # 59 | # Just print the given data frame dictionary to the output file given. 60 | ################################################################################ 61 | def print_df(df_dict, out_file=None): 62 | # open 63 | if out_file == None: 64 | df_fd, df_file = tempfile.mkstemp() 65 | else: 66 | df_file = out_file 67 | df_out = open(df_file, 'w') 68 | 69 | # get headers 70 | headers = sorted(df_dict.keys()) 71 | print >> df_out, ' '.join([str(head) for head in headers]) 72 | 73 | # check list lengths 74 | length = len(df_dict[headers[0]]) 75 | for i in range(1,len(headers)): 76 | if length != len(df_dict[headers[i]]): 77 | print >> sys.stderr, 'Lists in dict vary in length:' 78 | for j in range(len(headers)): 79 | print >> sys.stderr, headers[j], len(df_dict[headers[j]]) 80 | exit(1) 81 | 82 | # print data frame 83 | for i in range(length): 84 | print >> df_out, ' '.join([str(df_dict[head][i]) for head in headers]) 85 | df_out.close() 86 | 87 | if out_file == None: 88 | return df_fd, df_file 89 | else: 90 | return None 91 | -------------------------------------------------------------------------------- /strand_specifity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os, subprocess 4 | import pysam 5 | 6 | ################################################################################ 7 | # strand_specificity.py 8 | # 9 | # Print information relevant to determining the strand specificity of the 10 | # sequencing in a BAM file using a TopHat junctions.bed file. 11 | ################################################################################ 12 | 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] ' 19 | parser = OptionParser(usage) 20 | parser.add_option('-s', dest='single', default=False, action='store_true', help='Single-stranded [Default: %default]') 21 | (options,args) = parser.parse_args() 22 | 23 | if len(args) != 2: 24 | parser.error('Must provide BAM file and junctions.bed file.') 25 | else: 26 | bam_file = args[0] 27 | juncs_bed_file = args[1] 28 | 29 | # filter junctions for forward only 30 | subprocess.call('awk \'$6 == "+"\' %s > junctions_fwd.bed' % juncs_bed_file, shell=True) 31 | 32 | if options.single: 33 | subprocess.call('intersectBed -abam %s -b junctions_fwd.bed > fwd.bam' % bam_file, shell=True) 34 | 35 | # count forward/reverse reads 36 | forward = 0 37 | reverse = 0 38 | for aligned_read in pysam.Samfile('fwd.bam'): 39 | if aligned_read.is_reverse: 40 | reverse += 1 41 | else: 42 | forward += 1 43 | 44 | print 'Read\'s aligning + and intersecting + junctions: %9d' % forward 45 | print 'Read\'s aligning - and intersecting + junctions: %9d' % reverse 46 | 47 | else: 48 | # intersect BAM with forward junctions 49 | subprocess.call('intersectBed -s -abam %s -b junctions_fwd.bed > fwd.bam' % bam_file, shell=True) 50 | 51 | # count first/second reads 52 | first = 0 53 | second = 0 54 | for aligned_read in pysam.Samfile('fwd.bam'): 55 | if aligned_read.is_proper_pair: 56 | spliced = False 57 | for (code,size) in aligned_read.cigar: 58 | if code == 3: 59 | spliced = True 60 | if spliced: 61 | if aligned_read.is_read1: 62 | first += 1 63 | else: 64 | second += 1 65 | 66 | print 'Read1\'s aligning + and intersecting + junctions: %9d' % first 67 | print 'Read2\'s aligning + and intersecting + junctions: %9d' % second 68 | 69 | os.remove('junctions_fwd.bed') 70 | os.remove('fwd.bam') 71 | 72 | 73 | ################################################################################ 74 | # __main__ 75 | ################################################################################ 76 | if __name__ == '__main__': 77 | main() 78 | -------------------------------------------------------------------------------- /geneid2transid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | ################################################################################ 5 | # geneid2transid.py 6 | # 7 | # Given a gene id, produce a transcript id to punch into the browser 8 | ################################################################################ 9 | 10 | lnc_catalog = '/Users/dk/research/common/data/lncrna/lnc_catalog.gtf' 11 | 12 | ################################################################################ 13 | # main 14 | ################################################################################ 15 | def main(): 16 | usage = 'usage: %prog [options] \nUsage: %prog [options] ' 17 | parser = OptionParser(usage) 18 | #parser.add_option() 19 | (options,args) = parser.parse_args() 20 | 21 | # parse input 22 | if len(args) == 0: 23 | parser.error('Must provide gene id or file of gene ids') 24 | else: 25 | if args[0].startswith('XLOC'): 26 | print find_longest_transcript(args[0]) 27 | else: 28 | for line in open(args[0]): 29 | print find_longest_transcript(line.rstrip()) 30 | 31 | 32 | 33 | ################################################################################ 34 | # find_longest_transcript 35 | # 36 | # Return the longest transcript for this gene 37 | ################################################################################ 38 | def find_longest_transcript(gene_id): 39 | # find all transcripts and sum lengths 40 | transcripts = {} 41 | for line in open(lnc_catalog): 42 | a = line.split('\t') 43 | kv = gtf_kv(a[8]) 44 | 45 | if gene_id == kv['gene_id']: 46 | tx = kv['transcript_id'] 47 | transcripts[tx] = transcripts.get(tx,0) + int(a[4])-int(a[3])+1 48 | 49 | # return longest 50 | tx_len = max([l for l in transcripts.values()]) 51 | for tx in transcripts: 52 | if transcripts[tx] == tx_len: 53 | return tx 54 | 55 | 56 | ################################################################################ 57 | # gtf_kv 58 | # 59 | # Convert the last gtf section of key/value pairs into a dict. 60 | ################################################################################ 61 | def gtf_kv(s): 62 | d = {} 63 | 64 | a = s.split(';') 65 | for key_val in a: 66 | if key_val.strip(): 67 | if key_val.find('=') != -1: 68 | kvs = key_val.split('=') 69 | else: 70 | kvs = key_val.split() 71 | 72 | if len(kvs) == 2: 73 | key = kvs[0] 74 | if kvs[1][0] == '"' and kvs[1][-1] == '"': 75 | val = kvs[1].strip()[1:-1] 76 | else: 77 | val = kvs[1].strip() 78 | d[key] = val 79 | 80 | return d 81 | 82 | 83 | ################################################################################ 84 | # __main__ 85 | ################################################################################ 86 | if __name__ == '__main__': 87 | main() 88 | -------------------------------------------------------------------------------- /h5_h5z.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | 4 | import h5py 5 | import numpy as np 6 | 7 | from struct import pack, unpack 8 | 9 | ''' 10 | h5_bw.py 11 | 12 | Convert a coverage HDF5 to lossy compressed HDF5. 13 | ''' 14 | 15 | ################################################################################ 16 | # main 17 | ################################################################################ 18 | def main(): 19 | usage = 'usage: %prog [options] ' 20 | parser = OptionParser(usage) 21 | parser.add_option('-v', dest='verbose', default=False, action='store_true') 22 | (options,args) = parser.parse_args() 23 | 24 | if len(args) != 2: 25 | parser.error('Must provide input HDF5 and output BigWig.') 26 | else: 27 | in_h5_file = args[0] 28 | out_h5_file = args[1] 29 | 30 | # open files 31 | h5_in = h5py.File(in_h5_file) 32 | h5_out = h5py.File(out_h5_file, 'w') 33 | 34 | # construct header 35 | header = [] 36 | chroms = sorted(h5_in.keys()) 37 | for chrom in chroms: 38 | # chromosome and length 39 | header.append((chrom,len(h5_in[chrom]))) 40 | 41 | for chrom, length in header: 42 | if options.verbose: 43 | print(chrom) 44 | 45 | # read values 46 | x = np.array(h5_in[chrom], dtype='float16') 47 | 48 | # write gzipped into HDF5 49 | h5_out.create_dataset(chrom, data=x, chunks=True, compression=32013, compression_opts=None, shuffle=False) 50 | 51 | # close files 52 | h5_in.close() 53 | h5_out.close() 54 | 55 | def zfp_rate_opts(rate): 56 | """Create compression options for ZFP in fixed-rate mode 57 | 58 | The float rate parameter is the number of compressed bits per value. 59 | """ 60 | ZFP_MODE_RATE = 1 61 | rate = pack('': 41 | if chrom: 42 | process_chrom(transcripts_gtf, chrom, seq, transcript_seqs, transcript_genes) 43 | 44 | chrom = line[1:].rstrip() 45 | seq = '' 46 | else: 47 | seq += line.rstrip() 48 | line = genome_open.readline() 49 | process_chrom(transcripts_gtf, chrom, seq, transcript_seqs, transcript_genes) 50 | 51 | # print fasta 52 | for tid in transcript_seqs: 53 | print '>%s gene=%s\n%s' % (tid,transcript_genes[tid],transcript_seqs[tid]) 54 | 55 | 56 | ################################################################################ 57 | # process_chrom 58 | # 59 | # Build up transcript_seqs and transcript_genes hashes for the chromosome 60 | # specified. 61 | ################################################################################ 62 | def process_chrom(transcripts_gtf, chrom, seq, transcript_seqs, transcript_genes): 63 | # find chr transcripts 64 | for line in open(transcripts_gtf): 65 | a = line.split('\t') 66 | if a[0] == chrom: 67 | kv = gff.gtf_kv(a[8]) 68 | tid = kv['transcript_id'] 69 | gid = kv['gene_id'] 70 | 71 | exon_start = int(a[3]) 72 | exon_end = int(a[4]) 73 | 74 | exon_seq = seq[exon_start-1:exon_end] 75 | if a[6] == '+': 76 | transcript_seqs[tid] = transcript_seqs.get(tid,'') + exon_seq 77 | else: 78 | transcript_seqs[tid] = dna.rc(exon_seq) + transcript_seqs.get(tid,'') 79 | 80 | transcript_genes[tid] = gid 81 | 82 | 83 | ################################################################################ 84 | # __main__ 85 | ################################################################################ 86 | if __name__ == '__main__': 87 | main() 88 | #pdb.runcall(main) 89 | -------------------------------------------------------------------------------- /transmapbed2gtf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import gff 4 | 5 | ################################################################################ 6 | # transmapbed2gtf.py 7 | # 8 | # Convert the bed file that you get from the TransMap pipeline to a gtf file 9 | # where adjacent blocks are merged. 10 | ################################################################################ 11 | 12 | 13 | ################################################################################ 14 | # main 15 | ################################################################################ 16 | def main(): 17 | usage = 'usage: %prog [options] ' 18 | parser = OptionParser(usage) 19 | parser.add_option('-g', dest='orig_gtf', help='The original gtf file of the TransMap\'d genes to be used to transfer gene id\'s') 20 | parser.add_option('-m', dest='merge_dist', type='int', default=30, help='Minimum distance two exons can be apart for them to be merged [Default: %default]') 21 | (options,args) = parser.parse_args() 22 | 23 | if len(args) != 1: 24 | parser.error('Must provide bed file') 25 | else: 26 | bed_file = args[0] 27 | 28 | # map transcript id's to gene id's if possible 29 | t2g = {} 30 | if options.orig_gtf: 31 | for line in open(options.orig_gtf): 32 | a = line.split('\t') 33 | kv = gff.gtf_kv(a[8]) 34 | t2g[kv['transcript_id']] = kv['gene_id'] 35 | 36 | # hash to disambiguate multi-mapping transcripts 37 | transcript_maps = {} 38 | 39 | for line in open(bed_file): 40 | a = line.split('\t') 41 | a[-1] = a[-1].rstrip() 42 | 43 | tid = a[3] 44 | gid = t2g.get(a[3],a[3]) 45 | 46 | transcript_maps[tid] = transcript_maps.get(tid,0) + 1 47 | if transcript_maps[tid] > 1: 48 | gid += '_v%d' % transcript_maps[tid] 49 | tid += '_v%d' % transcript_maps[tid] 50 | 51 | gene_start = int(a[1]) 52 | gene_end = int(a[2]) 53 | 54 | block_sizes = [int(x) for x in a[10].split(',') if x] 55 | block_starts = [int(x) for x in a[11].split(',') if x] 56 | 57 | exon_cols = [] 58 | last_end = None 59 | exon_num = 1 60 | for i in range(len(block_starts)): 61 | exon_start = gene_start+1+block_starts[i] 62 | exon_end = gene_start+1+block_starts[i]+block_sizes[i]-1 63 | 64 | if last_end and last_end+options.merge_dist >= exon_start: 65 | # merge w/ last 66 | exon_cols[-1][4] = str(exon_end) 67 | else: 68 | exon_cols.append([a[0], 'TransMap', 'exon', str(exon_start), str(exon_end), '.', a[5], '.', 'gene_id "%s"; transcript_id "%s"; exon_number "%d"' % (gid,tid,exon_num)]) 69 | exon_num += 1 70 | 71 | last_end = exon_end 72 | 73 | for cols in exon_cols: 74 | print '\t'.join(cols) 75 | 76 | 77 | ################################################################################ 78 | # __main__ 79 | ################################################################################ 80 | if __name__ == '__main__': 81 | main() 82 | -------------------------------------------------------------------------------- /tss_bam_replot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | from rpy2.robjects.packages import importr 4 | import rpy2.robjects as ro 5 | import rpy2.robjects.lib.ggplot2 as ggplot2 6 | 7 | grdevices = importr('grDevices') 8 | 9 | ################################################################################ 10 | # tss_bam_replot.py 11 | # 12 | # Make a new plot from the raw data generated by tss_bam_plot.py 13 | ################################################################################ 14 | 15 | 16 | ################################################################################ 17 | # main 18 | ################################################################################ 19 | def main(): 20 | usage = 'usage: %prog [options] ' 21 | parser = OptionParser(usage) 22 | parser.add_option('-d', dest='downstream', default=2000, type='int', help='TSS downstream [Default: %default]') 23 | parser.add_option('-o', dest='out_prefix', default='tss', help='Output prefix [Default: %default]') 24 | parser.add_option('-u', dest='upstream', default=5000, type='int', help='TSS upstream [Default: %default]') 25 | parser.add_option('--ymax', dest='ymax', default=None, type='float', help='Y-coordinate limit [Default: %default]') 26 | (options,args) = parser.parse_args() 27 | 28 | if len(args) != 1: 29 | parser.error('Must provide raw file') 30 | else: 31 | raw_file = args[0] 32 | 33 | # collect data 34 | coords = [] 35 | main_cov = [] 36 | control_cov = [] 37 | for line in open(raw_file): 38 | a = line.split() 39 | coords.append(int(a[0])) 40 | main_cov.append(float(a[1])) 41 | control_cov.append(float(a[2])) 42 | 43 | # data structures 44 | tss_i = ro.IntVector(range(-options.upstream,options.downstream+1)) 45 | labels = ro.StrVector(['Main']*(options.upstream+options.downstream+1)+['Control']*(options.upstream+options.downstream+1)) 46 | cov = ro.FloatVector(main_cov + control_cov) 47 | 48 | df = ro.DataFrame({'tss_i':tss_i, 'cov':cov, 'label':labels}) 49 | 50 | # plot 51 | ''' 52 | gp = ggplot2.ggplot(df) + \ 53 | ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ 54 | ggplot2.geom_point() + \ 55 | ggplot2.scale_x_continuous('TSS index') + \ 56 | ggplot2.scale_colour_discrete('') 57 | ''' 58 | gp = ggplot2.ggplot(df) + \ 59 | ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \ 60 | ggplot2.geom_smooth(method='loess', size=1, span=0.2, se=False) + \ 61 | ggplot2.scale_x_continuous('TSS Position') + \ 62 | ggplot2.scale_colour_discrete('') + \ 63 | ggplot2.theme_bw() 64 | 65 | if options.ymax == None: 66 | gp += ggplot2.scale_y_continuous('Coverage') 67 | else: 68 | gp += ggplot2.scale_y_continuous('Coverage', limits=ro.FloatVector([0,options.ymax])) 69 | 70 | # save to file 71 | grdevices.pdf(file='%s_and.pdf' % options.out_prefix) 72 | gp.plot() 73 | grdevices.dev_off() 74 | 75 | 76 | 77 | ################################################################################ 78 | # __main__ 79 | ################################################################################ 80 | if __name__ == '__main__': 81 | main() 82 | -------------------------------------------------------------------------------- /attach_nh.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import sys 4 | import pysam 5 | 6 | ################################################################################ 7 | # attach_nh.py 8 | # 9 | # Attach NH tags to a stream of SAM alignments from Bowtie2. 10 | # 11 | # Note: I'm not sure how paired end reads will stream in. 12 | ################################################################################ 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] arg' 19 | parser = OptionParser(usage) 20 | #parser.add_option() 21 | (options,args) = parser.parse_args() 22 | 23 | sam_in = pysam.AlignmentFile('-', 'r') 24 | 25 | # previously required, but apparently not anymore 26 | # write_header(sam_in.header) 27 | 28 | sam_out = pysam.AlignmentFile('-', 'w', template=sam_in) 29 | 30 | last_id = 'not a header' 31 | read1_aligns = [] 32 | read2_aligns = [] 33 | 34 | for align in sam_in: 35 | if align.is_unmapped: 36 | # read stream concludes 37 | output_read(sam_out, read1_aligns) 38 | output_read(sam_out, read2_aligns) 39 | read1_aligns = [] 40 | read2_aligns = [] 41 | 42 | else: 43 | read_id = align.query_name 44 | 45 | if not match_id(read_id, last_id, align.is_paired): 46 | # read stream concludes 47 | 48 | # output 49 | output_read(sam_out, read1_aligns) 50 | output_read(sam_out, read2_aligns) 51 | 52 | # reset 53 | read1_aligns = [] 54 | read2_aligns = [] 55 | 56 | # read stream continues 57 | if align.is_read1: 58 | read1_aligns.append(align) 59 | else: 60 | read2_aligns.append(align) 61 | 62 | # update read id 63 | last_id = read_id 64 | 65 | sam_in.close() 66 | sam_out.close() 67 | 68 | 69 | def match_id(id1, id2, paired): 70 | ''' Match read_id's. 71 | 72 | First case handles most datasets. 73 | Second case handles paired end datasets where they got fancy. 74 | ''' 75 | 76 | return id1 == id2 or (paired and id1[:-1] == id2[:-1] and id1[-1] in '12' and id2[-1] in '12') 77 | 78 | 79 | def output_read(sam_out, read_aligns): 80 | nh_tag = len(read_aligns) 81 | for align in read_aligns: 82 | align.set_tag('NH',nh_tag) 83 | sam_out.write(align) 84 | 85 | 86 | def write_header(header): 87 | hd = header['HD'] 88 | print('@HD\tVN:%s\tSO:%s' % (hd['VN'], hd['SO'])) 89 | 90 | for sq in header['SQ']: 91 | print('@SQ\tSN:%s\tLN:%d' % (sq['SN'],sq['LN'])) 92 | 93 | pg = header['PG'][0] 94 | print('@PG\tID:%s\tPN:%s\tVN:%s\tCL:%s' % (pg['ID'],pg['PN'],pg['VN'],pg['CL']), flush=True) 95 | 96 | 97 | ################################################################################ 98 | # __main__ 99 | ################################################################################ 100 | if __name__ == '__main__': 101 | main() 102 | -------------------------------------------------------------------------------- /citemelike.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import urllib2 4 | import pdb, re, math, random 5 | 6 | ################################################################################ 7 | # citemelike.py 8 | # 9 | # Choose a random paper from my citeulike account to read. 10 | ################################################################################ 11 | 12 | user = 'dakelley' 13 | star_factor = 10 14 | 15 | url_re = re.compile('href="(/user/dakelley/article/\d+)"') 16 | star_re = re.compile('src="/static/img/star(\d).png"') 17 | 18 | ################################################################################ 19 | # main 20 | ################################################################################ 21 | def main(): 22 | usage = 'usage: %prog [options] arg' 23 | parser = OptionParser(usage) 24 | parser.add_option('--tag', dest='tag', help='Choose a paper with the given tag') 25 | (options,args) = parser.parse_args() 26 | 27 | if options.tag: 28 | citeulike_url = 'http://www.citeulike.org/user/%s/tag/%s/order/to_read' % (user,options.tag) 29 | else: 30 | citeulike_url = 'http://www.citeulike.org/user/%s/order/to_read' % user 31 | 32 | # get papers 33 | papers = get_papers(citeulike_url) 34 | 35 | if len(papers) == 0: 36 | parser.error('No papers with the tag %s' % options.tag) 37 | 38 | # re-score stars 39 | papers = [(math.pow(stars/5.0,star_factor),paper) for (stars,paper) in papers] 40 | 41 | # choose random paper 42 | max_rand = sum([score for (score,paper) in papers]) 43 | rand_score = random.uniform(0,max_rand) 44 | rand_tmp = 0.0 45 | for (score,paper) in papers: 46 | rand_tmp += score 47 | if rand_tmp > rand_score: 48 | print 'http://www.citeulike.org%s' % paper 49 | break 50 | 51 | ################################################################################ 52 | # get_papers 53 | # 54 | # Get all papers from the following base url 55 | ################################################################################ 56 | def get_papers(citeulike_url): 57 | papers = [] 58 | 59 | page_num = 1 60 | no_read = True 61 | unread_found = True 62 | while no_read and unread_found: 63 | unread_found = False 64 | 65 | #f = urllib2.urlopen('%s/page/%d' % (citeulike_url,page_num)) 66 | req = urllib2.Request('%s/page/%d' % (citeulike_url,page_num), headers={'User-Agent':"Magic Broswer"}) 67 | f = urllib2.urlopen(req) 68 | cul_read = f.read() 69 | cul_text = ''.join(cul_read) 70 | cul_lines = cul_text.split('\n') 71 | 72 | for line in cul_lines: 73 | if line.find('class="title"') != -1: 74 | url_match = url_re.search(line) 75 | paper_url = url_match.group(1) 76 | unread_found = True 77 | 78 | elif line.find('/static/img/star') != -1: 79 | star_match = star_re.search(line) 80 | stars = int(star_match.group(1)) 81 | papers.append((stars,paper_url)) 82 | 83 | elif line.find('radio') == -1 and line.find('already read') != -1: 84 | papers = papers[:-1] 85 | unread = False 86 | break 87 | 88 | page_num += 1 89 | 90 | return papers 91 | 92 | ################################################################################ 93 | # __main__ 94 | ################################################################################ 95 | if __name__ == '__main__': 96 | main() 97 | #pdb.runcall(main) 98 | -------------------------------------------------------------------------------- /cuff_rep_cor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os, pdb, sys 4 | from numpy import array 5 | from scipy.stats import spearmanr 6 | import gff, ggplot 7 | 8 | ################################################################################ 9 | # cuff_rep_cor.py 10 | # 11 | # Compute correlations between replicates in a cufflinks run. 12 | ################################################################################ 13 | 14 | ################################################################################ 15 | # main 16 | ################################################################################ 17 | def main(): 18 | usage = 'usage: %prog [options] <.read_group_tracking>' 19 | parser = OptionParser(usage) 20 | parser.add_option('-g', dest='genes_gtf', help='Print only genes in the given GTF file') 21 | #parser.add_option('-p', dest='pseudocount', type='float', default=0.125, help='FPKM pseudocount for taking logs [Default: %default]') 22 | parser.add_option('-o', dest='out_pdf', default='cor_heat.pdf', help='Output heatmap pdf [Default: %default]') 23 | (options,args) = parser.parse_args() 24 | 25 | if len(args) != 1: 26 | parser.error(usage) 27 | else: 28 | read_group_tracking = args[0] 29 | 30 | # get gene_ids 31 | gene_set = set() 32 | if options.genes_gtf: 33 | for line in open(options.genes_gtf): 34 | a = line.split('\t') 35 | gid = gff.gtf_kv(a[8])['gene_id'] 36 | gene_set.add(gid) 37 | 38 | # initialize diff data structures 39 | cond_rep_gene_fpkm = {} 40 | 41 | # read read group tracking file 42 | rgt_in = open(read_group_tracking) 43 | headers = rgt_in.readline() 44 | line = rgt_in.readline() 45 | while line: 46 | a = line.split('\t') 47 | 48 | gene_id = a[0] 49 | cond = a[1] 50 | rep = int(a[2]) 51 | fpkm = float(a[6]) 52 | status = a[8].rstrip() 53 | 54 | if status == 'OK' and (len(gene_set) == 0 or gene_id in gene_set): 55 | if not (cond,rep) in cond_rep_gene_fpkm: 56 | cond_rep_gene_fpkm[(cond,rep)] = {} 57 | 58 | cond_rep_gene_fpkm[(cond,rep)][gene_id] = fpkm 59 | 60 | line = rgt_in.readline() 61 | rgt_in.close() 62 | 63 | df_dict = {'Sample1':[], 'Sample2':[], 'Correlation':[]} 64 | cond_reps = cond_rep_gene_fpkm.keys() 65 | 66 | for i in range(len(cond_reps)): 67 | cond1, rep1 = cond_reps[i] 68 | 69 | for j in range(i+1,len(cond_reps)): 70 | cond2, rep2 = cond_reps[j] 71 | 72 | genes12 = set(cond_rep_gene_fpkm[(cond1,rep1)].keys()) & set(cond_rep_gene_fpkm[(cond2,rep2)].keys()) 73 | 74 | fpkms1 = array([cond_rep_gene_fpkm[(cond1,rep1)][gene_id] for gene_id in genes12]) 75 | fpkms2 = array([cond_rep_gene_fpkm[(cond2,rep2)][gene_id] for gene_id in genes12]) 76 | 77 | rho, pval = spearmanr(fpkms1, fpkms2) 78 | 79 | cols = (cond1,rep1,cond2,rep2,rho) 80 | print '%-15s %1d %-15s %1d %.4f' % cols 81 | 82 | df_dict['Sample1'].append('%s_%d' % (cond1,rep1)) 83 | df_dict['Sample2'].append('%s_%d' % (cond2,rep2)) 84 | df_dict['Correlation'].append(rho) 85 | 86 | # this is broken 87 | ggplot.plot('%s/cuff_rep_cor.r' % os.environ['RDIR'], df_dict, [options.out_pdf], debug=True) 88 | 89 | 90 | ################################################################################ 91 | # __main__ 92 | ################################################################################ 93 | if __name__ == '__main__': 94 | main() 95 | #pdb.runcall(main) 96 | -------------------------------------------------------------------------------- /bedtools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from optparse import OptionParser 3 | import os, subprocess, tempfile 4 | import pysam 5 | 6 | ################################################################################ 7 | # bedtools.py 8 | # 9 | # 10 | ################################################################################ 11 | 12 | 13 | ################################################################################ 14 | # main 15 | ################################################################################ 16 | def main(): 17 | usage = 'usage: %prog [options] arg' 18 | parser = OptionParser(usage) 19 | #parser.add_option() 20 | (options,args) = parser.parse_args() 21 | 22 | 23 | ################################################################################ 24 | # abam_f1 25 | # 26 | # Intersect the BAM file with the BED file using the "-f 1" option, but correct 27 | # for the loss of spliced reads. 28 | ################################################################################ 29 | def abam_f1(bam_file, bed_file, out_file): 30 | ############################################ 31 | # divide BAM by splicing 32 | ############################################ 33 | spliced_bam_fd, spliced_bam_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) 34 | unspliced_bam_fd, unspliced_bam_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) 35 | 36 | # open BAMs 37 | bam_in = pysam.Samfile(bam_file, 'rb') 38 | spliced_bam_out = pysam.Samfile(spliced_bam_file, 'wb', template=bam_in) 39 | unspliced_bam_out = pysam.Samfile(unspliced_bam_file, 'wb', template=bam_in) 40 | 41 | # divide 42 | for aligned_read in bam_in: 43 | if spliced(aligned_read): 44 | spliced_bam_out.write(aligned_read) 45 | else: 46 | unspliced_bam_out.write(aligned_read) 47 | 48 | # close BAMs 49 | bam_in.close() 50 | spliced_bam_out.close() 51 | unspliced_bam_out.close() 52 | 53 | ############################################ 54 | # intersect and merge 55 | ############################################ 56 | spliced_is_bam_fd, spliced_is_bam_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) 57 | unspliced_is_bam_fd, unspliced_is_bam_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) 58 | 59 | subprocess.call('intersectBed -f 1 -abam %s -b %s > %s' % (unspliced_bam_file, bed_file, unspliced_is_bam_file), shell=True) 60 | subprocess.call('intersectBed -abam %s -b %s > %s' % (spliced_bam_file, bed_file, spliced_is_bam_file), shell=True) 61 | 62 | subprocess.call('samtools merge -f %s %s %s' % (out_file, unspliced_is_bam_file, spliced_is_bam_file), shell=True) 63 | 64 | ############################################ 65 | # clean 66 | ############################################ 67 | os.close(spliced_bam_fd) 68 | os.remove(spliced_bam_file) 69 | os.close(unspliced_bam_fd) 70 | os.remove(unspliced_bam_file) 71 | os.close(spliced_is_bam_fd) 72 | os.remove(spliced_is_bam_file) 73 | os.close(unspliced_is_bam_fd) 74 | os.remove(unspliced_is_bam_file) 75 | 76 | 77 | ################################################################################ 78 | # spliced 79 | # 80 | # Return true if the read is spliced. 81 | ################################################################################ 82 | def spliced(aligned_read): 83 | spliced = False 84 | for code,size in aligned_read.cigar: 85 | if code == 3: 86 | spliced = True 87 | return spliced 88 | 89 | ################################################################################ 90 | # __main__ 91 | ################################################################################ 92 | if __name__ == '__main__': 93 | main() 94 | --------------------------------------------------------------------------------