├── .gitignore
├── README.md
├── col2rows
├── vcf_add_chr.awk
├── sqgrep.sh
├── sqgrepc.sh
├── raf_viz.sh
├── contrafold_viz.sh
├── r
    ├── mi_parmigene.r
    ├── peak_bam_plot.r
    ├── te_bam_coverage_bam.r
    ├── te_diff_count.r
    ├── bam_heat_meta.r
    ├── diff_diff_ma.r
    ├── bam_heat_heat.r
    ├── plot_gff_cov_heat.r
    ├── fpkm_fpkm_scatter.r
    ├── diff_diff_scatter.r
    ├── annotation_pie_ratios.r
    ├── cuff_2d.r
    ├── annotation_pie_pie.r
    ├── cuff_rep_cor.r
    ├── te_diff.r
    ├── cuff_bar.r
    ├── cuff_scatter.r
    ├── diff_summary.r
    ├── peaks_diff_compare.r
    ├── cuff_heat.r
    └── plot_gff_cov_meta.r
├── template_gpu.sb
├── test_template.py
├── template.py
├── sym_matrix.py
├── template_sci.py
├── parallel_template.py
├── clear_slurm.py
├── pygene_utrs.py
├── gtf2utrs.py
├── vcf2vds.py
├── fasta_upper.py
├── vcf2bed.py
├── h5_sum.py
├── explore.ipynb
├── fasta_genome.py
├── rm_nonxs.py
├── bam_unique.py
├── make_fasta_genome.py
├── clean_csv.py
├── transid2geneid.py
├── possum2bed.py
├── mess2fasta.py
├── gff2bed.py
├── rm2bed.py
├── possum2gff.py
├── zarr_h5.py
├── bam_12.py
├── rm2gff.py
├── plot.py
├── bam_plus_minus.py
├── fastq_filter.py
├── gsea_ranks.py
├── fastq_trim.py
├── bam_quality.py
├── gtf2prom.py
├── bw_nan.py
├── h5_zarr.py
├── sciseq_collision.py
├── gtf_span.py
├── bim_vcf.py
├── cuff_fails.py
├── gtf_filter_csf.py
├── zarr_bw.py
├── bed2gff.py
├── bed2gtf.py
├── gaps_bed.py
├── stockholm2fasta.py
├── bed_clean.py
├── multiz_gff.py
├── fastq_quality_change.py
├── w5_bg.py
├── split_fragment_lengths.py
├── gtf_cut.py
├── gtf_filter_expr.py
├── fpkm_tracking.py
├── size.py
├── w5_bw.py
├── gtf2bed.py
├── reservoir_sample.py
├── bl2gff.py
├── bam_len_hist.py
├── isoforms_fpkm.py
├── vcf_tss.py
├── peaks_venn.py
├── multiz_lncrna.py
├── fpkm_hist.py
├── bg_w5.py
├── gtf_homologues.py
├── gsea_rnk.py
├── lnc_expression.py
├── nuc2gff.py
├── plot_fragment_lengths.py
├── bam_bedg.py
├── bgo_w5.py
├── make_ref_ml.py
├── vcf_splice.py
├── seq_logo.py
├── rmdup_iclip.py
├── fpkm_fpkm.py
├── gsea.py
├── gtf_multimaps.py
├── meme2possum.py
├── cutFasta.py
├── peaks3_venn.py
├── trf_mask.py
├── vcf_ld.py
├── quantile_normalization.py
├── ggplot.py
├── strand_specifity.py
├── geneid2transid.py
├── h5_h5z.py
├── transcripts_fasta.py
├── transmapbed2gtf.py
├── tss_bam_replot.py
├── attach_nh.py
├── citemelike.py
├── cuff_rep_cor.py
└── bedtools.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.py~
2 | *.pyc
3 | r/*.r~
4 | .gitignore~
5 | ._*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | utility
2 | =======
3 | 
4 | Computational biology utility scripts


--------------------------------------------------------------------------------
/col2rows:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | awk -F "\t" '{for (i=1; i <= NF; i++) print $i}'
4 | 


--------------------------------------------------------------------------------
/vcf_add_chr.awk:
--------------------------------------------------------------------------------
1 | #!/bin/awk -f
2 | 
3 | {if (substr($0,1,1) == "#") print $0; else print "chr"$0}
4 | 


--------------------------------------------------------------------------------
/sqgrep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | squeue --format "%.10i %.9P %.20j %.8u %.2t %.10M %.6D %Q %R" -u drk | grep $1 | awk '{print $1}'
3 | 


--------------------------------------------------------------------------------
/sqgrepc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | scancel `squeue --format "%.10i %.9P %.20j %.8u %.2t %.10M %.6D %Q %R" -u drk | grep $1 | awk '{print $1}' | xargs`
4 | 


--------------------------------------------------------------------------------
/raf_viz.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | # raf predict ...
4 | raf2bpseq.py $1.raf > $1.bpseq
5 | make_coords $1.bpseq $1.coords
6 | plot_rna $1.bpseq $1.coords --png $1.png
7 | open $1.png
8 | 


--------------------------------------------------------------------------------
/contrafold_viz.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | contrafold predict $1 --bpseq $1.bpseq --parens $1.parens
4 | make_coords $1.bpseq $1.coords
5 | plot_rna $1.bpseq $1.coords --png $1.png
6 | open $1.png
7 | 


--------------------------------------------------------------------------------
/r/mi_parmigene.r:
--------------------------------------------------------------------------------
 1 | library(parmigene)
 2 | 
 3 | ca = commandArgs(trailing=T)
 4 | df.file = ca[1]
 5 | 
 6 | df = read.table(df.file, header=T, quote="\"")
 7 | 
 8 | mi = knnmi(df$A, df$B, k=5)
 9 | 
10 | cat(mi)
11 | 


--------------------------------------------------------------------------------
/template_gpu.sb:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -p gpu
 4 | #SBATCH -n 1
 5 | #SBATCH -c 2
 6 | #SBATCH --gres=gpu:nvidia_geforce_gtx_1080_ti:1
 7 | #SBATCH --mem 23000
 8 | #SBATCH --time 2-0:0:0
 9 | #SBATCH -J 3/5_name
10 | #SBATCH -o train_name.out
11 | #SBATCH -e train_name.err
12 | 
13 | . /home/drk/anaconda3/etc/profile.d/conda.sh
14 | conda activate tf210
15 | 
16 | basenji_train.py
17 | 


--------------------------------------------------------------------------------
/r/peak_bam_plot.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | ca = commandArgs(trailing=T)
 4 | df.file = ca[1]
 5 | output.pdf = ca[2]
 6 | 
 7 | df = read.table(df.file, header=T, quote="\"")
 8 | 
 9 | ggplot(df, aes(x=peak_i, y=cov)) +
10 |     geom_point() +
11 |     scale_x_continuous("Peak index") +
12 |     scale_y_continuous("Coverage") +
13 |     theme_bw() +
14 |     theme(text=element_text(size=20))
15 | 
16 | ggsave(output.pdf)
17 | 


--------------------------------------------------------------------------------
/r/te_bam_coverage_bam.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | ca = commandArgs(trailing=T)
 4 | df.file = ca[1]
 5 | plot.title = ca[2]
 6 | output.pdf = ca[3]
 7 | 
 8 | df = read.table(df.file, header=T)
 9 | 
10 | ggplot(df, aes(x=indexes, y=coverage)) +
11 |  geom_histogram(stat="identity") +
12 |  scale_x_continuous("TE index") +
13 |  scale_y_continuous("") +
14 |  ggtitle(plot.title) +
15 |  theme_bw()
16 | 
17 | ggsave(output.pdf)
18 | 


--------------------------------------------------------------------------------
/r/te_diff_count.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | library(plyr)
 3 | 
 4 | ca = commandArgs(trailing=T)
 5 | df.file = ca[1]
 6 | out.pdf = ca[2]
 7 | scale = as.numeric(ca[3])
 8 | 
 9 | df = read.table(df.file, header=T)
10 | 
11 | ggplot(df, aes(x=TEs, y=stat_mid, ymin=stat_low, ymax=stat_hi)) +
12 |     geom_pointrange() +
13 |     stat_smooth(se=FALSE, color="black", lty=2) +
14 |     scale_y_continuous("log2 fRIP/input") +
15 |     theme_bw() +
16 |     theme(text=element_text(size=(sqrt(scale)*28)))
17 | 
18 | ggsave(out.pdf, scale=scale)
19 | 


--------------------------------------------------------------------------------
/test_template.py:
--------------------------------------------------------------------------------
 1 | #!/user/bin/env python
 2 | 
 3 | ############################################################
 4 | # name
 5 | #
 6 | #
 7 | ############################################################
 8 | import unittest
 9 | 
10 | class Test...(unittest.TestCase):
11 |     def setUp(self):
12 |     	
13 | 
14 |     #def test...(self):
15 |         #self.assert_()
16 |         #self.assertEqual(,)
17 | 
18 | ############################################################
19 | # __main__
20 | ############################################################
21 | if __name__ == '__main__':
22 |     unittest.main()
23 | 


--------------------------------------------------------------------------------
/r/bam_heat_meta.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | ca = commandArgs(trailing=T)
 4 | df.file = ca[1]
 5 | output.pdf = ca[2]
 6 | 
 7 | df = read.table(df.file, header=T, quote="\"")
 8 | 
 9 | if (ncol(df) == 2) {
10 |     gp = ggplot(df, aes(x=Index, y=Coverage))
11 | } else {
12 |     gp = ggplot(df, aes(x=Index, y=Coverage, color=Type)) +
13 |         scale_color_brewer(palette="Set1")
14 | }
15 | 
16 | gp +
17 |     geom_point() +
18 |     geom_smooth() +
19 |     theme_bw() +
20 |     theme(text=element_text(size=16)) +
21 |     theme(legend.justification=c(1,0), legend.position=c(1,0))
22 | 
23 | ggsave(output.pdf)
24 | 


--------------------------------------------------------------------------------
/r/diff_diff_ma.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | ca = commandArgs(trailing=T)
 4 | df.file = ca[1]
 5 | output.pdf = ca[2]
 6 | 
 7 | df = read.table(df.file, header=T, quote="\"")
 8 | 
 9 | x.min = quantile(df$avg, .002)
10 | x.max = quantile(df$avg, .998)
11 | 
12 | y.min = quantile(df$minus, .002)
13 | y.max = quantile(df$minus, .998)
14 | 
15 | ggplot(df, aes(x=avg, y=minus)) +
16 |     geom_point(size=1.5, alpha=.3) +
17 |     scale_x_continuous("Avg test stat", lim=c(x.min,x.max)) +
18 |     scale_y_continuous("Test stat 1 - 2", lim=c(y.min,y.max)) +
19 |     geom_hline(y=0) +
20 |     theme_bw() +
21 |     theme(text=element_text(size=16))
22 | 
23 | ggsave(output.pdf)
24 | 


--------------------------------------------------------------------------------
/r/bam_heat_heat.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | ca = commandArgs(trailing=T)
 4 | df.file = ca[1]
 5 | output.pdf = ca[2]
 6 | control = ca[3]
 7 | 
 8 | df = read.table(df.file, header=T, quote="\"")
 9 | 
10 | gp = ggplot(df, aes(x=Index, y=Feature, fill=Coverage)) +
11 |     geom_tile()
12 | 
13 | if(control == "True") {
14 |     gp = gp + scale_fill_gradient2(low="#377eb8", high="#e41a1c")
15 | } else {
16 |     gp = gp + scale_fill_gradient(low="white", high="#e41a1c")
17 | }
18 | 
19 | gp +
20 |     scale_y_discrete("") +
21 |     theme_bw() +
22 |     theme(text=element_text(size=16)) +
23 |     theme(axis.ticks.y=element_blank(), axis.text.y=element_blank())
24 | 
25 | ggsave(output.pdf)
26 | 


--------------------------------------------------------------------------------
/r/plot_gff_cov_heat.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | ca = commandArgs(trailing=T)
 4 | df.file = ca[1]
 5 | output.pdf = ca[2]
 6 | control = ca[3]
 7 | 
 8 | df = read.table(df.file, header=T, quote="\"")
 9 | 
10 | gp = ggplot(df, aes(x=Index, y=Anchor, fill=Coverage)) +
11 |     geom_tile()
12 | 
13 | if(control == "True") {
14 |     gp = gp + scale_fill_gradient2(low="#377eb8", high="#e41a1c")
15 | } else {
16 |     gp = gp + scale_fill_gradient(low="white", high="#e41a1c")
17 | }
18 | 
19 | gp +
20 |     scale_y_discrete("") +
21 |     theme_bw() +
22 |     theme(text=element_text(size=16)) +
23 |     theme(axis.ticks.y=element_blank(), axis.text.y=element_blank())
24 | 
25 | ggsave(output.pdf)
26 | 


--------------------------------------------------------------------------------
/r/fpkm_fpkm_scatter.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | ca = commandArgs(trailing=T)
 4 | df.file = ca[1]
 5 | output.pdf = ca[2]
 6 | 
 7 | df = read.table(df.file, header=T, quote="\"")
 8 | 
 9 | x.min = quantile(df$fpkm1, .002)
10 | x.max = quantile(df$fpkm1, .998)
11 | 
12 | y.min = quantile(df$fpkm2, .002)
13 | y.max = quantile(df$fpkm2, .998)
14 | 
15 | ggplot(df, aes(x=fpkm1, y=fpkm2)) +
16 |     geom_point(size=1.5, alpha=.3) +
17 |     stat_smooth() +
18 |     scale_x_continuous("FPKM 1", lim=c(x.min,x.max)) +
19 |     scale_y_continuous("FPKM 2", lim=c(y.min,y.max)) +
20 |     geom_abline(intercept=0, slope=1, linetype=2) +
21 |     theme_bw() +
22 |     theme(text=element_text(size=18))
23 | 
24 | ggsave(output.pdf)
25 | 


--------------------------------------------------------------------------------
/template.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | '''
 5 | Name
 6 | 
 7 | Description...
 8 | '''
 9 | 
10 | ################################################################################
11 | # main
12 | ################################################################################
13 | def main():
14 |     usage = 'usage: %prog [options] arg'
15 |     parser = OptionParser(usage)
16 |     #parser.add_option()
17 |     (options,args) = parser.parse_args()
18 | 
19 | 
20 | ################################################################################
21 | # __main__
22 | ################################################################################
23 | if __name__ == '__main__':
24 |     main()
25 | 


--------------------------------------------------------------------------------
/sym_matrix.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import numpy as np
 3 | 
 4 | ################################################################################
 5 | # sym_matrix.py
 6 | #
 7 | # Space-efficient symmetric matrix class.
 8 | #
 9 | # Indexing adapted from here: http://stackoverflow.com/a/24563079/4114434
10 | ################################################################################
11 | 
12 | class sym_matrix:
13 | 	def __init__(self, n):
14 | 		self.n = n
15 | 		self.M = np.zeros(self.n*self.n)
16 | 
17 | 	def get(self, i, j):
18 | 		if i < j:
19 | 			i, j = j, i
20 | 		return self.M[j*self.n - (j+1)*j/2 + i]
21 | 
22 | 	def set(self, i, j, v):
23 | 		if i < j:
24 | 			i, j = j, i
25 | 		self.M[j*self.n - (j+1)*j/2 + i] = v
26 | 


--------------------------------------------------------------------------------
/r/diff_diff_scatter.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | ca = commandArgs(trailing=T)
 4 | df.file = ca[1]
 5 | output.pdf = ca[2]
 6 | 
 7 | df = read.table(df.file, header=T, quote="\"")
 8 | 
 9 | x.min = quantile(df$diff1, .002)
10 | x.max = quantile(df$diff1, .998)
11 | 
12 | y.min = quantile(df$diff2, .002)
13 | y.max = quantile(df$diff2, .998)
14 | 
15 | ggplot(df, aes(x=diff1, y=diff2)) +
16 |     geom_point(size=1.5, alpha=.3) +
17 |     stat_smooth(method="lm") +
18 |     scale_x_continuous("Test stat 1", lim=c(x.min,x.max)) +
19 |     scale_y_continuous("Test stat 2", lim=c(y.min,y.max)) +
20 |     geom_abline(intercept=0, slope=1, linetype=2) +
21 |     theme_bw() +
22 |     theme(text=element_text(size=25))
23 | 
24 | # stat_smooth(method="lm") +
25 | 
26 | ggsave(output.pdf)
27 | 


--------------------------------------------------------------------------------
/r/annotation_pie_ratios.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | ca = commandArgs(trailing=T)
 4 | df.file = ca[1]
 5 | plot.title = ca[2]
 6 | output.pdf = ca[3]
 7 | 
 8 | df = read.table(df.file, header=T, quote="\"")
 9 | 
10 | annotation.order.all = c('Intergenic','Introns','3\'UTR','5\'UTR','CDS','lncRNA','Pseudogene','rRNA','smallRNA')
11 | annotation.order = annotation.order.all[annotation.order.all %in% df$annotation]
12 | df$annotation = factor(df$annotation, levels=annotation.order)
13 | 
14 | ggplot(df, aes(x=annotation, y=ratio)) +
15 |  geom_bar(stat="identity") +
16 |  scale_x_discrete("Annotation") +
17 |  scale_y_continuous("log2 feature% / length%") +
18 |  ggtitle(plot.title) +
19 |  theme_bw() +
20 |  theme(panel.grid.minor=element_blank(), panel.grid.major=element_blank())
21 | 
22 | ggsave(output.pdf)
23 | 


--------------------------------------------------------------------------------
/template_sci.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | import h5py
 5 | import numpy as np
 6 | 
 7 | import matplotlib.pyplot as plt
 8 | import seaborn as sns
 9 | 
10 | '''
11 | Name
12 | 
13 | Description...
14 | '''
15 | 
16 | ################################################################################
17 | # main
18 | ################################################################################
19 | def main():
20 |     usage = 'usage: %prog [options] arg'
21 |     parser = OptionParser(usage)
22 |     #parser.add_option()
23 |     (options,args) = parser.parse_args()
24 | 
25 | 
26 | ################################################################################
27 | # __main__
28 | ################################################################################
29 | if __name__ == '__main__':
30 |     main()
31 | 


--------------------------------------------------------------------------------
/r/cuff_2d.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | ca = commandArgs(trailing=T)
 4 | df.file = ca[1]
 5 | output.pdf = ca[2]
 6 | square = ca[3]
 7 | 
 8 | df = read.table(df.file, header=T, quote="\"")
 9 | 
10 | if (square == "True") {
11 | 	d1.span = max(df$D1) - min(df$D1)
12 | 	d2.span = max(df$D2) - min(df$D2)
13 | 	plot.ratio = d1.span/d2.span
14 | } else {
15 | 	plot.ratio = 1
16 | }
17 | 
18 | 
19 | ggplot(df, aes(x=D1, y=D2, label=Label, color=Sample)) +
20 | 	geom_point(size=3, alpha=0.8) +
21 | 	scale_x_continuous("") +
22 | 	scale_y_continuous("") +
23 | 	scale_color_discrete("") +
24 | 	theme_bw() +
25 |     theme(text=element_text(size=22)) +
26 |     coord_fixed(ratio=plot.ratio) +
27 |     theme(legend.justification=c(1,0), legend.position=c(1,0))
28 |     
29 | ggsave(output.pdf)
30 | 
31 | # theme(legend.justification=c(1,0), legend.position=c(1,0))
32 | # geom_text(size=5) +
33 | 


--------------------------------------------------------------------------------
/parallel_template.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os, sys
 3 | 
 4 | ############################################################
 5 | # name
 6 | #
 7 | # description
 8 | ############################################################
 9 | 
10 | ############################################################
11 | # main
12 | ############################################################
13 | def main(cpu_id, num_cpus):
14 |     
15 | 
16 | ############################################################
17 | # __main__
18 | ############################################################
19 | if __name__ == '__main__':
20 |     if len(sys.argv) == 3 and sys.argv[1] == '--launch':
21 |         n = int(sys.argv[2])
22 |         for i in range(n):
23 |             os.system('./template_parallel.py %d %d &' % (i,n))
24 |     elif len(sys.argv) == 3:
25 |         main(int(sys.argv[1]), int(sys.argv[2]))
26 | 


--------------------------------------------------------------------------------
/r/annotation_pie_pie.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | ca = commandArgs(trailing=T)
 4 | df.file = ca[1]
 5 | output.pdf = ca[2]
 6 | 
 7 | df = read.table(df.file, header=T, quote="\"")
 8 | 
 9 | annotation.order.all = c('Intergenic','Introns','Exons','3\'UTR','5\'UTR','CDS','lncRNA','Pseudogene','rRNA','smallRNA')
10 | annotation.order = annotation.order.all[annotation.order.all %in% df$annotation]
11 | df$annotation = factor(df$annotation, levels=annotation.order)
12 | 
13 | ggplot(df, aes(x=dummy, y=count, fill=annotation)) +
14 |  geom_bar(stat="identity", width=1) +
15 |  coord_polar(theta="y") +
16 |  scale_x_discrete("") +
17 |  scale_y_continuous("") +
18 |  scale_fill_discrete("Annotation") +
19 |  theme_bw() +  
20 |  theme(axis.text.x=element_blank(), axis.text.y=element_blank(), axis.ticks=element_blank(), panel.grid.minor=element_blank(), panel.grid.major=element_blank())
21 | 
22 | ggsave(output.pdf)
23 | 


--------------------------------------------------------------------------------
/r/cuff_rep_cor.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | library(reshape2)
 3 | 
 4 | ca = commandArgs(trailing=T)
 5 | df.file = ca[1]
 6 | output.pdf = ca[2]
 7 | 
 8 | df = read.table(df.file, header=T, quote="\"")
 9 | 
10 | # this is broken
11 | 
12 | df$dist = 1 - df$Correlation
13 | sample12.matrix = acast(df, Sample1 ~ Sample2, value.var="dist", fill=0)
14 | sample21.matrix = acast(df, Sample2 ~ Sample1, value.var="dist", fill=0)
15 | sample.dist = as.dist(sample12.matrix+sample21.matrix)
16 | sample.clust = hclust(sample.dist)
17 | sample.order = rownames(sample12.matrix)[sample.clust$order]
18 | sample.order = rownames(sample12.matrix)
19 | 
20 | ggplot(df, aes(x=Sample1, y=Sample2, fill=Correlation)) +
21 |  geom_tile() +
22 |  scale_x_discrete("", limits=sample.order) + 
23 |  scale_y_discrete("", limits=sample.order) +
24 |  scale_fill_gradient(low="white", high="darkred") +
25 |  theme_bw()
26 | 
27 | ggsave(output.pdf)
28 | 


--------------------------------------------------------------------------------
/r/te_diff.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | ca = commandArgs(trailing=T)
 4 | df.file = ca[1]
 5 | output.pdf = ca[2]
 6 | scale = as.numeric(ca[3])
 7 | 
 8 | df = read.table(df.file, header=T, quote="\"")
 9 | 
10 | xmin = quantile(df$diff, .005, na.rm=T)
11 | xmax = quantile(df$diff, .995, na.rm=T)
12 | 
13 | ggplot(df, aes(x=diff, color=class)) +
14 |     stat_ecdf(size=(sqrt(scale)*1.5), alpha=0.8, na.rm=T) +
15 |     scale_x_continuous("log2 fRIP/Input", limits=c(xmin,xmax)) +
16 |     scale_y_continuous("") +
17 |     scale_color_manual("", values=c("#F46D43", "#66BD63")) +
18 |     theme_bw() +
19 |     theme(text=element_text(size=(sqrt(scale)*28))) +
20 |     theme(legend.justification=c(1,0), legend.position=c(1,0))
21 | 
22 | #     scale_x_continuous("Differential expression test statistic", limits=c(xmin,xmax)) +
23 | #     scale_color_brewer("", palette="Set1") +
24 | 
25 | ggsave(output.pdf, scale=scale)
26 | 


--------------------------------------------------------------------------------
/r/cuff_bar.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | library(RColorBrewer)
 3 | 
 4 | ca = commandArgs(trailing=T)
 5 | df.file = ca[1]
 6 | y.min = as.numeric(ca[2])
 7 | y.max = ca[3]
 8 | output.pdf = ca[4]
 9 | 
10 | df = read.table(df.file, header=T, quote="\"")
11 | 
12 | if (y.max == "None") {
13 | 	y.max = max(df$conf_hi)
14 | } else {
15 | 	y.max = as.numeric(y.max)
16 | }
17 | 
18 | color.count = length(unique(df$Sample))
19 | get.pal = colorRampPalette(brewer.pal(11, "Spectral"))
20 | 
21 | ggplot(df, aes(x=Sample, y=FPKM, ymin=conf_lo, ymax=conf_hi, fill=Sample)) +
22 | 	geom_bar(stat="identity", fill=get.pal(color.count)) +
23 | 	geom_errorbar(width=0.5) +
24 | 	scale_x_discrete("", limits=df$Sample) +
25 | 	guides(fill=FALSE) +
26 | 	theme_bw() +
27 | 	theme(text=element_text(size=22)) +
28 | 	theme(axis.text.x=element_text(angle=315, hjust=0, vjust=1))
29 | 
30 | # 	scale_fill_brewer(palette="Spectral") +
31 | 
32 | ggsave(output.pdf)
33 | 


--------------------------------------------------------------------------------
/r/cuff_scatter.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | ca = commandArgs(trailing=T)
 4 | df.file = ca[1]
 5 | output.pdf = ca[2]
 6 | cond1 = ca[3]
 7 | cond2 = ca[4]
 8 | pseudocount = as.numeric(ca[5])
 9 | 
10 | df = read.table(df.file, header=T, quote="\"")
11 | 
12 | fpkm.min = log2(pseudocount)
13 | 
14 | fpkm.max1 = quantile(df$fpkm1, .997)
15 | fpkm.max2 = quantile(df$fpkm2, .997)
16 | fpkm.max = max(fpkm.max1, fpkm.max2)
17 | 
18 | qval.unique = unique(df$qval)
19 | if (length(qval.unique) == 1) {
20 |     gp = ggplot(df, aes(x=fpkm1, y=fpkm2))
21 | } else {
22 |     gp = ggplot(df, aes(x=fpkm1, y=fpkm2, colour=qval))
23 | }
24 | 
25 | gp +
26 |     geom_point(size=1.5, alpha=.3) +
27 |     scale_x_continuous(paste(cond1, "log2 FPKM"), lim=c(fpkm.min,fpkm.max)) +
28 |     scale_y_continuous(paste(cond2, "log2 FPKM"), lim=c(fpkm.min,fpkm.max)) +
29 |     geom_abline(intercept=0, slope=1, linetype=2) +
30 |     theme_bw() +
31 |     theme(text=element_text(size=16))
32 | 
33 | #    ggtitle(paste(cond1, "vs", cond2)) +
34 | 
35 | ggsave(output.pdf)
36 | 


--------------------------------------------------------------------------------
/clear_slurm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import os, glob
 4 | 
 5 | '''
 6 | clear_slurm
 7 | 
 8 | Helper script to clear out slurm log files.
 9 | '''
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] <file_size>'
16 |     parser = OptionParser(usage)
17 |     #parser.add_option()
18 |     (options,args) = parser.parse_args()
19 | 
20 |     file_size = None
21 |     if len(args) > 0:
22 |         file_size = int(args[0])
23 | 
24 |     for slurm_out in glob.glob('slurm*.out'):
25 |         if file_size is None or os.path.getsize(slurm_out) == file_size:
26 |             os.remove(slurm_out)
27 |     
28 | 
29 | ################################################################################
30 | # __main__
31 | ################################################################################
32 | if __name__ == '__main__':
33 |     main()
34 | 


--------------------------------------------------------------------------------
/pygene_utrs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | import pygene
 5 | 
 6 | '''
 7 | Name
 8 | 
 9 | Description...
10 | '''
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |   usage = 'usage: %prog [options] <in_gtf_file> <out_gtf_file>'
17 |   parser = OptionParser(usage)
18 |   (options,args) = parser.parse_args()
19 | 
20 |   if len(args) != 2:
21 |   	parser.error('Must provide input and output GTFs')
22 |   else:
23 |   	in_gtf_file = args[0]
24 |   	out_gtf_file = args[1]
25 | 
26 |   gtf = pygene.GTF(in_gtf_file)
27 | 
28 |   with open(out_gtf_file, 'w') as out_gtf_open:
29 |     gtf.write_gtf(out_gtf_file, write_cds=True, write_utrs=True)
30 | 
31 | 
32 | ################################################################################
33 | # __main__
34 | ################################################################################
35 | if __name__ == '__main__':
36 |   main()
37 | 


--------------------------------------------------------------------------------
/gtf2utrs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | ################################################################################
 5 | # gtf2utrs.py
 6 | #
 7 | # Take a gtf file with exons and CDS annotated and return a gtf of the UTRs.
 8 | ################################################################################
 9 | 
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] <gtf file>'
16 |     parser = OptionParser(usage)
17 |     #parser.add_option()
18 |     (options,args) = parser.parse_args()
19 | 
20 |     if len(args) != 1:
21 |         parser.error('Must provide gtf file')
22 |     else:
23 |         gtf_file = args[0]
24 |     
25 | 
26 | ################################################################################
27 | # __main__
28 | ################################################################################
29 | if __name__ == '__main__':
30 |     main()
31 | 


--------------------------------------------------------------------------------
/vcf2vds.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import os
 4 | import shutil
 5 | from hail import *
 6 | 
 7 | '''
 8 | Name
 9 | 
10 | Description...
11 | '''
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] <vcf_file> <vds_file>'
18 |     parser = OptionParser(usage)
19 |     #parser.add_option()
20 |     (options,args) = parser.parse_args()
21 | 
22 |     if len(args) != 2:
23 |         parser.error('Must provide VCF and VDS files')
24 |     else:
25 |         vcf_file = args[0]
26 |         vds_file = args[1]
27 | 
28 |     if os.path.isdir(vds_file):
29 |         shutil.rmtree(vds_file)
30 | 
31 |     hc = HailContext()
32 |     hc.import_vcf(vcf_file).write(vds_file)
33 | 
34 | 
35 | ################################################################################
36 | # __main__
37 | ################################################################################
38 | if __name__ == '__main__':
39 |     main()
40 | 


--------------------------------------------------------------------------------
/fasta_upper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | '''
 5 | Name
 6 | 
 7 | Description...
 8 | '''
 9 | 
10 | ################################################################################
11 | # main
12 | ################################################################################
13 | def main():
14 |     usage = 'usage: %prog [options] <in_fasta_file> <out_fasta_file>'
15 |     parser = OptionParser(usage)
16 |     #parser.add_option()
17 |     (options,args) = parser.parse_args()
18 | 
19 |     in_fasta_file = args[0]
20 |     out_fasta_file = args[1]
21 |     out_fasta_open = open(out_fasta_file, 'w')
22 | 
23 |     for line in open(in_fasta_file):
24 |         if line[0] == '>':
25 |             print(line, end='', file=out_fasta_open)
26 |         else:
27 |             print(line.upper(), end='', file=out_fasta_open)
28 |             
29 |     out_fasta_open.close()
30 | 
31 | 
32 | ################################################################################
33 | # __main__
34 | ################################################################################
35 | if __name__ == '__main__':
36 |     main()
37 | 


--------------------------------------------------------------------------------
/vcf2bed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | '''
 5 | vcf2bed.py
 6 | 
 7 | Simple VCF to BED converter.
 8 | '''
 9 | 
10 | ################################################################################
11 | # main
12 | ################################################################################
13 | def main():
14 |     usage = 'usage: %prog [options] arg'
15 |     parser = OptionParser(usage)
16 |     #parser.add_option()
17 |     (options,args) = parser.parse_args()
18 | 
19 |     vcf_file = args[0]
20 |     bed_file = args[1]
21 |     bed_out = open(bed_file, 'w')
22 | 
23 |     for line in open(vcf_file):
24 |     	if not line.startswith('#'):
25 | 	    	a = line.split('\t')
26 | 	    	chrm = a[0]
27 | 	    	pos = int(a[1])
28 | 	    	name = a[2]
29 | 
30 | 	    	start = pos - 1
31 | 	    	end = start + 1
32 | 	    	print('%s\t%d\t%d\t%s' % (chrm,start,end,name), file=bed_out)
33 | 
34 |     bed_out.close()
35 | 
36 | 
37 | ################################################################################
38 | # __main__
39 | ################################################################################
40 | if __name__ == '__main__':
41 |     main()
42 | 


--------------------------------------------------------------------------------
/r/diff_summary.r:
--------------------------------------------------------------------------------
 1 | library(cummeRbund)
 2 | 
 3 | cuff = readCufflinks()
 4 | 
 5 | ############################################
 6 | # density plot
 7 | ############################################
 8 | csDensity(genes(cuff), rep=T, pseudocount=0.1)
 9 | ggsave("density.pdf")
10 | 
11 | ############################################
12 | # dendrogram
13 | ############################################
14 | pdf("dendro.pdf")
15 | csDendro(genes(cuff), rep=T, pseudocount=1)
16 | dev.off()
17 | 
18 | ############################################
19 | # MDS
20 | ############################################
21 | MDSplot(genes(cuff), rep=T, pseudocount=1) +
22 | 	coord_fixed() +
23 | 	theme_bw() +
24 |     theme(text=element_text(size=15))
25 | 
26 | ggsave("mds.pdf")
27 | 
28 | ############################################
29 | # PCA
30 | ############################################
31 | PCAplot(genes(cuff), x="PC1", y="PC2", rep=T, pseudocount=1) +
32 | 	coord_fixed() +
33 | 	theme_bw() +
34 |     theme(text=element_text(size=15))
35 | 
36 | ggsave("pca12.pdf")
37 | 
38 | PCAplot(genes(cuff), x="PC2", y="PC3", rep=T, pseudocount=1) +
39 | 	coord_fixed() +
40 | 	theme_bw() +
41 |     theme(text=element_text(size=15))
42 | 
43 | ggsave("pca23.pdf")
44 | 


--------------------------------------------------------------------------------
/h5_sum.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import h5py
 4 | 
 5 | '''
 6 | Name
 7 | 
 8 | Description...
 9 | '''
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] <h5_file>'
16 |     parser = OptionParser(usage)
17 |     #parser.add_option()
18 |     (options,args) = parser.parse_args()
19 | 
20 |     h5_in = h5py.File(args[0], 'r')
21 |     print_h5_tree(h5_in)
22 | 
23 |     # h5_keys = sorted(list(h5_in.keys()))
24 |     # for hkey in h5_keys:
25 |     #    print(h5_in[hkey])
26 | 
27 |     h5_in.close()
28 | 
29 | def print_h5_tree(h5_obj, depth=0):
30 |     h5_keys = sorted(list(h5_obj.keys()))
31 |     for hkey in h5_keys:
32 |         print(''.join(['\t']*depth), h5_obj[hkey])
33 |         if type(h5_obj[hkey]) == h5py._hl.group.Group:
34 |             print_h5_tree(h5_obj[hkey], depth+1)
35 |     
36 | ################################################################################
37 | # __main__
38 | ################################################################################
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/explore.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {
 7 |     "collapsed": true
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "%matplotlib inline"
12 |    ]
13 |   },
14 |   {
15 |    "cell_type": "code",
16 |    "execution_count": 2,
17 |    "metadata": {
18 |     "collapsed": true
19 |    },
20 |    "outputs": [],
21 |    "source": [
22 |     "import numpy as np\n",
23 |     "import pandas as pd\n",
24 |     "import matplotlib.pyplot as plt\n",
25 |     "import seaborn as sns"
26 |    ]
27 |   },
28 |   {
29 |    "cell_type": "code",
30 |    "execution_count": null,
31 |    "metadata": {
32 |     "collapsed": true
33 |    },
34 |    "outputs": [],
35 |    "source": []
36 |   }
37 |  ],
38 |  "metadata": {
39 |   "kernelspec": {
40 |    "display_name": "Python [default]",
41 |    "language": "python",
42 |    "name": "python3"
43 |   },
44 |   "language_info": {
45 |    "codemirror_mode": {
46 |     "name": "ipython",
47 |     "version": 3
48 |    },
49 |    "file_extension": ".py",
50 |    "mimetype": "text/x-python",
51 |    "name": "python",
52 |    "nbconvert_exporter": "python",
53 |    "pygments_lexer": "ipython3",
54 |    "version": "3.5.2"
55 |   }
56 |  },
57 |  "nbformat": 4,
58 |  "nbformat_minor": 2
59 | }
60 | 


--------------------------------------------------------------------------------
/fasta_genome.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | import pysam
 5 | 
 6 | '''
 7 | Name
 8 | 
 9 | Description...
10 | '''
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <fasta> <genome>'
17 |     parser = OptionParser(usage)
18 |     #parser.add_option()
19 |     (options,args) = parser.parse_args()
20 | 
21 |     if len(args) != 2:
22 |         parser.error('Must provide input FASTA and output genome files.')
23 |     else:
24 |         fasta_file = args[0]
25 |         genome_file = args[1]
26 | 
27 |     fasta_open = pysam.Fastafile(fasta_file)
28 |     genome_open = open(genome_file, 'w')
29 |     
30 |     for ref in fasta_open.references:
31 |         ref_len = fasta_open.get_reference_length(ref)
32 |         print('%s\t%d' % (ref, ref_len), file=genome_open)
33 | 
34 |     fasta_open.close()
35 |     genome_open.close()
36 | 
37 | 
38 | ################################################################################
39 | # __main__
40 | ################################################################################
41 | if __name__ == '__main__':
42 |     main()
43 | 


--------------------------------------------------------------------------------
/r/peaks_diff_compare.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | ca = commandArgs(trailing=T)
 4 | df.file = ca[1]
 5 | out.pre = ca[2]
 6 | rbp = ca[3]
 7 | tstat = ca[4]
 8 | 
 9 | df = read.table(df.file, header=T, quote="\"")
10 | 
11 | x.min = quantile(df$RIP, .005, na.rm=T)
12 | x.max = quantile(df$RIP, .995, na.rm=T)
13 | 
14 | if(tstat == "True") {
15 |     x.lab = paste(rbp, "fRIP/input diff stat")
16 | } else {
17 |     x.lab = paste(rbp, "log2 fRIP/input")
18 | }
19 | 
20 | ggplot(df, aes(x=RIP, color=CLIP)) +
21 |     geom_line(stat="density", size=1.5, alpha=0.8) +
22 |     scale_x_continuous(x.lab, limits=c(x.min,x.max)) +
23 |     scale_color_brewer(palette="Set1") +
24 |     theme_bw() +
25 |     theme(text=element_text(size=25)) +
26 |     theme(legend.justification=c(1,1), legend.position=c(1,1))
27 | 
28 | out.pdf = paste(out.pre, "_dens.pdf", sep="")
29 | ggsave(out.pdf)
30 | 
31 | 
32 | ggplot(df, aes(x=RIP, color=CLIP)) +
33 |     stat_ecdf(size=1.5, alpha=0.8) +
34 |     scale_x_continuous(x.lab, limits=c(x.min,x.max)) +
35 |     scale_y_continuous("") +
36 |     scale_color_brewer(palette="Set1") +
37 |     theme_bw() +
38 |     theme(text=element_text(size=25)) +
39 |     theme(legend.justification=c(1,0), legend.position=c(1,0))
40 | 
41 | out.pdf = paste(out.pre, "_cdf.pdf", sep="")
42 | ggsave(out.pdf)
43 | 


--------------------------------------------------------------------------------
/rm_nonxs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import os, subprocess
 4 | 
 5 | ################################################################################
 6 | # rm_nonxs.py
 7 | #
 8 | # Check for BAM files where I changed the XS tags, but left the original, 
 9 | # and delete the original.
10 | ################################################################################
11 | 
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] arg'
18 |     parser = OptionParser(usage)
19 |     #parser.add_option()
20 |     (options,args) = parser.parse_args()
21 | 
22 |     xs_bams_str = subprocess.check_output('find . -name accepted_hits_xs.bam', shell=True).strip()
23 |     xs_bams = xs_bams_str.split('\n')
24 | 
25 |     for xs_bam in xs_bams:
26 |         bam = xs_bam.replace('_xs','')
27 |         if os.path.isfile(bam):
28 |             print 'rm %s' % bam
29 |             os.remove(bam)
30 |     
31 | 
32 | ################################################################################
33 | # __main__
34 | ################################################################################
35 | if __name__ == '__main__':
36 |     main()
37 | 


--------------------------------------------------------------------------------
/r/cuff_heat.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | library(reshape2)
 3 | library(seriation)
 4 | library(RColorBrewer)
 5 | 
 6 | ca = commandArgs(trailing=T)
 7 | df.file = ca[1]
 8 | output.pdf = ca[2]
 9 | 
10 | df = read.table(df.file, header=T, quote="\"")
11 | 
12 | fpkm.matrix = acast(df, Gene ~ Sample, value.var="FPKM")
13 | 
14 | gene.dist = dist(fpkm.matrix)
15 | #gene.clust = hclust(gene.dist)
16 | #gene.order = rownames(fpkm.matrix)[gene.clust$order]
17 | gene.ser = seriate(gene.dist, method="OLO")
18 | gene.order = rownames(fpkm.matrix)[get_order(gene.ser)]
19 | 
20 | sample.dist = dist(t(fpkm.matrix))
21 | #sample.clust = hclust(sample.dist)
22 | #sample.order = colnames(fpkm.matrix)[sample.clust$order]
23 | sample.ser = seriate(sample.dist, method="OLO")
24 | sample.order = colnames(fpkm.matrix)[get_order(sample.ser)]
25 | 
26 | ggplot(df, aes(x=Sample, y=Gene, fill=FPKM)) +
27 |     geom_tile() +
28 |     scale_x_discrete("", limits=sample.order) + 
29 |     scale_y_discrete(limits=gene.order) +
30 |     scale_fill_gradientn("FPKM", colours=c("white", brewer.pal(8, "YlOrRd"))) +
31 |     theme_bw() +
32 |     theme(text=element_text(size=20)) +
33 |     theme(axis.text.x=element_text(angle=315, hjust=0, vjust=1), axis.ticks.y=element_blank(), axis.text.y=element_blank())
34 | 
35 | ggsave(output.pdf)
36 | 
37 | # scale_fill_gradientn("FPKM", colours=c("white", brewer.pal(8, "YOrRd"))) +
38 | 


--------------------------------------------------------------------------------
/bam_unique.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import pysam
 4 | 
 5 | ################################################################################
 6 | # bam_unique.py
 7 | #
 8 | # Remove multi-mapping alignments from a BAM file
 9 | ################################################################################
10 | 
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <input_bam> <output_bam>'
17 |     parser = OptionParser(usage)
18 |     (options,args) = parser.parse_args()
19 | 
20 |     if len(args) != 2:
21 |         parser.error('Must provide input and output BAMs')
22 |     else:
23 |         input_bam = args[0]
24 |         output_bam = args[1]
25 | 
26 |     bam_in = pysam.Samfile(input_bam, 'rb')
27 |     bam_out = pysam.Samfile(output_bam, 'wb', template=bam_in)
28 | 
29 |     for aligned_read in bam_in:
30 |         if aligned_read.get_tag('NH') == 1:
31 |             bam_out.write(aligned_read)
32 | 
33 |     bam_in.close()
34 |     bam_out.close()
35 | 
36 | ################################################################################
37 | # __main__
38 | ################################################################################
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/make_fasta_genome.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import pdb
 4 | 
 5 | import numpy as np
 6 | import pysam
 7 | 
 8 | '''
 9 | make_fasta_genome.py
10 | 
11 | Make a "genome" file, with chromosome names and lengths.
12 | '''
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <fasta_file> <genome_file>'
19 |     parser = OptionParser(usage)
20 |     #parser.add_option()
21 |     (options,args) = parser.parse_args()
22 | 
23 |     if len(args) != 2:
24 |         parser.error('Must provide input FASTA file and output genome file')
25 |     else:
26 |         fasta_file = args[0]
27 |         genome_file = args[1]
28 | 
29 |     genome_out = open(genome_file, 'w')
30 | 
31 |     fasta_open = pysam.Fastafile(fasta_file)
32 |     ref_indexes = np.argsort(fasta_open.lengths)[::-1]
33 | 
34 |     for i in ref_indexes:
35 |         print('%s\t%d' % (fasta_open.references[i], fasta_open.lengths[i]), file=genome_out)
36 |     fasta_open.close()
37 | 
38 |     genome_out.close()
39 | 
40 | 
41 | ################################################################################
42 | # __main__
43 | ################################################################################
44 | if __name__ == '__main__':
45 |     main()
46 | 


--------------------------------------------------------------------------------
/clean_csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | ################################################################################
 5 | # clean_csv.py
 6 | #
 7 | # Clean up an excel-saved .csv file with \r's and commas.
 8 | ################################################################################
 9 | 
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] arg'
16 |     parser = OptionParser(usage)
17 |     #parser.add_option()
18 |     (options,args) = parser.parse_args()
19 | 
20 |     file_in = open(args[0])
21 | 
22 |     file_str = file_in.readline()
23 | 
24 |     if file_str.find('\r') != -1:
25 |         for line in file_str.split('\r'):
26 |             a = line.split(',')
27 |             print '\t'.join(a)
28 | 
29 |     else:
30 |         line = file_str
31 |         while line:
32 |             a = line.split(',')
33 |             a[-1] = a[-1].rstrip()
34 |             print '\t'.join(a)
35 |             line = file_in.readline()
36 | 
37 |     file_in.close()
38 |         
39 |     
40 | 
41 | ################################################################################
42 | # __main__
43 | ################################################################################
44 | if __name__ == '__main__':
45 |     main()
46 | 


--------------------------------------------------------------------------------
/transid2geneid.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import gff
 4 | 
 5 | ################################################################################
 6 | # transid2geneid.py
 7 | #
 8 | # Given a transcript id, produce a gene id to punch into the browser
 9 | ################################################################################
10 | 
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <trans id>'
17 |     parser = OptionParser(usage)
18 |     parser.add_option('-l', dest='lnc_file', default='/Users/dk/research/common/data/lncrna/lnc_catalog.gtf', help='lncRNA catalog file [Default: %default]')
19 |     (options,args) = parser.parse_args()
20 |     
21 |     if len(args) != 1:
22 |         parser.error('Must provide transcript id')
23 |     else:
24 |         trans_id = args[0]
25 | 
26 |     for line in open(options.lnc_file):
27 |         a = line.split('\t')
28 |         kv = gff.gtf_kv(a[8])
29 |         if kv['transcript_id'] == trans_id:
30 |             print kv['gene_id']
31 |             break
32 | 
33 | 
34 | ################################################################################
35 | # __main__
36 | ################################################################################
37 | if __name__ == '__main__':
38 |     main()
39 | 


--------------------------------------------------------------------------------
/possum2bed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | ################################################################################
 5 | # possum2bed.py
 6 | #
 7 | #
 8 | ################################################################################
 9 | 
10 | ################################################################################
11 | # main
12 | ################################################################################
13 | def main():
14 |     usage = 'usage: %prog [options] <possum_file>'
15 |     parser = OptionParser(usage)
16 |     #parser.add_option()
17 |     (options,args) = parser.parse_args()
18 | 
19 |     if len(args) != 1:
20 |         parser.error('Must provide possum output file')
21 |     else:
22 |         possum_file = args[0]
23 | 
24 |     for line in open(possum_file):
25 |         a = line.split('\t')
26 | 
27 |         tf_id = a[0]
28 |         start = int(a[5])+1
29 |         end = start+int(a[6])-1
30 |         fnrc = a[7]
31 |         score = a[9]
32 |         seq_id = a[16][:a[16].find('.')]
33 | 
34 |         if fnrc == 'fn':
35 |             strand = '+'
36 |         else:
37 |             strand = '-'
38 | 
39 |         out_a = [seq_id, str(start), str(end), tf_id, score, strand]
40 |         print '\t'.join(out_a)
41 | 
42 | 
43 | ################################################################################
44 | # __main__
45 | ################################################################################
46 | if __name__ == '__main__':
47 |     main()
48 | 


--------------------------------------------------------------------------------
/mess2fasta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | ################################################################################
 5 | # mess2fa.py
 6 | #
 7 | # Convert a mess of nt's and numbers and spaces into a neat fasta file
 8 | ################################################################################
 9 | 
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] <mess file>'
16 |     parser = OptionParser(usage)
17 |     parser.add_option('-d','--header', dest='header', default='mess', help='Fasta header [Default: %default]')
18 |     parser.add_option('-u', '--upper', dest='upper', action='store_true', default=False, help='Uppercase all nucleotides [Default: %default]')
19 |     (options,args) = parser.parse_args()
20 | 
21 |     allowed_nts = set(['A','C','G','T','N','a','c','g','t','n'])
22 | 
23 |     seq = ''
24 |     for line in open(args[0]):
25 |         seq += ''.join([nt for nt in line if nt in allowed_nts])
26 | 
27 |     if options.upper:
28 |         seq = ''.join([nt.upper() for nt in seq])
29 | 
30 |     print '>%s\n%s' % (options.header,seq)
31 |     
32 | 
33 | ################################################################################
34 | # __main__
35 | ################################################################################
36 | if __name__ == '__main__':
37 |     main()
38 | 


--------------------------------------------------------------------------------
/gff2bed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | from optparse import OptionParser
 4 | import gff, sys
 5 | 
 6 | ################################################################################
 7 | # gff2bed.py
 8 | #
 9 | # Convert a gff file to a bed file. Each entry is converted independently,
10 | # so no blocks.
11 | ################################################################################
12 | 
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <gff file>'
19 |     parser = OptionParser(usage)
20 |     #parser.add_option()
21 |     (options,args) = parser.parse_args()
22 | 
23 |     if len(args) != 1:
24 |         parser.error('Must provide gff file')
25 |     else:
26 |         if args[0] == '-':
27 |             gff_open = sys.stdin
28 |         else:
29 |             gff_open = open(args[0])
30 | 
31 |     for line in gff_open:
32 |         if not line.startswith('##'):
33 |             a = line.split('\t')
34 | 
35 |             cols = [a[0], str(int(a[3])-1), a[4], a[2], '0', a[6], '0', '0', '255,0,0', '1', str(int(a[4])-int(a[3])+1), '0']
36 |             print('\t'.join(cols))
37 |     
38 | 
39 | ################################################################################
40 | # __main__
41 | ################################################################################
42 | if __name__ == '__main__':
43 |     main()
44 | 


--------------------------------------------------------------------------------
/rm2bed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import gzip
 4 | 
 5 | '''
 6 | rm2bed.py
 7 | 
 8 | Convert RepeatMasker .out format to BED.
 9 | '''
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] <rm out>'
16 |     parser = OptionParser(usage)
17 |     #parser.add_option()
18 |     (options,args) = parser.parse_args()
19 | 
20 |     if len(args) != 1:
21 |         parser.error('Must provide RepeatMasker .out file')
22 |     else:
23 |         if args[0][-2:] == 'gz':
24 |             rm_in = gzip.open(args[0], 'rt')
25 |         else:
26 |             rm_in = open(args[0])
27 | 
28 |     for i in range(4):
29 |         line = rm_in.readline()
30 |     while line:
31 |         a = line.split()
32 | 
33 |         chrm = a[4]
34 |         start = str(int(a[5])-1)
35 |         end = a[6]
36 | 
37 |         if a[8] == '+':
38 |             strand = '+'
39 |         else:
40 |             strand = '-'
41 | 
42 |         repeat = a[9]
43 |         family = a[10]
44 | 
45 |         cols = (chrm, start, end, '%s;%s' % (family,repeat), '.', strand)
46 |         print('\t'.join(cols))
47 | 
48 |         line = rm_in.readline()
49 | 
50 |     rm_in.close()
51 | 
52 | 
53 | ################################################################################
54 | # __main__
55 | ################################################################################
56 | if __name__ == '__main__':
57 |     main()
58 | 


--------------------------------------------------------------------------------
/possum2gff.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import os
 4 | 
 5 | ################################################################################
 6 | # possum2gff.py
 7 | #
 8 | # Convert the motif annotations in Possum output to a gff file.
 9 | ################################################################################
10 | 
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <possum_file>'
17 |     parser = OptionParser(usage)
18 |     #parser.add_option()
19 |     (options,args) = parser.parse_args()
20 | 
21 |     if len(args) != 1:
22 |         parser.error('Must provide possum output file')
23 |     else:
24 |         possum_file = args[0]
25 | 
26 |     for line in open(possum_file):
27 |         a = line.split('\t')
28 | 
29 |         tf_id = a[0]
30 |         start = int(a[5])+1
31 |         end = start+int(a[6])-1
32 |         fnrc = a[7]
33 |         seq_id = a[16][:a[16].find('.')]
34 | 
35 |         if fnrc == 'fn':
36 |             strand = '+'
37 |         else:
38 |             strand = '-'
39 | 
40 |         out_a = [seq_id, 'possum', 'motif', str(start), str(end), '.', strand, '.', tf_id]
41 |         print '\t'.join(out_a)
42 | 
43 | 
44 | ################################################################################
45 | # __main__
46 | ################################################################################
47 | if __name__ == '__main__':
48 |     main()
49 | 


--------------------------------------------------------------------------------
/zarr_h5.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | import h5py
 5 | import numpy as np
 6 | import zarr
 7 | 
 8 | '''
 9 | zarr_h5.py
10 | 
11 | Convert a coverage Zarr to HDF5.
12 | '''
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <in_zarr_file> <out_h5_file>'
19 |     parser = OptionParser(usage)
20 |     parser.add_option('-v', dest='verbose', default=False, action='store_true')
21 |     (options,args) = parser.parse_args()
22 | 
23 |     if len(args) != 2:
24 |         parser.error('Must provide input Zarr and output HDF5.')
25 |     else:
26 |         zarr_file = args[0]
27 |         hdf5_file = args[1]
28 | 
29 |     # open files
30 |     zarr_in = zarr.open_group(zarr_file, 'r')
31 |     h5_out = h5py.File(hdf5_file, 'w')
32 | 
33 |     # foreach chromosome
34 |     for chrom in sorted(zarr_in.keys()):
35 |         if options.verbose:
36 |             print(chrom)
37 | 
38 |         # read values
39 |         x = np.array(zarr_in[chrom])
40 | 
41 |         # write gzipped into HDF5
42 |         h5_out.create_dataset(chrom, data=x, dtype='float16', chunks=True, compression='lzf', shuffle=True)
43 | 
44 |     # close files
45 |     h5_out.close()
46 | 
47 | 
48 | ################################################################################
49 | # __main__
50 | ################################################################################
51 | if __name__ == '__main__':
52 |     main()
53 | 


--------------------------------------------------------------------------------
/bam_12.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import pdb, os
 4 | import pysam
 5 | 
 6 | ################################################################################
 7 | # bam_12.py
 8 | #
 9 | # Separate the alignments in a BAM file into two BAM files of the first and
10 | # second reads.
11 | ################################################################################
12 | 
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <bam file>'
19 |     parser = OptionParser(usage)
20 |     (options,args) = parser.parse_args()
21 | 
22 |     if len(args) != 1:
23 |         parser.error('Must provide bam file')
24 |     else:
25 |         bam_file = args[0]
26 | 
27 |     bam_pre = os.path.splitext(bam_file)[0]
28 | 
29 |     bam_in = pysam.Samfile(bam_file, 'rb')
30 |     bam1_out = pysam.Samfile('%s_1.bam'%bam_pre, 'wb', header=bam_in.header)
31 |     bam2_out = pysam.Samfile('%s_2.bam'%bam_pre, 'wb', header=bam_in.header)
32 | 
33 |     for read in bam_in:
34 |         if read.is_read1:
35 |             bam1_out.write(read)
36 |         else:
37 |             bam2_out.write(read)
38 | 
39 |     bam_in.close()
40 |     bam1_out.close()
41 |     bam2_out.close()
42 | 
43 | 
44 | ################################################################################
45 | # __main__
46 | ################################################################################
47 | if __name__ == '__main__':
48 |     main()
49 |     #pdb.runcall(main)
50 | 


--------------------------------------------------------------------------------
/rm2gff.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import gzip
 4 | import gff
 5 | 
 6 | ################################################################################
 7 | # rm2gff.py
 8 | #
 9 | # Convert RepeatMasker .out format to gff
10 | ################################################################################
11 | 
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] <rm out>'
18 |     parser = OptionParser(usage)
19 |     #parser.add_option()
20 |     (options,args) = parser.parse_args()
21 | 
22 |     if len(args) != 1:
23 |         parser.error('Must provide RepeatMasker .out file')
24 |     else:
25 |         if args[0][-2:] == 'gz':
26 |             rm_in = gzip.open(args[0])
27 |         else:
28 |             rm_in = open(args[0])
29 | 
30 |     for i in range(4):
31 |         line = rm_in.readline()
32 |     while line:
33 |         a = line.split()
34 | 
35 |         if a[8] == '+':
36 |             strand = '+'
37 |         else:
38 |             strand = '-'
39 | 
40 |         cols = (a[4], 'RepeatMasker', 'repeat', a[5], a[6], '.', strand, '.', gff.kv_gtf({'repeat':a[9], 'family':a[10]}))
41 |         print '\t'.join(cols)
42 | 
43 |         line = rm_in.readline()
44 |     
45 | 
46 | ################################################################################
47 | # __main__
48 | ################################################################################
49 | if __name__ == '__main__':
50 |     main()
51 | 


--------------------------------------------------------------------------------
/plot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | ################################################################################
 7 | # plot.py
 8 | #
 9 | # matplotlib helper methods
10 | ################################################################################
11 | 
12 | #####################################################################
13 | # limits
14 | #
15 | # Determine a nice buffered axis range from a list/array of numbers
16 | #####################################################################
17 | def limits(nums, buf_pct=0.05):
18 |     nmin = min(nums)
19 |     nmax = max(nums)
20 |     spread = nmax-nmin
21 |     buf = buf_pct*spread
22 |     return nmin-buf, nmax+buf
23 | 
24 | #####################################################################
25 | # scatter
26 | #
27 | # Example scatter plot with some reasonable parameter choices
28 | #####################################################################
29 | def scatter(x, y, pdf, xlabel='', ylabel=''):
30 |     f, ax = plt.subplots()
31 | 
32 |     # scatter
33 |     plt.scatter(x, y, s=20, alpha=0.8, linewidths=0)
34 | 
35 |     # x-axis
36 |     xmin, xmax = limits(x)
37 |     plt.xlim(xmin, xmax)
38 |     plt.xlabel(xlabel)
39 |     ax.xaxis.label.set_fontsize(18)
40 |     map(lambda xl: xl.set_fontsize(15), ax.get_xticklabels())
41 | 
42 |     # y-axis
43 |     ymin, ymax = limits(y)
44 |     plt.ylim(ymin, ymax)
45 |     plt.ylabel(ylabel)
46 |     ax.yaxis.label.set_fontsize(18)
47 |     map(lambda yl: yl.set_fontsize(15), ax.get_yticklabels())
48 | 
49 |     # save
50 |     plt.savefig(pdf)
51 | 
52 |     # close
53 |     plt.close()
54 | 


--------------------------------------------------------------------------------
/bam_plus_minus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import pdb, os
 4 | import pysam
 5 | 
 6 | ################################################################################
 7 | # bam_plus_minus.py
 8 | #
 9 | # Separate the alignments in a BAM file into two BAM files of the plus and
10 | # minus strand reads.
11 | ################################################################################
12 | 
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <bam file>'
19 |     parser = OptionParser(usage)
20 |     (options,args) = parser.parse_args()
21 | 
22 |     if len(args) != 1:
23 |         parser.error('Must provide bam file')
24 |     else:
25 |         bam_file = args[0]
26 | 
27 |     chr_starts = {}
28 | 
29 |     bam_pre = os.path.splitext(bam_file)[0]
30 | 
31 |     bam_in = pysam.Samfile(bam_file, 'rb')
32 |     bamp_out = pysam.Samfile('%s_p.bam'%bam_pre, 'wb', header=bam_in.header)
33 |     bamm_out = pysam.Samfile('%s_m.bam'%bam_pre, 'wb', header=bam_in.header)
34 | 
35 |     for read in bam_in:
36 |         if read.is_reverse:
37 |             bamm_out.write(read)
38 |         else:
39 |             bamp_out.write(read)
40 | 
41 |     bam_in.close()
42 |     bamp_out.close()
43 |     bamm_out.close()
44 | 
45 | 
46 | ################################################################################
47 | # __main__
48 | ################################################################################
49 | if __name__ == '__main__':
50 |     main()
51 |     #pdb.runcall(main)
52 | 


--------------------------------------------------------------------------------
/fastq_filter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import bz2
 4 | import gzip
 5 | 
 6 | '''
 7 | fastq_filter.py
 8 | 
 9 | Filter a FASTQ file for various properties, like read length.
10 | '''
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <fastq_file>'
17 |     parser = OptionParser(usage)
18 |     parser.add_option('-l', dest='length_min', default=None, type='int', help='Minimum read length')
19 |     (options,args) = parser.parse_args()
20 | 
21 |     if len(args) != 1:
22 |         parser.error('Must provide FASTQ file')
23 |     else:
24 |         fastq_file = args[0]
25 | 
26 |     if fastq_file[-3:] == '.gz':
27 |         fastq_in = gzip.open(fastq_file, 'rt')
28 |     elif fastq_file[-4:] == '.bz2':
29 |         fastq_in = bz2.open(fastq_file, 'rt')
30 |     else:
31 |         fastq_in = open(fastq_file)
32 | 
33 |     header = fastq_in.readline()
34 |     while header:
35 |         seq = fastq_in.readline()
36 |         mid = fastq_in.readline()
37 |         qual = fastq_in.readline()
38 | 
39 |         if options.length_min is not None:
40 |             if len(seq)-1 >= options.length_min:
41 |                 print('%s%s%s%s' % (header,seq,mid,qual), end='')
42 | 
43 |         header = fastq_in.readline()
44 | 
45 |     fastq_in.close()
46 | 
47 | 
48 | ################################################################################
49 | # __main__
50 | ################################################################################
51 | if __name__ == '__main__':
52 |     main()
53 | 


--------------------------------------------------------------------------------
/gsea_ranks.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import gsea
 4 | 
 5 | ################################################################################
 6 | # gsea_ranks.py
 7 | #
 8 | # Print out the ranks for a given GO term.
 9 | ################################################################################
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] <cors file> <GO term>'
16 |     parser = OptionParser(usage)
17 |     (options,args) = parser.parse_args()
18 | 
19 |     if len(args) != 2:
20 |         parser.error('Must provide correlations file')
21 |     else:
22 |         cors_file = args[0]
23 |         go_term = args[1]
24 | 
25 |     # get genes, correlations
26 |     correlations_genes = []
27 |     genes = []
28 |     for line in open(cors_file):
29 |         a = line.split()
30 |         correlations_genes.append((abs(float(a[1])),a[0]))
31 |         genes.append(a[0])
32 |     correlations_genes.sort(reverse=True)
33 | 
34 |     # GO
35 |     go_map, go_descs = gsea.read_go(set(genes))
36 |     go_term_map = go_map[go_term]
37 | 
38 |     # print ranks, correlations
39 |     i = 1
40 |     for (cor,gene) in correlations_genes:
41 |         if gene in go_term_map:
42 |             print i, cor
43 |         i += 1
44 | 
45 | 
46 | ################################################################################
47 | # __main__
48 | ################################################################################
49 | if __name__ == '__main__':
50 |     main()
51 |     #pdb.runcall(main)
52 | 


--------------------------------------------------------------------------------
/fastq_trim.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import bz2
 4 | import gzip
 5 | 
 6 | '''
 7 | fastq_trim.py
 8 | 
 9 | Filter a FASTQ file for various properties, like read length.
10 | '''
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <trim_length> <fastq_file>'
17 |     parser = OptionParser(usage)
18 |     (options,args) = parser.parse_args()
19 | 
20 |     if len(args) != 2:
21 |         parser.error('Must provide trim length and FASTQ file')
22 |     else:
23 |         trim_length = int(args[0])
24 |         fastq_file = args[1]
25 | 
26 |     if fastq_file[-3:] == '.gz':
27 |         fastq_in = gzip.open(fastq_file, 'rt')
28 |     elif fastq_file[-4:] == '.bz2':
29 |         fastq_in = bz2.open(fastq_file, 'rt')
30 |     else:
31 |         fastq_in = open(fastq_file)
32 | 
33 |     header = fastq_in.readline().rstrip()
34 |     while header:
35 |         seq = fastq_in.readline().rstrip()
36 |         mid = fastq_in.readline().rstrip()
37 |         qual = fastq_in.readline().rstrip()
38 | 
39 |         # trim
40 |         seq = seq[:trim_length]
41 |         qual = qual[:trim_length]                  
42 | 
43 |         print('%s\n%s\n%s\n%s' % (header,seq,mid,qual))
44 | 
45 |         header = fastq_in.readline().rstrip()
46 | 
47 |     fastq_in.close()
48 | 
49 | 
50 | ################################################################################
51 | # __main__
52 | ################################################################################
53 | if __name__ == '__main__':
54 |     main()
55 | 


--------------------------------------------------------------------------------
/bam_quality.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import pysam
 4 | 
 5 | '''
 6 | bam_quality.py
 7 | 
 8 | Remove low quality alignments from a BAM file.
 9 | '''
10 | 
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <input_bam> <output_bam>'
17 |     parser = OptionParser(usage)
18 |     parser.add_option('-q', dest='mapq_t',
19 |         type='int', default=None,
20 |         help='Keep alignments with mapping quality at or above [Default: %default]')
21 |     parser.add_option('-s', dest='score_t',
22 |         type='int', default=None,
23 |         help='Keep alignments with alignment score at or above [Default: %default]')
24 |     (options,args) = parser.parse_args()
25 | 
26 |     if len(args) != 2:
27 |         parser.error('Must provide input and output BAMs')
28 |     else:
29 |         input_bam = args[0]
30 |         output_bam = args[1]
31 | 
32 |     bam_in = pysam.AlignmentFile(input_bam, 'r')
33 |     bam_out = pysam.AlignmentFile(output_bam, 'wb', template=bam_in)
34 | 
35 |     for align in bam_in:
36 |         if options.mapq_t is None or align.mapping_quality >= options.mapq_t:
37 |             if options.score_t is None or align.get_tag('AS') >= options.score_t:
38 |                 bam_out.write(align)
39 | 
40 |     bam_in.close()
41 |     bam_out.close()
42 | 
43 | ################################################################################
44 | # __main__
45 | ################################################################################
46 | if __name__ == '__main__':
47 |     main()
48 | 


--------------------------------------------------------------------------------
/gtf2prom.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import gff
 4 | import os, subprocess
 5 | 
 6 | ################################################################################
 7 | # gtf2prom.py
 8 | #
 9 | # Produce a GFF file and a fasta file corresponding to the promoter of the
10 | # genes in a gtf fle.
11 | ################################################################################
12 | 
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] arg'
19 |     parser = OptionParser(usage)
20 |     parser.add_option('-d', dest='downstream', type='int', default=0, help='Downstream promoter length [Default: %default]')
21 |     parser.add_option('-u', dest='upstream', type='int', default=2000, help='Upstream promoter length [Default: %default]')
22 |     parser.add_option('-o', dest='output_pre', default='promoter', help='Output file prefix [Default: %default]')
23 |     (options,args) = parser.parse_args()
24 |     
25 |     if len(args) != 1:
26 |         parser.error('Must provide gtf file')
27 |     else:
28 |         gtf_file = args[0]
29 | 
30 |     gff.promoters(gtf_file, options.upstream, options.downstream, '%s.gff'%options.output_pre)
31 |     p = subprocess.Popen('gff2fa.py %s.gff > %s.fa' % (options.output_pre,options.output_pre), shell=True)
32 |     os.waitpid(p.pid,0)
33 | 
34 | 
35 | ################################################################################
36 | # __main__
37 | ################################################################################
38 | if __name__ == '__main__':
39 |     main()
40 | 


--------------------------------------------------------------------------------
/bw_nan.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | import numpy as np
 5 | import pyBigWig
 6 | 
 7 | '''
 8 | bw_nan.py
 9 | 
10 | Compute NaN % in a BigWig.
11 | '''
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] <in_bw_file> <out_h5_file>'
18 |     parser = OptionParser(usage)
19 |     (options,args) = parser.parse_args()
20 |     if len(args) != 1:
21 |         parser.error('Must provide input BigWig.')
22 |     else:
23 |         bw_file = args[0]
24 | 
25 |     # open files
26 |     bw_in = pyBigWig.open(bw_file)
27 | 
28 |     # process chromosomes in length order
29 |     chrom_lengths = bw_in.chroms()
30 |     chroms = sorted(chrom_lengths.keys())
31 |     length_chroms = [(chrom_lengths[chrm],chrm) for chrm in chroms]
32 |     length_chroms = sorted(length_chroms)[::-1]
33 |     mode_factor = None
34 | 
35 |     total_nt = 0
36 |     nan_nt = 0
37 | 
38 |     # for each chromosome
39 |     for clength, chrom in length_chroms:
40 |         # read values
41 |         x = bw_in.values(chrom, 0, chrom_lengths[chrom], numpy=True)
42 | 
43 |         # find NaN
44 |         x_nan = np.isnan(x)
45 | 
46 |         total_nt += len(x)
47 |         nan_nt += x_nan.sum()
48 | 
49 |     # close files
50 |     bw_in.close()
51 | 
52 |     nan_pct = nan_nt / total_nt
53 |     print('%.6f' % nan_pct)
54 | 
55 | 
56 | ################################################################################
57 | # __main__
58 | ################################################################################
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/h5_zarr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | import h5py
 5 | import numpy as np
 6 | import zarr
 7 | 
 8 | '''
 9 | h5_zarr.py
10 | 
11 | Convert a coverage HDF5 to BigWig.
12 | '''
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <in_h5_file> <out_zarr_file>'
19 |     parser = OptionParser(usage)
20 |     parser.add_option('-c', dest='chunk_size', default=None, type='int')
21 |     parser.add_option('-v', dest='verbose', default=False, action='store_true')
22 |     (options,args) = parser.parse_args()
23 | 
24 |     if len(args) != 2:
25 |         parser.error('Must provide input HDF5 and output BigWig.')
26 |     else:
27 |         hdf5_file = args[0]
28 |         zarr_file = args[1]
29 | 
30 |     # open files
31 |     h5_in = h5py.File(hdf5_file)
32 |     zarr_out = zarr.open_group(zarr_file, 'w')
33 | 
34 |     # foreach chromosome
35 |     for chrom in h5_in.keys():
36 |         if options.verbose:
37 |             print(chrom)
38 | 
39 |         # read values
40 |         x = np.array(h5_in[chrom], dtype='float16')
41 | 
42 |         # write gzipped into HDF5
43 |         z = zarr_out.create_dataset(chrom, data=x, shape=x.shape, dtype='float16', chunks=options.chunk_size)
44 |         if options.verbose:
45 |             print(z)
46 | 
47 |     # close files
48 |     h5_in.close()
49 | 
50 | 
51 | ################################################################################
52 | # __main__
53 | ################################################################################
54 | if __name__ == '__main__':
55 |     main()
56 | 


--------------------------------------------------------------------------------
/sciseq_collision.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import numpy as np
 4 | 
 5 | '''
 6 | sciseq_collision.py
 7 | 
 8 | Estimate the collision rate for a set of sci-seq barcode parameters.
 9 | '''
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] arg'
16 |     parser = OptionParser(usage)
17 |     parser.add_option('-b', dest='barcode1',
18 |             default=None, type='int',
19 |             help='Number of barcodes introduced in the first RT stage')
20 |     parser.add_option('-c', dest='cells',
21 |             default=None, type='int',
22 |             help='Number of cells sorted per well in the second PCR stage')
23 |     parser.add_option('-n', dest='num_samples',
24 |             default=10000, type='int',
25 |             help='Number of simulation samples [Default: %default]')
26 |     (options,args) = parser.parse_args()
27 | 
28 |     collisions = 0
29 | 
30 |     for i in range(options.num_samples):
31 |         cell_barcodes = np.random.randint(0, options.barcode1, size=options.cells)
32 |         unique_cell_barcodes = len(set(cell_barcodes))
33 |         cell_collisions = options.cells - unique_cell_barcodes
34 |         collisions += cell_collisions
35 | 
36 |     collision_rate = collisions / (options.num_samples*options.cells)
37 |     print('Collision rate: %.4f' % collision_rate)
38 | 
39 | 
40 | ################################################################################
41 | # __main__
42 | ################################################################################
43 | if __name__ == '__main__':
44 |     main()
45 | 


--------------------------------------------------------------------------------
/gtf_span.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import gff
 4 | 
 5 | ################################################################################
 6 | # gtf_span.py
 7 | #
 8 | # Merge all of the transcripts in one gene into a single spanning gtf entry.
 9 | ################################################################################
10 | 
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <gtf_file>'
17 |     parser = OptionParser(usage)
18 |     #parser.add_option()
19 |     (options,args) = parser.parse_args()
20 | 
21 |     gtf_file = args[0]
22 | 
23 |     genes = {}
24 | 
25 |     for line in open(gtf_file):
26 |         a = line.split()
27 |         gene_id = a[9][1:-2]
28 |         genes.setdefault(gene_id,[]).append(line)
29 | 
30 |     for gene_id in genes:
31 |         start = min([int(line.split()[3]) for line in genes[gene_id]])
32 |         end = max([int(line.split()[4]) for line in genes[gene_id]])
33 |         
34 |         a = genes[gene_id][0].split('\t')
35 |         kv = gff.gtf_kv(a[8])
36 |         succinct_kv = {'gene_id':kv['gene_id']}
37 |         succinct_kv['transcript_id'] = ','.join(list(set([line.split()[11][1:-2] for line in genes[gene_id]])))
38 |         
39 |         d = [a[0], 'gtf', 'gene', str(start), str(end), '.', a[6], '.', gff.kv_gtf(succinct_kv)]
40 |         print '\t'.join(d)
41 | 
42 | ################################################################################
43 | # __main__
44 | ################################################################################
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/bim_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import subprocess
 4 | 
 5 | '''
 6 | bim_vcf.py
 7 | 
 8 | Convert variants in a Plink .bim file to .vcf
 9 | '''
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] <in_bim> <out_vcf>'
16 |     parser = OptionParser(usage)
17 |     parser.add_option('-z', dest='zip', default=False, action='store_true')
18 |     (options,args) = parser.parse_args()
19 | 
20 |     if len(args) != 2:
21 |         parser.error('Must provide input BIM and output VCF.')
22 |     else:
23 |         in_bim_file = args[0]
24 |         out_vcf_file = args[1]
25 | 
26 |     # open out VCF
27 |     out_vcf_open = open(out_vcf_file, 'w')
28 | 
29 |     # print header
30 |     print('##fileformat=VCFv4.2', file=out_vcf_open)
31 |     cols = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
32 |     print('\t'.join(cols), file=out_vcf_open)
33 | 
34 |     # parse BIM
35 |     for line in open(in_bim_file):
36 |         a = line.split()
37 |         chrom = a[0]
38 |         snp_id = a[1]
39 |         pos = a[3]
40 |         a1 = a[4]
41 |         a2 = a[5]
42 | 
43 |         cols = [chrom, pos, snp_id, a2, a1, '.', '.', '.']
44 |         print('\t'.join(cols), file=out_vcf_open)
45 | 
46 |     out_vcf_open.close()
47 | 
48 |     if options.zip:
49 |         subprocess.call('gzip -f %s' % out_vcf_file, shell=True)
50 | 
51 | 
52 | ################################################################################
53 | # __main__
54 | ################################################################################
55 | if __name__ == '__main__':
56 |     main()
57 | 


--------------------------------------------------------------------------------
/cuff_fails.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | ################################################################################
 5 | # cuff_fails.py
 6 | #
 7 | # Print a table of the number of genes for which cufflinks failed.
 8 | ################################################################################
 9 | 
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] <fpkm_tracking>'
16 |     parser = OptionParser(usage)
17 |     #parser.add_option()
18 |     (options,args) = parser.parse_args()
19 | 
20 |     if len(args) != 1:
21 |         parser.error('Must provide FPKM tracking file')
22 |     else:
23 |         fpkm_file = args[0]
24 | 
25 |     # get headers
26 |     fpkm_in = open(fpkm_file)
27 |     headers = fpkm_in.readline().split()
28 | 
29 |     # initialize fail counts
30 |     fails = {}
31 | 
32 |     for line in fpkm_in:
33 |         a = line.split('\t')
34 |         a[-1] = a[-1].rstrip()
35 | 
36 |         gene_id = a[0]
37 | 
38 |         for i in range(len(a)):
39 |             if headers[i][-7:] == '_status':
40 |                 if a[i] != 'OK':
41 |                     sample = headers[i][:-7]
42 |                     fails[sample] = fails.get(sample,0) + 1
43 |         
44 |     fpkm_in.close()
45 | 
46 |     for sample in fails:
47 |         cols = (sample, fails[sample])
48 |         print '%-18s  %5d' % cols
49 | 
50 | 
51 | ################################################################################
52 | # __main__
53 | ################################################################################
54 | if __name__ == '__main__':
55 |     main()
56 | 


--------------------------------------------------------------------------------
/gtf_filter_csf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import sys
 4 | import gff
 5 | 
 6 | ################################################################################
 7 | # gtf_filter_csf.py
 8 | #
 9 | # Filter the lnc catalog gtf file by CSF value.
10 | ################################################################################
11 | 
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] <gtf file>'
18 |     parser = OptionParser(usage)
19 |     parser.add_option('-g', dest='greater', action='store_true', default=False, help='Keep genes w/ CSF value greater than the one given [Default: %default]')
20 |     parser.add_option('-l', dest='less', action='store_true', default=True, help='Keep genes w/ CSF value less than the one given [Default: %default]')
21 |     parser.add_option('-t', dest='csf_t', type='float', default=100.0, help='CSF threshold [Default: %default]')
22 |     (options,args) = parser.parse_args()
23 | 
24 |     if len(args) == 1:
25 |         gtf_open = open(args[0])
26 |     else:
27 |         gtf_open = sys.stdin
28 | 
29 |     line = gtf_open.readline()
30 |     while line:
31 |         a = line.split('\t')
32 |         csf = float(gff.gtf_kv(a[8])['csf'])
33 |         if (options.less and csf <= options.csf_t) or (options.greater and csf >= options.csf_t):
34 |             print line,
35 |         line = gtf_open.readline()
36 | 
37 | 
38 | ################################################################################
39 | # __main__
40 | ################################################################################
41 | if __name__ == '__main__':
42 |     main()
43 | 


--------------------------------------------------------------------------------
/zarr_bw.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | import zarr
 5 | import numpy as np
 6 | import pyBigWig
 7 | 
 8 | '''
 9 | zarr_bw.py
10 | 
11 | Convert a coverage Zarr to BigWig.
12 | '''
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <in_zarr_file> <out_bw_file>'
19 |     parser = OptionParser(usage)
20 |     parser.add_option('-v', dest='verbose', default=False, action='store_true')
21 |     (options,args) = parser.parse_args()
22 | 
23 |     if len(args) != 2:
24 |         parser.error('Must provide input HDF5 and output BigWig.')
25 |     else:
26 |         zarr_file = args[0]
27 |         bw_file = args[1]
28 | 
29 |     # open files
30 |     zarr_in = zarr.open_group(zarr_file, 'r')
31 |     bw_out = pyBigWig.open(bw_file, 'w')
32 | 
33 |     # construct header
34 |     header = []
35 |     chroms = sorted(zarr_in.keys())
36 |     for chrom in chroms:
37 |         # chromosome and length
38 |         header.append((chrom,len(zarr_in[chrom])))
39 | 
40 |     # write header
41 |     bw_out.addHeader(header)
42 | 
43 |     for chrom, length in header:
44 |         if options.verbose:
45 |             print(chrom)
46 | 
47 |         # read values
48 |         x = np.array(zarr_in[chrom])
49 | 
50 |         # write gzipped into HDF5
51 |         bw_out.addEntries(chrom, 0, values=x, span=1, step=1)
52 | 
53 |     # close files
54 |     bw_out.close()
55 | 
56 | 
57 | ################################################################################
58 | # __main__
59 | ################################################################################
60 | if __name__ == '__main__':
61 |     main()
62 | 


--------------------------------------------------------------------------------
/bed2gff.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import gff
 4 | 
 5 | ################################################################################
 6 | # bed2gff.py
 7 | #
 8 | # Convert a bed file to a gff file. No blocks.
 9 | ################################################################################
10 | 
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <bed file>'
17 |     parser = OptionParser(usage)
18 |     parser.add_option('--source', dest='source', default='bed', help='Gff format "source" [Default: %default]')
19 |     parser.add_option('--feature', dest='feature', default='feature', help='Gff format "feature" [Default: %default]')
20 |     (options,args) = parser.parse_args()
21 | 
22 |     if len(args) != 1:
23 |         parser.error('Must provide bed file')
24 |     else:
25 |         bed_file = args[0]
26 | 
27 |     group_num = 0
28 |     for line in open(bed_file):
29 |         a = line.split('\t')
30 |         a[-1] = a[-1].rstrip()
31 | 
32 |         if len(a) >= 5:
33 |             score = a[4]
34 |         else:
35 |             score = '.'
36 |         if len(a) >= 6:
37 |             strand = a[5]
38 |         else:
39 |             strand = '+'
40 |         group_num += 1
41 | 
42 |         cols = [a[0], options.source, options.feature, str(int(a[1])+1), a[2], score, strand, '.', str(group_num)]
43 |         print '\t'.join(cols)        
44 |     
45 | 
46 | ################################################################################
47 | # __main__
48 | ################################################################################
49 | if __name__ == '__main__':
50 |     main()
51 | 


--------------------------------------------------------------------------------
/bed2gtf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | ################################################################################
 5 | # bed2gtf.py
 6 | #
 7 | # Convert a bed file to a gtf file.
 8 | ################################################################################
 9 | 
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] <gtf file>'
16 |     parser = OptionParser(usage)
17 |     #parser.add_option()
18 |     (options,args) = parser.parse_args()
19 | 
20 |     if len(args) != 1:
21 |         parser.error('Must provide gtf file')
22 |     else:
23 |         bed_file = args[0]
24 | 
25 |     for line in open(bed_file):
26 |         a = line.split('\t')
27 |         a[-1] = a[-1].rstrip()
28 | 
29 |         tid = a[3]
30 | 
31 |         gene_start = int(a[1])
32 |         gene_end = int(a[2])
33 | 
34 |         block_sizes = [int(x) for x in a[10].split(',') if x]
35 |         block_starts = [int(x) for x in a[11].split(',') if x]
36 | 
37 |         exon_num = 1
38 |         for i in range(len(block_starts)):
39 |             exon_start = gene_start+1+block_starts[i]
40 |             exon_end = gene_start+1+block_starts[i]+block_sizes[i]-1
41 | 
42 |             cols = [a[0], 'BED', 'exon', str(exon_start), str(exon_end), '.', a[5], '.', 'transcript_id "%s"; exon_number "%d"' % (tid,exon_num)]
43 |             print '\t'.join(cols)
44 |             exon_num += 1
45 |     
46 | 
47 | ################################################################################
48 | # __main__
49 | ################################################################################
50 | if __name__ == '__main__':
51 |     main()
52 | 


--------------------------------------------------------------------------------
/gaps_bed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import pdb
 4 | 
 5 | ################################################################################
 6 | # gaps_bed.py
 7 | #
 8 | # Print a bed file of the gaps in a fasta file.
 9 | ################################################################################
10 | 
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <fasta file>'
17 |     parser = OptionParser(usage)
18 |     parser.add_option('-g', dest='gap_size', default=50, type='int', help='Minimum gap size to print a bed entry [Default: %default]')
19 |     (options,args) = parser.parse_args()
20 | 
21 |     if len(args) != 1:
22 |         parser.error('Must provide fasta file')
23 |     else:
24 |         fasta_file = args[0]
25 | 
26 |     for line in open(fasta_file):
27 |         if line[0] == '>':
28 |             chrom = line[1:].rstrip()
29 |             seq_i = 0
30 |             gap_start = None
31 |         else:
32 |             for nt in line.rstrip():
33 |                 if nt == 'N':
34 |                     if gap_start == None:
35 |                         gap_start = seq_i                        
36 |                 else:
37 |                     if gap_start != None and seq_i-gap_start >= options.gap_size:
38 |                         print '\t'.join([chrom,str(gap_start),str(seq_i)])
39 |                     gap_start = None
40 | 
41 |                 seq_i += 1
42 | 
43 | 
44 | ################################################################################
45 | # __main__
46 | ################################################################################
47 | if __name__ == '__main__':
48 |     main()
49 |     #pdb.runcall(main)
50 | 


--------------------------------------------------------------------------------
/stockholm2fasta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | ################################################################################
 5 | # stockholm2fasta.py
 6 | #
 7 | # Convert Stockholm MSA format from HMMer to FASTA for viewing.
 8 | ################################################################################
 9 | 
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] <stockholm>'
16 |     parser = OptionParser(usage)
17 |     parser.add_option('-c', dest='consensus_only', default=False, action='store_true', help='Print consensus columns only [Default: %default]')
18 |     (options,args) = parser.parse_args()
19 | 
20 |     if len(args) != 1:
21 |         parser.error('Must provide input Stockholm format file')
22 |     else:
23 |         stockholm_file = args[0]
24 | 
25 |     seqs = {}
26 |     consensus = ''
27 |     for line in open(stockholm_file):
28 |         if line.rstrip() not in ['','//'] and line[0] != '#':
29 |             header, msa = line.split()
30 |             seqs[header] = seqs.get(header,'') + msa
31 |         elif line.startswith('#=GC RF'):
32 |             consensus += line.split()[-1]
33 | 
34 |     if options.consensus_only:
35 |         for header in seqs:
36 |             hseq = seqs[header]
37 |             seqs[header] = ''.join([hseq[i] for i in range(len(hseq)) if consensus[i] == 'x'])
38 | 
39 |     for header in seqs:
40 |         print '>%s\n%s' % (header,seqs[header])
41 | 
42 | 
43 | ################################################################################
44 | # __main__
45 | ################################################################################
46 | if __name__ == '__main__':
47 |     main()
48 | 


--------------------------------------------------------------------------------
/bed_clean.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | ################################################################################
 5 | # bed_clean.py
 6 | #
 7 | # Detect and correct BED regions extending beyond chromosome ends
 8 | ################################################################################
 9 | 
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] <csizes_file> <bed_file>'
16 |     parser = OptionParser(usage)
17 |     parser.add_option('-d', dest='delete', default=False, action='store_true', help='Delete entries beyond boundaries [Default: %default]')
18 |     (options,args) = parser.parse_args()
19 | 
20 |     if len(args) != 2:
21 |         parser.error('Must provide BED file and chrom sizes file')
22 |     else:
23 |         csizes_file = args[0]
24 |         bed_file = args[1]
25 | 
26 |     # read in chromosome sizes
27 |     chrom_sizes = {}
28 |     for line in open(csizes_file):
29 |         a = line.split()
30 |         chrom_sizes[a[0]] = int(a[1])
31 | 
32 |     # clean BED file
33 |     for line in open(bed_file):
34 |         a = line.split()
35 |         chrom = a[0]
36 |         start = int(a[1])
37 |         end = int(a[2])
38 | 
39 |         if end <= chrom_sizes[chrom]:
40 |             print line,
41 | 
42 |         else:
43 |             if not options.delete:
44 |                 end = chrom_sizes[chrom]
45 |                 a[2] = str(end)
46 |                 print '\t'.join(a)
47 |     
48 | 
49 | ################################################################################
50 | # __main__
51 | ################################################################################
52 | if __name__ == '__main__':
53 |     main()
54 | 


--------------------------------------------------------------------------------
/multiz_gff.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | from pygr import worldbase
 4 | import gff
 5 | 
 6 | ################################################################################
 7 | # multiz_gff.py
 8 | #
 9 | # Return hg19 46-way multiz alignments of the entries in a gff file.
10 | ################################################################################
11 | 
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] <gff file>'
18 |     parser = OptionParser(usage)
19 |     (options,args) = parser.parse_args()
20 | 
21 |     if len(args) != 1:
22 |         parser.error('Must provide gff file.')
23 |     else:
24 |         gff_file = args[0]
25 | 
26 |     # get human genome
27 |     hg19 = worldbase.Bio.Seq.Genome.HUMAN.hg19()
28 | 
29 |     # get feature intervals
30 |     feat_ivals = []
31 |     for line in open(gff_file):
32 |         a = line.split('\t')
33 | 
34 |         chrom = a[0]
35 |         start = int(a[3])
36 |         end = int(a[4])
37 |         # ignoring orientation at the moment
38 | 
39 |         feat_ivals.append(hg19[chrom][start:end])
40 | 
41 |     # get hg19 msa
42 |     msa = worldbase.Bio.MSA.UCSC.hg19_multiz46way()
43 | 
44 |     # map returned sequences back to genome name
45 |     idDict = ~(msa.seqDict)
46 | 
47 |     # print alignments
48 |     for gi in feat_ivals:
49 |         for src, dest, edg in msa[gi].edges():
50 |             print repr(gi), repr(src), repr(dest), idDict[dest], edg.length()
51 | 
52 | 
53 | ################################################################################
54 | # __main__
55 | ################################################################################
56 | if __name__ == '__main__':
57 |     main()
58 | 


--------------------------------------------------------------------------------
/fastq_quality_change.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | ################################################################################
 5 | # fastq_quality_change.py
 6 | #
 7 | # Change the quality value ascii index for a fastq file.
 8 | #
 9 | # Author: David Kelley dakelley@umiacs.umd.edu
10 | ################################################################################
11 | 
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] <fastq file>'
18 |     parser = OptionParser(usage)
19 |     parser.add_option('-a', dest='after', type='int', help='Desired fastq quality ascii index')
20 |     parser.add_option('-b', dest='before', type='int', help='Current fastq quality ascii index')
21 |     (options,args) = parser.parse_args()
22 | 
23 |     if options.after == None or options.before == None:
24 |         parser.error('Must provide before and after ascii indexes')
25 |     if len(args) != 1:
26 |         parser.error('Must provide fastq file')
27 |     else:
28 |         fastq_file = args[0]
29 | 
30 |     fq_in = open(fastq_file)
31 |     header = fq_in.readline()
32 |     while header:        
33 |         seq = fq_in.readline()
34 |         mid = fq_in.readline()
35 |         qual = fq_in.readline()
36 | 
37 |         print header,
38 |         print seq,
39 |         print mid,
40 |         print ''.join([chr(ord(q)-options.before+options.after) for q in qual])
41 | 
42 |         header = fq_in.readline()
43 |     fq_in.close()
44 | 
45 | 
46 | ################################################################################
47 | # __main__
48 | ################################################################################
49 | if __name__ == '__main__':
50 |     main()
51 | 


--------------------------------------------------------------------------------
/w5_bg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | import h5py
 5 | import numpy as np
 6 | 
 7 | '''
 8 | w5_bg.py
 9 | 
10 | Convert a Wiggle HDF5 to BedGraph.
11 | '''
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] <in_w5> <out_bg>'
18 |     parser = OptionParser(usage)
19 |     parser.add_option('-v', dest='verbose', default=False, action='store_true')
20 |     (options,args) = parser.parse_args()
21 | 
22 |     if len(args) != 2:
23 |         parser.error('Must provide input Wig5 and output BedGraph.')
24 |     else:
25 |         in_w5_file = args[0]
26 |         out_bg_file = args[1]
27 | 
28 |     # open files
29 |     in_w5_open = h5py.File(in_w5_file)
30 |     out_bg_open = open(out_bg_file, 'w')
31 | 
32 |     header = 'track type=bedGraph'
33 |     print(header, file=out_bg_open)
34 | 
35 |     for chrm in sorted(in_w5_open.keys()):
36 |         if options.verbose:
37 |             print(chrm, flush=True)
38 | 
39 |         # read values
40 |         x = np.array(in_w5_open[chrm])
41 | 
42 |         # write to bedgraph
43 |         i = 0
44 |         while i < len(x):
45 |             start = i
46 |             end = i+1
47 |             while end < len(x) and x[start] == x[end]:
48 |                 end += 1
49 | 
50 |             cols = [chrm, str(start), str(end), '%.4f'%x[start]]
51 |             print('\t'.join(cols), file=out_bg_open)
52 | 
53 |             i = end
54 | 
55 |     in_w5_open.close()
56 |     out_bg_open.close()
57 | 
58 | 
59 | ################################################################################
60 | # __main__
61 | ################################################################################
62 | if __name__ == '__main__':
63 |     main()
64 | 


--------------------------------------------------------------------------------
/split_fragment_lengths.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import pysam
 4 | 
 5 | ################################################################################
 6 | # split_fragment_lengths.py
 7 | #
 8 | # Split a BAM file based on a fragment length threshold.
 9 | ################################################################################
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] <bam> <split_len> <out_pre>'
16 |     parser = OptionParser(usage)
17 |     parser.add_option('-m', dest='max_length', default=1000, help='Threshold length beyond which we ignore the read [Default: %default]')
18 |     (options,args) = parser.parse_args()
19 | 
20 |     if len(args) != 3:
21 |         parser.error('Must provide BAM file, split length, and output prefix')
22 |     else:
23 |         bam_file = args[0]
24 |         split_len = int(args[1])
25 |         out_pre = args[2]
26 | 
27 |     bam_in = pysam.Samfile(bam_file, 'rb')
28 | 
29 |     minus_out = pysam.Samfile('%s_%d-.bam' % (out_pre,split_len), 'wb', template=bam_in)
30 |     plus_out = pysam.Samfile('%s_%d+.bam' % (out_pre,split_len), 'wb', template=bam_in)
31 | 
32 |     for alignment in bam_in:
33 |         tl = abs(alignment.template_length)
34 |         if tl == 0:
35 |             pass
36 |         elif tl < split_len:
37 |             minus_out.write(alignment)
38 |         elif tl <= options.max_length:
39 |             plus_out.write(alignment)
40 |         else:
41 |             pass
42 | 
43 |     minus_out.close()
44 |     plus_out.close()
45 | 
46 | ################################################################################
47 | # __main__
48 | ################################################################################
49 | if __name__ == '__main__':
50 |     main()
51 | 


--------------------------------------------------------------------------------
/gtf_cut.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import sys
 4 | import gff
 5 | 
 6 | ################################################################################
 7 | # gtf_cut.py
 8 | #
 9 | # Cut a gtf key:value pair out of a gtf file.
10 | ################################################################################
11 | 
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] -k <key> <gtf file>'
18 |     parser = OptionParser(usage)
19 |     parser.add_option('-c', dest='column', default=8, type='int')
20 |     parser.add_option('-k', dest='key', help='Key to extract')
21 |     parser.add_option('-l', dest='line_too', action='store_true', default=False, help='Print the line too [Default: %default]')
22 |     (options,args) = parser.parse_args()
23 | 
24 |     if len(args) == 1:
25 |         if args[0] == '-':
26 |             gtf_open = sys.stdin
27 |         else:
28 |             gtf_open = open(args[0])
29 |     else:
30 |         parser.error(usage)
31 | 
32 |     if not options.key:
33 |         parser.error('Must provide key')
34 |     else:
35 |         keys = options.key.split(',')
36 | 
37 |     for line in gtf_open:
38 |         if not line.startswith('#'):
39 |             a = line.split('\t')
40 |             kv = gff.gtf_kv(a[options.column])
41 | 
42 |             if options.line_too:
43 |                 key_str = '\t'.join([kv.get(key,'-') for key in keys])
44 |                 print('%s\t%s' % (key_str,line))
45 |             else:
46 |                 print('\t'.join([kv.get(key,'-') for key in keys]))
47 | 
48 | 
49 | ################################################################################
50 | # __main__
51 | ################################################################################
52 | if __name__ == '__main__':
53 |     main()
54 | 


--------------------------------------------------------------------------------
/gtf_filter_expr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import cufflinks, gff
 4 | 
 5 | ################################################################################
 6 | # gtf_filter_expr.py
 7 | #
 8 | # Filter a gtf file to only leave genes that are expressed over a specified
 9 | # threshold in a specified tissue/cell type.
10 | ################################################################################
11 | 
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] <gtf file> <cell type>'
18 |     parser = OptionParser(usage)
19 |     parser.add_option('-t', dest='expr_t', type='float', default=.1, help='Minimum allowed fpkm value')
20 |     (options,args) = parser.parse_args()
21 | 
22 |     if len(args) != 2:
23 |         parser.error('Must provide gtf file and cell type')
24 |     else:
25 |         gtf_file = args[0]
26 |         cell_type = args[1]
27 | 
28 |     # get expression data
29 |     cuff = cufflinks.fpkm_tracking()
30 | 
31 |     # find cell type experiment index
32 |     cell_indexes = [i for i in range(len(cuff.experiments)) if cuff.experiments[i]==cell_type]
33 |     if len(cell_indexes) == 0:
34 |         parser.error('Cell type %s does not match any quantified experiments' % cell_type)
35 |     else:
36 |         cell_i = cell_indexes[0]
37 | 
38 |     # parser gtf file
39 |     for line in open(gtf_file):
40 |         a = line.split('\t')
41 |         gene_id = gff.gtf_kv(a[8])['gene_id']
42 |         expr_vec = cuff.gene_expr(gene_id)
43 |         if expr_vec[cell_i] > options.expr_t:
44 |             print line,
45 |     
46 | 
47 | ################################################################################
48 | # __main__
49 | ################################################################################
50 | if __name__ == '__main__':
51 |     main()
52 | 


--------------------------------------------------------------------------------
/fpkm_tracking.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import pandas as pd
 4 | 
 5 | ################################################################################
 6 | # fpkm_tracking
 7 | #
 8 | # Print a table of FPKM abundance estimates with one gene/sample per row.
 9 | #
10 | # Using Pandas for this is stupid because we have to read the whole thing
11 | # into memory, and when you iterate over rows, it has to create a Series object.
12 | ################################################################################
13 | 
14 | 
15 | ################################################################################
16 | # main
17 | ################################################################################
18 | def main():
19 |     usage = 'usage: %prog [options] <fpkm_tracking>'
20 |     parser = OptionParser(usage)
21 |     parser.add_option('-g', dest='gene_id', help='This gene only')
22 |     (options,args) = parser.parse_args()
23 | 
24 |     if len(args) != 1:
25 |         parser.error('Must provide .fpkm_tracking file')
26 |     else:
27 |         fpkm_tracking_file = args[0]
28 |     
29 |     cuff = pd.read_csv(fpkm_tracking_file, sep='\t')
30 | 
31 |     fpkm_indexes = [i for i in range(cuff.shape[1]) if cuff.columns[i][-5:] == '_FPKM']
32 | 
33 |     for gene_i, gene_series in cuff.iterrows():
34 |         gene_id = gene_series['gene_id']
35 |         if options.gene_id == None or gene_id == options.gene_id:
36 |             for i in fpkm_indexes:
37 |                 sample = cuff.columns[i][:-5]
38 |                 fpkm = str(gene_series[i])
39 |                 status = gene_series[i+3]
40 | 
41 |                 if status == 'OK':
42 |                     cols = [gene_series['tracking_id'], sample, fpkm]
43 |                     print '\t'.join(cols)
44 | 
45 | 
46 | ################################################################################
47 | # __main__
48 | ################################################################################
49 | if __name__ == '__main__':
50 |     main()
51 | 


--------------------------------------------------------------------------------
/size.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from sys import getsizeof, stderr
 3 | from itertools import chain
 4 | from collections import deque
 5 | try:
 6 |     from reprlib import repr
 7 | except ImportError:
 8 |     pass
 9 | 
10 | def total_size(o, handlers={}, verbose=False):
11 |     """ Returns the approximate memory footprint an object and all of its contents.
12 | 
13 |     Automatically finds the contents of the following builtin containers and
14 |     their subclasses:  tuple, list, deque, dict, set and frozenset.
15 |     To search other containers, add handlers to iterate over their contents:
16 | 
17 |         handlers = {SomeContainerClass: iter,
18 |                     OtherContainerClass: OtherContainerClass.get_elements}
19 | 
20 |     """
21 |     dict_handler = lambda d: chain.from_iterable(d.items())
22 |     all_handlers = {tuple: iter,
23 |                     list: iter,
24 |                     deque: iter,
25 |                     dict: dict_handler,
26 |                     set: iter,
27 |                     frozenset: iter,
28 |                    }
29 |     all_handlers.update(handlers)     # user handlers take precedence
30 |     seen = set()                      # track which object id's have already been seen
31 |     default_size = getsizeof(0)       # estimate sizeof object without __sizeof__
32 | 
33 |     def sizeof(o):
34 |         if id(o) in seen:       # do not double count the same object
35 |             return 0
36 |         seen.add(id(o))
37 |         s = getsizeof(o, default_size)
38 | 
39 |         if verbose:
40 |             print(s, type(o), repr(o), file=stderr)
41 | 
42 |         for typ, handler in all_handlers.items():
43 |             if isinstance(o, typ):
44 |                 s += sum(map(sizeof, handler(o)))
45 |                 break
46 |         return s
47 | 
48 |     return sizeof(o)
49 | 
50 | 
51 | ##### Example call #####
52 | 
53 | if __name__ == '__main__':
54 |     d = dict(a=1, b=2, c=3, d=[4,5,6,7], e='a string of chars')
55 |     print(total_size(d, verbose=True))
56 | 


--------------------------------------------------------------------------------
/w5_bw.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | import h5py
 5 | import numpy as np
 6 | import pyBigWig
 7 | 
 8 | '''
 9 | w5_bw.py
10 | 
11 | Convert a coverage wiggle HDF5 to BigWig.
12 | '''
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <out_h5_file> <in_bw_file>'
19 |     parser = OptionParser(usage)
20 |     parser.add_option('-c', dest='chr', 
21 |         default=None, help='Comma-separated chromosomes')
22 |     parser.add_option('-v', dest='verbose',
23 |         default=False, action='store_true')
24 |     (options,args) = parser.parse_args()
25 | 
26 |     if len(args) != 2:
27 |         parser.error('Must provide input HDF5 and output BigWig.')
28 |     else:
29 |         hdf5_file = args[0]
30 |         bw_file = args[1]
31 | 
32 |     # open files
33 |     h5_in = h5py.File(hdf5_file, 'r')
34 |     bw_out = pyBigWig.open(bw_file, 'w')
35 | 
36 |     # construct header
37 |     if options.chr is not None:
38 |         chroms = options.chr.split(',')
39 |     else:
40 |         chroms = sorted(h5_in.keys())
41 | 
42 |     header = []
43 |     for chrom in chroms:
44 |         # chromosome and length
45 |         header.append((chrom,len(h5_in[chrom])))
46 | 
47 |     # write header
48 |     bw_out.addHeader(header)
49 | 
50 |     for chrom, length in header:
51 |         if options.verbose:
52 |             print(chrom)
53 | 
54 |         # read values
55 |         x = np.array(h5_in[chrom])
56 | 
57 |         # write gzipped into HDF5
58 |         bw_out.addEntries(chrom, 0, values=x, span=1, step=1)
59 | 
60 |     # close files
61 |     h5_in.close()
62 |     bw_out.close()
63 | 
64 | 
65 | ################################################################################
66 | # __main__
67 | ################################################################################
68 | if __name__ == '__main__':
69 |     main()
70 | 


--------------------------------------------------------------------------------
/gtf2bed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import gff
 4 | 
 5 | ################################################################################
 6 | # gtf2bed.py
 7 | #
 8 | # Convert a gtf file to a bed file.
 9 | ################################################################################
10 | 
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <gtf file>'
17 |     parser = OptionParser(usage)
18 |     parser.add_option('-c', dest='cds', action='store_true', default=False, help='Use CDS, not exons [Default: %default]')
19 |     (options,args) = parser.parse_args()
20 | 
21 |     if len(args) != 1:
22 |         parser.error('Must provide gtf file')
23 |     else:
24 |         gtf_file = args[0]
25 | 
26 |     genes = gff.read_genes(gtf_file)
27 | 
28 |     for transcript_id in genes:
29 |         g = genes[transcript_id]
30 | 
31 |         if options.cds:
32 |             block_sizes = ','.join([str(ex.end-ex.start+1) for ex in g.cds])
33 |             block_starts = ','.join([str(ex.start-g.cds[0].start) for ex in g.cds])
34 | 
35 |             cols = [g.chrom, str(g.cds[0].start-1), str(g.cds[-1].end), transcript_id, '0', g.strand, '0', '0', '255,0,0', str(len(g.cds)), block_sizes, block_starts]
36 | 
37 |         else:
38 |             block_sizes = ','.join([str(ex.end-ex.start+1) for ex in g.exons])
39 |             block_starts = ','.join([str(ex.start-g.exons[0].start) for ex in g.exons])
40 | 
41 |             cols = [g.chrom, str(g.exons[0].start-1), str(g.exons[-1].end), transcript_id, '0', g.strand, '0', '0', '255,0,0', str(len(g.exons)), block_sizes, block_starts]
42 | 
43 |         print '\t'.join(cols)
44 |     
45 | 
46 | ################################################################################
47 | # __main__
48 | ################################################################################
49 | if __name__ == '__main__':
50 |     main()
51 | 


--------------------------------------------------------------------------------
/reservoir_sample.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | 
 4 | from optparse import OptionParser
 5 | import gzip
 6 | import random
 7 | import sys
 8 | 
 9 | ################################################################################
10 | # reservoir_sample.py
11 | #
12 | # Randomly choose a subset of lines in a file using single pass
13 | # reservoir sampling.
14 | ################################################################################
15 | 
16 | 
17 | ################################################################################
18 | # main
19 | ################################################################################
20 | def main():
21 |     usage = 'usage: %prog [options] <sample_num> <input_file>'
22 |     parser = OptionParser(usage)
23 |     parser.add_option('-d', dest='header',
24 |             default=False, action='store_true')
25 |     parser.add_option('-z', dest='gzip',
26 |             default=False, action='store_true')
27 |     (options,args) = parser.parse_args()
28 | 
29 |     if len(args) != 2:
30 |     	parser.error('Must provide file and sample number')
31 |     else:
32 |     	sample_num = int(args[0])
33 |     	input_file = args[1]
34 | 
35 |     reservoir = ['']*sample_num
36 | 
37 |     if input_file in ['-','stdin']:
38 |         input_in = sys.stdin
39 |     else:
40 |         if options.gzip:
41 |             input_in = gzip.open(input_file, 'rt')
42 |         else:
43 |             input_in = open(input_file)
44 | 
45 |     if options.header:
46 |         print(input_in.readline(), end='')
47 | 
48 |     # fill
49 |     i = 0
50 |     while i < sample_num:
51 |     	reservoir[i] = input_in.readline()
52 |     	i += 1
53 | 
54 |     # sample
55 |     for line in input_in:
56 |     	j = random.randint(0, i+1)
57 |     	if j < sample_num:
58 |     		reservoir[j] = line
59 |     	i += 1
60 | 
61 |     # print
62 |     print(''.join(reservoir), end='')
63 | 
64 | ################################################################################
65 | # __main__
66 | ################################################################################
67 | if __name__ == '__main__':
68 |     main()
69 | 


--------------------------------------------------------------------------------
/bl2gff.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import glob, sys, pdb
 4 | 
 5 | ################################################################################
 6 | # bl2gff.py
 7 | #
 8 | # Convert alignments from my Blast output to features in a .gff file.
 9 | ################################################################################
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] <blast file>'
16 |     parser = OptionParser(usage)
17 |     parser.add_option('-f', dest='feature_name', default='domain', help='Feature name [Default: %default]')
18 |     parser.add_option('-p', dest='pct_t', type='float', default=0, help='Percentage of the 2nd sequence that must be covered by the alignment [Default: %default]')
19 |     parser.add_option('-i', dest='idy_t', type='float', default=0, help='% identity that must be exceeded by the alignment [Default: %default]')    
20 |     (options,args) = parser.parse_args()
21 | 
22 |     if len(args) != 1:
23 |         parser.error('Must provide blast output file')
24 |         exit(1)
25 |     else:
26 |         blast_file = args[0]
27 | 
28 |     for line in open(blast_file):
29 |         a = line.split()
30 |         
31 |         header1 = a[-2]
32 |         header2 = a[-1]
33 | 
34 |         start1 = int(a[0])
35 |         end1 = int(a[1])
36 | 
37 |         alen2 = int(a[7])
38 |         len2 = int(a[10])
39 |         idy = float(a[12])
40 |         if int(a[3]) < int(a[4]):
41 |             strand = '+'
42 |         else:
43 |             strand = '-'
44 | 
45 |         if idy > options.idy_t and alen2 > len2*options.pct_t:
46 |             gff_a = [header1, 'blast', options.feature_name, str(start1), str(end1), '.', strand, '.', header2]
47 |             print '\t'.join(gff_a)
48 |         
49 | 
50 | ################################################################################
51 | # __main__
52 | ################################################################################
53 | if __name__ == '__main__':
54 |     main()
55 |     #pdb.runcall(main)
56 | 


--------------------------------------------------------------------------------
/r/plot_gff_cov_meta.r:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | library(plyr)
 3 | 
 4 | ca = commandArgs(trailing=T)
 5 | df.file = ca[1]
 6 | out.pre = ca[2]
 7 | smooth.span = as.numeric(ca[3])
 8 | label.primary = ca[4]
 9 | label.control = ca[5]
10 | 
11 | df = read.table(df.file, header=T, quote="\"")
12 | 
13 | # unnormalized
14 | if (ncol(df) == 2) {
15 |     gp = ggplot(df, aes(x=Index, y=Coverage))
16 | } else {
17 |     gp = ggplot(df, aes(x=Index, y=Coverage, color=Type)) +
18 |         scale_x_continuous("% in transcript") +
19 |         scale_color_manual("", values=c("#F46D43", "#66BD63"), breaks=c("Primary","Control"), labels=c(label.primary, label.control))
20 | 
21 |         #scale_color_brewer(palette="Set1")
22 | }
23 | 
24 | gp +
25 |     geom_point() +
26 |     stat_smooth(method="loess", span=smooth.span) +
27 |     theme_bw() +
28 |     theme(text=element_text(size=25)) +
29 |     theme(legend.justification=c(1,0), legend.position=c(1,0))
30 | 
31 | ggsave(paste(out.pre,"_raw.pdf",sep=""))
32 | 
33 | 
34 | # normalized
35 | if (ncol(df) > 2) {
36 |     # the values are so low, I want to boost them up
37 |     fudge=10
38 | 
39 |     control.sum = sum(df[df$Type=="Control",]$Coverage)
40 |     primary.sum = sum(df[df$Type!="Control",]$Coverage)
41 | 
42 |     df$Coverage.Norm = df$Coverage
43 |     for (i in 1:nrow(df)) {
44 |         if (df[i,"Type"] == "Control") {
45 |             df[i,"Coverage.Norm"] = fudge * df[i,"Coverage"] / control.sum
46 |         } else {
47 |             df[i,"Coverage.Norm"] = fudge * df[i,"Coverage"] / primary.sum
48 |         }
49 |     }
50 | 
51 |     ggplot(df, aes(x=Index, y=Coverage.Norm, color=Type)) +
52 |         scale_x_continuous("% in transcript") +
53 |         scale_color_manual("", values=c("#F46D43", "#66BD63"), breaks=c("Primary","Control"), labels=c(label.primary, label.control)) +
54 |         geom_point() +
55 |         stat_smooth(method="loess", span=smooth.span) +
56 |         scale_y_continuous("Normalized coverage") +
57 |         theme_bw() +
58 |         theme(text=element_text(size=25)) +
59 |         theme(legend.justification=c(1,0), legend.position=c(1,0))
60 | 
61 | # scale_color_brewer(palette="Set1")
62 | 
63 |     ggsave(paste(out.pre,"_norm.pdf",sep=""))
64 | }
65 | 


--------------------------------------------------------------------------------
/bam_len_hist.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | from rpy2.robjects.packages import importr
 4 | import rpy2.robjects as ro
 5 | import rpy2.robjects.lib.ggplot2 as ggplot2
 6 | import pdb
 7 | import pysam
 8 | 
 9 | grdevices = importr('grDevices')
10 | 
11 | ################################################################################
12 | # bam_len_hist.py
13 | #
14 | # Plot a histogram of the length of alignments in a BAM file.
15 | ################################################################################
16 | 
17 | 
18 | ################################################################################
19 | # main
20 | ################################################################################
21 | def main():
22 |     usage = 'usage: %prog [options] arg'
23 |     parser = OptionParser(usage)
24 |     #parser.add_option()
25 |     (options,args) = parser.parse_args()
26 | 
27 |     if len(args) != 1:
28 |         parser.error('Must provide BAM file')
29 |     else:
30 |         bam_file = args[0]
31 | 
32 |     align_lengths = {}
33 |     for aligned_read in pysam.Samfile(bam_file, 'rb'):
34 |         align_lengths[aligned_read.qlen] = align_lengths.get(aligned_read.qlen,0) + 1
35 | 
36 |     min_len = min(align_lengths.keys())
37 |     max_len = max(align_lengths.keys())
38 | 
39 |     # construct data frame
40 |     len_r = ro.IntVector(range(min_len,max_len+1))
41 |     counts_r = ro.IntVector([align_lengths.get(l,0) for l in range(min_len,max_len+1)])
42 |     
43 |     df = ro.DataFrame({'length':len_r, 'counts':counts_r})
44 | 
45 |     # construct full plot
46 |     gp = ggplot2.ggplot(df) + \
47 |         ggplot2.aes_string(x='length', y='counts') + \
48 |         ggplot2.geom_bar(stat='identity') + \
49 |         ggplot2.scale_x_continuous('Alignment length') + \
50 |         ggplot2.scale_y_continuous('')
51 | 
52 |     # plot to file
53 |     grdevices.pdf(file='align_lengths.pdf')
54 |     gp.plot()
55 |     grdevices.dev_off()
56 | 
57 |     
58 | 
59 | ################################################################################
60 | # __main__
61 | ################################################################################
62 | if __name__ == '__main__':
63 |     main()
64 |     #pdb.runcall(main)
65 | 


--------------------------------------------------------------------------------
/isoforms_fpkm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | ################################################################################
 5 | # isoforms_fpkm.py
 6 | #
 7 | # Print the FPKM values for all isoforms of the given gene.
 8 | ################################################################################
 9 | 
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] <gene_id> <iso_ft>'
16 |     parser = OptionParser(usage)
17 |     #parser.add_option()
18 |     (options,args) = parser.parse_args()
19 | 
20 |     if len(args) != 2:
21 |         parser.error('Must provide a gene_id and isoforms.fpkm_tracking file')
22 |     else:
23 |         gene_id = args[0]
24 |         iso_ft = args[1]
25 | 
26 |     # get headers
27 |     fpkm_in = open(iso_ft)
28 |     headers = fpkm_in.readline().split()
29 | 
30 |     # determine sample table length
31 |     sample_len = 0
32 |     for i in range(len(headers)):
33 |         if headers[i][-5:] == '_FPKM':
34 |             sample = headers[i][:-5]
35 |             if len(sample) > sample_len:
36 |                 sample_len = len(sample)
37 | 
38 |     for line in fpkm_in:
39 |         a = line.split('\t')
40 |         a[-1] = a[-1].rstrip()
41 | 
42 |         tracking_id = a[0]
43 |         line_gene_id = a[3]
44 | 
45 |         if line_gene_id == gene_id:
46 |             i = 9
47 |             while i < len(a):
48 |                 sample = headers[i][:-5]
49 | 
50 |                 if a[i+3] in ['FAIL','HIDATA']:
51 |                     cols = (tracking_id, sample_len, sample, a[i+3])
52 |                     print '%-18s  %*s  %11s' % cols
53 |                 else:
54 |                     fpkm = float(a[i])
55 |                     cols = (tracking_id, sample_len, sample, fpkm)
56 |                     print '%-18s  %*s  %11.3f' % cols
57 | 
58 |                 i += 4
59 | 
60 |     fpkm_in.close()
61 |     
62 | 
63 | ################################################################################
64 | # __main__
65 | ################################################################################
66 | if __name__ == '__main__':
67 |     main()
68 | 


--------------------------------------------------------------------------------
/vcf_tss.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import os
 4 | import pdb
 5 | 
 6 | import pybedtools
 7 | 
 8 | '''
 9 | vcf_tss.py
10 | 
11 | Add TSS distance INFO column to a VCF file.
12 | '''
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <in_vcf_file> <out_vcf_file>'
19 |     parser = OptionParser(usage)
20 |     parser.add_option('-g', dest='tss_gff_file', default='%s/genes/gencode28/gencode.v28.basic.annotation.tss.gff' % os.environ['HG38'])
21 |     # parser.add_option('-g', dest='tss_gff_file', default='%s/genes/gencode28/gencode_basic_tss.gff' % os.environ['HG19'])
22 |     (options,args) = parser.parse_args()
23 | 
24 |     if len(args) != 2:
25 |         parser.error('Must provide input and output VCF files')
26 |     else:
27 |         in_vcf_file = args[0]
28 |         out_vcf_file = args[1]
29 | 
30 |     # open files
31 |     in_vcf_open = open(in_vcf_file)
32 |     out_vcf_open = open(out_vcf_file, 'w')
33 | 
34 |     # print header
35 |     line = in_vcf_open.readline()
36 |     while line.startswith('#'):
37 |         if line.startswith('#CHROM'):
38 |             # add new INFO description first
39 |             print('##FORMAT=<ID=TS,Number=1,Type=Integer,Description="TSS distance">', file=out_vcf_open)
40 |         print(line, end='', file=out_vcf_open)
41 |         line = in_vcf_open.readline()
42 |     in_vcf_open.close()
43 | 
44 |     # intersect
45 |     in_vcf_bedtool = pybedtools.BedTool(in_vcf_file)
46 |     tss_bedtool = pybedtools.BedTool(options.tss_gff_file)
47 | 
48 |     for closest_a in in_vcf_bedtool.closest(tss_bedtool, d=True, t='first'):
49 |         a = closest_a[:8]
50 |         if a[-1] == '.':
51 |             a[-1] = 'TS=%s' % closest_a[-1]
52 |         else:
53 |             a[-1] += ';TS=%s' % closest_a[-1]
54 |         print('\t'.join(a), file=out_vcf_open)
55 | 
56 |     # close
57 |     out_vcf_open.close()
58 | 
59 | 
60 | ################################################################################
61 | # __main__
62 | ################################################################################
63 | if __name__ == '__main__':
64 |     main()
65 | 


--------------------------------------------------------------------------------
/peaks_venn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import os, subprocess
 4 | import math, os, stats, subprocess
 5 | 
 6 | import matplotlib
 7 | matplotlib.use('Agg')
 8 | import matplotlib.pyplot as plt
 9 | from matplotlib_venn import venn2
10 | 
11 | ################################################################################
12 | # peaks_venn.py
13 | #
14 | # Make a venn diagram comparing two sets of peak calls.
15 | ################################################################################
16 | 
17 | 
18 | ################################################################################
19 | # main
20 | ################################################################################
21 | def main():
22 |     usage = 'usage: %prog [options] <peaks1_bed> <peaks2_bed> <out_pdf>'
23 |     parser = OptionParser(usage)
24 |     parser.add_option('--l1', dest='label1', default='peaks1', help='Label for peak set 1')
25 |     parser.add_option('--l2', dest='label2', default='peaks2', help='Label for peak set 2')
26 |     (options,args) = parser.parse_args()
27 | 
28 |     if len(args) != 3:
29 |         parser.error('Must provide two peaks BED files and output PDF')
30 |     else:
31 |         peaks1_bed = args[0]
32 |         peaks2_bed = args[1]
33 |         out_pdf = args[2]
34 | 
35 |     # count individual
36 |     peaks1_count = count_peaks(peaks1_bed)
37 |     peaks2_count = count_peaks(peaks2_bed)
38 | 
39 |     # count overlap
40 |     copeaks_count = 0
41 |     p = subprocess.Popen('intersectBed -u -a %s -b %s' % (peaks1_bed, peaks2_bed), stdout=subprocess.PIPE, shell=True)
42 |     for line in p.stdout:
43 |         copeaks_count += 1
44 |     p.communicate()
45 | 
46 |     plt.figure()
47 |     venn_diag = venn2(subsets=(peaks1_count-copeaks_count, peaks2_count-copeaks_count, copeaks_count), set_labels=[options.label1, options.label2], set_colors=['#e41a1c', '#A1A838'])
48 |     plt.savefig(out_pdf)
49 |     plt.close()
50 | 
51 | 
52 | def count_peaks(bed_file):
53 |     peak_counts = 0
54 |     for line in open(bed_file):
55 |         peak_counts += 1
56 |     return peak_counts
57 | 
58 | ################################################################################
59 | # __main__
60 | ################################################################################
61 | if __name__ == '__main__':
62 |     main()
63 | 


--------------------------------------------------------------------------------
/multiz_lncrna.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | from pygr import worldbase
 4 | import gff
 5 | 
 6 | ################################################################################
 7 | # multiz_lncrna.py
 8 | #
 9 | # Return hg19 46-way multiz alignments of a specified lncRNA gene. By default,
10 | # just do the exons, but as an option do the entire span.
11 | ################################################################################
12 | 
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <gene id>'
19 |     parser = OptionParser(usage)
20 |     parser.add_option('-l', dest='lncrna_gtf', default='/Users/dk/research/common/data/lncrna/lnc_catalog.gtf', help='lncRNA gtf file [Default: %default]')
21 |     parser.add_option('-s', dest='span', action='store_true', default=False, help='Map the gene\'s entire span, i.e. introns too [Default: %default]')
22 |     (options,args) = parser.parse_args()
23 | 
24 |     if len(args) != 1:
25 |         parser.error('Must provide gene id')
26 |     else:
27 |         gene_id = args[0]
28 | 
29 |     # get human genome
30 |     hg19 = worldbase.Bio.Seq.Genome.HUMAN.hg19()
31 | 
32 |     # get gene exon intervals
33 |     gene_ivals = []
34 |     for line in open(options.lncrna_gtf):
35 |         a = line.split('\t')
36 |         if gff.gtf_kv(a[8])['gene_id'] == gene_id:
37 |             chrom = a[0]
38 |             start = int(a[3])
39 |             end = int(a[4])
40 |             # ignoring orientation at the moment
41 | 
42 |             gene_ivals.append(hg19[chrom][start:end])
43 | 
44 |     # get hg19 msa
45 |     msa = worldbase.Bio.MSA.UCSC.hg19_multiz46way()
46 | 
47 |     # map returned sequences back to genome name
48 |     idDict = ~(msa.seqDict)
49 | 
50 |     # print alignments
51 |     for gi in gene_ivals:
52 |         for src, dest, edg in msa[gi].edges():
53 |             print repr(gi), repr(src), repr(dest), idDict[dest], edg.length()
54 | 
55 | 
56 | ################################################################################
57 | # __main__
58 | ################################################################################
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/fpkm_hist.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import math, os, pdb, random
 4 | import cufflinks, gff
 5 | 
 6 | from rpy2.robjects.packages import importr
 7 | import rpy2.robjects as ro
 8 | import rpy2.robjects.lib.ggplot2 as ggplot2
 9 | grdevices = importr('grDevices')
10 | 
11 | ################################################################################
12 | # fpkm_hist.py
13 | #
14 | # Plot a histogram of the max log2 FPKM values for the genes in a gtf file.
15 | ################################################################################
16 | 
17 | ################################################################################
18 | # main
19 | ################################################################################
20 | def main():
21 |     usage = 'usage: %prog [options] <gtf file> <fpkm tracking>'
22 |     parser = OptionParser(usage)
23 |     #parser.add_option('-m', dest='fpkm_min', type='float', default=0.25, help='Minimum FPKM [Default: %default]')
24 |     (options,args) = parser.parse_args()
25 | 
26 |     if len(args) != 2:
27 |         parser.error(usage)
28 |     else:
29 |         gtf_file = args[0]
30 |         fpkm_tracking_file = args[1]
31 | 
32 |     # get genes
33 |     genes = set()
34 |     for line in open(gtf_file):
35 |         a = line.split('\t')
36 |         genes.add(gff.gtf_kv(a[8])['gene_id'])
37 | 
38 |     # get expression
39 |     cuff = cufflinks.fpkm_tracking(fpkm_tracking_file)
40 |     log_fpkms = []
41 |     for gene_id in genes:
42 |         max_fpkm = max(cuff.gene_expr(gene_id))
43 |         if max_fpkm > 0:
44 |             log_fpkms.append(math.log(max_fpkm,2))
45 | 
46 |     # construct R data objects
47 |     fpkms_r = ro.FloatVector(log_fpkms)
48 |     df = ro.DataFrame({'fpkm':fpkms_r})
49 |     
50 |     # construct plot
51 |     gp = ggplot2.ggplot(df) + \
52 |         ggplot2.aes_string(x='fpkm') + \
53 |         ggplot2.geom_histogram(binwidth=0.2)
54 |     
55 |     # save to file
56 |     gtf_pre = os.path.splitext(gtf_file)[0]
57 |     grdevices.pdf(file='%s_fpkmhist.pdf' % gtf_pre)
58 |     gp.plot()
59 |     grdevices.dev_off()
60 | 
61 | 
62 | ################################################################################
63 | # __main__
64 | ################################################################################
65 | if __name__ == '__main__':
66 |     main()
67 |     #pdb.runcall(main)
68 | 


--------------------------------------------------------------------------------
/bg_w5.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import gzip
 4 | 
 5 | import h5py
 6 | import numpy as np
 7 | 
 8 | '''
 9 | bg_w5.py
10 | 
11 | Convert a BedGraph w/o overlapping entries to wig5.
12 | '''
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <in_bg_file> <genome_file> <out_h5_file>'
19 |     parser = OptionParser(usage)
20 |     parser.add_option('-l', dest='norm_len',
21 |         default=False, action='store_true',
22 |         help='Normalize values by site length [Default: %default]')
23 |     (options,args) = parser.parse_args()
24 | 
25 |     if len(args) != 3:
26 |         parser.error('Must provide input BigWig, genome file, output HDF5.')
27 |     else:
28 |         bg_file = args[0]
29 |         genome_file = args[1]
30 |         hdf5_file = args[2]
31 | 
32 |     # initialize chromosome arrays
33 |     chrm_values = {}
34 |     for line in open(genome_file):
35 |         a = line.split()
36 |         chrm = a[0]
37 |         chrm_len = int(a[1])
38 |         chrm_values[chrm] = np.zeros(chrm_len, dtype='float16')
39 | 
40 |     # write bedgraph entries
41 |     if bg_file[-3:] == '.gz':
42 |         bg_open = gzip.open(bg_file, 'rt')
43 |     else:
44 |         bg_open = open(bg_file)
45 | 
46 |     for line in bg_open:
47 |         if not line.startswith('#'):
48 |             a = line.split()
49 |             if len(a) >= 4:
50 |                 chrm = a[0]
51 |                 start = int(a[1])
52 |                 end = int(a[2])
53 |                 v = float(a[3])
54 |                 if options.norm_len:
55 |                     v /= (end-start)
56 |                 chrm_values[chrm][start:end] = v
57 | 
58 |     bg_open.close()
59 | 
60 |     # write gzipped into HDF5
61 |     h5_out = h5py.File(hdf5_file, 'w')
62 |     for chrm in chrm_values:
63 |         h5_out.create_dataset(chrm, data=np.nan_to_num(chrm_values[chrm]),
64 |             dtype='float16', compression='gzip', shuffle=True)
65 |     h5_out.close()
66 | 
67 | 
68 | 
69 | ################################################################################
70 | # __main__
71 | ################################################################################
72 | if __name__ == '__main__':
73 |     main()
74 | 


--------------------------------------------------------------------------------
/gtf_homologues.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import os, subprocess, tempfile
 4 | import gff
 5 | 
 6 | ################################################################################
 7 | # gtf_homologues.py
 8 | #
 9 | # Make a table describing candidate homologue genes as determined by a
10 | # transmap from one genome to another.
11 | ################################################################################
12 | 
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <chain_file> <net_file> <gtf_from> <gtf_to>'
19 |     parser = OptionParser(usage)
20 |     #parser.add_option()
21 |     (options,args) = parser.parse_args()
22 | 
23 |     if len(args) != 4:
24 |         parser.error('Must provide chain file and two GTF files')
25 |     else:
26 |         chain_file = args[0]
27 |         net_file = args[1]
28 |         gtf_from = args[2]
29 |         gtf_to = args[3]
30 | 
31 |     # transmap to new genome
32 |     from_map_gtf_fd, from_map_gtf_file = tempfile.mkstemp()
33 |     subprocess.call('chain_map.py -k gene_id -n %s %s %s > %s' % (net_file,chain_file,gtf_from,from_map_gtf_file), shell=True)
34 | 
35 |     # intersect w/ gtf_to
36 |     homologues = {}
37 |     p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' % (from_map_gtf_file,gtf_to), shell=True, stdout=subprocess.PIPE)
38 |     for line in p.stdout:
39 |         a = line.split('\t')
40 |         
41 |         kv_to = gff.gtf_kv(a[17])
42 | 
43 |         gid_from = a[8].split(';')[1].strip()
44 |         gid_to = kv_to['gene_id']
45 | 
46 |         homologues.setdefault(gid_from,set()).add(gid_to)
47 |     p.communicate()
48 | 
49 |     # find all genes
50 |     genes = set()
51 |     for line in open(gtf_from):
52 |         a = line.split('\t')
53 |         genes.add(gff.gtf_kv(a[8])['gene_id'])
54 | 
55 |     # print table
56 |     for g in genes:
57 |         print '%s\t%s' % (g,' '.join(homologues.get(g,['-'])))
58 | 
59 |     os.close(from_map_gtf_fd)
60 |     os.remove(from_map_gtf_file)
61 | 
62 | 
63 | ################################################################################
64 | # __main__
65 | ################################################################################
66 | if __name__ == '__main__':
67 |     main()
68 | 


--------------------------------------------------------------------------------
/gsea_rnk.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import math, os
 4 | 
 5 | ################################################################################
 6 | # gsea_rnk.py
 7 | #
 8 | # Output a set of .rnk files for GSEA from a cuffdiff .diff file.
 9 | ################################################################################
10 | 
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <diff>'    
17 |     parser = OptionParser(usage)
18 |     parser.add_option('-m', dest='min_fpkm', type='float')
19 |     parser.add_option('-o', dest='out_dir', default='.')
20 |     (options,args) = parser.parse_args()
21 | 
22 |     if len(args) != 1:
23 |         parser.error('Must provide .diff')
24 |     else:
25 |         diff_file = args[0]
26 | 
27 |     if not os.path.isdir(options.out_dir):
28 |         os.mkdir(options.out_dir)
29 | 
30 |     comparison_out = {}
31 | 
32 |     diff_in = open(diff_file)
33 |     diff_in.readline()
34 |     for line in diff_in:
35 |         a = line.split('\t')
36 | 
37 |         gene_id = a[0]
38 |         gene_name = a[2]
39 |         sample1 = a[4].replace('-','_')  # cmd line gsea cannot handle hyphens
40 |         sample2 = a[5].replace('-','_')
41 |         status = a[6]
42 |         fpkm1 = float(a[7])
43 |         fpkm2 = float(a[8])
44 |         fold_change = float(a[9])
45 |         tstat = float(a[10])
46 |         qval = float(a[11])
47 |         sig = a[-1].rstrip()
48 | 
49 |         if status == 'OK' and not math.isnan(tstat):
50 |             if options.min_fpkm == None or fpkm1 > options.min_fpkm or fpkm2 > options.min_fpkm:
51 |                 if not (sample1,sample2) in comparison_out:
52 |                     comparison_out[(sample1,sample2)] = open('%s/%s_%s.rnk' % (options.out_dir, sample1, sample2), 'w')
53 | 
54 |                 print >> comparison_out[(sample1,sample2)], '%s\t%f' % (gene_name, fold_change)
55 | 
56 |     diff_in.close()
57 | 
58 |     for ckey in comparison_out:
59 |         comparison_out[ckey].close()
60 | 
61 | 
62 | ################################################################################
63 | # __main__
64 | ################################################################################
65 | if __name__ == '__main__':
66 |     main()
67 | 


--------------------------------------------------------------------------------
/lnc_expression.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import cufflinks, gff
 4 | import os
 5 | 
 6 | ################################################################################
 7 | # lnc_expession.py
 8 | #
 9 | # Print a summary of the lncrna gene's expression.
10 | ################################################################################
11 | 
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] <gene/transcript id>'
18 |     parser = OptionParser(usage)
19 |     parser.add_option('-c', dest='cuff_dir', default='%s/research/common/data/lncrna'%os.environ['HOME'], help='Cufflinks output directory with .fpkm_tracking files [Default: %default]')
20 |     parser.add_option('-l', dest='lnc_gtf', default='%s/research/common/data/lncrna/lnc_catalog.gtf'%os.environ['HOME'], help='lncRNA catalog gtf file [Default: %default]')
21 |     parser.add_option('-t', dest='transcript_expr', default=False, action='store_true', help='Return transcript expression rather than gene [Default: %default]')
22 |     (options,args) = parser.parse_args()
23 | 
24 |     if options.transcript_expr:
25 |         cuff = cufflinks.fpkm_tracking('%s/isoforms.fpkm_tracking' % options.cuff_dir)
26 | 
27 |         if args[0].find('XLOC') != -1:
28 |             trans_ids = set()
29 |             for line in open(options.lnc_gtf):
30 |                 a = line.split('\t')
31 |                 kv = gff.gtf_kv(a[8])
32 |                 if kv['gene_id'] == args[0]:
33 |                     trans_ids.add(kv['transcript_id'])
34 |         else:
35 |             trans_ids = [args[0]]
36 | 
37 |         for trans_id in trans_ids:
38 |             print '%s:' % trans_id
39 |             cuff.gene_expr_print(trans_id)
40 | 
41 |     else:
42 |         cuff = cufflinks.fpkm_tracking('%s/genes.fpkm_tracking' % options.cuff_dir)
43 | 
44 |         if args[0].find('XLOC') != -1:
45 |             gene_id = args[0]
46 |         else:
47 |             t2g = gff.t2g(options.lnc_gtf)
48 |             gene_id = t2g[args[0]]
49 | 
50 |         cuff.gene_expr_print(gene_id)
51 |     
52 | 
53 | ################################################################################
54 | # __main__
55 | ################################################################################
56 | if __name__ == '__main__':
57 |     main()
58 | 


--------------------------------------------------------------------------------
/nuc2gff.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import glob, sys, pdb
 4 | 
 5 | ################################################################################
 6 | # nuc2gff.py
 7 | #
 8 | # Convert alignments from a nucmer coords file to features in a .gff file.
 9 | ################################################################################
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] <coords file>'
16 |     parser = OptionParser(usage)
17 |     parser.add_option('-f', dest='feature_name', default='domain', help='Feature name [Default: %default]')
18 |     parser.add_option('-p', dest='pct_t', type='float', default=0.9, help='Percentage of the 2nd sequence that must be covered by the alignment [Default: %default]')
19 |     parser.add_option('-i', dest='idy_t', type='float', default=0.8, help='% identity that must be exceeded by the alignment [Default: %default]')
20 |     (options,args) = parser.parse_args()
21 | 
22 |     if len(args) != 1:
23 |         parser.error('Must provide nucmer output coords file')
24 |     else:
25 |         coords_file = args[0]
26 | 
27 |     # get header
28 |     cf = open(coords_file)
29 |     for i in range(5):
30 |         cf.readline()
31 | 
32 |     line = cf.readline()
33 |     while line:
34 |         a = line.split()
35 |         
36 |         header1 = a[-2]
37 |         header2 = a[-1]
38 | 
39 |         start1 = int(a[0])
40 |         end1 = int(a[1])
41 | 
42 |         idy = float(a[9])/100.0
43 |         len2 = int(a[12])
44 |         if int(a[3]) < int(a[4]):
45 |             strand = '+'
46 |             start2 = int(a[3])
47 |             end2 = int(a[4])
48 |         else:
49 |             strand = '-'
50 |             start2 = int(a[4])
51 |             end2 = int(a[3])
52 | 
53 |         if idy > options.idy_t and end2-start2+1 > len2*options.pct_t:
54 |             gff_a = [header1, 'nucmer', options.feature_name, str(start1), str(end1), '.', strand, '.', header2]
55 |             print '\t'.join(gff_a)
56 |         
57 |         line = cf.readline()
58 |  
59 | 
60 | ################################################################################
61 | # __main__
62 | ################################################################################
63 | if __name__ == '__main__':
64 |     main()
65 |     #pdb.runcall(main)
66 | 


--------------------------------------------------------------------------------
/plot_fragment_lengths.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import pysam
 4 | 
 5 | import matplotlib
 6 | matplotlib.use('Agg')
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | import seaborn as sns
10 | sns.set_style('ticks')
11 | 
12 | ################################################################################
13 | # plot_fragment_lengths.py
14 | #
15 | # Plot the distribution of fragment lengths
16 | ################################################################################
17 | 
18 | 
19 | ################################################################################
20 | # main
21 | ################################################################################
22 | def main():
23 |     usage = 'usage: %prog [options] <bam> <out_pdf>'
24 |     parser = OptionParser(usage)
25 |     parser.add_option('-m', dest='max_length', type='int')
26 |     (options,args) = parser.parse_args()
27 | 
28 |     if len(args) != 2:
29 |         parser.error('Must provide BAM file and output PDF')
30 |     else:
31 |         bam_file = args[0]
32 |         out_pdf = args[1]
33 | 
34 |     tlens = {}
35 |     for alignment in pysam.Samfile(bam_file):
36 |         tl = abs(alignment.template_length)
37 |         tlens[tl] = tlens.get(tl,0) + 1
38 | 
39 |     # not sure what 0 means
40 |     tlens[0] = 0
41 | 
42 |     if options.max_length is None:
43 |         num_fragments = sum([tlens.get(i,0) for i in range(10000)])
44 |         max_length_fragments = 0.99*num_fragments
45 | 
46 |         length = 1
47 |         length_fragments = tlens.get(length,0)
48 |         while length_fragments < max_length_fragments and length < 1000:
49 |             print length, length_fragments, max_length_fragments
50 |             length += 1
51 |             length_fragments += tlens.get(length,0)
52 | 
53 |         options.max_length = length
54 | 
55 |     #for tl in range(max(tlens.keys())+1):
56 |     #    print '%4d  %d' % (tl,tlens.get(tl,0))
57 | 
58 |     length_counts = [tlens.get(length,0) for length in range(options.max_length)]
59 | 
60 |     plt.figure()
61 |     plt.plot(length_counts)
62 |     plt.xlabel('Fragment length')
63 |     plt.xlim(0,options.max_length+1)
64 |     sns.despine()
65 |     plt.savefig(out_pdf)
66 |     plt.close()
67 |     
68 | 
69 | ################################################################################
70 | # __main__
71 | ################################################################################
72 | if __name__ == '__main__':
73 |     main()
74 | 


--------------------------------------------------------------------------------
/bam_bedg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import pysam
 4 | 
 5 | ################################################################################
 6 | # bam_bedg
 7 | #
 8 | # Map a BAM file of aligned reads from a ChIP-seq or ATAC-seq to a BEDGRAPH
 9 | # file, counting only the events relevant to that experiment.
10 | ################################################################################
11 | 
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] <bam> <bedg>'
18 |     parser = OptionParser(usage)
19 |     parser.add_option('-l', dest='frag_len', default=200, type='int', action='store_true')
20 |     (options,args) = parser.parse_args()
21 | 
22 |     if len(args) != 2:
23 |         parser.error('Must provide input BAM and output BEDGRAPH files')
24 |     else:
25 |         bam_file = args[0]
26 |         bedg_file = args[1]
27 | 
28 |     chrom_events = {}
29 | 
30 |     bam_in = pysam.Samfile(bam_file, 'rb')
31 |     for align in bam_in:
32 |         # get chrom
33 |         chrom = bam_in.references[align.tid]
34 | 
35 |         # weight multi-mappers
36 |         multi_weight = weight_multi(align)
37 | 
38 |         # determine fragment length
39 |         if align.is_proper_pair:
40 |             frag_len = abs(align.tlen)
41 |         else:
42 |             frag_len = options.frag_len
43 | 
44 |         # map to event position
45 |         event_pos = align.reference_start + frag_len/2
46 | 
47 |         # save
48 |         if chrom not in chrom_events:
49 |             chrom_events[chrom] = {}
50 |         chrom_events[chrom][event_pos] = chrom_events[chrom].get(event_pos,0) + multi_weight
51 |     bam_in.close()
52 | 
53 |     # output BEDGRAPH
54 | 
55 | 
56 | def weight_multi(align):
57 |     ''' Weight the alignment by its multimap properties
58 | 
59 |     I'm making this a separate function, because I might
60 |     want to use more sophisticated weights later.
61 |     '''
62 |     try:
63 |         nh_tag = align_read.opt('NH')
64 |     except:
65 |         nh_tag = 1
66 | 
67 |     multi_weight = 1.0 / nh_tag
68 | 
69 |     return multi_weight
70 | 
71 | 
72 | 
73 | ################################################################################
74 | # __main__
75 | ################################################################################
76 | if __name__ == '__main__':
77 |     main()
78 | 


--------------------------------------------------------------------------------
/bgo_w5.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import gzip
 4 | 
 5 | import h5py
 6 | import numpy as np
 7 | 
 8 | '''
 9 | bg_w5.py
10 | 
11 | Convert a BedGraph w/ overlapping entries to Wig5.
12 | '''
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <in_bg_file> <genome_file> <out_h5_file>'
19 |     parser = OptionParser(usage)
20 |     # parser.add_option('-v', dest='verbose',
21 |     #         default=False, action='store_true')
22 |     (options,args) = parser.parse_args()
23 | 
24 |     if len(args) != 3:
25 |         parser.error('Must provide input BigWig, genome file, output HDF5.')
26 |     else:
27 |         bg_file = args[0]
28 |         genome_file = args[1]
29 |         hdf5_file = args[2]
30 | 
31 |     # initialize chromosome arrays
32 |     chrm_values = {}
33 |     chrm_counts = {}
34 |     for line in open(genome_file):
35 |         a = line.split()
36 |         chrm = a[0]
37 |         chrm_len = int(a[1])
38 |         chrm_values[chrm] = np.zeros(chrm_len, dtype='float16')
39 |         chrm_counts[chrm] = np.zeros(chrm_len, dtype='uint8')
40 | 
41 |     # write bedgraph entries
42 |     if bg_file[-3:] == '.gz':
43 |         bg_open = gzip.open(bg_file, 'rt')
44 |     else:
45 |         bg_open = open(bg_file)
46 | 
47 |     for line in bg_open:
48 |         if not line.startswith('#'):
49 |             a = line.split()
50 |             if len(a) >= 4:
51 |                 chrm = a[0]
52 |                 start = int(a[1])
53 |                 end = int(a[2])
54 |                 v = float(a[3])
55 |                 chrm_values[chrm][start:end] += v
56 |                 chrm_counts[chrm][start:end] += 1
57 | 
58 |     bg_open.close()
59 | 
60 |     # take mean
61 |     for chrm in chrm_values:
62 |         chrm_values[chrm] = np.divide(chrm_values[chrm], chrm_counts[chrm])
63 |         chrm_values[chrm] = np.nan_to_num(chrm_values[chrm])
64 | 
65 |     # write gzipped into HDF5
66 |     h5_out = h5py.File(hdf5_file, 'w')
67 |     for chrm in chrm_values:
68 |         h5_out.create_dataset(chrm, data=chrm_values[chrm], dtype='float16',
69 |             compression='gzip', shuffle=True)
70 |     h5_out.close()
71 | 
72 | 
73 | 
74 | ################################################################################
75 | # __main__
76 | ################################################################################
77 | if __name__ == '__main__':
78 |     main()
79 | 


--------------------------------------------------------------------------------
/make_ref_ml.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import subprocess
 4 | 
 5 | '''
 6 | make_ref_ml.py
 7 | 
 8 | Make machine learning friendly genome files, removing unplaced contigs,
 9 | and chrY.
10 | '''
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <fasta_file> <genome_file>'
17 |     parser = OptionParser(usage)
18 |     #parser.add_option()
19 |     (options,args) = parser.parse_args()
20 | 
21 |     if len(args) != 2:
22 |         parser.error('Must provide FASTA and genome files')
23 |     else:
24 |         fasta_file = args[0]
25 |         genome_file = args[1]
26 | 
27 |     fasta_ml_file = fasta_file.replace('.fa', '.ml.fa')
28 |     fasta_ml_out = open(fasta_ml_file, 'w')
29 | 
30 |     for line in open(fasta_file):
31 |         if line[0] == '>':
32 |             keep_chr = True
33 |             header = line[1:]
34 |             keep_chr = filter_chr(header)
35 |         if keep_chr:
36 |             print(line, file=fasta_ml_out, end='')
37 | 
38 |     fasta_ml_out.close()
39 | 
40 |     subprocess.call('samtools faidx %s' % fasta_ml_file, shell=True)
41 | 
42 | 
43 |     genome_ml_file = genome_file.replace('.genome', '.ml.genome')
44 |     genome_ml_file = open(genome_ml_file, 'w')
45 | 
46 |     for line in open(genome_file):
47 |         header = line.split()[0]
48 |         keep_chr = filter_chr(header)
49 |         if keep_chr:
50 |             print(line, file=genome_ml_file, end='')
51 | 
52 |     genome_ml_file.close()
53 | 
54 | 
55 | def filter_chr(header):
56 |     keep_chr = True
57 |     if header.find('chrUn') != -1:
58 |         keep_chr = False
59 |     elif header.find('random') != -1:
60 |         keep_chr = False
61 |     elif header.find('hap') != -1:
62 |         keep_chr = False
63 |     elif header.find('alt') != -1:
64 |         keep_chr = False
65 |     elif header.find('KI270') != -1:
66 |         keep_chr = False
67 |     elif header.find('GL000') != -1:
68 |         keep_chr = False
69 |     elif header.find('JH584') != -1:
70 |         keep_chr = False
71 |     elif header.find('GL456') != -1:
72 |         keep_chr = False
73 |     elif header.rstrip() in ['chrM','chrMT','chrY']:
74 |         keep_chr = False
75 |     return keep_chr
76 | 
77 | ################################################################################
78 | # __main__
79 | ################################################################################
80 | if __name__ == '__main__':
81 |     main()
82 | 


--------------------------------------------------------------------------------
/vcf_splice.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import os
 4 | import pdb
 5 | 
 6 | import pybedtools
 7 | 
 8 | '''
 9 | vcf_splice.py
10 | 
11 | Add splice site distance INFO column to a VCF file.
12 | '''
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <in_vcf_file> <out_vcf_file>'
19 |     parser = OptionParser(usage)
20 |     parser.add_option('-g', dest='splice_gff_file', default='%s/genes/gencode28/gencode.v28.basic.annotation.splice.gff' % os.environ['HG38'])
21 |     # parser.add_option('-g', dest='splice_gff_file', default='%s/genes/gencode28/gencode_basic_splice.gff' % os.environ['HG19'])
22 |     parser.add_option('-t', dest='filter_t',
23 |         default=None, type='int',
24 |         help='Filter out variants less than the given distance threshold [Default: %default]')
25 |     (options,args) = parser.parse_args()
26 | 
27 |     if len(args) != 2:
28 |         parser.error('Must provide input and output VCF files')
29 |     else:
30 |         in_vcf_file = args[0]
31 |         out_vcf_file = args[1]
32 | 
33 |     # open files
34 |     in_vcf_open = open(in_vcf_file)
35 |     out_vcf_open = open(out_vcf_file, 'w')
36 | 
37 |     # print header
38 |     line = in_vcf_open.readline()
39 |     while line.startswith('#'):
40 |         if line.startswith('#CHROM'):
41 |             # add new INFO description first
42 |             print('##FORMAT=<ID=SS,Number=1,Type=Integer,Description="Splice site distance">', file=out_vcf_open)
43 |         print(line, end='', file=out_vcf_open)
44 |         line = in_vcf_open.readline()
45 |     in_vcf_open.close()
46 | 
47 |     # intersect
48 |     in_vcf_bedtool = pybedtools.BedTool(in_vcf_file)
49 |     splice_bedtool = pybedtools.BedTool(options.splice_gff_file)
50 | 
51 |     for closest_a in in_vcf_bedtool.closest(splice_bedtool, d=True, t='first'):
52 |         a = closest_a[:8]
53 |         splice_distance = int(closest_a[-1])
54 |         if a[-1] == '.':
55 |             a[-1] = 'SS=%s' % str(splice_distance)
56 |         else:
57 | 
58 |             a[-1] += ';SS=%s' % str(splice_distance)
59 | 
60 |         if options.filter_t is None or splice_distance >= options.filter_t:
61 |             print('\t'.join(a), file=out_vcf_open)
62 | 
63 |     # close
64 |     out_vcf_open.close()
65 | 
66 | 
67 | ################################################################################
68 | # __main__
69 | ################################################################################
70 | if __name__ == '__main__':
71 |     main()
72 | 


--------------------------------------------------------------------------------
/seq_logo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import os, re, subprocess, tempfile
 4 | 
 5 | ################################################################################
 6 | # name
 7 | #
 8 | #
 9 | ################################################################################
10 | 
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] arg'
17 |     parser = OptionParser(usage)
18 |     #parser.add_option()
19 |     (options,args) = parser.parse_args()
20 | 
21 |     seq = 'ACGTACGT'
22 |     heights = [1, 1, 2, 2, 1, 1, 1, 1]
23 |     out_eps = 'test_logo.eps'
24 |     seq_logo(seq, heights, out_eps)
25 |     
26 | 
27 | def seq_logo(seq, heights, out_eps, weblogo_args=''):
28 | 	# print the sequence to a temp fasta file
29 | 	fasta_fd, fasta_file = tempfile.mkstemp()
30 | 	fasta_out = open(fasta_file, 'w')
31 | 	print >> fasta_out, '>seq\n%s' % seq
32 | 	fasta_out.close()
33 | 
34 | 	# print figure to a temp eps file
35 | 	eps_fd, eps_file = tempfile.mkstemp()
36 | 	weblogo_cmd = 'weblogo --errorbars NO --show-xaxis NO --show-yaxis NO --fineprint "" -c classic -n %d %s < %s > %s' % (len(seq), weblogo_args, fasta_file, eps_file)
37 | 	subprocess.call(weblogo_cmd, shell=True)
38 | 
39 | 	# copy eps file over and write in my own heights
40 | 	start_stack_re = re.compile('^\(\d*\) StartStack')
41 | 	out_eps_open = open(out_eps, 'w')
42 | 	weblogo_eps_in = open(eps_file)
43 | 	line = weblogo_eps_in.readline()
44 | 	si = 0
45 | 	while line:
46 | 		start_stack_match = start_stack_re.search(line)
47 | 
48 | 		# nt column begins
49 | 		if start_stack_match:
50 | 			print >> out_eps_open, line,
51 | 
52 | 			# loop over 4 nt's
53 | 			for i in range(4):
54 | 				line = weblogo_eps_in.readline()
55 | 				a = line.split()
56 | 
57 | 				nt = a[2][1:-1]
58 | 				if nt != seq[si]:
59 | 					print >> out_eps_open, line,
60 | 				else:
61 | 					# change the nt of seq
62 | 					a[1] = '%.6f' % heights[si]
63 | 					print >> out_eps_open, ' %s' % ' '.join(a)
64 | 
65 | 			# move to next nucleotide
66 | 			si += 1
67 | 		else:
68 | 			print >> out_eps_open, line,
69 | 
70 | 		# advance to next line
71 | 		line = weblogo_eps_in.readline()
72 | 
73 | 	# clean
74 | 	os.close(fasta_fd)
75 | 	os.remove(fasta_file)
76 | 	os.close(eps_fd)
77 | 	os.remove(eps_file)
78 | 
79 | 
80 | ################################################################################
81 | # __main__
82 | ################################################################################
83 | if __name__ == '__main__':
84 |     main()
85 | 


--------------------------------------------------------------------------------
/rmdup_iclip.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import gzip, pdb
 4 | import pysam
 5 | 
 6 | ################################################################################
 7 | # rmdup_iclip.py
 8 | #
 9 | # Remove duplicates in Tollervey and Zamack et al's CLIP-Seq data, where the
10 | # reads have barcodes at varying positions.
11 | ################################################################################
12 | 
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <barcode_indexes> <bam> <fastq1> ...'
19 |     parser = OptionParser(usage)
20 |     #parser.add_option()
21 |     (options,args) = parser.parse_args()
22 | 
23 |     if len(args) < 3:
24 |         parser.error('Must provide barcode indexes, BAM file, and FASTQ files')
25 |     else:
26 |         barcode_indexes = [int(bi) for bi in args[0].split(',')]
27 |         bam_file = args[1]
28 |         fastq_files = args[2:]
29 | 
30 |     # map headers to barcodes
31 |     header_barcodes = {}
32 |     for fastq_file in fastq_files:
33 |         if fastq_file[-2:] == 'gz':
34 |             fastq_in = gzip.open(fastq_file)
35 |         else:
36 |             fastq_in = open(fastq_file)
37 | 
38 |         header = fastq_in.readline()
39 |         while header:
40 |             seq = fastq_in.readline()
41 |             mid = fastq_in.readline()
42 |             qual = fastq_in.readline()
43 | 
44 |             align_header = header[1:].split()[0]
45 |             barcode = ''.join([seq[bi] for bi in barcode_indexes])
46 |             header_barcodes[align_header] = barcode
47 | 
48 |             header = fastq_in.readline()
49 | 
50 |     # open BAM
51 |     bam_in = pysam.Samfile(bam_file, 'rb')
52 |     bam_out = pysam.Samfile(bam_file[:-4] + '_rmdup.bam', 'wb', template=bam_in)
53 | 
54 |     alignment_hash = set()
55 | 
56 |     for aligned_read in bam_in:
57 |         # hash by chrom, start, strand, barcode
58 |         align_key = (aligned_read.tid, aligned_read.pos, aligned_read.is_reverse, header_barcodes[aligned_read.qname])
59 | 
60 |         # if alignment not yet printed
61 |         if not align_key in alignment_hash:
62 |             bam_out.write(aligned_read)
63 |             alignment_hash.add(align_key)
64 | 
65 |     bam_in.close()
66 |     bam_out.close()
67 | 
68 | ################################################################################
69 | # __main__
70 | ################################################################################
71 | if __name__ == '__main__':
72 |     main()
73 |     #pdb.runcall(main)
74 | 


--------------------------------------------------------------------------------
/fpkm_fpkm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import os, math, sys
 4 | from scipy.stats import spearmanr
 5 | import gff, ggplot, cufflinks
 6 | 
 7 | ################################################################################
 8 | # fpkm_fpkm.py
 9 | #
10 | # Compare two cufflinks runs.
11 | ################################################################################
12 | 
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <fpkm1_file> <fpkm2_file>'
19 |     parser = OptionParser(usage)
20 |     parser.add_option('-g', dest='gtf')
21 |     parser.add_option('-o', dest='out_dir', default='.')
22 |     parser.add_option('-p', dest='pseudocount', default=0.125, type='float')
23 |     (options,args) = parser.parse_args()
24 | 
25 |     if len(args) != 2:
26 |         parser.error('Must provide two diff files')
27 |     else:
28 |         fpkm1_file = args[0]
29 |         fpkm2_file = args[1]
30 | 
31 |     cuff1 = cufflinks.fpkm_tracking(fpkm1_file)
32 |     cuff2 = cufflinks.fpkm_tracking(fpkm2_file)
33 | 
34 |     gtf_genes = set()
35 |     if options.gtf:
36 |         gtf_genes = gff.gtf_gene_set(options.gtf)
37 | 
38 |     if not os.path.isdir(options.out_dir):
39 |         os.mkdir(options.out_dir)
40 | 
41 |     for sample in cuff1.experiments:
42 |         # scatter plot fpkm
43 |         df = {'fpkm1':[], 'fpkm2':[]}
44 |         for i in range(len(cuff1.genes)):
45 |             if len(gtf_genes) == 0 or cuff1.genes[i] in gtf_genes:
46 |                 fpkm1 = cuff1.gene_expr_exp(i, sample)
47 |                 fpkm2 = cuff2.gene_expr_exp(i, sample)
48 | 
49 |                 if not math.isnan(fpkm1) and not math.isnan(fpkm2):
50 |                     df['fpkm1'].append(math.log(options.pseudocount+fpkm1,2))
51 |                     df['fpkm2'].append(math.log(options.pseudocount+fpkm2,2))
52 | 
53 |         r_script = '%s/fpkm_fpkm_scatter.r' % os.environ['RDIR']
54 |         out_pdf = '%s/%s_scatter.pdf' % (options.out_dir, sample)
55 |         ggplot.plot(r_script, df, [out_pdf])
56 | 
57 |         # compute correlation
58 |         cor, p = spearmanr(df['fpkm1'], df['fpkm2'])
59 | 
60 |         report_out = open('%s/%s_report.txt' % (options.out_dir,sample), 'w')
61 |         print >> report_out, 'Spearman correlation: %f (%e)' % (cor, p)
62 |         report_out.close()
63 | 
64 | 
65 | ################################################################################
66 | # __main__
67 | ################################################################################
68 | if __name__ == '__main__':
69 |     main()
70 | 


--------------------------------------------------------------------------------
/gsea.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import glob, os, subprocess, sys
 4 | 
 5 | ################################################################################
 6 | # gsea.py
 7 | #
 8 | # Helper script to run GSEA from CuffDiff output.
 9 | ################################################################################
10 | 
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <diff>'    
17 |     parser = OptionParser(usage)
18 |     parser.add_option('-c', dest='scheme', default='weighted', help='weighted or classic [Default: %default]')
19 |     parser.add_option('-o', dest='out_dir', default='.')
20 |     parser.add_option('-s', dest='gene_set', default='go', help='Gene sets [Default: %default]')
21 |     (options,args) = parser.parse_args()
22 | 
23 |     if len(args) != 1:
24 |         parser.error('Must provide .diff')
25 |     else:
26 |         diff_file = args[0]
27 | 
28 |     if not os.path.isdir(options.out_dir):
29 |         os.mkdir(options.out_dir)
30 | 
31 |     # choose chip
32 |     gsea_jar = glob.glob('%s/gsea*.jar' % os.environ['GSEA'])[0]
33 |     chip_file = '%s/GENE_SYMBOL.chip' % os.environ['GSEA']
34 | 
35 |     # choose sets
36 |     if options.gene_set.lower() in ['c5', 'go']:
37 |     	sets_file = '%s/sets/c5.all.v5.0.symbols.gmt' % os.environ['GSEA']
38 |     else:
39 |     	print >> sys.stderr, 'Unrecognized gene set: %s' % options.gene_set
40 |     	exit(1)
41 | 
42 |     # make rank files
43 |     rank_cmd = 'gsea_rnk.py -o %s %s' % (options.out_dir, diff_file)
44 |     subprocess.call(rank_cmd, shell=True)
45 | 
46 |     for rank_file in glob.glob('%s/*.rnk' % options.out_dir):
47 |         rank_name = rank_file.split('/')[-1][:-4]
48 | 
49 |         # run GSEA
50 |         gsea_cmd = 'java -cp %s -Xmx4000m xtools.gsea.GseaPreranked -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -rnk %s -scoring_scheme %s -rpt_label %s -chip %s -include_only_symbols true -make_sets true -plot_top_x 50 -rnd_seed timestamp -set_max 1000 -set_min 10 -zip_report false -out %s -gui false' % (gsea_jar, sets_file, rank_file, options.scheme, rank_name, chip_file, options.out_dir)
51 |         subprocess.call(gsea_cmd, shell=True)
52 | 
53 |         # consider making a new excel file from the gsea_report_for_na_neg*.xls file
54 |         #  where I strip out the redundant col 1 and stupid col 2.
55 | 
56 | 
57 | ################################################################################
58 | # __main__
59 | ################################################################################
60 | if __name__ == '__main__':
61 |     main()
62 | 


--------------------------------------------------------------------------------
/gtf_multimaps.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import gff, util
 4 | import os, subprocess, sys
 5 | 
 6 | ################################################################################
 7 | # gtf_multimaps.py
 8 | #
 9 | # Print a summary table about multimapping reads for the transcripts in a gtf
10 | # file.
11 | ################################################################################
12 | 
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <gtf file> <bam file>'
19 |     parser = OptionParser(usage)
20 |     parser.add_option('-i', dest='intersect_done', default=False, action='store_true', help='intersectBed is already done [Default: %default]')
21 |     parser.add_option('-o', dest='output_prefix', help='Prefix for the intersectBed intermediate file [Default: %default]')
22 |     (options,args) = parser.parse_args()
23 | 
24 |     if len(args) != 2:
25 |         parser.error('Must provide gtf file and bam file')
26 |     else:
27 |         gtf_file = args[0]
28 |         bam_file = args[1]
29 | 
30 |     if options.output_prefix:
31 |         ib_file = '%s_reads_genes.gff' % options.output_prefix
32 |     else:
33 |         ib_file = 'reads_genes.gff'
34 | 
35 |     if not options.intersect_done:
36 |         # overlap genes w/ aligned reads
37 |         p = subprocess.Popen('intersectBed -s -wo -abam -bed -a %s -b %s > %s' % (bam_file,gtf_file,ib_file), shell=True)
38 |         os.waitpid(p.pid,0)
39 | 
40 |     # count transcriptome alignments per read
41 |     read_aligns = {}
42 |     for line in open(ib_file):
43 |         a = line.split('\t')
44 |         chrom = a[0]
45 |         start = int(a[1])
46 |         read_id = a[3]
47 | 
48 |         read_aligns.setdefault(read_id,set()).add((chrom,start))
49 | 
50 |     # hash reads by gene
51 |     gene_reads = {}
52 |     for line in open(ib_file):
53 |         a = line.split('\t')
54 |         read_id = a[3]
55 |         gene_id = gff.gtf_kv(a[14])['transcript_id']
56 |         gene_reads.setdefault(gene_id,[]).append(read_id)
57 | 
58 |     # print gene stats
59 |     for gene_id in gene_reads:
60 |         align_counts = [len(read_aligns[read_id]) for read_id in gene_reads[gene_id]]
61 |         multi_count = float(len([ac for ac in align_counts if ac > 1]))
62 |         cols = (gene_id, len(align_counts), util.mean(align_counts), multi_count/float(len(align_counts)))
63 |         print '%-15s %7d %7.2f %7.2f' % cols
64 | 
65 | 
66 | ################################################################################
67 | # __main__
68 | ################################################################################
69 | if __name__ == '__main__':
70 |     main()
71 | 


--------------------------------------------------------------------------------
/meme2possum.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | import numpy as np
 5 | 
 6 | ################################################################################
 7 | # meme2possum.py
 8 | #
 9 | # Convert a file of MEME PWMs to Possum's input format.
10 | ################################################################################
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <meme_file> <possum_file>'
17 |     parser = OptionParser(usage)
18 |     # parser.add_option()
19 |     (options,args) = parser.parse_args()
20 | 
21 |     if len(args) != 2:
22 |         parser.error('Must provide input MEME file and output Possum file')
23 |     else:
24 |         meme_file = args[0]
25 |         possum_file = args[1]
26 | 
27 |     #######################################################
28 |     # input MEME motifs
29 |     #######################################################
30 |     motif_pwms = {}
31 |     in_motif = False
32 |     for line in open(meme_file):
33 |         if line.startswith('MOTIF'):
34 |             motif_id = line.split()[1]
35 |             in_motif = True
36 |             pwm_cols = []
37 |         elif in_motif:
38 |             if line.startswith('letter-probability matrix'):
39 |                 pass
40 |             elif line.strip() == '':
41 |                 in_motif = False
42 |                 motif_pwms[motif_id] = np.array(pwm_cols)
43 |             else:
44 |                 pwm_cols.append([float(p) for p in line.split()])
45 | 
46 |     if in_motif:
47 |         motif_pwms[motif_id] = np.array(pwm_cols)
48 | 
49 |     #######################################################
50 |     # output Possum
51 |     #######################################################
52 |     possum_out = open(possum_file, 'w')
53 |     print >> possum_out, 'BEGIN GROUP'
54 | 
55 |     for motif_id in motif_pwms:
56 |         mpwm = motif_pwms[motif_id]
57 |         motif_len = mpwm.shape[0]
58 | 
59 |         print >> possum_out, 'BEGIN FLOAT'
60 |         print >> possum_out, 'ID %s' % motif_id
61 |         print >> possum_out, 'AP DNA'
62 |         print >> possum_out, 'LE %d' % motif_len
63 |         for ci in range(motif_len):
64 |             print >> possum_out, 'MA %s' % ' '.join([str(n) for n in mpwm[ci]])
65 |         print >> possum_out, 'END'
66 |         print >> possum_out, ''
67 | 
68 |     print >> possum_out, 'END'
69 | 
70 |     possum_out.close()
71 | 
72 | 
73 | ################################################################################
74 | # __main__
75 | ################################################################################
76 | if __name__ == '__main__':
77 |     main()
78 | 


--------------------------------------------------------------------------------
/cutFasta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import gzip, sys
 4 | 
 5 | ############################################################
 6 | # cutFasta
 7 | #
 8 | # Extract a sequence from a fasta file, using 1-based
 9 | # indexing
10 | ############################################################
11 | 
12 | 
13 | ############################################################
14 | # main
15 | ############################################################
16 | def main():
17 |     parser = OptionParser()
18 |     parser.add_option('-x', dest='start', type='int', help='Cut start')
19 |     parser.add_option('-y', dest='end', type='int', help='Cut end')
20 |     parser.add_option('-s', dest='header', help='Sequence header')
21 |     parser.add_option('-c', dest='contain', action='store_true', default=False, help='Grab all sequences that contain the header pattern')
22 |     (options,args) = parser.parse_args()
23 |     
24 |     if len(args) != 1:
25 |         parser.error('Please provide a single fasta file')
26 |         
27 |     print cf(options.start, options.end, options.header, options.contain, args[0])
28 | 
29 | ############################################################
30 | # cf
31 | #
32 | # Pull out the sequence from start to end in the entry
33 | # header in the file fasta_file
34 | ############################################################
35 | def cf(start, end, header, contain, fasta_file):
36 |     # collect sequence up to end
37 |     seq = ''
38 |     get_seq = False
39 | 
40 |     if fasta_file[-3:] == '.gz':
41 |         ff = gzip.open(fasta_file)
42 |     else:
43 |         ff = open(fasta_file)
44 |     line = ff.readline()
45 |     while line:
46 |         if line[0] == '>':
47 |             if get_seq:
48 |                 # already found, stop
49 |                 break
50 |             else:
51 |                 # check header
52 |                 h = line[1:].rstrip()
53 |                 if not header:
54 |                     get_seq = True
55 |                     header = h
56 |                 elif h == header or (contain and h.find(header) != -1):
57 |                     get_seq = True
58 | 
59 |         elif get_seq:
60 |             seq += line.rstrip()
61 | 
62 |         # if past end, stop
63 |         if end and len(seq) > end:
64 |             break
65 | 
66 |         line = ff.readline()
67 | 
68 |     # print seq
69 |     if start and end:
70 |         return '>%s_(%d-%d)\n%s' % (header,start,end,seq[start-1:end])
71 |     elif start:
72 |         return '>%s_(%d-)\n%s' % (header,start,seq[start-1:])
73 |     elif end:
74 |         return '>%s_(-%d)\n%s' % (header,start,seq[:end])
75 |     else:
76 |         return '>%s\n%s' % (header,seq)
77 | 
78 | ############################################################
79 | # __main__
80 | ############################################################
81 | if __name__ == '__main__':
82 |     main()
83 | 


--------------------------------------------------------------------------------
/peaks3_venn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import math, os, stats, subprocess, tempfile
 4 | 
 5 | import matplotlib.pyplot as plt
 6 | from matplotlib_venn import venn3
 7 | import seaborn as sns
 8 | 
 9 | ################################################################################
10 | # peaks3_venn.py
11 | #
12 | # Make a venn diagram comparing three sets of peak calls.
13 | ################################################################################
14 | 
15 | 
16 | ################################################################################
17 | # main
18 | ################################################################################
19 | def main():
20 |     usage = 'usage: %prog [options] <peaks1_bed> <peaks2_bed> <peaks3_bed> <out_pdf>'
21 |     parser = OptionParser(usage)
22 |     parser.add_option('--l1', dest='label1', default='peaks1', help='Label for peak set 1')
23 |     parser.add_option('--l2', dest='label2', default='peaks2', help='Label for peak set 2')
24 |     parser.add_option('--l3', dest='label3', default='peaks3', help='Label for peak set 3')
25 |     (options,args) = parser.parse_args()
26 | 
27 |     if len(args) != 4:
28 |         parser.error('Must provide three peaks BED files and output PDF')
29 |     else:
30 |         peak_beds = args[:3]
31 |         out_pdf = args[3]
32 | 
33 |     merge_fd, merge_bed = tempfile.mkstemp()
34 | 
35 |     # merge peaks
36 |     cmd = 'cat %s %s %s | awk \'{OFS="\t"} {print $1, $2, $3}\' | bedtools sort -i stdin | bedtools merge -i stdin > %s' % (peak_beds[0], peak_beds[1], peak_beds[2], merge_bed)
37 |     subprocess.call(cmd, shell=True)
38 | 
39 |     # annotate merged peaks with each individual set
40 |     num_peaks = count_peaks(merge_bed)
41 |     peak_overlaps = [set(), set(), set()]
42 | 
43 |     for bi in range(3):
44 |         cmd = 'bedtools intersect -c -a %s -b %s' % (merge_bed, peak_beds[bi])
45 |         p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
46 |         pi = 0
47 |         for line in p.stdout:
48 |             a = line.split()
49 |             if int(a[-1]) > 0:
50 |                 peak_overlaps[bi].add(pi)
51 |             pi += 1
52 | 
53 |     # plot
54 |     plt.figure()
55 |     venn_diag = venn3(peak_overlaps, set_labels=[options.label1, options.label2, options.label3]) # , set_colors=['#e41a1c', '#A1A838', ''])
56 |     plt.savefig(out_pdf)
57 |     plt.close()
58 | 
59 |     # clean up
60 |     os.close(merge_fd)
61 |     os.remove(merge_bed)
62 | 
63 | 
64 | def count_peaks(bed_file):
65 |     peak_counts = 0
66 |     for line in open(bed_file):
67 |         peak_counts += 1
68 |     return peak_counts
69 | 
70 | ################################################################################
71 | # __main__
72 | ################################################################################
73 | if __name__ == '__main__':
74 |     main()
75 | 


--------------------------------------------------------------------------------
/trf_mask.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import re
 4 | 
 5 | ################################################################################
 6 | # trf_mask.py
 7 | #
 8 | # Mask the tandem repeats found by Tandem Repeat Finder.
 9 | ################################################################################
10 | 
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <seq file> <trf file 1> ... <trf file N>'
17 |     parser = OptionParser(usage)
18 |     #parser.add_option()
19 |     (options,args) = parser.parse_args()
20 | 
21 |     if len(args) < 2:
22 |         parser.error('Please provide sequence file and TRF output file')
23 |     else:
24 |         seq_file = args[0]
25 |         trf_files = args[1:]
26 | 
27 |     repeats = {}
28 |     for trf_file in trf_files:
29 |         get_repeats(trf_file, repeats)
30 | 
31 |     header = ''
32 |     for line in open(seq_file):
33 |         if line[0] == '>':
34 |             if header:
35 |                 mseq = mask_seq(seq, repeats.get(header,[]))
36 |                 print '>%s\n%s' % (header,mseq)
37 | 
38 |             header = line[1:].rstrip()
39 |             seq = ''
40 | 
41 |         else:
42 |             seq += line.rstrip()
43 | 
44 |     if header:
45 |         mseq = mask_seq(seq, repeats.get(header,[]))
46 |         print '>%s\n%s' % (header,mseq)
47 | 
48 | 
49 | ################################################################################
50 | # get_repeats
51 | #
52 | # Save the repeats in a dict keyed by the header
53 | ################################################################################
54 | def get_repeats(trf_file, repeats):
55 |     indices_re = re.compile('Indices: (\d+)\-\-(\d+)\s*Score')
56 |     for line in open(trf_file):
57 |         if line.startswith('Sequence:'):
58 |             header = line[10:].rstrip()
59 |         else:
60 |             m = indices_re.search(line)
61 |             if m:
62 |                 (start,end) = m.group(1,2)
63 |                 repeats.setdefault(header,[]).append((int(start)-1,int(end)))
64 | 
65 | 
66 | ################################################################################
67 | # mask_seq
68 | #
69 | # Mask the sequence using the list of repeats
70 | ################################################################################
71 | def mask_seq(seq, seq_repeats):
72 |     mseq = list(seq)
73 |     for rep in seq_repeats:
74 |         for i in range(rep[0],rep[1]):
75 |             mseq[i] = 'N'
76 |     return ''.join(mseq)
77 | 
78 | 
79 | ################################################################################
80 | # __main__
81 | ################################################################################
82 | if __name__ == '__main__':
83 |     main()
84 | 


--------------------------------------------------------------------------------
/vcf_ld.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import pdb
 4 | import os
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from basenji.emerald import EmeraldVCF
 9 | 
10 | '''
11 | vcf_ld.py
12 | 
13 | Transform an input VCF to add all linked variants above some threshold.
14 | Makes use of Emerald for LD queries.
15 | '''
16 | 
17 | ################################################################################
18 | # main
19 | ################################################################################
20 | def main():
21 |     usage = 'usage: %prog [options] <in_vcf> <out_vcf>'
22 |     parser = OptionParser(usage)
23 |     parser.add_option('-l','--ld', dest='ld_t',
24 |         default=0.8, type='float',
25 |         help='LD threshold to include SNP [Default: %default]')
26 |     parser.add_option('-r', dest='refpanel_stem',
27 |         default='%s/popgen/1000G/phase3/eur/1000G.EUR.QC' % os.environ['HG19'],
28 |         help='Reference panel chromosome VCF stem [Default: %default]')
29 |     (options,args) = parser.parse_args()
30 | 
31 |     if len(args) != 2:
32 |         parser.error('Must provide input and output VCF files')
33 |     else:
34 |         in_vcf_file = args[0]
35 |         out_vcf_file = args[1]
36 | 
37 |     # initialize reference panel
38 |     refp_em = EmeraldVCF(options.refpanel_stem)
39 | 
40 |     # retrieve all SNPs in LD
41 |     all_snps_df = []
42 | 
43 |     # initialize VCFs
44 |     in_vcf_open = open(in_vcf_file)
45 |     out_vcf_open = open(out_vcf_file, 'w')
46 | 
47 |     # hash SNPs by chromosome
48 |     for line in in_vcf_open:
49 |         if line[0] == '#':
50 |             # print header
51 |             print(line, end='', file=out_vcf_open)
52 | 
53 |         else:
54 |             a = line.split()
55 |             chrm = a[0]
56 |             pos = int(a[1])
57 |             rsid = a[2]
58 | 
59 |             # query LD SNPs
60 |             snp_df = refp_em.query_ld(rsid, chrm, pos,
61 |                                       options.ld_t, return_pos=True)
62 | 
63 |             if snp_df.shape[0] == 0:
64 |                 print('WARNING: %s not found in reference panel.' % rsid)
65 |             else:
66 |                 # set SNP id index
67 |                 snp_df.set_index('snp', inplace=True)
68 | 
69 |                 # fetch VCF lines
70 |                 pos_start = snp_df.pos.iloc[0]
71 |                 pos_end = snp_df.pos.iloc[-1]
72 |                 for snp_rec in refp_em.fetch(chrm, pos_start-1, pos_end):
73 |                     if snp_rec.id in snp_df.index:
74 |                         snp_str = snp_rec.__str__().rstrip()
75 |                         snp_str += '=%s;LD=%.2f' % (rsid, snp_df.loc[snp_rec.id].r)
76 |                         print(snp_str, file=out_vcf_open)
77 | 
78 |     out_vcf_open.close()
79 |     in_vcf_open.close()
80 | 
81 | 
82 | ################################################################################
83 | # __main__
84 | ################################################################################
85 | if __name__ == '__main__':
86 |     main()
87 | 


--------------------------------------------------------------------------------
/quantile_normalization.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import copy
 4 | 
 5 | import numpy as np
 6 | 
 7 | '''
 8 | quantile_normalization.py
 9 | '''
10 | 
11 | ################################################################################
12 | # main
13 | ################################################################################
14 | def main():
15 |     usage = 'usage: %prog [options] arg'
16 |     parser = OptionParser(usage)
17 |     #parser.add_option()
18 |     (options,args) = parser.parse_args()
19 | 
20 |     x = np.random.randn(10,5)
21 |     for ti in range(x.shape[1]):
22 |         x[:,ti] = (ti+1)*x[:,ti]
23 | 
24 |     print(x, end='\n\n')
25 | 
26 |     xn = quantile_normalize(x)
27 |     print(xn, end='\n\n')
28 | 
29 |     print(x.mean(axis=0))
30 |     print(xn.mean(axis=0))
31 | 
32 | 
33 | def quantile_normalize_expr(gene_expr, quantile_stat='median'):
34 |     ''' Quantile normalize across targets. The version below
35 |         just labels the variables more generally, but should
36 |         return the same answer. '''
37 | 
38 |     # make a copy
39 |     gene_expr_qn = copy.copy(gene_expr)
40 | 
41 |     # sort values within each column
42 |     for ti in range(gene_expr.shape[1]):
43 |         gene_expr_qn[:,ti].sort()
44 | 
45 |     # compute the mean/median in each row
46 |     if quantile_stat == 'median':
47 |         sorted_index_stats = np.median(gene_expr_qn, axis=1)
48 |     elif quantile_stat == 'mean':
49 |         sorted_index_stats = np.mean(gene_expr_qn, axis=1)
50 |     else:
51 |         print('Unrecognized quantile statistic %s' % quantile_stat, file=sys.stderr)
52 |         exit()
53 | 
54 |     # set new values
55 |     for ti in range(gene_expr.shape[1]):
56 |         sorted_indexes = np.argsort(gene_expr[:,ti])
57 |         for gi in range(gene_expr.shape[0]):
58 |             gene_expr_qn[sorted_indexes[gi],ti] = sorted_index_stats[gi]
59 | 
60 |     return gene_expr_qn
61 | 
62 | def quantile_normalize(X, quantile_stat='median'):
63 |     ''' Quantile normalize features across samples. '''
64 | 
65 |     # make a copy
66 |     Xq = copy.copy(X)
67 | 
68 |     # sort values within each column
69 |     for fi in range(X.shape[1]):
70 |         Xq[:,fi].sort()
71 | 
72 |     # compute the mean/median in each row
73 |     if quantile_stat == 'median':
74 |         sorted_index_stats = np.median(Xq, axis=1)
75 |     elif quantile_stat == 'mean':
76 |         sorted_index_stats = np.mean(Xq, axis=1)
77 |     else:
78 |         print('Unrecognized quantile statistic %s' % quantile_stat, file=sys.stderr)
79 |         exit()
80 | 
81 |     # set new values
82 |     for fi in range(X.shape[1]):
83 |         sorted_indexes = np.argsort(X[:,fi])
84 |         for si in range(X.shape[0]):
85 |             Xq[sorted_indexes[si],fi] = sorted_index_stats[si]
86 | 
87 |     return Xq
88 | 
89 | 
90 | ################################################################################
91 | # __main__
92 | ################################################################################
93 | if __name__ == '__main__':
94 |     main()
95 | 


--------------------------------------------------------------------------------
/ggplot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import os, subprocess, sys, tempfile
 4 | 
 5 | ################################################################################
 6 | # ggplot.py
 7 | #
 8 | # Make a plot given an R script, dict data frame, and arguments.
 9 | ################################################################################
10 | 
11 | 
12 | ################################################################################
13 | # plot
14 | ################################################################################
15 | def plot(r_script, df_dict, args, df_file=None, print_cmd=False, sep=' '):
16 |     # open temp file
17 |     if df_file == None:
18 |         df_fd, df_file = tempfile.mkstemp()
19 |     else:
20 |         df_fd = None
21 |     df_out = open(df_file, 'w')
22 | 
23 |     # get headers
24 |     headers = sorted(df_dict.keys())
25 |     print >> df_out, sep.join([str(head) for head in headers])
26 | 
27 |     # check list lengths
28 |     length = len(df_dict[headers[0]])
29 |     for i in range(1,len(headers)):
30 |         if length != len(df_dict[headers[i]]):
31 |             print >> sys.stderr, 'Lists in dict vary in length.'
32 |             exit(1)
33 | 
34 |     # print data frame
35 |     for i in range(length):
36 |         print >> df_out, sep.join([str(df_dict[head][i]) for head in headers])
37 |     df_out.close()
38 | 
39 |     # convert args to one string
40 |     args_str = sep.join([str(a) for a in args])
41 | 
42 |     # plot in R
43 |     cmd = 'R --slave --args %s %s < %s' % (df_file, args_str, r_script)
44 | 
45 |     if print_cmd:
46 |         print >> sys.stderr, cmd
47 |         
48 |     subprocess.call(cmd, shell=True)
49 | 
50 |     # clean
51 |     if df_fd != None:
52 |         os.close(df_fd)
53 |         os.remove(df_file)
54 | 
55 | 
56 | ################################################################################
57 | # print_df
58 | #
59 | # Just print the given data frame dictionary to the output file given.
60 | ################################################################################
61 | def print_df(df_dict, out_file=None):
62 |     # open
63 |     if out_file == None:
64 |         df_fd, df_file = tempfile.mkstemp()
65 |     else:
66 |         df_file = out_file
67 |     df_out = open(df_file, 'w')
68 | 
69 |     # get headers
70 |     headers = sorted(df_dict.keys())
71 |     print >> df_out, ' '.join([str(head) for head in headers])
72 | 
73 |     # check list lengths
74 |     length = len(df_dict[headers[0]])
75 |     for i in range(1,len(headers)):
76 |         if length != len(df_dict[headers[i]]):
77 |             print >> sys.stderr, 'Lists in dict vary in length:'
78 |             for j in range(len(headers)):
79 |                 print >> sys.stderr, headers[j], len(df_dict[headers[j]])
80 |             exit(1)
81 | 
82 |     # print data frame
83 |     for i in range(length):
84 |         print >> df_out, ' '.join([str(df_dict[head][i]) for head in headers])
85 |     df_out.close()
86 | 
87 |     if out_file == None:
88 |         return df_fd, df_file
89 |     else:
90 |         return None
91 | 


--------------------------------------------------------------------------------
/strand_specifity.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import os, subprocess
 4 | import pysam
 5 | 
 6 | ################################################################################
 7 | # strand_specificity.py
 8 | #
 9 | # Print information relevant to determining the strand specificity of the
10 | # sequencing in a BAM file using a TopHat junctions.bed file.
11 | ################################################################################
12 | 
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <bam> <junctions.bed>'
19 |     parser = OptionParser(usage)
20 |     parser.add_option('-s', dest='single', default=False, action='store_true', help='Single-stranded [Default: %default]')
21 |     (options,args) = parser.parse_args()
22 | 
23 |     if len(args) != 2:
24 |         parser.error('Must provide BAM file and junctions.bed file.')
25 |     else:
26 |         bam_file = args[0]
27 |         juncs_bed_file = args[1]
28 | 
29 |     # filter junctions for forward only
30 |     subprocess.call('awk \'$6 == "+"\' %s > junctions_fwd.bed' % juncs_bed_file, shell=True)
31 | 
32 |     if options.single:
33 |         subprocess.call('intersectBed -abam %s -b junctions_fwd.bed > fwd.bam' % bam_file, shell=True)
34 | 
35 |         # count forward/reverse reads
36 |         forward = 0
37 |         reverse = 0
38 |         for aligned_read in pysam.Samfile('fwd.bam'):
39 |             if aligned_read.is_reverse:
40 |                 reverse += 1
41 |             else:
42 |                 forward += 1
43 | 
44 |         print 'Read\'s aligning + and intersecting + junctions: %9d' % forward
45 |         print 'Read\'s aligning - and intersecting + junctions: %9d' % reverse
46 | 
47 |     else:
48 |         # intersect BAM with forward junctions
49 |         subprocess.call('intersectBed -s -abam %s -b junctions_fwd.bed > fwd.bam' % bam_file, shell=True)
50 | 
51 |         # count first/second reads
52 |         first = 0
53 |         second = 0
54 |         for aligned_read in pysam.Samfile('fwd.bam'):
55 |             if aligned_read.is_proper_pair:
56 |                 spliced = False
57 |                 for (code,size) in aligned_read.cigar:
58 |                     if code == 3:
59 |                         spliced = True
60 |                 if spliced:
61 |                     if aligned_read.is_read1:
62 |                         first += 1
63 |                     else:
64 |                         second += 1
65 | 
66 |         print 'Read1\'s aligning + and intersecting + junctions: %9d' % first
67 |         print 'Read2\'s aligning + and intersecting + junctions: %9d' % second
68 | 
69 |     os.remove('junctions_fwd.bed')
70 |     os.remove('fwd.bam')
71 | 
72 | 
73 | ################################################################################
74 | # __main__
75 | ################################################################################
76 | if __name__ == '__main__':
77 |     main()
78 | 


--------------------------------------------------------------------------------
/geneid2transid.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | ################################################################################
 5 | # geneid2transid.py
 6 | #
 7 | # Given a gene id, produce a transcript id to punch into the browser
 8 | ################################################################################
 9 | 
10 | lnc_catalog = '/Users/dk/research/common/data/lncrna/lnc_catalog.gtf'
11 | 
12 | ################################################################################
13 | # main
14 | ################################################################################
15 | def main():
16 |     usage = 'usage: %prog [options] <gene id>\nUsage: %prog [options] <gene id file>'
17 |     parser = OptionParser(usage)
18 |     #parser.add_option()
19 |     (options,args) = parser.parse_args()
20 | 
21 |     # parse input
22 |     if len(args) == 0:
23 |         parser.error('Must provide gene id or file of gene ids')
24 |     else:
25 |         if args[0].startswith('XLOC'):
26 |             print find_longest_transcript(args[0])
27 |         else:
28 |             for line in open(args[0]):
29 |                 print find_longest_transcript(line.rstrip())
30 | 
31 | 
32 | 
33 | ################################################################################
34 | # find_longest_transcript
35 | #
36 | # Return the longest transcript for this gene
37 | ################################################################################
38 | def find_longest_transcript(gene_id):
39 |     # find all transcripts and sum lengths
40 |     transcripts = {}
41 |     for line in open(lnc_catalog):
42 |         a = line.split('\t')
43 |         kv = gtf_kv(a[8])
44 | 
45 |         if gene_id == kv['gene_id']:
46 |             tx = kv['transcript_id']
47 |             transcripts[tx] = transcripts.get(tx,0) + int(a[4])-int(a[3])+1
48 | 
49 |     # return longest
50 |     tx_len = max([l for l in transcripts.values()])
51 |     for tx in transcripts:
52 |         if transcripts[tx] == tx_len:
53 |             return tx
54 | 
55 | 
56 | ################################################################################
57 | # gtf_kv
58 | #
59 | # Convert the last gtf section of key/value pairs into a dict.
60 | ################################################################################
61 | def gtf_kv(s):
62 |     d = {}
63 | 
64 |     a = s.split(';')
65 |     for key_val in a:
66 |         if key_val.strip():
67 |             if key_val.find('=') != -1:
68 |                 kvs = key_val.split('=')
69 |             else:
70 |                 kvs = key_val.split()
71 | 
72 |             if len(kvs) == 2:
73 |                 key = kvs[0]
74 |                 if kvs[1][0] == '"' and kvs[1][-1] == '"':
75 |                     val = kvs[1].strip()[1:-1]
76 |                 else:
77 |                     val = kvs[1].strip()
78 |                 d[key] = val
79 | 
80 |     return d
81 | 
82 | 
83 | ################################################################################
84 | # __main__
85 | ################################################################################
86 | if __name__ == '__main__':
87 |     main()
88 | 


--------------------------------------------------------------------------------
/h5_h5z.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | 
 4 | import h5py
 5 | import numpy as np
 6 | 
 7 | from struct import pack, unpack
 8 | 
 9 | '''
10 | h5_bw.py
11 | 
12 | Convert a coverage HDF5 to lossy compressed HDF5.
13 | '''
14 | 
15 | ################################################################################
16 | # main
17 | ################################################################################
18 | def main():
19 |     usage = 'usage: %prog [options] <in_h5_file> <out_h5_file>'
20 |     parser = OptionParser(usage)
21 |     parser.add_option('-v', dest='verbose', default=False, action='store_true')
22 |     (options,args) = parser.parse_args()
23 | 
24 |     if len(args) != 2:
25 |         parser.error('Must provide input HDF5 and output BigWig.')
26 |     else:
27 |         in_h5_file = args[0]
28 |         out_h5_file = args[1]
29 | 
30 |     # open files
31 |     h5_in = h5py.File(in_h5_file)
32 |     h5_out = h5py.File(out_h5_file, 'w')
33 | 
34 |     # construct header
35 |     header = []
36 |     chroms = sorted(h5_in.keys())
37 |     for chrom in chroms:
38 |         # chromosome and length
39 |         header.append((chrom,len(h5_in[chrom])))
40 | 
41 |     for chrom, length in header:
42 |         if options.verbose:
43 |             print(chrom)
44 | 
45 |         # read values
46 |         x = np.array(h5_in[chrom], dtype='float16')
47 | 
48 |         # write gzipped into HDF5
49 |         h5_out.create_dataset(chrom, data=x, chunks=True, compression=32013, compression_opts=None, shuffle=False)
50 | 
51 |     # close files
52 |     h5_in.close()
53 |     h5_out.close()
54 | 
55 | def zfp_rate_opts(rate):
56 |     """Create compression options for ZFP in fixed-rate mode
57 | 
58 |     The float rate parameter is the number of compressed bits per value.
59 |     """
60 |     ZFP_MODE_RATE = 1
61 |     rate = pack('<d', rate)            # Pack as IEEE 754 double
62 |     high = unpack('<I', rate[0:4])[0]  # Unpack high bits as unsigned int
63 |     low = unpack('<I', rate[4:8])[0]   # Unpack low bits as unsigned int
64 |     return (ZFP_MODE_RATE, 0, high, low, 0, 0)
65 | 
66 | def zfp_accuracy_opts(accuracy):
67 |     """Create compression options for ZFP in fixed-accuracy mode
68 | 
69 |     The float accuracy parameter is the absolute error tolarance (e.g. 0.001).
70 |     """
71 |     ZFP_MODE_ACCURACY = 3
72 |     accuracy = pack('<d', accuracy)        # Pack as IEEE 754 double
73 |     high = unpack('<I', accuracy[0:4])[0]  # Unpack high bits as unsigned int
74 |     low = unpack('<I', accuracy[4:8])[0]   # Unpack low bits as unsigned int
75 |     print(high, low)
76 |     return (ZFP_MODE_ACCURACY, 0, high, low, 0, 0)
77 | 
78 | def zfp_precision_opts(precision):
79 |     """Create a compression options for ZFP in fixed-precision mode
80 | 
81 |     The float precision parameter is the number of uncompressed bits per value.
82 |     """
83 |     ZFP_MODE_PRECISION = 2
84 |     return (ZFP_MODE_PRECISION, 0, precision, 0, 0, 0)
85 | 
86 | ################################################################################
87 | # __main__
88 | ################################################################################
89 | if __name__ == '__main__':
90 |     main()
91 | 


--------------------------------------------------------------------------------
/transcripts_fasta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | from glob import glob
 4 | import gzip, os, pdb
 5 | import gff, dna
 6 | 
 7 | ################################################################################
 8 | # transcripts_fasta.py
 9 | #
10 | # Make a fasta file of transcripts from the gtf file
11 | ################################################################################
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] <genome fasta> <transcripts gtf>'
18 |     parser = OptionParser(usage)
19 |     #parser.add_option()
20 |     (options,args) = parser.parse_args()
21 | 
22 |     if len(args) != 2:
23 |         parser.error('Must provide genome fasta file and transcripts gtf file')
24 |     else:
25 |         genome_fasta = args[0]
26 |         transcripts_gtf = args[1]
27 | 
28 |     transcript_seqs = {}
29 |     transcript_genes = {}
30 | 
31 |     if genome_fasta[-2:] == 'gz':
32 |         genome_open = gzip.open(genome_fasta)
33 |     else:
34 |         genome_open = open(genome_fasta)
35 | 
36 |     # process chromosomes
37 |     chrom = ''
38 |     line = genome_open.readline()
39 |     while line:
40 |         if line[0] == '>':
41 |             if chrom:
42 |                 process_chrom(transcripts_gtf, chrom, seq, transcript_seqs, transcript_genes)
43 | 
44 |             chrom = line[1:].rstrip()
45 |             seq = ''
46 |         else:
47 |             seq += line.rstrip()
48 |         line = genome_open.readline()
49 |     process_chrom(transcripts_gtf, chrom, seq, transcript_seqs, transcript_genes)
50 | 
51 |     # print fasta
52 |     for tid in transcript_seqs:
53 |         print '>%s gene=%s\n%s' % (tid,transcript_genes[tid],transcript_seqs[tid])
54 | 
55 | 
56 | ################################################################################
57 | # process_chrom
58 | #
59 | # Build up transcript_seqs and transcript_genes hashes for the chromosome
60 | # specified.
61 | ################################################################################
62 | def process_chrom(transcripts_gtf, chrom, seq, transcript_seqs, transcript_genes):
63 |     # find chr transcripts
64 |     for line in open(transcripts_gtf):
65 |         a = line.split('\t')
66 |         if a[0] == chrom:
67 |             kv = gff.gtf_kv(a[8])
68 |             tid = kv['transcript_id']
69 |             gid = kv['gene_id']
70 | 
71 |             exon_start = int(a[3])
72 |             exon_end = int(a[4])
73 | 
74 |             exon_seq = seq[exon_start-1:exon_end]
75 |             if a[6] == '+':
76 |                 transcript_seqs[tid] = transcript_seqs.get(tid,'') + exon_seq
77 |             else:
78 |                 transcript_seqs[tid] = dna.rc(exon_seq) + transcript_seqs.get(tid,'')
79 | 
80 |             transcript_genes[tid] = gid
81 | 
82 | 
83 | ################################################################################
84 | # __main__
85 | ################################################################################
86 | if __name__ == '__main__':
87 |     main()
88 |     #pdb.runcall(main)
89 | 


--------------------------------------------------------------------------------
/transmapbed2gtf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import gff
 4 | 
 5 | ################################################################################
 6 | # transmapbed2gtf.py
 7 | #
 8 | # Convert the bed file that you get from the TransMap pipeline to a gtf file
 9 | # where adjacent blocks are merged.
10 | ################################################################################
11 | 
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] <bed file>'
18 |     parser = OptionParser(usage)
19 |     parser.add_option('-g', dest='orig_gtf', help='The original gtf file of the TransMap\'d genes to be used to transfer gene id\'s')
20 |     parser.add_option('-m', dest='merge_dist', type='int', default=30, help='Minimum distance two exons can be apart for them to be merged [Default: %default]')
21 |     (options,args) = parser.parse_args()
22 | 
23 |     if len(args) != 1:
24 |         parser.error('Must provide bed file')
25 |     else:
26 |         bed_file = args[0]
27 | 
28 |     # map transcript id's to gene id's if possible
29 |     t2g = {}
30 |     if options.orig_gtf:
31 |         for line in open(options.orig_gtf):
32 |             a = line.split('\t')
33 |             kv = gff.gtf_kv(a[8])
34 |             t2g[kv['transcript_id']] = kv['gene_id']
35 | 
36 |     # hash to disambiguate multi-mapping transcripts
37 |     transcript_maps = {}
38 | 
39 |     for line in open(bed_file):
40 |         a = line.split('\t')
41 |         a[-1] = a[-1].rstrip()
42 | 
43 |         tid = a[3]
44 |         gid = t2g.get(a[3],a[3])
45 | 
46 |         transcript_maps[tid] = transcript_maps.get(tid,0) + 1
47 |         if transcript_maps[tid] > 1:
48 |             gid += '_v%d' % transcript_maps[tid]
49 |             tid += '_v%d' % transcript_maps[tid]
50 | 
51 |         gene_start = int(a[1])
52 |         gene_end = int(a[2])
53 | 
54 |         block_sizes = [int(x) for x in a[10].split(',') if x]
55 |         block_starts = [int(x) for x in a[11].split(',') if x]
56 | 
57 |         exon_cols = []
58 |         last_end = None
59 |         exon_num = 1
60 |         for i in range(len(block_starts)):
61 |             exon_start = gene_start+1+block_starts[i]
62 |             exon_end = gene_start+1+block_starts[i]+block_sizes[i]-1
63 | 
64 |             if last_end and last_end+options.merge_dist >= exon_start:
65 |                 # merge w/ last
66 |                 exon_cols[-1][4] = str(exon_end)
67 |             else:
68 |                 exon_cols.append([a[0], 'TransMap', 'exon', str(exon_start), str(exon_end), '.', a[5], '.', 'gene_id "%s"; transcript_id "%s"; exon_number "%d"' % (gid,tid,exon_num)])
69 |                 exon_num += 1
70 |             
71 |             last_end = exon_end
72 | 
73 |         for cols in exon_cols:
74 |             print '\t'.join(cols)
75 |     
76 | 
77 | ################################################################################
78 | # __main__
79 | ################################################################################
80 | if __name__ == '__main__':
81 |     main()
82 | 


--------------------------------------------------------------------------------
/tss_bam_replot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | from rpy2.robjects.packages import importr
 4 | import rpy2.robjects as ro
 5 | import rpy2.robjects.lib.ggplot2 as ggplot2
 6 | 
 7 | grdevices = importr('grDevices')
 8 | 
 9 | ################################################################################
10 | # tss_bam_replot.py
11 | #
12 | # Make a new plot from the raw data generated by tss_bam_plot.py
13 | ################################################################################
14 | 
15 | 
16 | ################################################################################
17 | # main
18 | ################################################################################
19 | def main():
20 |     usage = 'usage: %prog [options] <raw file>'
21 |     parser = OptionParser(usage)
22 |     parser.add_option('-d', dest='downstream', default=2000, type='int', help='TSS downstream [Default: %default]')
23 |     parser.add_option('-o', dest='out_prefix', default='tss', help='Output prefix [Default: %default]')
24 |     parser.add_option('-u', dest='upstream', default=5000, type='int', help='TSS upstream [Default: %default]')
25 |     parser.add_option('--ymax', dest='ymax', default=None, type='float', help='Y-coordinate limit [Default: %default]')
26 |     (options,args) = parser.parse_args()
27 |     
28 |     if len(args) != 1:
29 |         parser.error('Must provide raw file')
30 |     else:
31 |         raw_file = args[0]
32 | 
33 |     # collect data
34 |     coords = []
35 |     main_cov = []
36 |     control_cov = []
37 |     for line in open(raw_file):
38 |         a = line.split()
39 |         coords.append(int(a[0]))
40 |         main_cov.append(float(a[1]))
41 |         control_cov.append(float(a[2]))
42 | 
43 |     # data structures
44 |     tss_i = ro.IntVector(range(-options.upstream,options.downstream+1))
45 |     labels = ro.StrVector(['Main']*(options.upstream+options.downstream+1)+['Control']*(options.upstream+options.downstream+1))
46 |     cov = ro.FloatVector(main_cov + control_cov)
47 | 
48 |     df = ro.DataFrame({'tss_i':tss_i, 'cov':cov, 'label':labels})
49 | 
50 |     # plot
51 |     '''
52 |     gp = ggplot2.ggplot(df) + \
53 |         ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \
54 |         ggplot2.geom_point() + \
55 |         ggplot2.scale_x_continuous('TSS index') + \
56 |         ggplot2.scale_colour_discrete('')
57 |     '''
58 |     gp = ggplot2.ggplot(df) + \
59 |         ggplot2.aes_string(x='tss_i', y='cov', colour='label') + \
60 |         ggplot2.geom_smooth(method='loess', size=1, span=0.2, se=False) + \
61 |         ggplot2.scale_x_continuous('TSS Position') + \
62 |         ggplot2.scale_colour_discrete('') + \
63 |         ggplot2.theme_bw()
64 | 
65 |     if options.ymax == None:
66 |         gp += ggplot2.scale_y_continuous('Coverage')
67 |     else:
68 |         gp += ggplot2.scale_y_continuous('Coverage', limits=ro.FloatVector([0,options.ymax]))
69 | 
70 |     # save to file
71 |     grdevices.pdf(file='%s_and.pdf' % options.out_prefix)
72 |     gp.plot()
73 |     grdevices.dev_off()
74 | 
75 | 
76 | 
77 | ################################################################################
78 | # __main__
79 | ################################################################################
80 | if __name__ == '__main__':
81 |     main()
82 | 


--------------------------------------------------------------------------------
/attach_nh.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from optparse import OptionParser
  3 | import sys
  4 | import pysam
  5 | 
  6 | ################################################################################
  7 | # attach_nh.py
  8 | #
  9 | # Attach NH tags to a stream of SAM alignments from Bowtie2.
 10 | #
 11 | # Note: I'm not sure how paired end reads will stream in.
 12 | ################################################################################
 13 | 
 14 | ################################################################################
 15 | # main
 16 | ################################################################################
 17 | def main():
 18 |     usage = 'usage: %prog [options] arg'
 19 |     parser = OptionParser(usage)
 20 |     #parser.add_option()
 21 |     (options,args) = parser.parse_args()
 22 | 
 23 |     sam_in = pysam.AlignmentFile('-', 'r')
 24 | 
 25 |     # previously required, but apparently not anymore
 26 |     # write_header(sam_in.header)
 27 | 
 28 |     sam_out = pysam.AlignmentFile('-', 'w', template=sam_in)
 29 | 
 30 |     last_id = 'not a header'
 31 |     read1_aligns = []
 32 |     read2_aligns = []
 33 | 
 34 |     for align in sam_in:
 35 |         if align.is_unmapped:
 36 |             # read stream concludes
 37 |             output_read(sam_out, read1_aligns)
 38 |             output_read(sam_out, read2_aligns)
 39 |             read1_aligns = []
 40 |             read2_aligns = []
 41 | 
 42 |         else:
 43 |             read_id = align.query_name
 44 | 
 45 |             if not match_id(read_id, last_id, align.is_paired):
 46 |                 # read stream concludes
 47 | 
 48 |                 # output
 49 |                 output_read(sam_out, read1_aligns)
 50 |                 output_read(sam_out, read2_aligns)
 51 | 
 52 |                 # reset
 53 |                 read1_aligns = []
 54 |                 read2_aligns = []
 55 | 
 56 |             # read stream continues
 57 |             if align.is_read1:
 58 |                 read1_aligns.append(align)
 59 |             else:
 60 |                 read2_aligns.append(align)
 61 | 
 62 |             # update read id
 63 |             last_id = read_id
 64 | 
 65 |     sam_in.close()
 66 |     sam_out.close()
 67 | 
 68 | 
 69 | def match_id(id1, id2, paired):
 70 |     ''' Match read_id's.
 71 | 
 72 |     First case handles most datasets.
 73 |     Second case handles paired end datasets where they got fancy.
 74 |     '''
 75 | 
 76 |     return id1 == id2 or (paired and id1[:-1] == id2[:-1] and id1[-1] in '12' and id2[-1] in '12')
 77 | 
 78 | 
 79 | def output_read(sam_out, read_aligns):
 80 |     nh_tag = len(read_aligns)
 81 |     for align in read_aligns:
 82 |         align.set_tag('NH',nh_tag)
 83 |         sam_out.write(align)
 84 | 
 85 | 
 86 | def write_header(header):
 87 |     hd = header['HD']
 88 |     print('@HD\tVN:%s\tSO:%s' % (hd['VN'], hd['SO']))
 89 | 
 90 |     for sq in header['SQ']:
 91 |         print('@SQ\tSN:%s\tLN:%d' % (sq['SN'],sq['LN']))
 92 | 
 93 |     pg = header['PG'][0]
 94 |     print('@PG\tID:%s\tPN:%s\tVN:%s\tCL:%s' % (pg['ID'],pg['PN'],pg['VN'],pg['CL']), flush=True)
 95 | 
 96 | 
 97 | ################################################################################
 98 | # __main__
 99 | ################################################################################
100 | if __name__ == '__main__':
101 |     main()
102 | 


--------------------------------------------------------------------------------
/citemelike.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import urllib2
 4 | import pdb, re, math, random
 5 | 
 6 | ################################################################################
 7 | # citemelike.py
 8 | #
 9 | # Choose a random paper from my citeulike account to read.
10 | ################################################################################
11 | 
12 | user = 'dakelley'
13 | star_factor = 10
14 | 
15 | url_re = re.compile('href="(/user/dakelley/article/\d+)"')
16 | star_re = re.compile('src="/static/img/star(\d).png"')
17 | 
18 | ################################################################################
19 | # main
20 | ################################################################################
21 | def main():
22 |     usage = 'usage: %prog [options] arg'
23 |     parser = OptionParser(usage)
24 |     parser.add_option('--tag', dest='tag', help='Choose a paper with the given tag')
25 |     (options,args) = parser.parse_args()
26 | 
27 |     if options.tag:
28 |         citeulike_url = 'http://www.citeulike.org/user/%s/tag/%s/order/to_read' % (user,options.tag)
29 |     else:
30 |         citeulike_url = 'http://www.citeulike.org/user/%s/order/to_read' % user
31 | 
32 |     # get papers
33 |     papers = get_papers(citeulike_url)
34 | 
35 |     if len(papers) == 0:
36 |         parser.error('No papers with the tag %s' % options.tag)
37 | 
38 |     # re-score stars
39 |     papers = [(math.pow(stars/5.0,star_factor),paper) for (stars,paper) in papers]
40 | 
41 |     # choose random paper
42 |     max_rand = sum([score for (score,paper) in papers])
43 |     rand_score = random.uniform(0,max_rand)
44 |     rand_tmp = 0.0
45 |     for (score,paper) in papers:
46 |         rand_tmp += score
47 |         if rand_tmp > rand_score:
48 |             print 'http://www.citeulike.org%s' % paper
49 |             break
50 |         
51 | ################################################################################
52 | # get_papers
53 | #
54 | # Get all papers from the following base url
55 | ################################################################################
56 | def get_papers(citeulike_url):
57 |     papers = []
58 | 
59 |     page_num = 1
60 |     no_read = True
61 |     unread_found = True
62 |     while no_read and unread_found:
63 |         unread_found = False
64 | 
65 |         #f = urllib2.urlopen('%s/page/%d' % (citeulike_url,page_num))
66 |         req = urllib2.Request('%s/page/%d' % (citeulike_url,page_num), headers={'User-Agent':"Magic Broswer"})
67 |         f = urllib2.urlopen(req)
68 |         cul_read = f.read()
69 |         cul_text = ''.join(cul_read)
70 |         cul_lines = cul_text.split('\n')
71 | 
72 |         for line in cul_lines:
73 |             if line.find('class="title"') != -1:
74 |                 url_match = url_re.search(line)
75 |                 paper_url = url_match.group(1)
76 |                 unread_found = True
77 | 
78 |             elif line.find('/static/img/star') != -1:
79 |                 star_match = star_re.search(line)
80 |                 stars = int(star_match.group(1))
81 |                 papers.append((stars,paper_url))
82 |             
83 |             elif line.find('radio') == -1 and line.find('already read') != -1:
84 |                 papers = papers[:-1]
85 |                 unread = False
86 |                 break
87 | 
88 |         page_num += 1
89 | 
90 |     return papers
91 | 
92 | ################################################################################
93 | # __main__
94 | ################################################################################
95 | if __name__ == '__main__':
96 |     main()
97 |     #pdb.runcall(main)
98 | 


--------------------------------------------------------------------------------
/cuff_rep_cor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import os, pdb, sys
 4 | from numpy import array
 5 | from scipy.stats import spearmanr
 6 | import gff, ggplot
 7 | 
 8 | ################################################################################
 9 | # cuff_rep_cor.py
10 | #
11 | # Compute correlations between replicates in a cufflinks run.
12 | ################################################################################
13 | 
14 | ################################################################################
15 | # main
16 | ################################################################################
17 | def main():
18 |     usage = 'usage: %prog [options] <.read_group_tracking>'
19 |     parser = OptionParser(usage)
20 |     parser.add_option('-g', dest='genes_gtf', help='Print only genes in the given GTF file')
21 |     #parser.add_option('-p', dest='pseudocount', type='float', default=0.125, help='FPKM pseudocount for taking logs [Default: %default]')
22 |     parser.add_option('-o', dest='out_pdf', default='cor_heat.pdf', help='Output heatmap pdf [Default: %default]')
23 |     (options,args) = parser.parse_args()
24 | 
25 |     if len(args) != 1:
26 |         parser.error(usage)
27 |     else:
28 |         read_group_tracking = args[0]
29 |     
30 |     # get gene_ids
31 |     gene_set = set()
32 |     if options.genes_gtf:
33 |         for line in open(options.genes_gtf):
34 |             a = line.split('\t')
35 |             gid = gff.gtf_kv(a[8])['gene_id']
36 |             gene_set.add(gid)
37 | 
38 |     # initialize diff data structures
39 |     cond_rep_gene_fpkm = {}
40 | 
41 |     # read read group tracking file
42 |     rgt_in = open(read_group_tracking)
43 |     headers = rgt_in.readline()
44 |     line = rgt_in.readline()
45 |     while line:
46 |         a = line.split('\t')
47 | 
48 |         gene_id = a[0]
49 |         cond = a[1]
50 |         rep = int(a[2])
51 |         fpkm = float(a[6])
52 |         status = a[8].rstrip()
53 | 
54 |         if status == 'OK' and (len(gene_set) == 0 or gene_id in gene_set):
55 |             if not (cond,rep) in cond_rep_gene_fpkm:
56 |                 cond_rep_gene_fpkm[(cond,rep)] = {}
57 |             
58 |             cond_rep_gene_fpkm[(cond,rep)][gene_id] = fpkm
59 | 
60 |         line = rgt_in.readline()
61 |     rgt_in.close()
62 | 
63 |     df_dict = {'Sample1':[], 'Sample2':[], 'Correlation':[]}
64 |     cond_reps = cond_rep_gene_fpkm.keys()
65 | 
66 |     for i in range(len(cond_reps)):
67 |         cond1, rep1 = cond_reps[i]
68 | 
69 |         for j in range(i+1,len(cond_reps)):
70 |             cond2, rep2 = cond_reps[j]
71 | 
72 |             genes12 = set(cond_rep_gene_fpkm[(cond1,rep1)].keys()) & set(cond_rep_gene_fpkm[(cond2,rep2)].keys())
73 | 
74 |             fpkms1 = array([cond_rep_gene_fpkm[(cond1,rep1)][gene_id] for gene_id in genes12])
75 |             fpkms2 = array([cond_rep_gene_fpkm[(cond2,rep2)][gene_id] for gene_id in genes12])
76 | 
77 |             rho, pval = spearmanr(fpkms1, fpkms2)
78 | 
79 |             cols = (cond1,rep1,cond2,rep2,rho)
80 |             print '%-15s  %1d  %-15s  %1d  %.4f' % cols
81 | 
82 |             df_dict['Sample1'].append('%s_%d' % (cond1,rep1))
83 |             df_dict['Sample2'].append('%s_%d' % (cond2,rep2))
84 |             df_dict['Correlation'].append(rho)
85 | 
86 |     # this is broken
87 |     ggplot.plot('%s/cuff_rep_cor.r' % os.environ['RDIR'], df_dict, [options.out_pdf], debug=True)
88 | 
89 | 
90 | ################################################################################
91 | # __main__
92 | ################################################################################
93 | if __name__ == '__main__':
94 |     main()
95 |     #pdb.runcall(main)
96 | 


--------------------------------------------------------------------------------
/bedtools.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from optparse import OptionParser
 3 | import os, subprocess, tempfile
 4 | import pysam
 5 | 
 6 | ################################################################################
 7 | # bedtools.py
 8 | #
 9 | #
10 | ################################################################################
11 | 
12 | 
13 | ################################################################################
14 | # main
15 | ################################################################################
16 | def main():
17 |     usage = 'usage: %prog [options] arg'
18 |     parser = OptionParser(usage)
19 |     #parser.add_option()
20 |     (options,args) = parser.parse_args()
21 |     
22 | 
23 | ################################################################################
24 | # abam_f1
25 | #
26 | # Intersect the BAM file with the BED file using the "-f 1" option, but correct
27 | # for the loss of spliced reads.
28 | ################################################################################
29 | def abam_f1(bam_file, bed_file, out_file):
30 |     ############################################
31 |     # divide BAM by splicing
32 |     ############################################
33 |     spliced_bam_fd, spliced_bam_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
34 |     unspliced_bam_fd, unspliced_bam_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
35 | 
36 |     # open BAMs
37 |     bam_in = pysam.Samfile(bam_file, 'rb')
38 |     spliced_bam_out = pysam.Samfile(spliced_bam_file, 'wb', template=bam_in)
39 |     unspliced_bam_out = pysam.Samfile(unspliced_bam_file, 'wb', template=bam_in)
40 | 
41 |     # divide
42 |     for aligned_read in bam_in:
43 |         if spliced(aligned_read):
44 |             spliced_bam_out.write(aligned_read)
45 |         else:
46 |             unspliced_bam_out.write(aligned_read)
47 | 
48 |     # close BAMs
49 |     bam_in.close()
50 |     spliced_bam_out.close()
51 |     unspliced_bam_out.close()
52 | 
53 |     ############################################
54 |     # intersect and merge
55 |     ############################################
56 |     spliced_is_bam_fd, spliced_is_bam_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
57 |     unspliced_is_bam_fd, unspliced_is_bam_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
58 | 
59 |     subprocess.call('intersectBed -f 1 -abam %s -b %s > %s' % (unspliced_bam_file, bed_file, unspliced_is_bam_file), shell=True)
60 |     subprocess.call('intersectBed -abam %s -b %s > %s' % (spliced_bam_file, bed_file, spliced_is_bam_file), shell=True)
61 | 
62 |     subprocess.call('samtools merge -f %s %s %s' % (out_file, unspliced_is_bam_file, spliced_is_bam_file), shell=True)
63 | 
64 |     ############################################
65 |     # clean
66 |     ############################################
67 |     os.close(spliced_bam_fd)
68 |     os.remove(spliced_bam_file)
69 |     os.close(unspliced_bam_fd)
70 |     os.remove(unspliced_bam_file)
71 |     os.close(spliced_is_bam_fd)
72 |     os.remove(spliced_is_bam_file)
73 |     os.close(unspliced_is_bam_fd)
74 |     os.remove(unspliced_is_bam_file)
75 |     
76 | 
77 | ################################################################################
78 | # spliced
79 | #
80 | # Return true if the read is spliced.
81 | ################################################################################
82 | def spliced(aligned_read):
83 |     spliced = False
84 |     for code,size in aligned_read.cigar:
85 |         if code == 3:
86 |             spliced = True
87 |     return spliced
88 | 
89 | ################################################################################
90 | # __main__
91 | ################################################################################
92 | if __name__ == '__main__':
93 |     main()
94 | 


--------------------------------------------------------------------------------