├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
    ├── INSTALL
    ├── THANKS
    ├── data_structure.json
    ├── manual.md
    ├── releases.md
    └── supplementary.md
├── rseqc
    ├── __init__.py
    ├── modules
    │   ├── FPKM_count.py
    │   ├── RNA_fragment_size.py
    │   ├── RPKM_saturation.py
    │   ├── __init__.py
    │   ├── bam2fq.py
    │   ├── bam2wig.py
    │   ├── bam_stats.py
    │   ├── clipping_profile.py
    │   ├── deletion_profile.py
    │   ├── divide_bam.py
    │   ├── geneBody_coverage.py
    │   ├── geneBody_coverage2.py
    │   ├── infer_experiment.py
    │   ├── inner_distance.py
    │   ├── insertion_profile.py
    │   ├── junction_annotation.py
    │   ├── junction_saturation.py
    │   ├── mismatch_profile.py
    │   ├── normalize_bigwig.py
    │   ├── overlay_bigwig.py
    │   ├── read_GC.py
    │   ├── read_NVC.py
    │   ├── read_dist.py
    │   ├── read_duplication.py
    │   ├── read_hexamer.py
    │   ├── read_quality.py
    │   ├── split_bam.py
    │   ├── split_paired_bam.py
    │   └── tin.py
    ├── parsers
    │   ├── BED.py
    │   ├── BedWrapper.py
    │   ├── GTF.py
    │   └── __init__.py
    └── qcmodule
    │   ├── BED.py
    │   ├── FrameKmer.py
    │   ├── PSL.py
    │   ├── SAM.py
    │   ├── __init__.py
    │   ├── annoGene.py
    │   ├── bam_cigar.py
    │   ├── changePoint.py
    │   ├── cigar.py
    │   ├── dotProduct.py
    │   ├── fasta.py
    │   ├── fastq.py
    │   ├── fickett.py
    │   ├── getBamFiles.py
    │   ├── mystat.py
    │   ├── orf.py
    │   ├── poisson.py
    │   ├── quantile.py
    │   ├── twoList.py
    │   └── wiggle.py
├── scripts
    └── rseqc
├── setup.cfg
├── setup.py
└── src
    ├── binBits.c
    ├── binBits.h
    ├── bunzip
        ├── micro-bunzip.c
        └── micro-bunzip.h
    ├── cluster.c
    ├── cluster.h
    ├── kent
        ├── bits.c
        ├── bits.h
        ├── common.c
        └── common.h
    ├── pwm_utils.c
    ├── pwm_utils.h
    └── samtools
        ├── bgzf.c
        ├── bgzf.h
        └── khash.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled python modules.
 2 | *.pyc
 3 | 
 4 | # Setuptools distribution folder.
 5 | /dist/
 6 | 
 7 | # Python egg metadata, regenerated from source files by setuptools.
 8 | /*.egg-info
 9 | /*.egg
10 | 
11 | #
12 | /build/
13 | 
14 | #text editor specific files - VIM
15 | *.swp
16 | 
17 | # sample data
18 | /sample-data/
19 | 
20 | working.py
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | LICENSE
 2 | ==========
 3 | RSeQC is distributed under `GNU General Public License (GPLv3) <http://www.gnu.org/copyleft/gpl.html>`_
 4 | 
 5 | This program is free software; you can redistribute it and/or modify it under
 6 | the terms of the GNU General Public License as published by the Free Software
 7 | Foundation; either version 3 of the License, or (at your option) any later
 8 | version. This program is distributed in the hope that it will be useful,but
 9 | WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
11 | details. You should have received a copy of the GNU General Public License along
12 | with this program; if not, write to the Free Software Foundation, Inc.,
13 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
14 | 
15 | Contact:
16 | Liguo Wang: wangliguo78@gmail.com
17 | Shengqin Wang: wzsqwang@gmail.com
18 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 | include requirements.txt
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RSeQC: An RNA-seq Quality Control Package
 2 | 
 3 | > This is a fork of original RSeQC package from [sorceforge site](http://rseqc.sourceforge.net)
 4 | > I'm making drastic rearrangement to this package to make it easier to follow. I'm also making changes to the code base
 5 | > At this stage only `read_dist` (read_distribution) and `bam_stats` (bam_stat) modules have been incorporated and both now can be accessed from main executable `scripts/rseqc` 
 6 | 
 7 | **Original message**
 8 | 
 9 | > RSeQC package provides a number of useful modules that can comprehensively evaluate high
10 | > throughput sequence data especially RNA-seq data. Some basic modules quickly inspect sequence
11 | > quality, nucleotide composition bias, PCR bias and GC bias, while RNA-seq specific modules
12 | > evaluate sequencing saturation, mapped reads distribution, coverage uniformity, strand specificity, transcript level RNA integrity etc.
13 | 
14 | ## Table of content
15 | 
16 | - [Quick start](#quick-start)
17 | - [Installation](#installation)
18 | - [Input formats](#input-format)
19 | - [Contact](#contact)
20 | - [Reference](#reference)
21 | 
22 | ## Quick start
23 | 
24 | Once installed use main executable file `rseqc` to run any of the sub-commands (modules)
25 | 
26 | e.g
27 | 
28 | ```BASH
29 | rseqc read_dist --input_file yourBamFile.bam --gene_models yourGTFfile.gtf
30 | ```
31 | 
32 | OR
33 | 
34 | ```BASH
35 | rseqc read_dist --input_file yourBamFile.bam --gene_models yourBED12file.bed --file_type bed
36 | ```
37 | 
38 | ## Installation
39 |  
40 | You will need either `sudo` or [virtualenvs](ttp://docs.python-guide.org/en/latest/dev/virtualenvs/) (which is my preferred method). If you are you going to use `sudo` please prefix `python setup.py install` and `pip install numpy` with `sudo`.
41 | 
42 | ```BASH
43 | git clone --branch fresh https://github.com/MonashBioinformaticsPlatform/RSeQC.git
44 | cd RSeQC
45 | python setup.py install
46 | rseqc --help
47 | ```
48 | 
49 | I haven't figured why, but `numpy` needs to be installed separately. It doesn't get pulled correctly from the dependencies list in `setup.up`.
50 | 
51 | ```BASH
52 | pip install numpy
53 | ```
54 | 
55 | ## Input format
56 | 
57 | - [BED](http://genome.ucsc.edu/FAQ/FAQformat.html) file is tab separated, 12-column, plain text file to represent gene models
58 | - [GTF](http://mblab.wustl.edu/GTF22.html) file is also represents gene models. This is an alternative file to BED12
59 | - [SAM/BAM](http://www.htslib.org/doc/sam.html) file holds information about read alignment to the reference genome. 
60 | 
61 | ## Contact
62 | 
63 | - Liguo Wang: wangliguo78@gmail.com
64 | - Shengqin Wang: wzsqwang@gmail.com
65 | - Wei Li: superliwei@gmail.com 
66 | 
67 | ## Reference
68 | 
69 | - Wang, L., Wang, S., & Li, W. (2012). **RSeQC: quality control of RNA-seq experiments**. *Bioinformatics* (Oxford, England), 28(16), 2184–2185. http://doi.org/10.1093/bioinformatics/bts356
70 | - Wang, L., Nie, J., Sicotte, H., Li, Y., Eckel-Passow, J. E., Dasari, S., et al. (2016). **Measure transcript integrity using RNA-seq data**. *BMC Bioinformatics*, 17(1), 1–16. http://doi.org/10.1186/s12859-016-0922-z
71 | 


--------------------------------------------------------------------------------
/docs/INSTALL:
--------------------------------------------------------------------------------
 1 | Installation guide for RSeQC
 2 | ==============================
 3 | 
 4 | 
 5 | Use pip to install RSeQC
 6 | ------------------------
 7 | 
 8 | You must first install pip following the instructions: https://pypi.python.org/pypi/pip
 9 | Then type:
10 | pip install RSeQC
11 | 
12 | 
13 | Install RSeQC from source code (Not recommended)
14 | ------------------------------------------------
15 | 
16 | Prerequisite:
17 |    * gcc
18 |    * python2.7
19 |    * numpy
20 |    * If your computer can not connnect to internet, nose>= 0.10.4 and distribute-0.6.10 are also required. 
21 | 
22 | Below is an example of installing RSeQC on Linux system using BASH. You need to change '--root' directory, PYTHONPATH and PATH accordingly
23 | 
24 |     1) tar zxf RSeQC-VERSION.tar.gz
25 |     2) cd RSeQC-VERSION
26 |     3) python setup.py install 					#will install RSeQC in system level. require root previledge
27 |     4) python setup.py install --root=/home/user/RSeQC 		#will install RSeQC at user specified location
28 |     5) export PYTHONPATH=/home/user/RSeQC/usr/local/lib/python2.7/site-packages:$PYTHONPATH. #setup PYTHONPATH, so that RSeQC knows where to import modules
29 |     6) export PATH=/home/user/RSeQC/usr/local/bin:$PATH 	#setup PATH, so that system knows where to find executable file 
30 | 
31 | NOTE:
32 |     * To install RSeQC on MAC OSX, user need to download and install Xcode beforehand.
33 |     * To produce graphical outputs, R must be installed.
34 |     * If the installation failed with error like: /usr/bin/ld: cannot find -lz, you may need to install a shared zlib library on your system. 
35 | 
36 | 
37 | Online Manual:
38 | http://rseqc.sourceforge.net/
39 | 
40 | 


--------------------------------------------------------------------------------
/docs/THANKS:
--------------------------------------------------------------------------------
1 | Acknowledgements goes to:
2 | 1) Samtools and pysam contributors
3 | 2) bx-python contributors
4 | 3) Other members in Wei Li's Computational Laboratory (Baylor College of Medicine)
5 | 4) Users' valuable feedback
6 | 


--------------------------------------------------------------------------------
/docs/data_structure.json:
--------------------------------------------------------------------------------
 1 | {GeneId: {Chromosome: chr,
 2 | 
 3 |           Strand: strnd,
 4 | 
 5 |           Gene_biotype: gtype,
 6 | 
 7 |           Gene_name: name,
 8 |  
 9 |           Tag: tag,
10 | 
11 |           Exons: {ExonId: {start: value, end: value},
12 |                   ExonId: {start: value, end: value},
13 |                   ExonId: {start: value, end: value}
14 |                  },
15 | 
16 |           CDS: {cdsId: {start: value, end: value},
17 |                 cdsId: {start: value, end: value},
18 |                 cdsId: {start: value, end: value}
19 |                 },
20 | 
21 |           Transcripts: {TranscriptId_1: {ExonIds: [exon_1, exon_3, exon_4],
22 |                                          cdsIds: [cds_1, cds_2, cds_5],
23 |                                          tprime_utr: {start: value, end: value},
24 |                                          fprime_utr: {start: value, end: value},
25 |                                          ATG: value_1,
26 |                                          TGA: value_1,
27 |                                          transcript_biotype: type,
28 |                                          transcript_support_level: value
29 |                                          },
30 | 
31 |                         TranscriptId_2: {ExonIds: [exon_2, exon_3, exon_5],
32 |                                          cdsIds: [cds_1, cds_2, cds_5],
33 |                                          tprime_utr: {start: value, end: value},
34 |                                          fprime_utr: {start: value, end: value},
35 |                                          ATG: value_2,
36 |                                          TGA: value_2,
37 |                                          transcript_biotype: type,
38 |                                          transcript_support_level: value
39 |                                          },
40 |                         TranscriptId_3: {ExonIds: [exon_5, exon_4, exon_6],
41 |                                          cdsIds: [cds_1, cds_2, cds_5],
42 |                                          tprime_utr: {start: value, end: value},
43 |                                          fprime_utr: {start: value, end: value},
44 |                                          ATG: value_3,
45 |                                          TGA: value_3,
46 |                                          transcript_biotype: type,
47 |                                          transcript_support_level: value
48 |                                          },
49 |                         },
50 |            }
51 | }
52 | 


--------------------------------------------------------------------------------
/docs/releases.md:
--------------------------------------------------------------------------------
 1 | # Release history
 2 | 
 3 | ## RSeQC v2.6.4
 4 | 
 5 | * Two dependency packages bx-python and pysam are **not** shipped with ## RSeQC starting from v2.6.4. 
 6 | * Users could install ## RSeQC using pip: **pip install ## RSeQC**. bx-python and pysam will be installed automatically if they haven’t been installed before.
 7 | 
 8 | ## RSeQC v2.6.3
 9 | 
10 | * Fix a bug in "read_quality.py" that does not return results if input file containing less than 1000 reads.
11 | * ## Remove "## RPKM_count.py" as it generates erroneous results especially for longer reads. Use "FPKM_count.py" instead.
12 | * "bam_stat.py" prints summary statistics to STDOUT. 
13 | 
14 | ## RSeQC v2.6.2
15 | 
16 | * fix bugs in "insertion_profile.py", "clipping_profile.py", and "inner_distance.py "
17 | 
18 | ## RSeQC v2.6.1
19 | 
20 | * Fix bug in "junction_annotation.py" in that it would report some "novel splice junctions" that don't exist in the BAM files. This happened when reads were clipped and spliced mapped simultaneously. 
21 | * Add FPKM.py. FPKM.py will report "raw fragment count", "FPM" and "FPKM" for each gene. It does not report exon and intron level count. 
22 | 
23 | ## RSeQC v2.6
24 | 
25 | Add 3 new modules:
26 | 
27 | * deletion_profile.py
28 | * insertion_profile.py
29 | * mismatch_profile.py
30 | 
31 | ## RSeQC v2.5
32 | 
33 | * read_duplication.py:
34 | 
35 |  * add '-q' option filter alignments with low mapping quality.
36 |  * Fix bug related to the labels of right Y-aixs
37 |  
38 | * bam2fq: add '-c' option to call 'gzip' command to compress output fastq file(s).
39 | * divide_bam.py: add '-s' option, skipped unmapped reads.
40 | * clipping_profile.py:
41 | 
42 |  * add '-q' option filter alignments with low mapping quality.
43 |  * Issue warnning and exit if no clipped reads found. 
44 | 
45 | 
46 | ## RSeQC v2.4
47 | rewrite "geneBody_coverage.py"
48 | 
49 | * Memory-efficient: consumed < 100M ## RAM
50 | * Flexible input to handle one or more BAM files::
51 | 
52 |  * Input a singel BAM file. eg: **-i test.bam**
53 |  * Input several BAM files (separated by ","). eg: **-i test1.bam,test2.bam,test3.bam**
54 |  * Input plain text file containing the path of BAM file(s). eg: **-i bam_path.txt**
55 |  * Input a directory containing BAM file(s). eg: **-i my_folder**
56 | 
57 | * Generate heatmap to visualize gene body coverage over many samples.
58 | 
59 | ## RSeQC v2.3.9
60 |  
61 | * Add bam2fq.py. Transform BAM files into fastq format.
62 | * bugs fixed. 
63 | 
64 | ## RSeQC v2.3.7
65 | 
66 | * bam_stat.py: Now counts 'Proper-paired reads map to different chrom'
67 | * bam2wig.py: automatically call 'wigToBigwig' if it can be found in system $PATH
68 | * inner_distance.py: add 'PE_within_diff_chrom'
69 | 
70 | ## RSeQC v2.3.3
71 | 
72 | * Minor bugs fixed. 
73 | 
74 | ## RSeQC v2.3.2
75 | 
76 | * Add split_bam.py: Split orignal BAM file into small BAM files based on provided gene list. User can use this module to estimate ribosome ## RNA amount if the input gene list is ribosomal ## RNA.
77 | * Add  read_hexamer.py: Calculate hexamer frequency for multiple input files (fasta or fastq).
78 | * Some parts were optimized and runs little faster.
79 | 
80 | ## RSeQC v2.3.1
81 | 
82 | * Add normalization option to bam2wig.py. With this option, user can normalize different sequencing depth into the same scale when converting BAM into wiggle format.
83 | * Add another script. geneBody_coverage2.py. This script uses BigWig? instead of BAM as input, and requires much less memory (~ 200M) 
84 | 


--------------------------------------------------------------------------------
/docs/supplementary.md:
--------------------------------------------------------------------------------
 1 | ## Download test datasets
 2 |  
 3 | Pair-end strand specific (Illumina). BAM file md5sum=fbd1fb1c153e3d074524ec70e6e21fb9
 4 | 
 5 | * `Pairend_StrandSpecific_51mer_Human_hg19.bam <https://drive.google.com/file/d/0BwAUopGWU_khNmozSHhWdDVncXc/view?usp=sharing>`_
 6 | * `Pairend_StrandSpecific_51mer_Human_hg19.bam.bai <https://drive.google.com/file/d/0BwAUopGWU_khc2g4akJlN25KVzQ/view?usp=sharing>`_
 7 |  
 8 | Pair-end  non-strand specific (Illumina). BAM file md5sum=ba014f6b397b8a29c456b744237a12de
 9 | 
10 | * `Pairend_nonStrandSpecific_36mer_Human_hg19.bam <https://drive.google.com/file/d/0BwAUopGWU_khbjJPX3BxRzNtOWs/view?usp=sharing>`_
11 | * `Pairend_nonStrandSpecific_36mer_Human_hg19.bam.bai <https://drive.google.com/file/d/0BwAUopGWU_khUi02dGc0VjhORlk/view?usp=sharing>`_
12 |   
13 | Single-end strand specific (SOLiD). BAM file md5sum=b39951a6ba4639ca51983c2f0bf5dfce
14 | 
15 | * `SingleEnd_StrandSpecific_50mer_Human_hg19.bam <https://drive.google.com/file/d/0BwAUopGWU_khUDNTRHhFc29RQms/view?usp=sharing>`_
16 | * `SingleEnd_StrandSpecific_50mer_Human_hg19.bam.bai <https://drive.google.com/file/d/0BwAUopGWU_khSVFLdUhGRjJpS0k/view?usp=sharing>`_
17 |  
18 | ## Download gene models (update on 08/07/2014)
19 | 
20 | * `Human (hg38, hg19) <https://sourceforge.net/projects/rseqc/files/BED/Human_Homo_sapiens/>`_
21 | * `Mouse (mm10,mm9) <https://sourceforge.net/projects/rseqc/files/BED/Mouse_Mus_musculus/>`_
22 | * `Fly (dm3) <https://sourceforge.net/projects/rseqc/files/BED/fly_D.melanogaster/>`_
23 | * `Zebrafish (danRer7) <https://sourceforge.net/projects/rseqc/files/BED/Zebrafish_Danio_rerio/>`_
24 | 
25 | **NOTE:**
26 | 
27 | * BED file for other species and the most recent release of these files can be downloaded from `UCSC Table Browser <https://genome.ucsc.edu/cgi-bin/hgTables?command=start>`_ 
28 | * Make sure the annotation file and the genome assembly are matched. For example, if you aligned RNA-seq reads to `hg19/GRCh37 <http://www.ncbi.nlm.nih.gov/assembly/2758/>`_ you should download `hg19/GRCh37 <http://www.ncbi.nlm.nih.gov/assembly/2758/>`_ based BED files. 
29 | 
30 | ## Download ribosome RNA (update on 07/08/2015)
31 | We only provide rRNA bed files for human and mouse. We download these ribosome RNAs from UCSC table browser,
32 | we provide them here to facilitate users with NO WARRANTY in completeness.
33 | 
34 | * `GRCh38_rRNA.bed <http://sourceforge.net/projects/rseqc/files/BED/Human_Homo_sapiens/GRCh38_rRNA.bed.gz/download>`_
35 | * `hg19_rRNA.bed <http://sourceforge.net/projects/rseqc/files/BED/Human_Homo_sapiens/hg19_rRNA.bed.gz/download>`_
36 | * `mm10_rRNA.bed <http://sourceforge.net/projects/rseqc/files/BED/Mouse_Mus_musculus/mm10_rRNA.bed.gz/download>`_
37 | * `mm9_rRNA.bed <http://sourceforge.net/projects/rseqc/files/BED/Mouse_Mus_musculus/mm9_rRNA.bed.gz/download>`_
38 | 
39 | 


--------------------------------------------------------------------------------
/rseqc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MonashBioinformaticsPlatform/RSeQC/5658c4d7c5c1c9a8ece1461df82479b84c5509f8/rseqc/__init__.py


--------------------------------------------------------------------------------
/rseqc/modules/RNA_fragment_size.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | calculate fragment size for each gene/transcript. For each transcript/gene, it Will report:
  4 | 1) # of fragment that was used.
  5 | 2) mean of fragment size
  6 | 3) median of fragment size
  7 | 4) stdev of fragment size
  8 | '''
  9 | 
 10 | import pysam
 11 | import sys,os
 12 | from numpy import mean,median,std
 13 | from optparse import OptionParser
 14 | 
 15 | __author__ = "Liguo Wang"
 16 | __copyright__ = "Copyleft"
 17 | __credits__ = []
 18 | __license__ = "GPL"
 19 | __version__="2.6.4"
 20 | __maintainer__ = "Liguo Wang"
 21 | __email__ = "wang.liguo@mayo.edu"
 22 | __status__ = "Production"
 23 | 
 24 | """
 25 | def overlap_length1(lst1, lst2):
 26 | 	l = 0
 27 | 	for chr, st, end  in BED.intersectBed3(lst1, lst2):
 28 | 		l += end -st
 29 | 	return l
 30 | """
 31 | def overlap_length2(lst1, lst2):
 32 | 	l = 0
 33 | 	for x in lst1:
 34 | 		for y in lst2:
 35 | 			l += len(range(max(x[0], y[0]), min(x[-1], y[-1])+1))
 36 | 	return l
 37 | 		
 38 | 	
 39 | def fragment_size(bedfile, samfile,qcut=30,ncut=5):
 40 | 	'''calculate the fragment size for each gene'''
 41 | 	for line in open(bedfile,'r'):
 42 | 		exon_range = []	
 43 | 		if line.startswith(('#','track','browser')):continue
 44 | 		fields = line.split()
 45 | 		chrom = fields[0]
 46 | 		tx_start = int( fields[1] )
 47 | 		tx_end = int( fields[2] )
 48 | 		geneName = fields[3]
 49 | 		trand    = fields[5].replace(" ","_")
 50 | 		exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
 51 | 		exon_starts = map((lambda x: x + tx_start ), exon_starts)
 52 | 		exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) )
 53 | 		exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends)
 54 | 		geneID = "\t".join([str(i) for i in (chrom, tx_start, tx_end, geneName)])
 55 | 		
 56 | 		for st,end in zip(exon_starts,exon_ends):
 57 | 			exon_range.append([st+1,end+1])
 58 | 			#exon_range.append([chrom, st,end])
 59 | 		
 60 | 		try:
 61 | 			alignedReads = samfile.fetch(chrom,tx_start,tx_end)
 62 | 		except:
 63 | 			yield '\t'.join([str(i) for i in (geneID, 0,0,0)])
 64 | 			continue
 65 | 		
 66 | 		frag_sizes = []
 67 | 		for aligned_read in alignedReads:
 68 | 			if not aligned_read.is_paired:				#skip single sequencing
 69 | 				continue
 70 | 			if aligned_read.is_read2:
 71 | 				continue
 72 | 			if aligned_read.mate_is_unmapped:
 73 | 				continue
 74 | 			if aligned_read.is_qcfail:continue               #skip low quanlity                                      
 75 | 			if aligned_read.is_duplicate:continue           #skip duplicate read
 76 | 			if aligned_read.is_secondary:continue           #skip non primary hit           
 77 | 			if aligned_read.mapq < qcut:
 78 | 					continue
 79 | 			
 80 | 			read_st = aligned_read.pos
 81 | 			mate_st = aligned_read.pnext
 82 | 			if read_st > mate_st:
 83 | 				(read_st, mate_st) = (mate_st, read_st)
 84 | 			if read_st < tx_start or mate_st > tx_end:
 85 | 				continue
 86 | 			read_len = aligned_read.qlen
 87 | 			map_range = [[read_st+1, mate_st]]
 88 | 			#map_range = [[chrom, read_st, mate_st]]
 89 | 			frag_len = overlap_length2(exon_range, map_range) + read_len
 90 | 			frag_sizes.append(frag_len)
 91 | 		if len(frag_sizes) < ncut:
 92 | 			yield '\t'.join([str(i) for i in (geneID, len(frag_sizes), 0,0,0)])
 93 | 		else:
 94 | 			yield '\t'.join([str(i) for i in (geneID, len(frag_sizes), mean(frag_sizes), median(frag_sizes), std(frag_sizes))])
 95 | 
 96 | 
 97 | def main():
 98 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
 99 | 	parser = OptionParser(usage,version="%prog " + __version__)
100 | 	parser.add_option("-i","--input",action="store",type="string",dest="input_file",help="Input BAM file") 
101 | 	parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in BED format. Must be strandard 12-column BED file. [required]")
102 | 	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default")
103 | 	parser.add_option("-n","--frag-num",action="store",type="int",dest="fragment_num",default=3,help="Minimum number of fragment. default=%default")
104 | 
105 | 	(options,args)=parser.parse_args()
106 | 
107 | 	if not (options.input_file and options.refgene_bed):
108 | 		parser.print_help()
109 | 		sys.exit(0)
110 | 	if not os.path.exists(options.input_file + '.bai'):
111 | 		print >>sys.stderr, "cannot find index file of input BAM file"
112 | 		print >>sys.stderr, options.input_file + '.bai' + " does not exists"
113 | 		sys.exit(0)	
114 | 
115 | 	for file in (options.input_file, options.refgene_bed):
116 | 		if not os.path.exists(file):
117 | 			print >>sys.stderr, file + " does NOT exists" + '\n'
118 | 			sys.exit(0)
119 | 	
120 | 	print '\t'.join([str(i) for i in ("chrom","tx_start", "tx_end","symbol","frag_count","frag_mean","frag_median","frag_std")])
121 | 	for tmp in fragment_size(options.refgene_bed, pysam.Samfile(options.input_file), options.map_qual, options.fragment_num):
122 | 		print tmp
123 | 
124 | if __name__ == '__main__':
125 |         main()
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 			
139 | 			
140 | 


--------------------------------------------------------------------------------
/rseqc/modules/RPKM_saturation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''-------------------------------------------------------------------------------------------------
  3 | For each gene, check whether the RPKM value was saturated or not. Saturation analysis is based on 
  4 | re-sampling. For example, sample 5%, 10%, ... , 95%, 100% from total mapped reads, then 
  5 | calculate RPKM value for each step. Strand specific sequencing protocol is supported.
  6 | -------------------------------------------------------------------------------------------------'''
  7 | 
  8 | #import built-in modules
  9 | import os,sys
 10 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
 11 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
 12 | 	sys.exit()
 13 | 
 14 | import re
 15 | import string
 16 | from optparse import OptionParser
 17 | import warnings
 18 | import string
 19 | import collections
 20 | import math
 21 | import sets
 22 | import numpy as np
 23 | from time import strftime
 24 | import operator
 25 | import subprocess
 26 | 
 27 | #import third-party modules
 28 | from bx.bitset import *
 29 | from bx.bitset_builders import *
 30 | from bx.intervals import *
 31 | 
 32 | #import my own modules
 33 | from qcmodule import SAM
 34 | #changes to the paths
 35 | 
 36 | #changing history to this module
 37 | 
 38 | 
 39 | __author__ = "Liguo Wang"
 40 | __copyright__ = "Copyleft"
 41 | __credits__ = []
 42 | __license__ = "GPL"
 43 | __version__="2.6.4"
 44 | __maintainer__ = "Liguo Wang"
 45 | __email__ = "wang.liguo@mayo.edu"
 46 | __status__ = "Production"
 47 | 
 48 | 
 49 | def printlog (mesg):
 50 | 	'''print progress into stderr and log file'''
 51 | 	mesg="@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg
 52 | 	LOG=open('class.log','a')
 53 | 	print >>sys.stderr,mesg
 54 | 	print >>LOG,mesg
 55 | 
 56 | def normalize(lst):
 57 | 	'''normalize all numbers between 0 and 1'''
 58 | 	norm_lst=[]
 59 | 	if max(lst) == min(lst):
 60 | 		return norm_lst
 61 | 	if max(lst) - min(lst)==0:
 62 | 		return norm_lst
 63 | 	for i in lst:
 64 | 		norm_lst.append(  (i-min(lst))/(max(lst)-min(lst))  )  
 65 | 	return norm_lst
 66 | 
 67 | def square_error(lst):
 68 | 	'''transform list into normalized squared error (squared error divided by range)'''
 69 | 	SE = []
 70 | 	true_rpkm = lst[-1]
 71 | 	rang = max(lst) - min(lst)
 72 | 	if true_rpkm == 0:
 73 | 		return None
 74 | 	if rang == 0:
 75 | 		return None
 76 | 	for i in lst:
 77 | 		SE.append(  abs(i - true_rpkm)/true_rpkm  )  
 78 | 	return SE
 79 | 
 80 | 
 81 | def show_saturation (infile,outfile,rpkm_cut=0.01):
 82 | 	
 83 | 	RPKM_values = collections.defaultdict(list)
 84 | 	RPKM_mean = {}
 85 | 	gene_count = 0
 86 | 	Quan = {'Q1':[0,0.25],'Q2':[0.25,0.5],'Q3':[0.5,0.75],'Q4':[0.75,1]}
 87 | 	ROUT = open(outfile,'w')
 88 | 	
 89 | 	for line in open(infile):
 90 | 		line=line.strip()
 91 | 		fields=line.split()
 92 | 		if fields[0].startswith('#'):
 93 | 			head = [i.replace('%','') for i in fields[6:]]
 94 | 			continue
 95 | 		mykey = '\t'.join(fields[0:6])
 96 | 		myvalue = [float(i) for i in fields[6:]]
 97 | 		if max(myvalue) == 0: continue
 98 | 		if max(myvalue) - min(myvalue) == 0: continue
 99 | 		if np.mean(myvalue) < rpkm_cut: continue
100 | 		
101 | 		RPKM_values[mykey] = square_error(myvalue)
102 | 		RPKM_mean[mykey] = np.mean(myvalue)
103 | 		gene_count += 1
104 | 		if (len(head)==0):
105 | 			print >>sys.stderr, "No head line found, exit."
106 | 			sys.exit(1)
107 | 	
108 | 	
109 | 	print >>ROUT, "pdf('%s')" % (outfile.replace('.r','.pdf'))
110 | 	print >>ROUT, 'par(mfrow=c(2,2))'
111 | 	for quantile in sorted(Quan):
112 | 		line_count = 0
113 | 		norm_RPKM = collections.defaultdict(list)
114 | 		for k,v in sorted (RPKM_mean.iteritems(), key=operator.itemgetter(1)):	
115 | 			line_count += 1
116 | 			if (line_count > gene_count * Quan[quantile][0]) and (line_count <= gene_count * Quan[quantile][1]):
117 | 				for i,j in enumerate(RPKM_values[k]):
118 | 					norm_RPKM[head[i]].append(str(j))
119 | 		print >>ROUT,"name=c(%s)" % (','.join(head[:-1]))
120 | 		for i in head[:-1]:
121 | 			print >>ROUT, "S%s=c(%s)" % (i, ','.join(norm_RPKM[i]))
122 | 		print >>ROUT, "boxplot(%s,names=name,outline=F,ylab='Percent Relative Error',main='%s',xlab='Resampling percentage')" % (','.join(['100*S' + i for i in head[:-1]]),quantile)
123 | 	print >>ROUT, 'dev.off()'
124 | 				
125 | 
126 | def main():
127 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
128 | 	parser = OptionParser(usage,version="%prog " + __version__)
129 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format. [required]")
130 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]")
131 | 	parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed fomat. [required]")
132 | 	parser.add_option("-d","--strand",action="store",type="string",dest="strand_rule",default=None,help="How read(s) were stranded during sequencing. For example: --strand='1++,1--,2+-,2-+' means that this is a pair-end, strand-specific RNA-seq, and the strand rule is: read1 mapped to '+' => parental gene on '+'; read1 mapped to '-' => parental gene on '-'; read2 mapped to '+' => parental gene on '-'; read2 mapped to '-' => parental gene on '+'.  If you are not sure about the strand rule, run \'infer_experiment.py' default=%default (Not a strand specific RNA-seq data)")
133 | 	parser.add_option("-l","--percentile-floor",action="store",type="int",dest="percentile_low_bound",default=5, help="Sampling starts from this percentile. A integer between 0 and 100. default=%default")
134 | 	parser.add_option("-u","--percentile-ceiling",action="store",type="int",dest="percentile_up_bound",default=100, help="Sampling ends at this percentile. A integer between 0 and 100. default=%default")
135 | 	parser.add_option("-s","--percentile-step",action="store",type="int",dest="percentile_step",default=5, help="Sampling frequency. Smaller value means more sampling times. A integer between 0 and 100. default=%default")	
136 | 	parser.add_option("-c","--rpkm-cutoff",action="store",type="float",dest="rpkm_cutoff",default=0.01, help="Transcripts with RPKM smaller than this number will be ignored in visualization plot. default=%default")
137 | 	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default")
138 | 	
139 | 	(options,args)=parser.parse_args()
140 | 
141 | 	if not (options.output_prefix and options.input_file):
142 | 		parser.print_help()
143 | 		sys.exit(0)
144 | 	if options.percentile_low_bound <0 or options.percentile_low_bound >100:
145 | 		print >>sys.stderr, "percentile_low_bound must be larger than 0 and samller than 100"
146 | 		sys.exit(0)
147 | 	if options.percentile_up_bound <0 or options.percentile_up_bound >100:
148 | 		print >>sys.stderr, "percentile_up_bound must be larger than 0 and samller than 100"
149 | 		sys.exit(0)
150 | 	if options.percentile_up_bound < options.percentile_low_bound:
151 | 		print >>sys.stderr, "percentile_up_bound must be larger than percentile_low_bound"
152 | 		sys.exit(0)
153 | 	if options.percentile_step <0 or options.percentile_step > options.percentile_up_bound:
154 | 		print >>sys.stderr, "percentile_step must be larger than 0 and samller than percentile_up_bound"
155 | 		sys.exit(0)
156 | 	if os.path.exists(options.input_file):
157 | 		obj = SAM.ParseBAM(options.input_file)
158 | 		obj.saturation_RPKM(outfile=options.output_prefix, refbed=options.refgene_bed, sample_start=options.percentile_low_bound,sample_end=options.percentile_up_bound,sample_step=options.percentile_step,strand_rule=options.strand_rule, q_cut  = options.map_qual)
159 | 		show_saturation(infile=options.output_prefix + ".eRPKM.xls", outfile=options.output_prefix + ".saturation.r",rpkm_cut = options.rpkm_cutoff)
160 | 		try:
161 | 			subprocess.call("Rscript " + options.output_prefix + ".saturation.r", shell=True)
162 | 		except:
163 | 			pass
164 | 	else:
165 | 		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
166 | 		#parser.print_help()
167 | 		sys.exit(0)
168 | 		
169 | 
170 | 
171 | if __name__ == '__main__':
172 | 	main()
173 | 


--------------------------------------------------------------------------------
/rseqc/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MonashBioinformaticsPlatform/RSeQC/5658c4d7c5c1c9a8ece1461df82479b84c5509f8/rseqc/modules/__init__.py


--------------------------------------------------------------------------------
/rseqc/modules/bam2fq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | Description: Convert alignments in BAM or SAM format into fastq format.
 4 | '''
 5 | 
 6 | #import built-in modules
 7 | import os,sys
 8 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
 9 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
10 | 	sys.exit()
11 | 
12 | import re
13 | import string
14 | from optparse import OptionParser
15 | import warnings
16 | import string
17 | import collections
18 | import subprocess
19 | 
20 | from time import strftime
21 | 
22 | #import my own modules
23 | from qcmodule import SAM
24 | #changes to the paths
25 | 
26 | #changing history to this module
27 | 
28 | 
29 | __author__ = "Liguo Wang"
30 | __copyright__ = "Copyleft"
31 | __credits__ = []
32 | __license__ = "GPL"
33 | __version__="2.6.4"
34 | __maintainer__ = "Liguo Wang"
35 | __email__ = "wang.liguo@mayo.edu"
36 | __status__ = "Production"
37 | 
38 | 
39 | 
40 | def main():
41 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
42 | 	parser = OptionParser(usage,version="%prog " + __version__)
43 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.")
44 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output fastq files(s).")
45 | 	parser.add_option("-s","--single-end",action="store_true",dest="single", help="Specificy '-s' or '--single-end' for single-end sequencing.")
46 | 	parser.add_option("-c","--compress",action="store_true",dest="gzip", help="Specificy '-c' or '--compress' to compress output fastq file(s) using 'gzip' command.")
47 | 	(options,args)=parser.parse_args()
48 | 	
49 | 	
50 | 	#print options.single
51 | 	if not (options.output_prefix and options.input_file):
52 | 		parser.print_help()
53 | 		sys.exit(0)
54 | 	if os.path.exists(options.input_file):
55 | 		obj = SAM.ParseBAM(options.input_file)
56 | 		if options.single is True:
57 | 			obj.bam2fq(prefix=options.output_prefix, paired = False)
58 | 			if options.gzip is True:
59 | 				try:
60 | 					print >>sys.stderr, "run gzip ... ",
61 | 					subprocess.call("gzip " + options.output_prefix + '.fastq', shell=True)
62 | 					print >>sys.stderr, "Done."
63 | 				except:
64 | 					pass
65 | 		else:
66 | 			obj.bam2fq(prefix=options.output_prefix, paired = True)
67 | 			if options.gzip is True:
68 | 				try:
69 | 					print >>sys.stderr, "run gzip ..."
70 | 					subprocess.call("gzip " + options.output_prefix + '.R1.fastq', shell=True)
71 | 					subprocess.call("gzip " + options.output_prefix + '.R2.fastq', shell=True)
72 | 					print >>sys.stderr, "Done."
73 | 				except:
74 | 					pass
75 | 	else:
76 | 		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
77 | 		#parser.print_help()
78 | 		sys.exit(0)
79 | 
80 | if __name__ == '__main__':
81 | 	main()
82 | 


--------------------------------------------------------------------------------
/rseqc/modules/bam2wig.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | Convert BAM file into wig file. BAM file must be sorted and indexed using SAMtools.
  4 | Note: SAM format file is not supported.
  5 | '''
  6 | 
  7 | #import built-in modules
  8 | import os,sys
  9 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
 10 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
 11 | 	sys.exit()
 12 | 
 13 | import string
 14 | from optparse import OptionParser
 15 | import warnings
 16 | import string
 17 | import collections
 18 | 
 19 | #import third-party modules
 20 | from bx.bitset import *
 21 | from bx.bitset_builders import *
 22 | from bx.intervals import *
 23 | 
 24 | #import my own modules
 25 | from qcmodule import SAM
 26 | from qcmodule import BED
 27 | #changes to the paths
 28 | 
 29 | #changing history to this module
 30 | 
 31 | 
 32 | __author__ = "Liguo Wang"
 33 | __copyright__ = "Copyleft"
 34 | __credits__ = []
 35 | __license__ = "GPL"
 36 | __version__="2.6.4"
 37 | __maintainer__ = "Liguo Wang"
 38 | __email__ = "wang.liguo@mayo.edu"
 39 | __status__ = "Production"
 40 | 
 41 | 
 42 | def printlog (mesg):
 43 | 	'''print progress into stderr and log file'''
 44 | 	mesg="@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg
 45 | 	LOG=open('class.log','a')
 46 | 	print >>sys.stderr,mesg
 47 | 	print >>LOG,mesg
 48 | 
 49 | def load_chromsize(file):
 50 | 	'''read chrom.size file'''
 51 | 	chromSize={}
 52 | 	for line in open(file,'r'):
 53 | 		if line.startswith('#'):continue
 54 | 		if not line.strip():continue
 55 | 		fields = line.strip().split()
 56 | 		chromSize[fields[0]] = int(fields[1])
 57 | 	return chromSize
 58 | 
 59 | 			
 60 | def main():
 61 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
 62 | 	parser = OptionParser(usage,version="%prog " + __version__)
 63 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM format. BAM file must be sorted and indexed using samTools. .bam and .bai files should be placed in the same directory.")
 64 | 	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name/ID, second column is chromosome size. Chromosome name (such as \"chr1\") should be consistent between this file and the BAM file.")
 65 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output wiggle files(s). One wiggle file will be generated for non strand-specific data, two wiggle files (\"Prefix_Forward.wig\" and \"Prefix_Reverse.wig\") will be generated for strand-specific RNA-seq data.")
 66 | 	parser.add_option("-t","--wigsum",action="store",type="int",dest="total_wigsum",help="Specified wigsum. Eg: 1,000,000,000 equals to coverage of 10 million 100nt reads. Ignore this option to disable normalization")
 67 | 	parser.add_option("-u","--skip-multi-hits",action="store_true",dest="skip_multi",help="Skip non-unique hit reads.")
 68 | 	parser.add_option("-d","--strand",action="store",type="string",dest="strand_rule",default=None,help="How read(s) were stranded during sequencing. For example: --strand='1++,1--,2+-,2-+' means that this is a pair-end, strand-specific RNA-seq data, and the strand rule is: read1 mapped to '+' => parental gene on '+'; read1 mapped to '-' => parental gene on '-'; read2 mapped to '+' => parental gene on '-'; read2 mapped to '-' => parental gene on '+'.  If you are not sure about the strand rule, run \'infer_experiment.py' default=%default (Not a strand specific RNA-seq data).")
 69 | 	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality to determine \"uniquely mapped\". default=%default")
 70 | 
 71 | 	(options,args)=parser.parse_args()
 72 | 	
 73 | 	
 74 | 	if not (options.output_prefix and options.input_file and options.chromSize and options.output_prefix):
 75 | 		parser.print_help()
 76 | 		sys.exit(0)
 77 | 	for file in (options.input_file,options.chromSize):
 78 | 		if not os.path.exists(file):
 79 | 			print >>sys.stderr, '\n\n' + file + " does NOT exists" + '\n'
 80 | 			sys.exit(0)
 81 | 	if not os.path.exists(options.input_file + '.bai'):
 82 | 		print >>sys.stderr, "index file " + options.input_file + '.bai' + " does not exists"
 83 | 		sys.exit(0)
 84 | 	
 85 | 	if options.skip_multi:print "Skip multi-hits:True"
 86 | 	else:print "Skip multi-hits:False"
 87 | 
 88 | 
 89 | 	chromSizes = load_chromsize(options.chromSize)
 90 | 	
 91 | 	norm_factor=None
 92 | 	if options.total_wigsum:
 93 | 		obj = SAM.ParseBAM(options.input_file)
 94 | 		wig_sum = obj.calWigSum(chrom_sizes = chromSizes, skip_multi=options.skip_multi)
 95 | 		print >>sys.stderr, "\n\ntotal wigsum is:" + str(wig_sum) + '\n'
 96 | 		try:
 97 | 			norm_factor = options.total_wigsum / wig_sum
 98 | 		except:
 99 | 			norm_factor = None
100 | 			
101 | 	obj = SAM.ParseBAM(options.input_file)		
102 | 	obj.bamTowig(outfile = options.output_prefix, chrom_sizes = chromSizes, chrom_file = options.chromSize, q_cut = options.map_qual, skip_multi=options.skip_multi,strand_rule = options.strand_rule, WigSumFactor=norm_factor)
103 | 
104 | if __name__ == '__main__':
105 | 	main()
106 | 


--------------------------------------------------------------------------------
/rseqc/modules/bam_stats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | Summarizing mapping statistics of a BAM or SAM file. 
 4 | '''
 5 | 
 6 | #import my own modules
 7 | from rseqc.qcmodule import SAM
 8 | 
 9 | __author__ = "Liguo Wang"
10 | __copyright__ = "Copyleft."
11 | __credits__ = []
12 | __license__ = "GPL"
13 | __version__="2.6.4"
14 | __maintainer__ = "Liguo Wang"
15 | __email__ = "wang.liguo@mayo.edu"
16 | __status__ = "Production"
17 | 
18 | 
19 | def main(input_file, mapq):
20 |     obj = SAM.ParseBAM(input_file)
21 |     obj.stat(q_cut = mapq)
22 | 


--------------------------------------------------------------------------------
/rseqc/modules/clipping_profile.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | This program is used estimate clipping profile of RNA-seq reads from BAM or SAM file
 4 | Note that to use this funciton, CIGAR strings within SAM/BAM file should have 'S' operation 
 5 | (This means your reads mapper should support clipped mapping).
 6 | '''
 7 | 
 8 | #import built-in modules
 9 | import os,sys
10 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
11 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
12 | 	sys.exit()
13 | 
14 | import re
15 | import string
16 | from optparse import OptionParser
17 | import warnings
18 | import string
19 | import collections
20 | import math
21 | import sets
22 | from time import strftime
23 | import subprocess
24 |  
25 | 
26 | #import third-party modules
27 | from bx.bitset import *
28 | from bx.bitset_builders import *
29 | from bx.intervals import *
30 | 
31 | #import my own modules
32 | from qcmodule import SAM
33 | #changes to the paths
34 | 
35 | #changing history to this module
36 | 
37 | 
38 | __author__ = "Liguo Wang"
39 | __copyright__ = "Copyleft"
40 | __credits__ = []
41 | __license__ = "GPL"
42 | __version__="2.6.4"
43 | __maintainer__ = "Liguo Wang"
44 | __email__ = "wang.liguo@mayo.edu"
45 | __status__ = "Production"
46 | 
47 | 
48 | def main():
49 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
50 | 	parser = OptionParser(usage,version="%prog " + __version__)
51 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.")
52 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s).")
53 | 	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default")
54 | 	parser.add_option("-s","--sequencing",action="store",dest="layout",help="Sequencing layout. \"SE\"(single-end) or \"PE\"(pair-end). ")
55 | 	(options,args)=parser.parse_args()
56 | 
57 | 	if not (options.input_file and options.output_prefix and options.layout):
58 | 		parser.print_help()
59 | 		sys.exit(0)
60 | 	for input_file in ([options.input_file]):
61 | 		if not os.path.exists(input_file):
62 | 			print >>sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n'
63 | 			sys.exit(0)
64 | 
65 | 	obj = SAM.ParseBAM(options.input_file)
66 | 	if options.layout == "SE":
67 | 		obj.clipping_profile(outfile=options.output_prefix, q_cut = options.map_qual,type="S", PE=False)
68 | 	elif options.layout == "PE":
69 | 		obj.clipping_profile(outfile=options.output_prefix, q_cut = options.map_qual, type="S", PE=True)
70 | 	else:
71 | 		print >>sys.stderr, "unknow sequencing layout. Must be \"SE\" or \"PE\""
72 | 	try:
73 | 		subprocess.call("Rscript " + options.output_prefix + '.clipping_profile.r',shell=True)
74 | 	except:
75 | 		print >>sys.stderr, "Cannot generate pdf file from " + options.output_prefix + '.clipping_profile.r'
76 | 		pass
77 | 
78 | if __name__ == '__main__':
79 | 	main()
80 | 


--------------------------------------------------------------------------------
/rseqc/modules/deletion_profile.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | Calculate the distributions of deleted nucleotides across reads. 
 4 | '''
 5 | 
 6 | #import built-in modules
 7 | import os,sys
 8 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
 9 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
10 | 	sys.exit()
11 | 
12 | import string
13 | import subprocess
14 | from optparse import OptionParser
15 | from time import strftime
16 | 
17 | 
18 | #import my own modules
19 | from qcmodule import SAM
20 | from qcmodule import fasta
21 | #changes to the paths
22 | 
23 | #changing history to this module
24 | 
25 | 
26 | __author__ = "Liguo Wang"
27 | __copyright__ = "Copyleft"
28 | __credits__ = []
29 | __license__ = "GPL"
30 | __version__="2.6.4"
31 | __maintainer__ = "Liguo Wang"
32 | __email__ = "wang.liguo@mayo.edu"
33 | __status__ = "Production"
34 | 
35 | 
36 | def printlog (mesg):
37 | 	'''print progress into stderr and log file'''
38 | 	mesg="@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg
39 | 	print >>sys.stderr,mesg
40 | 
41 | 
42 | def main():
43 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
44 | 	parser = OptionParser(usage,version="%prog " + __version__)
45 | 	parser.add_option("-i","--input",action="store",type="string",dest="input_bam",help='Input BAM file. [required]')
46 | 	parser.add_option("-l","--read-align-length",action="store",type="int", dest="read_alignment_length",help="Alignment length of read. It is usually set to the orignial read length. For example, all these cigar strings (\"101M\", \"68M140N33M\", \"53M1D48M\") suggest the read alignment length is 101. [required]")
47 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]")
48 | 	parser.add_option("-n","--read-num",action="store",type="int",default=1000000, dest="read_number",help="Number of aligned reads with deletions used to calculate the deletion profile. default=%default")
49 | 	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality. default=%default")
50 | 	(options,args)=parser.parse_args()
51 | 
52 | 	if not (options.input_bam):
53 | 		parser.print_help()
54 | 		sys.exit(0)
55 | 	for f in ([options.input_bam]):
56 | 		if not os.path.exists(f):
57 | 			print >>sys.stderr, '\n\n' + f + " does NOT exists" + '\n'
58 | 			parser.print_help()
59 | 			sys.exit(0)
60 | 
61 | 	if not (options.output_prefix):
62 | 		print >>sys.stderr, '\n\n You must specify the output prefix'
63 | 		parser.print_help()
64 | 		sys.exit(0)
65 | 	
66 | 	if not (options.read_alignment_length):
67 | 		print >>sys.stderr, '\n\n You must specify read alignment length. It is usually the read length.'
68 | 		parser.print_help()
69 | 		sys.exit(0)	
70 | 		
71 | 	obj = SAM.ParseBAM(options.input_bam)
72 | 	obj.deletionProfile(read_length = options.read_alignment_length, read_num = options.read_number, q_cut = options.map_qual, outfile = options.output_prefix)
73 | 	
74 | 	try:
75 | 		subprocess.call("Rscript " + options.output_prefix + '.deletion_profile.r',shell=True)
76 | 	except:
77 | 		print >>sys.stderr, "Cannot generate pdf file from " + options.output_prefix + '.deletion_profile.r'
78 | 		pass
79 | 	
80 | if __name__ == '__main__':
81 | 	main()
82 | 


--------------------------------------------------------------------------------
/rseqc/modules/divide_bam.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | Equally divide BAM file (m alignments) into n parts. Each part contains roughly m/n alignments
 4 | that are randomly sampled from total alignments. 
 5 | '''
 6 | 
 7 | #import built-in modules
 8 | import os,sys
 9 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
10 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
11 | 	sys.exit()
12 | 
13 | import string
14 | from optparse import OptionParser
15 | import warnings
16 | import string
17 | import collections
18 | import sets
19 | from random import randrange
20 | #import third-party modules
21 | import pysam
22 | 
23 | __author__ = "Liguo Wang"
24 | __copyright__ = "Copyleft"
25 | __credits__ = []
26 | __license__ = "GPL"
27 | __version__="2.6.4"
28 | __maintainer__ = "Liguo Wang"
29 | __email__ = "wang.liguo@mayo.edu"
30 | __status__ = "Production"
31 | 
32 | def main():
33 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
34 | 	parser = OptionParser(usage,version="%prog " + __version__)
35 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM format. BAM file should be sorted and indexed.")
36 | 	parser.add_option("-n","--subset-num",action="store",type="int",dest="subset_num",help="Number of small BAM files")
37 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output BAM files. Output \"Prefix_num.bam\".")
38 | 	parser.add_option("-s","--skip-unmap",action="store_true",dest="skip_unmap", help="Skip unmapped reads.")
39 | 	(options,args)=parser.parse_args()
40 | 		
41 | 	if not (options.input_file and options.subset_num and options.output_prefix):
42 | 		parser.print_help()
43 | 		sys.exit(0)
44 | 	if not os.path.exists(options.input_file):
45 | 		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
46 | 		sys.exit(0)		
47 | 	
48 | 	samfile = pysam.Samfile(options.input_file,'rb')
49 | 	
50 | 	sub_bam = {}
51 | 	count_bam={}
52 | 	for i in range(0,options.subset_num):
53 | 		sub_bam[i] = pysam.Samfile(options.output_prefix + '_' + str(i) +'.bam','wb',template=samfile)
54 | 		count_bam[i] = 0
55 | 		
56 | 	total_alignment = 0
57 | 	print >>sys.stderr, "Dividing " + options.input_file + " ...",
58 | 	try:
59 | 		while(1):
60 | 			aligned_read = samfile.next()
61 | 			if aligned_read.is_unmapped and options.skip_unmap is True:
62 | 				continue
63 | 			total_alignment += 1
64 | 			tmp = randrange(0,options.subset_num)
65 | 			sub_bam[tmp].write(aligned_read)
66 | 			count_bam[tmp] += 1
67 | 				
68 | 	except StopIteration:
69 | 		print >>sys.stderr, "Done"
70 | 
71 | 	for i in range(0,options.subset_num):
72 | 		print "%-55s%d" %  (options.output_prefix + '_' + str(i) +'.bam', count_bam[i])
73 | 				
74 | if __name__ == '__main__':
75 | 	main()
76 | 


--------------------------------------------------------------------------------
/rseqc/modules/geneBody_coverage2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | Calculate the RNA-seq reads coverage over gene body.
  4 | This module uses bigwig file as input.
  5 | '''
  6 | 
  7 | #import built-in modules
  8 | import os,sys
  9 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
 10 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
 11 | 	sys.exit()
 12 | 
 13 | import re
 14 | import string
 15 | from optparse import OptionParser
 16 | import warnings
 17 | import string
 18 | import collections
 19 | import math
 20 | import sets
 21 | from time import strftime
 22 | import subprocess
 23 | 
 24 | #import third-party modules
 25 | import numpy as np
 26 | from bx.bitset import *
 27 | from bx.bitset_builders import *
 28 | from bx.intervals import *
 29 | from bx.bbi.bigwig_file import BigWigFile
 30 | 
 31 | #import my own modules
 32 | from qcmodule import SAM
 33 | from qcmodule import mystat
 34 | #changes to the paths
 35 | __author__ = "Liguo Wang"
 36 | __copyright__ = "Copyleft"
 37 | __credits__ = []
 38 | __license__ = "GPL"
 39 | __version__="2.6.4"
 40 | __maintainer__ = "Liguo Wang"
 41 | __email__ = "wang.liguo@mayo.edu"
 42 | __status__ = "Production"
 43 | 
 44 | 
 45 | def coverageGeneBody_bigwig(bigFile,refbed,outfile,gtype="png"):
 46 | 	'''Calculate reads coverage over gene body, from 5'to 3'. each gene will be equally divided
 47 | 	into 100 regsions. bigFile is bigwig format file'''
 48 | 	if refbed is None:
 49 | 		print >>sys.stderr,"You must specify a bed file representing gene model\n"
 50 | 		exit(0)
 51 | 	OUT1 = open(outfile + ".geneBodyCoverage_plot.r",'w')
 52 | 	OUT2 = open(outfile + ".geneBodyCoverage.txt",'w')
 53 | 	
 54 | 	bw = BigWigFile( file = open(bigFile) )
 55 | 	print >>sys.stderr, "calculating coverage over gene body ..."
 56 | 	coverage=collections.defaultdict(int)
 57 | 	flag=0
 58 | 	gene_count = 0
 59 | 	for line in open(refbed,'r'):
 60 | 		try:
 61 | 			if line.startswith(('#','track','browser')):continue  
 62 | 			gene_count += 1
 63 |            	# Parse fields from gene tabls
 64 | 			fields = line.split()
 65 | 			chrom     = fields[0]
 66 | 			tx_start  = int( fields[1] )
 67 | 			tx_end    = int( fields[2] )
 68 | 			geneName      = fields[3]
 69 | 			strand    = fields[5]
 70 | 				
 71 | 			exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
 72 | 			exon_starts = map((lambda x: x + tx_start ), exon_starts)
 73 | 			exon_ends = map( int, fields[10].rstrip( ',\n' ).split( ',' ) )
 74 | 			exon_ends = map((lambda x, y: x + y ), exon_starts, exon_ends);   
 75 | 		except:
 76 | 			print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line,
 77 | 			continue
 78 | 		gene_all_base=[]
 79 | 		percentile_base=[]
 80 | 		mRNA_len =0
 81 | 		flag=0
 82 | 		for st,end in zip(exon_starts,exon_ends):
 83 | 			gene_all_base.extend(range(st+1,end+1))		#0-based coordinates on genome
 84 | 			mRNA_len = len(gene_all_base)
 85 | 			if mRNA_len <100:
 86 | 				flag=1
 87 | 				break
 88 | 		if flag==1: continue
 89 | 		if strand == '-':
 90 | 			gene_all_base.sort(reverse=True)			#deal with gene on minus stand
 91 | 		else:
 92 | 			gene_all_base.sort(reverse=False)
 93 | 		percentile_base = mystat.percentile_list (gene_all_base)	#get 101 points from each gene's coordinates
 94 | 			
 95 | 		for i in range(0,len(percentile_base)):
 96 | 			#try:
 97 | 			sig = bw.get_as_array(chrom,percentile_base[i]-1,percentile_base[i])
 98 | 			if sig is None:continue
 99 | 			coverage[i] += np.nan_to_num(sig[0])
100 | 			#except:
101 | 			#	continue
102 | 		print >>sys.stderr, "  %d genes finished\r" % gene_count,
103 | 
104 | 	x_coord=[]
105 | 	y_coord=[]
106 | 	print >>OUT2, "percentile\tcount"
107 | 	for i in coverage:
108 | 		x_coord.append(str(i))
109 | 		y_coord.append(str(coverage[i]))
110 | 		print >>OUT2, str(i) + '\t' + str(coverage[i])
111 | 		
112 | 	print >>OUT1, "%s(\'%s\')" % (gtype, outfile + ".geneBodyCoverage." + gtype)
113 | 	print >>OUT1, "x=1:100"
114 | 	print >>OUT1, "y=c(" + ','.join(y_coord) + ')'
115 | 	print >>OUT1, "plot(x,y/%s,xlab=\"percentile of gene body (5'->3')\",ylab='average wigsum',type='s')" % gene_count
116 | 	print >>OUT1, "dev.off()"
117 | 	
118 | def main():
119 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
120 | 	parser = OptionParser(usage,version="%prog " + __version__)
121 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Coverage signal file in bigwig format")
122 | 	parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in bed format. [required]")
123 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]")
124 | 	parser.add_option("-t","--graph-type",action="store",type="string",dest="graph_type",default="pdf",help="Graphic file type in \"pdf\", \"jpeg\", \"bmp\", \"bmp\", \"tiff\" or \"png\".default=%default [optional]")
125 | 	(options,args)=parser.parse_args()
126 | 
127 | 	gt = options.graph_type.lower()
128 | 	if gt not in ("pdf","png",'bmp','jpeg','tiff'):
129 | 		print >>sys.stderr, "graphic file type must be 'pdf' or 'png'"
130 | 		parser.print_help()
131 | 		sys.exit(0)
132 | 	if not (options.output_prefix and options.input_file and options.ref_gene_model):
133 | 		parser.print_help()
134 | 		sys.exit(0)
135 | 
136 | 	if not os.path.exists(options.ref_gene_model):
137 | 		print >>sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
138 | 		#parser.print_help()
139 | 		sys.exit(0)
140 | 	if os.path.exists(options.input_file):
141 | 		coverageGeneBody_bigwig(options.input_file,options.ref_gene_model,options.output_prefix,gtype=options.graph_type)
142 | 		try:
143 | 			subprocess.call("Rscript " + options.output_prefix + '.geneBodyCoverage_plot.r',shell=True)
144 | 		except:
145 | 			print >>sys.stderr, "Cannot generate plot from " + options.output_prefix + '.geneBodyCoverage_plot.r'
146 | 			pass
147 | 	else:
148 | 		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
149 | 		#parser.print_help()
150 | 		sys.exit(0)
151 | 
152 | 
153 | 
154 | 
155 | if __name__ == '__main__':
156 |         main()
157 |  
158 | 


--------------------------------------------------------------------------------
/rseqc/modules/infer_experiment.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''=================================================================================================
  3 | Infer RNA-seq experiment design from SAM/BAM file. This module will determine if the RNA-seq
  4 | experiment is:
  5 | 1) pair-end or single-end
  6 | 2) if experiment is strand-specific, how reads were stranded.
  7 |  * For pair-end RNA-seq, there are two different ways to strand reads:
  8 |   i) 1++,1--,2+-,2-+
  9 |      read1 mapped to '+' strand indicates parental gene on '+' strand
 10 |      read1 mapped to '-' strand indicates parental gene on '-' strand
 11 |      read2 mapped to '+' strand indicates parental gene on '-' strand
 12 |      read2 mapped to '-' strand indicates parental gene on '+' strand
 13 |   ii) 1+-,1-+,2++,2--
 14 |      read1 mapped to '+' strand indicates parental gene on '-' strand
 15 |      read1 mapped to '-' strand indicates parental gene on '+' strand
 16 |      read2 mapped to '+' strand indicates parental gene on '+' strand
 17 |      read2 mapped to '-' strand indicates parental gene on '-' strand
 18 |  * For single-end RNA-seq, there are two different ways to strand reads:
 19 |   i) ++,--
 20 |      read mapped to '+' strand indicates parental gene on '+' strand
 21 |      read mapped to '-' strand indicates parental gene on '-' strand
 22 |   ii) +-,-+
 23 |      read mapped to '+' strand indicates parental gene on '-' strand
 24 |      read mapped to '-' strand indicates parental gene on '+' strand		
 25 |  
 26 |  NOTE:
 27 | 	You don't need to know the RNA sequencing protocol before mapping your reads to the reference
 28 |  	genome. Mapping your RNA-seq reads as if they were non-strand specific, this script can
 29 |  	"guess" how RNA-seq reads were stranded.
 30 | ================================================================================================='''
 31 | 
 32 | #import built-in modules
 33 | import os,sys
 34 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
 35 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
 36 | 	sys.exit()
 37 | 
 38 | import re
 39 | import string
 40 | from optparse import OptionParser
 41 | import warnings
 42 | import string
 43 | import collections
 44 | import math
 45 | import sets
 46 | from time import strftime
 47 | 
 48 | #import third-party modules
 49 | from bx.bitset import *
 50 | from bx.bitset_builders import *
 51 | from bx.intervals import *
 52 | 
 53 | #import my own modules
 54 | from qcmodule import SAM
 55 | #changes to the paths
 56 | 
 57 | #changing history to this module
 58 | 
 59 | 
 60 | __author__ = "Liguo Wang"
 61 | __copyright__ = "Copyleft"
 62 | __credits__ = []
 63 | __license__ = "GPL"
 64 | __version__="2.6.4"
 65 | __maintainer__ = "Liguo Wang"
 66 | __email__ = "wang.liguo@mayo.edu"
 67 | __status__ = "Production"
 68 | 
 69 | 
 70 | def printlog (mesg):
 71 | 	'''print progress into stderr and log file'''
 72 | 	mesg="@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg
 73 | 	LOG=open('class.log','a')
 74 | 	print >>sys.stderr,mesg
 75 | 	print >>LOG,mesg
 76 | 
 77 | 
 78 | def main():
 79 | 	usage="%prog [options]" + "\n"
 80 | 	parser = OptionParser(usage,version="%prog " + __version__)
 81 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Input alignment file in SAM or BAM format")
 82 | 	parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed fomat.")
 83 | 	parser.add_option("-s","--sample-size",action="store",type="int",dest="sample_size",default=200000, help="Number of reads sampled from SAM/BAM file. default=%default")	
 84 | 	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default")
 85 | 
 86 | 	(options,args)=parser.parse_args()
 87 | 
 88 | 	if not (options.input_file and options.refgene_bed):
 89 | 		parser.print_help()
 90 | 		print >>sys.stderr, '\n\n' + __doc__
 91 | 		sys.exit(0)
 92 | 	for f in (options.input_file,options.refgene_bed):
 93 | 		if not os.path.exists(f):
 94 | 			print >>sys.stderr, '\n\n' + f + " does NOT exists." + '\n'
 95 | 			sys.exit(0)
 96 | 	if options.sample_size <1000:
 97 | 		print >>sys.stderr, "Warn: Sample Size too small to give a accurate estimation"
 98 | 	obj = SAM.ParseBAM(options.input_file)
 99 | 	(protocol,sp1,sp2,other)=obj.configure_experiment(refbed=options.refgene_bed, sample_size = options.sample_size, q_cut = options.map_qual)
100 | 	if other <0: other=0.0
101 | 	if protocol == "PairEnd":
102 | 		print "\n\nThis is PairEnd Data"
103 | 		print "Fraction of reads failed to determine: %.4f" % other
104 | 		print "Fraction of reads explained by \"1++,1--,2+-,2-+\": %.4f" % sp1
105 | 		print "Fraction of reads explained by \"1+-,1-+,2++,2--\": %.4f" % sp2
106 | 		
107 | 	elif protocol == "SingleEnd":
108 | 		print "\n\nThis is SingleEnd Data"
109 | 		print "Fraction of reads failed to determine: %.4f" % other
110 | 		print "Fraction of reads explained by \"++,--\": %.4f" % sp1
111 | 		print "Fraction of reads explained by \"+-,-+\": %.4f" % sp2
112 | 		
113 | 	else:
114 | 		print "Unknown Data type"
115 | 	#print mesg
116 | 
117 | if __name__ == '__main__':
118 | 	main()
119 | 


--------------------------------------------------------------------------------
/rseqc/modules/inner_distance.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | Calculate the inner distance (insert size)  of RNA-seq fragments. 
 4 | 
 5 |                RNA fragment
 6 |  _________________||_________________
 7 | |                                    |
 8 | |                                    |
 9 | ||||||||||------------------||||||||||
10 |   read_1      insert_size     read_2
11 | 
12 | fragment size = read_1 + insert_size + read_2
13 | '''
14 | 
15 | #import built-in modules
16 | import os,sys
17 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
18 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
19 | 	sys.exit()
20 | 
21 | import re
22 | import string
23 | from optparse import OptionParser
24 | import warnings
25 | import string
26 | import collections
27 | import math
28 | import sets
29 | from time import strftime
30 | import subprocess
31 | 
32 | #import third-party modules
33 | from bx.bitset import *
34 | from bx.bitset_builders import *
35 | from bx.intervals import *
36 | 
37 | #import my own modules
38 | from qcmodule import SAM
39 | #changes to the paths
40 | 
41 | #changing history to this module
42 | 
43 | 
44 | __author__ = "Liguo Wang"
45 | __copyright__ = "Copyleft"
46 | __credits__ = []
47 | __license__ = "GPL"
48 | __version__="2.6.4"
49 | __maintainer__ = "Liguo Wang"
50 | __email__ = "wang.liguo@mayo.edu"
51 | __status__ = "Production"
52 | 
53 | 
54 | def printlog (mesg):
55 | 	'''print progress into stderr and log file'''
56 | 	mesg="@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg
57 | 	LOG=open('class.log','a')
58 | 	print >>sys.stderr,mesg
59 | 	print >>LOG,mesg
60 | 
61 | 
62 | def main():
63 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
64 | 	parser = OptionParser(usage,version="%prog " + __version__)
65 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.")
66 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s)")
67 | 	parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene",help="Reference gene model in BED format.")
68 | 	parser.add_option("-k","--sample-size",action="store",type="int",dest="sampleSize",default=1000000,help="Number of read-pairs used to estimate inner distance. default=%default")
69 | 	parser.add_option("-l","--lower-bound",action="store",type="int",dest="lower_bound_size",default=-250,help="Lower bound of inner distance (bp). This option is used for ploting histograme. default=%default")
70 | 	parser.add_option("-u","--upper-bound",action="store",type="int",dest="upper_bound_size",default=250,help="Upper bound of inner distance (bp). This option is used for plotting histogram. default=%default")
71 | 	parser.add_option("-s","--step",action="store",type="int",dest="step_size",default=5,help="Step size (bp) of histograme. This option is used for plotting histogram. default=%default")	
72 | 	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default")
73 | 
74 | 	(options,args)=parser.parse_args()
75 | 
76 | 	if not (options.output_prefix and options.input_file and options.ref_gene):
77 | 		parser.print_help()
78 | 		sys.exit(0)
79 | 	for input_file in ([options.input_file,options.ref_gene]):
80 | 		if not os.path.exists(input_file):
81 | 			print >>sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n'
82 | 			parser.print_help()
83 | 			sys.exit(0)
84 | 	if options.step_size <=0:
85 | 		print >>sys.stderr, "step size is a positive interger"
86 | 		sys.exit(0)
87 | 	obj = SAM.ParseBAM(options.input_file)
88 | 	obj.mRNA_inner_distance(outfile=options.output_prefix,low_bound=options.lower_bound_size,up_bound=options.upper_bound_size,step=options.step_size,refbed=options.ref_gene,sample_size=options.sampleSize, q_cut = options.map_qual)
89 | 	try:
90 | 		subprocess.call("Rscript " + options.output_prefix + '.inner_distance_plot.r',shell=True)
91 | 	except:
92 | 		print >>sys.stderr, "Cannot generate pdf file from " + options.output_prefix + '.inner_distance_plot.r'
93 | 		pass
94 | 
95 | if __name__ == '__main__':
96 | 	main()
97 | 


--------------------------------------------------------------------------------
/rseqc/modules/insertion_profile.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | Calculate the distributions of inserted nucleotides across reads
 4 | Note CIGAR strings within SAM/BAM file should have 'I' operation 
 5 | '''
 6 | 
 7 | #import built-in modules
 8 | import os,sys
 9 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
10 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
11 | 	sys.exit()
12 | 
13 | import re
14 | import string
15 | from optparse import OptionParser
16 | import warnings
17 | import string
18 | import collections
19 | import math
20 | import sets
21 | from time import strftime
22 | import subprocess
23 |  
24 | 
25 | #import third-party modules
26 | from bx.bitset import *
27 | from bx.bitset_builders import *
28 | from bx.intervals import *
29 | 
30 | #import my own modules
31 | from qcmodule import SAM
32 | #changes to the paths
33 | 
34 | #changing history to this module
35 | 
36 | 
37 | __author__ = "Liguo Wang"
38 | __copyright__ = "Copyleft"
39 | __credits__ = []
40 | __license__ = "GPL"
41 | __version__="2.6.4"
42 | __maintainer__ = "Liguo Wang"
43 | __email__ = "wang.liguo@mayo.edu"
44 | __status__ = "Production"
45 | 
46 | 
47 | def main():
48 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
49 | 	parser = OptionParser(usage,version="%prog " + __version__)
50 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.")
51 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s).")
52 | 	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default")
53 | 	parser.add_option("-s","--sequencing",action="store",dest="layout",help="Sequencing layout. \"SE\"(single-end) or \"PE\"(pair-end). ")
54 | 	(options,args)=parser.parse_args()
55 | 
56 | 	if not (options.input_file and options.output_prefix and options.layout):
57 | 		parser.print_help()
58 | 		sys.exit(0)
59 | 	for input_file in ([options.input_file]):
60 | 		if not os.path.exists(input_file):
61 | 			print >>sys.stderr, '\n\n' + input_file + " does NOT exists" + '\n'
62 | 			sys.exit(0)
63 | 
64 | 	obj = SAM.ParseBAM(options.input_file)
65 | 	if options.layout == "SE":
66 | 		obj.insertion_profile(outfile=options.output_prefix, q_cut = options.map_qual,PE=False)
67 | 	elif options.layout == "PE":
68 | 		obj.insertion_profile(outfile=options.output_prefix, q_cut = options.map_qual,PE=True)
69 | 	else:
70 | 		print >>sys.stderr, "unknow sequencing layout. Must be \"SE\" or \"PE\""
71 | 	try:
72 | 		subprocess.call("Rscript " + options.output_prefix + '.insertion_profile.r',shell=True)
73 | 	except:
74 | 		print >>sys.stderr, "Cannot generate pdf file from " + options.output_prefix + '.insertion_profile.r'
75 | 		pass
76 | 
77 | if __name__ == '__main__':
78 | 	main()
79 | 


--------------------------------------------------------------------------------
/rseqc/modules/junction_annotation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | Annotate splicing reads against gene model in two levels: reads level and  juncion level.
  4 | Note:
  5 | 1) A read, especially long read, can be spliced 2 or more times
  6 | 2) Multiple splicing reads spanning the same intron can be consolidated into one splicing junction.
  7 | '''
  8 | 
  9 | #import built-in modules
 10 | import os,sys
 11 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
 12 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
 13 | 	sys.exit()
 14 | 
 15 | import re
 16 | import string
 17 | from optparse import OptionParser
 18 | import warnings
 19 | import string
 20 | import collections
 21 | import math
 22 | import sets
 23 | from time import strftime
 24 | import subprocess
 25 | 
 26 | #import third-party modules
 27 | from bx.bitset import *
 28 | from bx.bitset_builders import *
 29 | from bx.intervals import *
 30 | 
 31 | #import my own modules
 32 | from qcmodule import SAM
 33 | #changes to the paths
 34 | 
 35 | #changing history to this module
 36 | 
 37 | 
 38 | __author__ = "Liguo Wang"
 39 | __copyright__ = "Copyleft"
 40 | __credits__ = []
 41 | __license__ = "GPL"
 42 | __version__="2.6.4"
 43 | __maintainer__ = "Liguo Wang"
 44 | __email__ = "wang.liguo@mayo.edu"
 45 | __status__ = "Production"
 46 | 
 47 | 
 48 | def printlog (mesg):
 49 |         '''print progress into stderr and log file'''
 50 |         mesg="@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg
 51 |         LOG=open('class.log','a')
 52 |         print >>sys.stderr,mesg
 53 |         print >>LOG,mesg
 54 | 
 55 | def generate_bed12(infile,size=10):
 56 | 	''' 
 57 | 	infile: input file. eg: chrX    66766604        66788677        348     partial_novel
 58 | 	size: the block size representing exons
 59 | 	'''
 60 | 	
 61 | 	outfile = infile.replace('.xls','.bed')
 62 | 	OUT = open(outfile,'w')
 63 | 	for line in open(infile,'r'):
 64 | 		if line.startswith('chrom'):continue
 65 | 		line = line.strip()
 66 | 		f = line.split()
 67 | 		if len(f) != 5:continue
 68 | 		chrom = f[0]
 69 | 		start = int(f[1]) - size
 70 | 		end = int(f[2]) + size
 71 | 		score = int(f[3])
 72 | 		strand = '.'
 73 | 		name = f[4]
 74 | 		thick_st = start
 75 | 		thick_end = end
 76 | 		if name == 'annotated':
 77 | 			color = '205,0,0'
 78 | 		elif name == 'partial_novel':
 79 | 			color = '0,205,0'
 80 | 		elif name == 'complete_novel':
 81 | 			color = '0,0,205'
 82 | 		else:
 83 | 			color = '0,0,0'
 84 | 		blockCount = 2
 85 | 		blockSizes = ','.join((str(size),str(size)))
 86 | 		blockStarts = '0,' + str(end-size-start)
 87 | 		print >>OUT, '\t'.join( [str(i) for i in [chrom, start, end, name, score, strand, thick_st, thick_end,color, blockCount, blockSizes,blockStarts]])
 88 | 		
 89 | 	
 90 | def main():
 91 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
 92 | 	parser = OptionParser(usage,version="%prog " + __version__)
 93 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.")
 94 | 	parser.add_option("-r","--refgene",action="store",type="string",dest="ref_gene_model",help="Reference gene model in bed format. This file is better to be a pooled gene model as it will be used to annotate splicing junctions [required]")
 95 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]")
 96 | 	parser.add_option("-m","--min-intron",action="store",type="int",dest="min_intron",default=50, help="Minimum intron length (bp). default=%default [optional]")
 97 | 	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default")
 98 | 
 99 | 	(options,args)=parser.parse_args()
100 | 		
101 | 	if not (options.output_prefix and options.input_file and options.ref_gene_model):
102 | 		parser.print_help()
103 | 		sys.exit(0)
104 | 	if not os.path.exists(options.ref_gene_model):
105 | 		print >>sys.stderr, '\n\n' + options.ref_gene_model + " does NOT exists" + '\n'
106 | 		sys.exit(0)
107 | 	if os.path.exists(options.input_file):
108 | 		obj = SAM.ParseBAM(options.input_file)
109 | 		obj.annotate_junction(outfile=options.output_prefix,refgene=options.ref_gene_model,min_intron=options.min_intron, q_cut = options.map_qual)
110 | 		try:
111 | 			subprocess.call("Rscript " + options.output_prefix + '.junction_plot.r', shell=True)
112 | 		except:
113 | 			print >>sys.stderr, "Cannot generate pdf file from " + '.junction_plot.r'
114 | 			pass
115 | 	else:
116 | 		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
117 | 		sys.exit(0)
118 | 	try:
119 | 		generate_bed12(options.output_prefix + '.junction.xls')	
120 | 	except:
121 | 		pass	
122 | 
123 | 
124 | if __name__ == '__main__':
125 |         main()
126 | 


--------------------------------------------------------------------------------
/rseqc/modules/junction_saturation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''
  3 | Check if sequencing depth is saturated or not, based on the idea that when sequencing depth is 
  4 | approaching saturation, less NEW junctions will be detected. 
  5 | See http://rseqc.sourceforge.net/ for details.
  6 | '''
  7 | 
  8 | #import built-in modules
  9 | import os,sys
 10 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
 11 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
 12 | 	sys.exit()
 13 | 
 14 | import re
 15 | import string
 16 | from optparse import OptionParser
 17 | import warnings
 18 | import string
 19 | import collections
 20 | import math
 21 | import sets
 22 | from time import strftime
 23 | import subprocess
 24 | 
 25 | #import third-party modules
 26 | from bx.bitset import *
 27 | from bx.bitset_builders import *
 28 | from bx.intervals import *
 29 | 
 30 | #import my own modules
 31 | from qcmodule import SAM
 32 | #changes to the paths
 33 | 
 34 | #changing history to this module
 35 | 
 36 | 
 37 | __author__ = "Liguo Wang"
 38 | __copyright__ = "Copyleft"
 39 | __credits__ = []
 40 | __license__ = "GPL"
 41 | __version__="2.6.4"
 42 | __maintainer__ = "Liguo Wang"
 43 | __email__ = "wang.liguo@mayo.edu"
 44 | __status__ = "Production"
 45 | 
 46 | 
 47 | def printlog (mesg):
 48 | 	'''print progress into stderr and log file'''
 49 | 	mesg="@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg
 50 | 	LOG=open('class.log','a')
 51 | 	print >>sys.stderr,mesg
 52 | 	print >>LOG,mesg
 53 | 
 54 | 
 55 | def main():
 56 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
 57 | 	parser = OptionParser(usage,version="%prog " + __version__)
 58 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.[required]")
 59 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]")
 60 | 	parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed fomat. This gene model is used to determine known splicing junctions. [required]")
 61 | 	parser.add_option("-l","--percentile-floor",action="store",type="int",dest="percentile_low_bound",default=5, help="Sampling starts from this percentile. A integer between 0 and 100. default=%default")
 62 | 	parser.add_option("-u","--percentile-ceiling",action="store",type="int",dest="percentile_up_bound",default=100, help="Sampling ends at this percentile. A integer between 0 and 100. default=%default")
 63 | 	parser.add_option("-s","--percentile-step",action="store",type="int",dest="percentile_step",default=5, help="Sampling frequency. Smaller value means more sampling times. A integer between 0 and 100. default=%default")	
 64 | 	parser.add_option("-m","--min-intron",action="store",type="int",dest="minimum_intron_size",default=50, help="Minimum intron size (bp). default=%default")
 65 | 	parser.add_option("-v","--min-coverage",action="store",type="int",dest="minimum_splice_read",default=1, help="Minimum number of supportting reads to call a junction. default=%default")
 66 | 	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default")
 67 | 
 68 | 	(options,args)=parser.parse_args()
 69 | 
 70 | 	if not (options.output_prefix and options.input_file and options.refgene_bed):
 71 | 		parser.print_help()
 72 | 		sys.exit(0)
 73 | 	if options.percentile_low_bound <0 or options.percentile_low_bound >100:
 74 | 		print >>sys.stderr, "percentile_low_bound must be larger than 0 and samller than 100"
 75 | 		sys.exit(0)
 76 | 	if options.percentile_up_bound <0 or options.percentile_up_bound >100:
 77 | 		print >>sys.stderr, "percentile_up_bound must be larger than 0 and samller than 100"
 78 | 		sys.exit(0)
 79 | 	if options.percentile_up_bound < options.percentile_low_bound:
 80 | 		print >>sys.stderr, "percentile_up_bound must be larger than percentile_low_bound"
 81 | 		sys.exit(0)
 82 | 	if options.percentile_step <0 or options.percentile_step > options.percentile_up_bound:
 83 | 		print >>sys.stderr, "percentile_step must be larger than 0 and samller than percentile_up_bound"
 84 | 		sys.exit(0)
 85 | 	if os.path.exists(options.input_file):
 86 | 		obj = SAM.ParseBAM(options.input_file)
 87 | 		obj.saturation_junction(outfile=options.output_prefix, refgene=options.refgene_bed, sample_start=options.percentile_low_bound,sample_end=options.percentile_up_bound,sample_step=options.percentile_step,min_intron=options.minimum_intron_size,recur=options.minimum_splice_read, q_cut = options.map_qual)
 88 | 		try:
 89 | 			subprocess.call("Rscript " + options.output_prefix + '.junctionSaturation_plot.r', shell=True)
 90 | 		except:
 91 | 			print >>sys.stderr, "Cannot generate pdf file from " + '.junctionSaturation_plot.r'
 92 | 			pass
 93 | 	else:
 94 | 		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
 95 | 		sys.exit(0)
 96 | 		#parser.print_help()
 97 | 		
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 | 	main()
102 | 


--------------------------------------------------------------------------------
/rseqc/modules/mismatch_profile.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | Calculate the distribution of mismatches across reads. Note that the "MD" tag must exist in BAM file.
 4 | '''
 5 | 
 6 | #import built-in modules
 7 | import os,sys
 8 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
 9 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
10 | 	sys.exit()
11 | 
12 | import string
13 | import subprocess
14 | from optparse import OptionParser
15 | from time import strftime
16 | 
17 | 
18 | #import my own modules
19 | from qcmodule import SAM
20 | from qcmodule import fasta
21 | #changes to the paths
22 | 
23 | #changing history to this module
24 | 
25 | 
26 | __author__ = "Liguo Wang"
27 | __copyright__ = "Copyleft"
28 | __credits__ = []
29 | __license__ = "GPL"
30 | __version__="2.6.4"
31 | __maintainer__ = "Liguo Wang"
32 | __email__ = "wang.liguo@mayo.edu"
33 | __status__ = "Production"
34 | 
35 | 
36 | def printlog (mesg):
37 | 	'''print progress into stderr and log file'''
38 | 	mesg="@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg
39 | 	print >>sys.stderr,mesg
40 | 
41 | 
42 | def main():
43 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
44 | 	parser = OptionParser(usage,version="%prog " + __version__)
45 | 	parser.add_option("-i","--input",action="store",type="string",dest="input_bam",help='Input BAM file. [required]')
46 | 	parser.add_option("-l","--read-align-length",action="store",type="int", dest="read_alignment_length",help="Alignment length of read. It is usually set to the orignial read length. For example, all these cigar strings (\"101M\", \"68M140N33M\", \"53M1D48M\") suggest the read alignment length is 101. [required]")
47 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]")
48 | 	parser.add_option("-n","--read-num",action="store",type="int",default=1000000, dest="read_number",help="Number of aligned reads with mismatches used to calculate the mismatch profile. default=%default")
49 | 	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality. default=%default")
50 | 	(options,args)=parser.parse_args()
51 | 
52 | 	if not (options.input_bam):
53 | 		parser.print_help()
54 | 		sys.exit(0)
55 | 	for f in ([options.input_bam]):
56 | 		if not os.path.exists(f):
57 | 			print >>sys.stderr, '\n\n' + f + " does NOT exists" + '\n'
58 | 			parser.print_help()
59 | 			sys.exit(0)
60 | 
61 | 	if not (options.output_prefix):
62 | 		print >>sys.stderr, '\n\n You must specify the output prefix'
63 | 		parser.print_help()
64 | 		sys.exit(0)
65 | 	
66 | 	if not (options.read_alignment_length):
67 | 		print >>sys.stderr, '\n\n You must specify read alignment length. It is usually the read length.'
68 | 		parser.print_help()
69 | 		sys.exit(0)	
70 | 		
71 | 	obj = SAM.ParseBAM(options.input_bam)
72 | 	obj.mismatchProfile(read_length = options.read_alignment_length, read_num = options.read_number, q_cut = options.map_qual,outfile = options.output_prefix)
73 | 	try:
74 | 		subprocess.call("Rscript " + options.output_prefix + '.mismatch_profile.r',shell=True)
75 | 	except:
76 | 		print >>sys.stderr, "Cannot generate pdf file from " + options.output_prefix + '.mismatch_profile.r'
77 | 		pass
78 | 	
79 | if __name__ == '__main__':
80 | 	main()
81 | 


--------------------------------------------------------------------------------
/rseqc/modules/normalize_bigwig.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''Normalize bigwig signal to fixed wigsum (equivelent to total reads). Output wiggle file'''
  3 | import os,sys
  4 | import collections
  5 | from operator import itemgetter
  6 | from itertools import groupby
  7 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
  8 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
  9 | 	sys.exit()
 10 | 
 11 | import string
 12 | from optparse import OptionParser
 13 | 
 14 | from bx.bbi.bigwig_file import BigWigFile
 15 | from qcmodule import BED
 16 | from qcmodule import twoList
 17 | import numpy
 18 | __author__ = "Liguo Wang"
 19 | __copyright__ = "Copyleft"
 20 | __credits__ = []
 21 | __license__ = "GPL"
 22 | __version__="2.6.4"
 23 | __maintainer__ = "Liguo Wang"
 24 | __email__ = "wang.liguo@mayo.edu"
 25 | __status__ = "Production"
 26 | 
 27 | def load_chromsize(file):
 28 | 	'''read chrom.size file'''
 29 | 	chromSize={}
 30 | 	for line in open(file,'r'):
 31 | 		if line.startswith('#'):continue
 32 | 		if not line.strip():continue
 33 | 		fields = line.strip().split()
 34 | 		chromSize[fields[0]] = int(fields[1])
 35 | 	return chromSize
 36 | 
 37 | def main():
 38 | 	usage="%prog [options]"
 39 | 	parser = OptionParser(usage,version="%prog " + __version__)
 40 | 	
 41 | 	parser.add_option("-i","--bwfile",action="store",type="string",dest="BigWig_File",help="Input BigWig file. [required]")
 42 | 	parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file. [required]")
 43 | 	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome. [required]")
 44 | 	parser.add_option("-t","--wigsum",action="store",type="int",dest="total_wigsum",default=100000000,help="Specified wigsum. 100000000 equals to coverage of 1 million 100nt reads. default=%default  [optional]")
 45 | 	parser.add_option("-r","--refgene",action="store",type="string",dest="refgene_bed",help="Reference gene model in bed format. [optional]")	
 46 | 	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=500000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp) [optional]")
 47 | 	parser.add_option("-f","--format",action="store",type="string",dest="out_format",default="bgr",help="Output format. either \"wig\" or \"bgr\". \"bgr\" save disk space but make program slower. default=%default")
 48 | 	(options,args)=parser.parse_args()
 49 | 	
 50 | 	if not (options.BigWig_File and options.output_wig and options.chromSize):
 51 | 		parser.print_help()
 52 | 		sys.exit(0)
 53 | 
 54 | 	OUT=open(options.output_wig,'w')
 55 | 	bw = BigWigFile( file=open(options.BigWig_File) )
 56 | 	chrom_sizes = load_chromsize(options.chromSize)	
 57 | 	exons=[]
 58 | 	WIG_SUM=0.0
 59 | 	if (options.refgene_bed):	
 60 | 		print >>sys.stderr, "Extract exons from " + options.refgene_bed
 61 | 		obj = BED.ParseBED(options.refgene_bed)
 62 | 		exons = obj.getExon()
 63 | 		print >>sys.stderr, "Merge overlapping exons ..."
 64 | 		exons = BED.unionBed3(exons)
 65 | 		print >>sys.stderr, "Calculate wigsum covered by " + options.refgene_bed + ' only'
 66 | 		for chrom,st,end in exons:
 67 | 			try: bw.get_as_array(chrom,0,1).size
 68 | 			except:continue
 69 | 
 70 | 			bw_signal = bw.get_as_array(chrom,st,end)
 71 | 			tmp = numpy.nansum(bw_signal)			#nan will be ignored. but if all items are 'nan', the result summay is 'nan' NOT 0
 72 | 			if numpy.isnan(tmp):continue	
 73 | 			WIG_SUM += tmp
 74 | 		print >>sys.stderr, "Total wigsum is %.2f\n" % WIG_SUM
 75 | 	else:
 76 | 		print >>sys.stderr, "Calculate wigsum from " + options.BigWig_File
 77 | 		for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
 78 | 			#if chr_name != "chrY":continue
 79 | 			try: bw.get_as_array(chr_name,0,1).size
 80 | 			except:
 81 | 				print >>sys.stderr, "Skip " + chr_name + "!"
 82 | 				continue
 83 | 
 84 | 			print >>sys.stderr, "Processing " + chr_name + " ..."	
 85 | 			for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
 86 | 				bw_signal = bw.get_as_array(interval[0],interval[1],interval[2])
 87 | 				tmp = numpy.nansum(bw_signal)
 88 | 				if numpy.isnan(tmp):continue
 89 | 				WIG_SUM += tmp
 90 | 		print >>sys.stderr, "\nTotal wigsum is %.2f\n" % WIG_SUM
 91 | 	
 92 | 	try:
 93 | 		weight = options.total_wigsum/WIG_SUM
 94 | 	except:
 95 | 		"Error, WIG_SUM cannot be 0"
 96 | 		eys.exit(1)
 97 | 
 98 | 	#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 99 | 	print >>sys.stderr, "Normalizing bigwig file ..."
100 | 	for chr_name, chr_size in chrom_sizes.items():          #iterate each chrom
101 | 		#if chr_name != "chrY":continue
102 | 		try: bw.get_as_array(chr_name,0,1).size
103 | 		except:
104 | 			print >>sys.stderr, "Skip " + chr_name + "!"
105 | 			continue
106 | 		
107 | 		if options.out_format.upper() == "WIG":
108 | 			print >>sys.stderr, "Writing " + chr_name + " ..."
109 | 			OUT.write('variableStep chrom='+chr_name+'\n')
110 | 			for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
111 | 				coord = interval[1]
112 | 				bw_signal = bw.get_as_array(chr_name,interval[1],interval[2])
113 | 				tmp = numpy.nansum(bw_signal)
114 | 				if numpy.isnan(tmp):continue
115 | 				bw_signal = numpy.nan_to_num(bw_signal) * weight
116 | 				for v in bw_signal:
117 | 					coord +=1
118 | 					if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
119 | 		elif options.out_format.upper() == "BGR":
120 | 			print >>sys.stderr, "Writing " + chr_name + " ..."
121 | 			#OUT.write('variableStep chrom='+chr_name+'\n')
122 | 			for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
123 | 				v2p = collections.defaultdict(list)     #value to position
124 | 				range2p={}      #coorindate range to value, bedgraph. #[start]=[len,value]
125 | 				coord = interval[1]
126 | 				bw_signal = bw.get_as_array(chr_name,interval[1],interval[2])
127 | 				tmp = numpy.nansum(bw_signal)
128 | 				if numpy.isnan(tmp):continue
129 | 				bw_signal = numpy.nan_to_num(bw_signal) * weight
130 | 				for v in bw_signal:
131 | 					coord +=1
132 | 					#if v != 0: print >>OUT, "%d\t%.2f" % (coord,v)
133 | 					if v != 0: v2p[v].append(coord)
134 | 				for v in v2p:
135 | 					for k,g in groupby(enumerate(v2p[v]), lambda (i,x):i-x):
136 | 						for l in [map(itemgetter(1), g)]:
137 | 							range2p[l[0]-1] = [len(l),v]
138 | 				for i in sorted(range2p):
139 | 					print >>OUT, chr_name + '\t' + str(i) +'\t' + str(i + range2p[i][0]) + '\t' + str(range2p[i][1])
140 | 		else:
141 | 			print >>sys.stderr, "unknown output format"
142 | 			sys.exit(1)
143 | 				
144 | 			
145 | 			
146 | if __name__=='__main__':
147 | 	main()
148 | 


--------------------------------------------------------------------------------
/rseqc/modules/overlay_bigwig.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''Manipulate two bigwig files'''
 3 | import os,sys
 4 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
 5 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
 6 | 	sys.exit()
 7 | 
 8 | import string
 9 | from optparse import OptionParser
10 | 
11 | from bx.bbi.bigwig_file import BigWigFile
12 | from qcmodule import BED
13 | from qcmodule import twoList
14 | import numpy
15 | __author__ = "Liguo Wang"
16 | __copyright__ = "Copyleft"
17 | __credits__ = []
18 | __license__ = "GPL"
19 | __version__="2.6.4"
20 | __maintainer__ = "Liguo Wang"
21 | __email__ = "wang.liguo@mayo.edu"
22 | __status__ = "Production"
23 | 
24 | def load_chromsize(file):
25 | 	'''read chrom.size file'''
26 | 	chromSize={}
27 | 	for line in open(file,'r'):
28 | 		if line.startswith('#'):continue
29 | 		if not line.strip():continue
30 | 		fields = line.strip().split()
31 | 		chromSize[fields[0]] = int(fields[1])
32 | 	return chromSize
33 | 
34 | def main():
35 | 	usage="%prog [options]"
36 | 	parser = OptionParser(usage,version="%prog " + __version__)
37 | 	
38 | 	parser.add_option("-i","--bwfile1",action="store",type="string",dest="BigWig_File1",help="One BigWig file")
39 | 	parser.add_option("-j","--bwfile2",action="store",type="string",dest="BigWig_File2",help="Another BigWig file")
40 | 	parser.add_option("-a","--action",action="store",type="string",dest="action",help='After pairwise align two bigwig files, perform the follow actions (Only select one keyword):"Add" = add signals. "Average" = average signals. "Division"= divide bigwig2 from bigwig1. Add 1 to both bigwig. "Max" = pick the signal that is larger. "Min" = pick the signal that is smaller. "Product" = multiply signals. "Subtract" = subtract signals in 2nd bigwig file from the corresponiding ones in the 1st bigwig file. "geometricMean" = take the geometric mean of signals.')
41 | 	parser.add_option("-o","--output",action="store",type="string",dest="output_wig",help="Output wig file")
42 | 	parser.add_option("-s","--chromSize",action="store",type="string",dest="chromSize",help="Chromosome size file. Tab or space separated text file with 2 columns: first column is chromosome name, second column is size of the chromosome.")
43 | 	parser.add_option("-c","--chunk",action="store",type="int",dest="chunk_size",default=100000,help="Chromosome chunk size. Each chomosome will be cut into samll chunks of this size. Decrease chunk size will save more RAM. default=%default (bp)")
44 | 	parser.add_option("-m","--min_signal",action="store",type="float",dest="min_score",default=0.0,help="To redude the size of output wigfile, genomic positions with signal value smaller than (<) this threshold will be filtered out. default=%default")
45 | 	(options,args)=parser.parse_args()
46 | 	
47 | 	if not (options.BigWig_File1 and options.BigWig_File2  and options.output_wig and options.chromSize):
48 | 		parser.print_help()
49 | 		sys.exit(0)
50 | 	OUT=open(options.output_wig,'w')
51 | 	bw1 = BigWigFile( file=open(options.BigWig_File1) )
52 | 	bw2 = BigWigFile( file=open(options.BigWig_File2) )
53 | 	chrom_sizes = load_chromsize(options.chromSize)
54 | 	for chr_name, chr_size in chrom_sizes.items():		#iterate each chrom
55 | 		print >>sys.stderr, "Processing " + chr_name + " ..."
56 | 		OUT.write('variableStep chrom='+chr_name+'\n')
57 | 		for interval in BED.tillingBed(chrName = chr_name,chrSize = chr_size,stepSize = options.chunk_size):
58 | 			coord = interval[1]
59 | 			try:
60 | 				bw_signal1 = bw1.get_as_array(chr_name,interval[1],interval[2])
61 | 			except:
62 | 				bw_signal1 = numpy.array()
63 | 			try:
64 | 				bw_signal2 = bw2.get_as_array(chr_name,interval[1],interval[2])
65 | 			except:
66 | 				bw_signal2 = numpy.array()
67 | 			if bw_signal1 is None and bw_signal2 is None:
68 | 				continue
69 | 			if numpy.isnan(numpy.nansum(bw_signal1)) and numpy.isnan(numpy.nansum(bw_signal2)):
70 | 				continue
71 | 			if len(bw_signal1) == 0 and len(bw_signal2) == 0:
72 | 				continue
73 | 			bw_signal1 = numpy.nan_to_num( bw_signal1 )
74 | 			bw_signal2 = numpy.nan_to_num( bw_signal2 )
75 | 		
76 | 			call_back = getattr(twoList,options.action)
77 | 			for v in call_back(bw_signal1,bw_signal2):
78 | 				coord +=1
79 | 				if v >= options.min_score: print >>OUT, "%d\t%.2f" % (coord,v)
80 | 
81 | if __name__=='__main__':
82 | 	main()
83 | 


--------------------------------------------------------------------------------
/rseqc/modules/read_GC.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''-------------------------------------------------------------------------------------------------
 3 | Calculate distribution of reads' GC content
 4 | -------------------------------------------------------------------------------------------------'''
 5 | 
 6 | #import built-in modules
 7 | import os,sys
 8 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
 9 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
10 | 	sys.exit()
11 | 
12 | import re
13 | import string
14 | from optparse import OptionParser
15 | import warnings
16 | import string
17 | import collections
18 | import math
19 | import sets
20 | from time import strftime
21 | import subprocess
22 | 
23 | 
24 | #import third-party modules
25 | from bx.bitset import *
26 | from bx.bitset_builders import *
27 | from bx.intervals import *
28 | 
29 | #import my own modules
30 | from qcmodule import SAM
31 | #changes to the paths
32 | 
33 | #changing history to this module
34 | 
35 | 
36 | __author__ = "Liguo Wang"
37 | __copyright__ = "Copyleft"
38 | __credits__ = []
39 | __license__ = "GPL"
40 | __version__="2.6.4"
41 | __maintainer__ = "Liguo Wang"
42 | __email__ = "wang.liguo@mayo.edu"
43 | __status__ = "Production"
44 | 
45 | 
46 | def printlog (mesg):
47 | 	'''print progress into stderr and log file'''
48 | 	mesg="@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg
49 | 	LOG=open('class.log','a')
50 | 	print >>sys.stderr,mesg
51 | 	print >>LOG,mesg
52 | 
53 | 
54 | def main():
55 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
56 | 	parser = OptionParser(usage,version="%prog " + __version__)
57 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.")
58 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s).")
59 | 	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default")
60 | 	(options,args)=parser.parse_args()
61 | 
62 | 	if not (options.output_prefix and options.input_file):
63 | 		parser.print_help()
64 | 		sys.exit(0)
65 | 	if os.path.exists(options.input_file):
66 | 		obj = SAM.ParseBAM(options.input_file)
67 | 		obj.readGC(outfile=options.output_prefix, q_cut = options.map_qual)	
68 | 		try:
69 | 			subprocess.call("Rscript " + options.output_prefix +  ".GC_plot.r",shell=True)
70 | 		except:
71 | 			pass
72 | 	else:
73 | 		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
74 | 		#parser.print_help()
75 | 		sys.exit(0)
76 | 
77 | 
78 | 
79 | if __name__ == '__main__':
80 | 	main()
81 | 


--------------------------------------------------------------------------------
/rseqc/modules/read_NVC.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''-------------------------------------------------------------------------------------------------
 3 | For each nucleotide  position of read (5'->3'), check the nucleotide frequency. The generated R script will
 4 | gives NVC (Nucleotide Versus Cycle) plot.
 5 | -------------------------------------------------------------------------------------------------'''
 6 | 
 7 | #import built-in modules
 8 | import os,sys
 9 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
10 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
11 | 	sys.exit()
12 | 
13 | import re
14 | import string
15 | from optparse import OptionParser
16 | import warnings
17 | import string
18 | import collections
19 | import math
20 | import sets
21 | from time import strftime
22 | import subprocess
23 | 
24 | #import third-party modules
25 | from bx.bitset import *
26 | from bx.bitset_builders import *
27 | from bx.intervals import *
28 | 
29 | #import my own modules
30 | from qcmodule import SAM
31 | #changes to the paths
32 | 
33 | #changing history to this module
34 | 
35 | 
36 | __author__ = "Liguo Wang"
37 | __copyright__ = "Copyleft"
38 | __credits__ = []
39 | __license__ = "GPL"
40 | __version__="2.6.4"
41 | __maintainer__ = "Liguo Wang"
42 | __email__ = "wang.liguo@mayo.edu"
43 | __status__ = "Production"
44 | 
45 | 
46 | def printlog (mesg):
47 | 	'''print progress into stderr and log file'''
48 | 	mesg="@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg
49 | 	LOG=open('class.log','a')
50 | 	print >>sys.stderr,mesg
51 | 	print >>LOG,mesg
52 | 
53 | 
54 | def main():
55 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
56 | 	parser = OptionParser(usage,version="%prog " + __version__)
57 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Input file in BAM or SAM format.[required]")
58 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]")
59 | 	parser.add_option("-x","--nx",action="store_true",dest="unknown_nucleotide",help="Flag option. Presense of this flag tells program to include N,X in output NVC plot [required]")
60 | 	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default")
61 | 	(options,args)=parser.parse_args()
62 | 
63 | 	if not (options.output_prefix and options.input_file):
64 | 		parser.print_help()
65 | 		sys.exit(0)
66 | 	if os.path.exists(options.input_file):
67 | 		obj = SAM.ParseBAM(options.input_file)
68 | 		obj.readsNVC(outfile=options.output_prefix,nx=options.unknown_nucleotide, q_cut = options.map_qual)
69 | 		try:
70 | 			subprocess.call("Rscript " + options.output_prefix +  ".NVC_plot.r",shell=True)
71 | 		except:
72 | 			pass
73 | 	else:
74 | 		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
75 | 		#parser.print_help()
76 | 		sys.exit(0)
77 | 
78 | if __name__ == '__main__':
79 | 	main()
80 | 


--------------------------------------------------------------------------------
/rseqc/modules/read_duplication.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''
 3 | Calculte reads' duplication rate. 
 4 | Sequence-based: Reads with identical sequence are considered as "duplicate reads".
 5 | Mapping-based: Reads mapped to the exact same location are considered as "duplicate reads".
 6 | '''
 7 | 
 8 | #import built-in modules
 9 | import os,sys
10 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
11 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
12 | 	sys.exit()
13 | 
14 | import re
15 | import string
16 | from optparse import OptionParser
17 | import warnings
18 | import string
19 | import collections
20 | import math
21 | import sets
22 | from time import strftime
23 | import subprocess
24 | 
25 | #import third-party modules
26 | from bx.bitset import *
27 | from bx.bitset_builders import *
28 | from bx.intervals import *
29 | 
30 | #import my own modules
31 | from qcmodule import SAM
32 | #changes to the paths
33 | 
34 | #changing history to this module
35 | 
36 | 
37 | __author__ = "Liguo Wang"
38 | __copyright__ = "Copyleft"
39 | __credits__ = []
40 | __license__ = "GPL"
41 | __version__="2.6.4"
42 | __maintainer__ = "Liguo Wang"
43 | __email__ = "wang.liguo@mayo.edu"
44 | __status__ = "Production"
45 | 
46 | 
47 | def printlog (mesg):
48 | 	'''print progress into stderr and log file'''
49 | 	mesg="@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg
50 | 	LOG=open('class.log','a')
51 | 	print >>sys.stderr,mesg
52 | 	print >>LOG,mesg
53 | 
54 | 
55 | def main():
56 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
57 | 	parser = OptionParser(usage,version="%prog " + __version__)
58 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format.")
59 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s).")
60 | 	parser.add_option("-u","--up-limit",action="store",type="int",dest="upper_limit",default=500,help="Upper limit of reads' occurrence. Only used for plotting, default=%default (times)")
61 | 	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be considered as \"uniquely mapped\". default=%default")
62 | 	(options,args)=parser.parse_args()
63 | 
64 | 	if not (options.output_prefix and options.input_file):
65 | 		parser.print_help()
66 | 		sys.exit(0)
67 | 	if os.path.exists(options.input_file):
68 | 		obj = SAM.ParseBAM(options.input_file)
69 | 		obj.readDupRate(outfile=options.output_prefix,up_bound=options.upper_limit, q_cut = options.map_qual)
70 | 		try:
71 | 			subprocess.call("Rscript " + options.output_prefix +  ".DupRate_plot.r", shell=True)
72 | 		except:
73 | 			pass
74 | 	else:
75 | 		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
76 | 		#parser.print_help()
77 | 		sys.exit(0)
78 | 	
79 | 
80 | 
81 | 
82 | if __name__ == '__main__':
83 | 	main()
84 | 


--------------------------------------------------------------------------------
/rseqc/modules/read_hexamer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''---------------------------------------------------------------------------------------
 3 | Calculate Hexamer frequency
 4 | ------------------------------------------------------------------------------------------'''
 5 | 
 6 | import os,sys
 7 | import string
 8 | from optparse import OptionParser
 9 | import warnings
10 | import string
11 | from qcmodule import FrameKmer
12 | 
13 | __author__ = "Liguo Wang"
14 | __copyright__ = "Copyleft"
15 | __credits__ = []
16 | __license__ = "GPL"
17 | __version__="2.6.4"
18 | __maintainer__ = "Liguo Wang"
19 | __email__ = "wang.liguo@mayo.edu"
20 | __status__ = "Production"
21 | 
22 | 
23 | def file_exist(file):
24 | 	try:
25 |    		with open(file) as f: return True
26 | 	except IOError as e:
27 |   	 return False
28 | 
29 | 
30 | def main():
31 | 	usage = "\n%prog  [options]"
32 | 	parser = OptionParser(usage,version="%prog " + __version__)
33 | 	parser.add_option("-i","--input",action="store",dest="input_read",help="Read sequence in fasta or fastq format. Multiple fasta/fastq files should be separated by ','. For example: read.fq,read2.fa,read3,fa ")
34 | 	parser.add_option("-r","--refgenome",action="store",type="string",dest="ref_genome",help="Reference genome sequence in fasta format. Optional")
35 | 	parser.add_option("-g","--refgene",action="store",type="string",dest="ref_gene",help="Reference mRNA sequence in fasta format. Optional")
36 | 	(options,args)=parser.parse_args()
37 | 	
38 | 	if not options.input_read:
39 | 		parser.print_help()
40 | 		sys.exit(0)
41 | 
42 | 	read_table={}
43 | 	read_file_names=[]	#base name
44 | 	read_file_sum = {}	#sum of hexamer
45 | 	
46 | 	for read_file in options.input_read.split(','):
47 | 		if not file_exist(read_file):
48 | 			print >>sys.stderr, read_file, ' does NOT exist!'
49 | 			continue	
50 | 		print >>sys.stderr, "Calculate hexamer of " + read_file + ' file ...',
51 | 		read_table[os.path.basename(read_file)] =  FrameKmer.kmer_freq_file(fastafile = read_file, word_size = 6, step_size = 1, frame = 0)
52 | 		read_file_names.append(os.path.basename(read_file))
53 | 		read_file_sum[os.path.basename(read_file)] = float(sum(read_table[os.path.basename(read_file)].values()))
54 | 		print >>sys.stderr, "Done"	
55 | 		
56 | 	if options.ref_genome and file_exist(options.ref_genome):
57 | 		print >>sys.stderr, "Calculate hexamer of " + options.ref_genome + ' file ...',
58 | 		read_table[os.path.basename(options.ref_genome)] = FrameKmer.kmer_freq_file(fastafile = options.ref_genome, word_size = 6, step_size = 1, frame = 0)
59 | 		read_file_names.append(os.path.basename(options.ref_genome))
60 | 		read_file_sum[os.path.basename(options.ref_genome)] = float(sum(read_table[os.path.basename(options.ref_genome)].values()))
61 | 		print >>sys.stderr, "Done."
62 | 		
63 | 	if options.ref_gene and file_exist(options.ref_gene):
64 | 		print >>sys.stderr, "Calculate hexamer of " + options.ref_gene + ' file ...',
65 | 		read_table[os.path.basename(options.ref_gene)]= FrameKmer.kmer_freq_file(fastafile = options.ref_gene, word_size = 6, step_size = 1, frame = 0)	
66 | 		read_file_names.append(os.path.basename(options.ref_gene))
67 | 		read_file_sum[os.path.basename(options.ref_gene)] = float(sum(read_table[os.path.basename(options.ref_gene)].values()))
68 | 		print >>sys.stderr, "Done."
69 | 	
70 | 	print '\n\nHexamer' + '\t' + '\t'.join(read_file_names)
71 | 		
72 | 	for kmer in FrameKmer.all_possible_kmer(6):
73 | 		if 'N' in kmer:continue
74 | 		print kmer + '\t',
75 | 		try:
76 | 			print '\t'.join([str(read_table[name][kmer] / (read_file_sum[name])) for name in read_file_names])
77 | 		except:
78 | 			print '\t'.join([str(read_table[name][kmer] / (read_file_sum[name]+1)) for name in read_file_names])
79 | 		
80 | if __name__ == '__main__':
81 | 	main()
82 | 


--------------------------------------------------------------------------------
/rseqc/modules/read_quality.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''-------------------------------------------------------------------------------------------------
 3 | Calculating Phred Quality Score for each position on read. Note that each read should have 
 4 | the fixed (same) length
 5 | -------------------------------------------------------------------------------------------------'''
 6 | 
 7 | #import built-in modules
 8 | import os,sys
 9 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
10 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
11 | 	sys.exit()
12 | 
13 | import re
14 | import string
15 | from optparse import OptionParser
16 | import warnings
17 | import string
18 | import collections
19 | import math
20 | import sets
21 | from time import strftime
22 | import subprocess
23 | 
24 | #import third-party modules
25 | from bx.bitset import *
26 | from bx.bitset_builders import *
27 | from bx.intervals import *
28 | 
29 | #import my own modules
30 | from qcmodule import SAM
31 | #changes to the paths
32 | 
33 | #changing history to this module
34 | 
35 | 
36 | __author__ = "Liguo Wang"
37 | __copyright__ = "Copyleft"
38 | __credits__ = []
39 | __license__ = "GPL"
40 | __version__="2.6.2"
41 | __maintainer__ = "Liguo Wang"
42 | __email__ = "wang.liguo@mayo.edu"
43 | __status__ = "Production"
44 | 
45 | 
46 | def printlog (mesg):
47 | 	'''print progress into stderr and log file'''
48 | 	mesg="@ " + strftime("%Y-%m-%d %H:%M:%S") + ": " + mesg
49 | 	LOG=open('class.log','a')
50 | 	print >>sys.stderr,mesg
51 | 	print >>LOG,mesg
52 | 
53 | 
54 | def main():
55 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
56 | 	parser = OptionParser(usage,version="%prog " + __version__)
57 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format. [required]")
58 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output files(s). [required]")
59 | 	parser.add_option("-r","--reduce",action="store",type="int",dest="reduce_fold",default=1,help="To avoid making huge vector in R, nucleotide with particular phred score less frequent than this number will be ignored. Increase this number save more memory while reduce precision. Set to 1 achieves maximum precision (i.e. every nucleotide will be considered). This option only applies to the 'boxplot'. default=%default")
60 | 	parser.add_option("-q","--mapq",action="store",type="int",dest="map_qual",default=30,help="Minimum mapping quality (phred scaled) for an alignment to be called \"uniquely mapped\". default=%default")
61 | 	(options,args)=parser.parse_args()
62 | 
63 | 	if not (options.output_prefix and options.input_file):
64 | 		parser.print_help()
65 | 		sys.exit(0)
66 | 	if os.path.exists(options.input_file):
67 | 		obj = SAM.ParseBAM(options.input_file)
68 | 		obj.readsQual_boxplot(outfile=options.output_prefix, q_cut = options.map_qual, shrink = options.reduce_fold)
69 | 		try:
70 | 			subprocess.call("Rscript " + options.output_prefix + ".qual.r",shell=True)
71 | 		except:
72 | 			pass
73 | 	else:
74 | 		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
75 | 		#parser.print_help()
76 | 		sys.exit(0)
77 | 
78 | 
79 | 
80 | 
81 | if __name__ == '__main__':
82 | 	main()
83 | 


--------------------------------------------------------------------------------
/rseqc/modules/split_bam.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''-------------------------------------------------------------------------------------------------
  3 | Split bam file according to input gene list (bed).
  4 | -------------------------------------------------------------------------------------------------'''
  5 | 
  6 | #import built-in modules
  7 | import os,sys
  8 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
  9 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
 10 | 	sys.exit()
 11 | 
 12 | import string
 13 | from optparse import OptionParser
 14 | import warnings
 15 | import string
 16 | import collections
 17 | import sets
 18 | 
 19 | #import third-party modules
 20 | from bx.bitset import *
 21 | from bx.bitset_builders import *
 22 | from bx.intervals import *
 23 | from bx.binned_array import BinnedArray
 24 | from bx_extras.fpconst import isNaN
 25 | from bx.bitset_utils import *
 26 | import pysam
 27 | 
 28 | #import my own modules
 29 | from qcmodule import BED
 30 | from qcmodule import SAM
 31 | from qcmodule import bam_cigar
 32 | 
 33 | __author__ = "Liguo Wang"
 34 | __copyright__ = "Copyleft"
 35 | __credits__ = []
 36 | __license__ = "GPL"
 37 | __version__="2.6.4"
 38 | __maintainer__ = "Liguo Wang"
 39 | __email__ = "wang.liguo@mayo.edu"
 40 | __status__ = "Production"
 41 | 
 42 | def searchit(exon_range, exon_list):
 43 | 	'''return 1 if find, return 0 if cannot find'''
 44 | 	for chrom, st, end in exon_list:
 45 | 		if chrom.upper() not in exon_range:
 46 | 			return 0
 47 | 		elif len(exon_range[chrom].find(st,end)) >=1:
 48 | 			return 1
 49 | 	return 0
 50 | 
 51 | def build_bitsets(list):
 52 | 	'''build intevalTree from list'''
 53 | 	ranges={}
 54 | 	for l in list:
 55 | 		chrom =l[0].upper()
 56 | 		st = int(l[1])
 57 | 		end = int(l[2])
 58 | 		if chrom not in ranges:
 59 | 			ranges[chrom] = Intersecter()
 60 | 		ranges[chrom].add_interval( Interval( st, end ) )
 61 | 	return ranges
 62 | 
 63 | 	
 64 | def main():
 65 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
 66 | 	parser = OptionParser(usage,version="%prog " + __version__)
 67 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format. BAM file should be sorted and indexed.")
 68 | 	parser.add_option("-r","--genelist",action="store",type="string",dest="gene_list",help="Gene list in bed foramt. All reads hits to exon regions (defined by this gene list) will be saved into one BAM file, the remaining reads will saved into another BAM file.")
 69 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output BAM files. \"prefix.in.bam\" file contains reads mapped to the gene list specified by \"-r\", \"prefix.ex.bam\" contains reads that cannot mapped to gene list. \"prefix.junk.bam\" contains qcfailed or unmapped reads.")
 70 | 	(options,args)=parser.parse_args()
 71 | 		
 72 | 	if not (options.input_file and options.gene_list):
 73 | 		parser.print_help()
 74 | 		sys.exit(0)
 75 | 	if not os.path.exists(options.gene_list):
 76 | 		print >>sys.stderr, '\n\n' + options.gene_list + " does NOT exists" + '\n'
 77 | 		#parser.print_help()
 78 | 		sys.exit(0)
 79 | 	if not os.path.exists(options.input_file):
 80 | 		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
 81 | 		sys.exit(0)		
 82 | 	
 83 | 	#build bitset for gene list
 84 | 	print >>sys.stderr, 'reading ' + options.gene_list + ' ... ',
 85 | 	obj = BED.ParseBED(options.gene_list)
 86 | 	exons = obj.getExon()
 87 | 	exon_ranges = build_bitsets(exons)
 88 | 	print >>sys.stderr, 'Done'
 89 | 	
 90 | 	samfile = pysam.Samfile(options.input_file,'rb')
 91 | 	out1 = pysam.Samfile(options.output_prefix + '.in.bam','wb',template=samfile)	#bam file containing reads hit to exon region
 92 | 	out2 = pysam.Samfile(options.output_prefix + '.ex.bam','wb',template=samfile)	#bam file containing reads not hit to exon region
 93 | 	out3 = pysam.Samfile(options.output_prefix + '.junk.bam','wb',template=samfile)	#bam file containing reads not hit to exon region
 94 | 	
 95 | 	total_alignment = 0
 96 | 	in_alignment = 0
 97 | 	ex_alignment = 0
 98 | 	bad_alignment = 0
 99 | 	print >>sys.stderr, "spliting " + options.input_file + " ...",
100 | 	try:
101 | 		while(1):
102 | 			aligned_read = samfile.next()
103 | 			total_alignment += 1
104 | 			
105 | 			if aligned_read.is_qcfail:
106 | 				bad_alignment +=1
107 | 				out3.write(aligned_read)
108 | 				continue
109 | 			if aligned_read.is_unmapped:
110 | 				bad_alignment +=1
111 | 				out3.write(aligned_read)
112 | 				continue
113 | 			
114 | 			chrom = samfile.getrname(aligned_read.tid)
115 | 			chrom=chrom.upper()	
116 | 			read_start = aligned_read.pos
117 | 			mate_start = aligned_read.mpos
118 | 				
119 | 			#read_exons = bam_cigar.fetch_exon(chrom, aligned_read.pos, aligned_read.cigar)
120 | 			if aligned_read.mate_is_unmapped:	#only one end mapped
121 | 				if chrom not in exon_ranges:
122 | 					out2.write(aligned_read)
123 | 					ex_alignment += 1
124 | 					continue		
125 | 				else:		
126 | 					if len(exon_ranges[chrom].find(read_start, read_start +1)) >= 1:
127 | 						out1.write(aligned_read)
128 | 						in_alignment += 1
129 | 						continue
130 | 					elif len(exon_ranges[chrom].find(read_start, read_start +1)) == 0:
131 | 						out2.write(aligned_read)
132 | 						ex_alignment += 1
133 | 						continue
134 | 			else:							#both end mapped
135 | 				if chrom not in exon_ranges:
136 | 					out2.write(aligned_read)
137 | 					ex_alignment += 1
138 | 					continue
139 | 				else:
140 | 					if (len(exon_ranges[chrom].find(read_start, read_start +1)) >= 1) or (len(exon_ranges[chrom].find(mate_start, mate_start +1)) >= 1):
141 | 						out1.write(aligned_read)
142 | 						in_alignment += 1
143 | 					else:
144 | 						out2.write(aligned_read)
145 | 						ex_alignment += 1
146 | 				
147 | 	except StopIteration:
148 | 		print >>sys.stderr, "Done"
149 | 				
150 | 	print "%-55s%d" % ("Total records:",total_alignment)
151 | 	print "%-55s%d" % (options.output_prefix + '.in.bam (Reads consumed by input gene list):',in_alignment)
152 | 	print "%-55s%d" % (options.output_prefix + '.ex.bam (Reads not consumed by input gene list):',ex_alignment)
153 | 	print "%-55s%d" % (options.output_prefix + '.junk.bam (qcfailed, unmapped reads):',bad_alignment)
154 | 	
155 | if __name__ == '__main__':
156 | 	main()
157 | 


--------------------------------------------------------------------------------
/rseqc/modules/split_paired_bam.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''-------------------------------------------------------------------------------------------------
  3 | Split bam file (pair-end) into 2 single-end bam file
  4 | -------------------------------------------------------------------------------------------------'''
  5 | 
  6 | #import built-in modules
  7 | import os,sys
  8 | if sys.version_info[0] != 2 or sys.version_info[1] != 7:
  9 | 	print >>sys.stderr, "\nYou are using python" + str(sys.version_info[0]) + '.' + str(sys.version_info[1]) + " RSeQC needs python2.7!\n"
 10 | 	sys.exit()
 11 | 
 12 | import string
 13 | from optparse import OptionParser
 14 | import warnings
 15 | import string
 16 | import collections
 17 | import sets
 18 | 
 19 | #import third-party modules
 20 | from bx.bitset import *
 21 | from bx.bitset_builders import *
 22 | from bx.intervals import *
 23 | from bx.binned_array import BinnedArray
 24 | from bx_extras.fpconst import isNaN
 25 | from bx.bitset_utils import *
 26 | import pysam
 27 | 
 28 | #import my own modules
 29 | from qcmodule import BED
 30 | from qcmodule import SAM
 31 | from qcmodule import bam_cigar
 32 | 
 33 | __author__ = "Liguo Wang"
 34 | __copyright__ = "Copyleft"
 35 | __credits__ = []
 36 | __license__ = "GPL"
 37 | __version__="2.6.4"
 38 | __maintainer__ = "Liguo Wang"
 39 | __email__ = "wang.liguo@mayo.edu"
 40 | __status__ = "Production"
 41 | 
 42 | 	
 43 | def main():
 44 | 	usage="%prog [options]" + '\n' + __doc__ + "\n"
 45 | 	parser = OptionParser(usage,version="%prog " + __version__)
 46 | 	parser.add_option("-i","--input-file",action="store",type="string",dest="input_file",help="Alignment file in BAM or SAM format. BAM file should be sorted and indexed")
 47 | 	parser.add_option("-o","--out-prefix",action="store",type="string",dest="output_prefix",help="Prefix of output BAM files. \"prefix.R1.bam\" file contains the 1st read, \"prefix.R2.bam\" file contains the 2nd read")
 48 | 	(options,args)=parser.parse_args()
 49 | 		
 50 | 	if not (options.input_file):
 51 | 		parser.print_help()
 52 | 		sys.exit(0)
 53 | 	if not os.path.exists(options.input_file):
 54 | 		print >>sys.stderr, '\n\n' + options.input_file + " does NOT exists" + '\n'
 55 | 		sys.exit(0)		
 56 | 		
 57 | 	samfile = pysam.Samfile(options.input_file,'rb')
 58 | 	OUT1 = pysam.Samfile(options.output_prefix + '.R1.bam','wb',template=samfile)	#bam file containing reads hit to exon region
 59 | 	OUT2 = pysam.Samfile(options.output_prefix + '.R2.bam','wb',template=samfile)	#bam file containing reads not hit to exon region
 60 | 	OUT3 = pysam.Samfile(options.output_prefix + '.unmap.bam','wb',template=samfile)	#bam file containing reads not hit to exon region
 61 | 	
 62 | 	total_alignment = 0
 63 | 	r1_alignment = 0
 64 | 	r2_alignment = 0
 65 | 	unmapped = 0
 66 | 	
 67 | 	print >>sys.stderr, "spliting " + options.input_file + " ...",
 68 | 	try:
 69 | 		while(1):
 70 | 			new_alignment = pysam.AlignedRead()     # create AlignedRead object
 71 | 			old_alignment = samfile.next()
 72 | 			total_alignment += 1
 73 | 			
 74 | 			new_alignment.qname = old_alignment.qname      # 1st column. read name.
 75 | 			#new_alignment.flag = old_alignment.flag        # 2nd column. subject to change. flag value 
 76 | 			new_alignment.tid = old_alignment.tid          # 3rd column. samfile.getrname(tid) == chrom name
 77 | 			new_alignment.pos = old_alignment.pos          # 4th column. reference Start position of the aligned part (of read) [0-based]
 78 | 			new_alignment.mapq = old_alignment.mapq        # 5th column. mapping quality
 79 | 			new_alignment.cigar= old_alignment.cigar       # 6th column. subject to change. 
 80 | 			#new_alignment.rnext = old_alignment.rnext      # 7th column. tid of the reference (mate read mapped to)
 81 | 			#new_alignment.pnext = old_alignment.pnext      # 8th column. position of the reference (0 based, mate read mapped to)
 82 | 			#new_alignment.tlen = old_alignment.tlen        # 9th column. insert size
 83 | 			new_alignment.seq = old_alignment.seq          # 10th column. read sequence. all bases.
 84 | 			new_alignment.qual = old_alignment.qual        # 11th column. read sequence quality. all bases.
 85 | 			new_alignment.tags = old_alignment.tags        # 12 - columns
 86 | 			new_alignment.flag = 0x0000
 87 | 			if old_alignment.is_unmapped:
 88 | 				OUT3.write(old_alignment)
 89 | 				unmapped += 1
 90 | 				continue
 91 | 			if old_alignment.is_reverse:
 92 | 				new_alignment.flag = new_alignment.flag | 0x0010
 93 | 
 94 | 			if old_alignment.is_secondary:
 95 | 				new_alignment.flag = new_alignment.flag | 0x0100
 96 | 			if old_alignment.is_qcfail:
 97 | 				new_alignment.flag = new_alignment.flag | 0x0200
 98 | 			if old_alignment.is_duplicate:
 99 | 				new_alignment.flag = new_alignment.flag | 0x0400
100 | 			if old_alignment.is_read1:
101 | 				OUT1.write(new_alignment)
102 | 				r1_alignment += 1
103 | 			else:
104 | 				OUT2.write(new_alignment)
105 | 				r2_alignment += 1
106 | 
107 | 	except StopIteration:
108 | 		print >>sys.stderr, "Done"
109 | 				
110 | 	print "%-55s%d" % ("Total records:",total_alignment)
111 | 	print "%-55s%d" % (options.output_prefix + 'Read 1:',r1_alignment)
112 | 	print "%-55s%d" % (options.output_prefix + 'Read 2:',r2_alignment)
113 | 	print "%-55s%d" % (options.output_prefix + 'Unmapped:',unmapped)
114 | 	
115 | if __name__ == '__main__':
116 | 	main()
117 | 


--------------------------------------------------------------------------------
/rseqc/parsers/BedWrapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: iso-8859-15 -*-
 3 | 
 4 | import sys
 5 | from rseqc.parsers.GTF import GeneModels
 6 | #from bx.bitset import *
 7 | #from bx.bitset_builders import *
 8 | #from bx.intervals import *
 9 | 
10 | class BedWrapper(GeneModels):
11 | 
12 |     def getCDSExon(self):
13 |         cds = []
14 | 
15 |         for c in self.get_cds():
16 |             cds.append([c['chr'], c['start'], c['end']])
17 | 
18 |         return cds
19 | 
20 |     def getIntron(self):
21 | 
22 |         introns = []
23 | 
24 |         for i in self.get_introns():
25 |             introns.append([i['chr'], i['start'], i['end']])
26 | 
27 |         return introns
28 | 
29 |     def getUTR(self, utr = 35):
30 | 
31 |         tprime_utrs = []
32 | 
33 |         if utr == 3: 
34 |             for i in self.get_utrs(biotype = "protein_coding", tsl = 1, utr = 3):
35 |                 tprime_utrs.append([i['chr'], i['start'], i['end'], i['name'], '0', i['strand']])
36 | 
37 |         if utr == 5: 
38 |             for i in self.get_utrs(biotype = "protein_coding", tsl = 1, utr = 5):
39 |                 tprime_utrs.append([i['chr'], i['start'], i['end'], i['name'], '0', i['strand']])
40 | 
41 |         return tprime_utrs
42 | 
43 |     def getIntergenic(self, direction = 'up', size = 1000):
44 |         '''get intergenic regions. direction = up or down or both.'''
45 |         
46 |         regions = []
47 |     
48 |         for t in self.get_cds():
49 |     
50 |             chrom = t['chr']
51 |             tx_start = t['start']
52 |             tx_end = t['end']
53 |             strand = t['strand']
54 |             
55 |             if(direction == "up" or direction == "both"):
56 |                 if strand == '-':
57 |                     region_st = tx_end
58 |                     region_end = tx_end + size
59 |                 else:
60 |                     region_st = max(tx_start - size, 0)
61 |                     region_end = tx_start
62 |             
63 |                 regions.append([chrom, region_st, region_end])
64 |             
65 |             if (direction == "down" or direction == "both"):
66 |                 if strand == '-':
67 |                     region_st = max(0,tx_start-size)
68 |                     region_end = tx_start
69 |                 else:
70 |                     region_st = tx_end
71 |                     region_end = tx_end+size
72 |                 regions.append([chrom, region_st, region_end])
73 |             
74 |         return regions
75 | 


--------------------------------------------------------------------------------
/rseqc/parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MonashBioinformaticsPlatform/RSeQC/5658c4d7c5c1c9a8ece1461df82479b84c5509f8/rseqc/parsers/__init__.py


--------------------------------------------------------------------------------
/rseqc/qcmodule/FrameKmer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''deal with Kmer. DNA sequence should only A, C, G, T. python2.7 or newer'''
  3 | 
  4 | #import built-in modules
  5 | import os,sys
  6 | import numpy
  7 | import math
  8 | from collections import Counter
  9 | import re
 10 | import itertools
 11 | 
 12 | def word_generator(seq,word_size,step_size,frame=0):
 13 | 	'''generate DNA word from sequence using word_size and step_size. Frame is 0, 1 or2'''
 14 | 	for i in range(frame,len(seq),step_size):
 15 | 		word =  seq[i:i+word_size]
 16 | 		if len(word) == word_size:
 17 | 			yield word
 18 | 
 19 | def seq_generator(fastafile):
 20 | 	'''DNA sequence only contains A,C,G,T,N. sequence with other characters will be removed'''
 21 | 	tmpseq=''
 22 | 	name=''
 23 | 	DNA_pat = re.compile(r'^[ACGTN]+$')
 24 | 	for line in open(fastafile,'r'):
 25 | 		line=line.strip().upper()
 26 | 		if line.startswith(('#',' ','\n')):continue
 27 | 		if line.startswith(('>','@')):
 28 | 			if tmpseq:
 29 | 				yield [name,tmpseq]
 30 | 				tmpseq=''
 31 | 			name = line.split()[0][1:]
 32 | 		elif DNA_pat.match(line):
 33 | 			tmpseq += line
 34 | 	yield [name,tmpseq]
 35 | 	
 36 | def all_possible_kmer(l):
 37 | 	'''return all possible combinations of A,C,G,T,N. only support A,C,G,T,N. l is length of kmer'''
 38 | 	for i in itertools.product(['A','C','G','T','N'],repeat=l):
 39 | 		yield ''.join(i)
 40 | 
 41 | def kmer_freq_file (fastafile,word_size,step_size=1,frame=0,min_count=0):
 42 | 	'''Calculate kmer frequency from fasta file'''
 43 | 	seq_num = 0
 44 | 	ret_dict={}
 45 | 	for n,s in seq_generator(fastafile):
 46 | 		seq_num += 1
 47 | 		if seq_num == 1:
 48 | 			count_table = Counter(word_generator(s,word_size=word_size,step_size=step_size,frame=frame))
 49 | 		else:
 50 | 			count_table.update( word_generator(s,word_size=word_size,step_size=step_size,frame=frame) )
 51 | 	
 52 | 	#return count_table
 53 | 	for kmer in all_possible_kmer(word_size):
 54 | 		if kmer not in count_table: count_table[kmer]=0
 55 | 		if count_table[kmer] >= min_count:
 56 | 			#print kmer + '\t' + str(count_table[kmer])
 57 | 			if 'N' in kmer:continue
 58 | 			ret_dict[kmer] = count_table[kmer]
 59 | 	return ret_dict		
 60 | 
 61 | def kmer_freq_seq (seq,word_size,step_size=1,frame=0,min_count=0):
 62 | 	'''Calculate kmer frequency from DNA sequence. coding. genome is hexamer table calculated
 63 | 	from coding region and whole genome (as background control)
 64 | 	'''
 65 | 	count_table = Counter(word_generator(seq,word_size=word_size,step_size=step_size,frame=frame))
 66 | 	for kmer in all_possible_kmer(word_size):
 67 | 		if kmer not in count_table: count_table[kmer]=0
 68 | 		if count_table[kmer] >= min_count:
 69 | 			print((kmer + '\t' + str(count_table[kmer])))
 70 | 
 71 | def kmer_ratio(seq,word_size,step_size,coding,noncoding):
 72 | 	if len(seq) < word_size:
 73 | 		return 0
 74 | 		
 75 | 	sum_of_log_ratio_0 = 0.0
 76 | 	sum_of_log_ratio_1 = 0.0
 77 | 	sum_of_log_ratio_2 = 0.0	
 78 | 	frame0_count=0.0
 79 | 	frame1_count=0.0
 80 | 	frame2_count=0.0
 81 | 	for k in word_generator(seq=seq, word_size = word_size, step_size=step_size,frame=0):	
 82 | 		if (k not in coding) or (k not in noncoding):
 83 | 			continue
 84 | 		if coding[k]>0 and noncoding[k] >0:
 85 | 			sum_of_log_ratio_0  +=  math.log( coding[k] / noncoding[k])
 86 | 		elif coding[k]>0 and noncoding[k] == 0:
 87 | 			sum_of_log_ratio_0 += 1
 88 | 		elif coding[k] == 0 and noncoding[k] == 0:
 89 | 			continue
 90 | 		elif coding[k] == 0 and noncoding[k] >0 :
 91 | 			sum_of_log_ratio_0 -= 1
 92 | 		else:
 93 | 			continue
 94 | 		frame0_count += 1
 95 | 	'''	
 96 | 	for k in word_generator(seq=seq, word_size = word_size, step_size=step_size,frame=1):
 97 | 		if (not coding.has_key(k)) or (not noncoding.has_key(k)):
 98 | 			continue
 99 | 		if coding[k]>0 and noncoding[k] >0:
100 | 			sum_of_log_ratio_1  +=  math.log( coding[k] / noncoding[k])
101 | 		elif coding[k]>0 and noncoding[k] == 0:
102 | 			sum_of_log_ratio_1 += 1
103 | 		elif coding[k] == 0 and noncoding[k] == 0:
104 | 			continue
105 | 		elif coding[k] == 0 and noncoding[k] >0 :
106 | 			sum_of_log_ratio_1 -= 1
107 | 		else:
108 | 			continue
109 | 		frame1_count += 1
110 | 	
111 | 	for k in word_generator(seq=seq, word_size = word_size, step_size=step_size,frame=2):
112 | 		if (not coding.has_key(k)) or (not noncoding.has_key(k)):
113 | 			continue
114 | 		if coding[k]>0 and noncoding[k] >0:
115 | 			sum_of_log_ratio_2  +=  math.log( coding[k] / noncoding[k])
116 | 		elif coding[k]>0 and noncoding[k] == 0:
117 | 			sum_of_log_ratio_2 += 1
118 | 		elif coding[k] == 0 and noncoding[k] == 0:
119 | 			continue
120 | 		elif coding[k] == 0 and noncoding[k] >0 :
121 | 			sum_of_log_ratio_2 -= 1
122 | 		else:
123 | 			continue
124 | 		frame2_count += 1
125 | 	return max(sum_of_log_ratio_0/frame0_count, sum_of_log_ratio_1/frame1_count,sum_of_log_ratio_2/frame2_count)	
126 | 	'''
127 | 	return sum_of_log_ratio_0/frame0_count
128 | 	
129 | 
130 | 


--------------------------------------------------------------------------------
/rseqc/qcmodule/PSL.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''manipulate blat PSL file.'''
  3 | 
  4 | #import built-in modules
  5 | import os,sys
  6 | import re
  7 | import string
  8 | from optparse import OptionParser
  9 | import warnings
 10 | 
 11 | #import third-party modules
 12 | 
 13 | #changes to the paths
 14 | 
 15 | #changing history to this module
 16 | 
 17 | 
 18 | __author__ = "Liguo Wang"
 19 | __copyright__ = "Copyright 2010, Wei Li's Lab"
 20 | __credits__ = []
 21 | __license__ = "GPL"
 22 | __version__ = "1.0.1"
 23 | __maintainer__ = "Liguo Wang"
 24 | __email__ = "liguow@bcm.edu"
 25 | __status__ = "Development" #Prototype or Production
 26 | 
 27 | 
 28 | class PSL:
 29 |     '''manipulate PSL format file (blat output file)'''
 30 |     
 31 |     def __init__(self,inputfile_name,score=20,block=2):
 32 |         '''initialize this class
 33 |         arg1: inputfile psl file
 34 |         arg2: score cutoff. only report alignments with score >= this value. default=20
 35 |         arg3: block count cutoff. only report alignments with block number <= this value. default=2
 36 |         eg: a=PSL("filename",30,2) or a=PSL.PSL("filename",30,2)
 37 |         '''
 38 |         self.__inputfile=open(inputfile_name,'r')
 39 |         self.__scoreCutoff=score        #matched score smaller than this value will be removed
 40 |         self.__blockCutoff=block        #matched blcok bigger than this value will be removed
 41 |         self.__pslLine=re.compile(r'^\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+[+-]')   #use this re to remove head lines
 42 |         self.__pslSplit=re.compile(r'\s+')
 43 |         self.__blankLine=re.compile(r'\s*\n')
 44 |         self.__data=[]
 45 |         totalLine=0
 46 |         usedLine=0
 47 |         nonPslLine=0
 48 |         field=[]
 49 |         while True:
 50 |             self.__line=self.__inputfile.readline()
 51 |             if self.__blankLine.match(self.__line):     #skip blank line
 52 |                 continue
 53 |             elif self.__pslLine.match(self.__line):
 54 |                 totalLine=totalLine+1
 55 |                 field=self.__pslSplit.split(self.__line)
 56 |                 if string.atoi(field[0])< self.__scoreCutoff:
 57 |                     continue
 58 |                 elif string.atoi(field[17]) > self.__blockCutoff:
 59 |                     continue
 60 |                 usedLine = usedLine+1
 61 |                 self.__data.append(self.__line)
 62 |                 
 63 |             else:
 64 |                 nonPslLine = nonPslLine+1
 65 |             if not self.__line:break    #end of file
 66 |             
 67 |         print  "\nTotal: ", totalLine, "lines"
 68 |         print  "Used: ", usedLine, "lines"
 69 |         print  "Non-PSL: ", nonPslLine,"lines","\n"
 70 |           
 71 |     def head(self,limit=10):
 72 |         '''print out header lines of PSL file, default first 10 lines
 73 |         eg: a.head(50)
 74 |         '''
 75 |         count=0 #count how many lines have been printed
 76 |         field=[]
 77 |         for line in self.__data:
 78 |             line=line.rstrip("\n")
 79 |             print line
 80 |             count=count+1
 81 |             if count >= limit:break
 82 |             
 83 |     def psl2bedFile(self,output_file=None):
 84 |         '''transform psl format into bed format. creat 6 column bed files.
 85 |         col1: target name, typically a chromosome
 86 |         col2: start coordinate
 87 |         col3: end coordinate
 88 |         col4: name of query. if a query is split. sequential number will be added
 89 |              at the end of the name
 90 |         col5: score. "matchScore.mismatch"
 91 |         col6: strand
 92 |         '''
 93 |         
 94 |         if output_file is None:
 95 |         	sys.stdout=sys.__stdout__
 96 |         else:
 97 |         	outfile=open(output_file,'w')
 98 |         	sys.stdout=outfile
 99 |         field=[]
100 |         blockSize=[]
101 |         blockStart=[]
102 |         for line in self.__data:
103 |             line=line.rstrip('\n')
104 |             field=self.__pslSplit.split(line)
105 |             if string.atoi(field[17]) == 1:
106 |                 print field[13],"\t",field[15],"\t",field[16],"\t",field[9],"\t",field[0]+'.'+field[1],"\t",field[8]
107 |             else:
108 |                 blockSize=field[18].split(',')
109 |                 blockSize.pop(-1)
110 |                 blockStart=field[20].split(',')
111 |                 blockStart.pop(-1)
112 |                 for i in range(0,len(blockSize)):
113 |                     print field[13],"\t",blockStart[i],"\t",string.atoi(blockStart[i])+string.atoi(blockSize[i]),"\t",field[9]+'.'+str(i+1),"\t",field[0]+'.'+field[1],"\t",field[8]
114 | 
115 | def main():
116 | 	parser=OptionParser()
117 | 	parser.add_option('-i','--input_file',dest="inputFileName",help="Input file name")
118 | 	parser.add_option('-o','--output_file',dest="outputFileName",help="Output file name")
119 | 	parser.add_option('-s','--score',dest="scoreCutoff",help="blat mapping score cutoff",type="int",default=20)
120 | 	parser.add_option('-b','--block_num',dest="blockCutoff",help="block number cutoff",type="int",default=2)
121 | 	(options,args)=parser.parse_args()
122 | 	
123 | 	obj=PSL(options.inputFileName,options.scoreCutoff,options.blockCutoff)
124 | 	obj.psl2bedFile(options.outputFileName)
125 |             
126 | if __name__== '__main__':
127 | 	main()
128 | else:
129 |     print >>sys.stderr, "module " + __name__ + " imported!"
130 | 
131 |             
132 |             
133 |             
134 |             
135 |             
136 |             
137 |             
138 |             
139 |             
140 |             
141 |             
142 |             
143 |             
144 |             
145 |             


--------------------------------------------------------------------------------
/rseqc/qcmodule/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MonashBioinformaticsPlatform/RSeQC/5658c4d7c5c1c9a8ece1461df82479b84c5509f8/rseqc/qcmodule/__init__.py


--------------------------------------------------------------------------------
/rseqc/qcmodule/bam_cigar.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | '''manipulate CIGAR string represented as list of tuples (BAM file)
  4 | BAM	OP	Description
  5 | 0	M	alignment match
  6 | 1	I	insertion to read. (relative to reference genome)
  7 | 2	D	deletion from read. (relative to reference genome)
  8 | 3	N	skipped region from the reference
  9 | 4	S	soft clipping (clipped sequence present in SEQ)
 10 | 5	H	hard clipping (clipped sequences NOT present in SEQ) 
 11 | 6	P	padding (silent deletion from padded reference)
 12 | 7	=	sequence match
 13 | 8	X	sequence mismatch
 14 | 
 15 | Example:
 16 | The tuple [ (0,3), (1,5), (0,2) ] refers to an alignment with 3 matches, 
 17 | 5 insertions and another 2 matches.
 18 |  
 19 | NOTE: only deal with Match, Gap, Soft Clip, Insertion, Deletion 
 20 | '''
 21 | 
 22 | __author__ = "Liguo Wang"
 23 | __copyright__ = "Copyright 2012 Mayo Clinic"
 24 | __credits__ = []
 25 | __license__ = "GPL"
 26 | __version__ = "1.0.1"
 27 | __maintainer__ = "Liguo Wang"
 28 | __email__ = "wangliguo78@gmail.com"
 29 | __status__ = "Development" #Prototype or Production
 30 | 
 31 | 
 32 | def map_bounds(start_pos, cigar):
 33 | 	'''return the start, end corrdinates (on genome) of mapped region'''
 34 | 	span_size = 0	#the actual genomic region size that covered by read
 35 | 	for c,s in cigar:	#code and size
 36 | 		if c == 0:	#match
 37 | 			span_size += s
 38 | 		elif c == 1:	#insertion to the ref
 39 | 			continue
 40 | 		elif c == 2: # deleltion
 41 | 			span_size += s
 42 | 		elif c == 3:	# skipping
 43 | 			span_size += s
 44 | 		else:
 45 | 			continue
 46 | 	return (start_pos, start_pos+span_size)
 47 | 		
 48 | 
 49 | def fetch_exon(chrom, st, cigar):
 50 | 	''' fetch exon regions defined by cigar. st must be zero based
 51 | 	return list of tuple of (chrom,st, end)
 52 | 	'''
 53 | 	#match = re.compile(r'(\d+)(\D)')
 54 | 	chrom_st = st
 55 | 	exon_bound =[]
 56 | 	for c,s in cigar:	#code and size
 57 | 		if c==0:		#match
 58 | 			exon_bound.append((chrom, chrom_st,chrom_st + s))
 59 | 			chrom_st += s
 60 | 		elif c==1:		#insertion to ref
 61 | 			continue
 62 | 		elif c==2:		#deletion to ref
 63 | 			chrom_st += s
 64 | 		elif c==3:		#gap or intron
 65 | 			chrom_st += s
 66 | 		elif c==4:		#soft clipping. We do NOT include soft clip as part of exon
 67 | 			chrom_st += s
 68 | 		else:
 69 | 			continue
 70 | 	return exon_bound
 71 | 	
 72 | def fetch_intron(chrom, st, cigar):
 73 | 	''' fetch intron regions defined by cigar. st must be zero based
 74 | 	return list of tuple of (chrom,st, end)
 75 | 	'''
 76 | 	#match = re.compile(r'(\d+)(\D)')
 77 | 	chrom_st = st
 78 | 	intron_bound =[]
 79 | 	for c,s in cigar:	#code and size
 80 | 		if c==0:		#match
 81 | 			chrom_st += s
 82 | 		elif c==1:		#insertion to ref
 83 | 			continue
 84 | 		elif c==2:		#deletion to ref
 85 | 			chrom_st += s
 86 | 		elif c==3:		#gap or intron
 87 | 			intron_bound.append((chrom, chrom_st,chrom_st+s))
 88 | 			chrom_st += s
 89 | 		elif c==4:		#soft clipping. We do NOT include soft clip as part of exon
 90 | 			#chrom_st += s
 91 | 			continue
 92 | 		else:
 93 | 			continue
 94 | 	return intron_bound	
 95 | 
 96 | def fetch_clip(chrom, st, cigar):
 97 | 	''' fetch head soft clip regions defined by cigar. st must be zero based
 98 | 	return list of tuple of (chrom,st, end)
 99 | 	'''
100 | 	#match = re.compile(r'(\d+)(\D)')
101 | 	chrom_st = st
102 | 	clip_bound =[]
103 | 	for c,s in cigar:	#code and size
104 | 		if c==0:		#match
105 | 			chrom_st += s
106 | 		elif c==1:		#insertion to ref
107 | 			continue
108 | 		elif c==2:		#deletion to ref
109 | 			chrom_st += s
110 | 		elif c==3:		#gap or intron
111 | 			chrom_st += s
112 | 		elif c==4:		#soft clipping. We do NOT include soft clip as part of exon
113 | 			clip_bound.append((chrom, chrom_st, chrom_st + s))
114 | 			chrom_st += s
115 | 		else:
116 | 			continue
117 | 	return clip_bound		
118 | 
119 | def fetch_deletion(chrom, st, cigar):
120 | 	''' fetch deletion regions defined by cigar. st must be zero based
121 | 	return list of tuple of (chrom,st, end)
122 | 	'''
123 | 	#match = re.compile(r'(\d+)(\D)')
124 | 	chrom_st = st
125 | 	del_bound =[]
126 | 	for c,s in cigar:	#code and size
127 | 		if c==0:		#match
128 | 			chrom_st += s
129 | 		elif c==1:		#insertion to ref
130 | 			continue
131 | 		elif c==2:		#deletion to ref
132 | 			del_bound.append((chrom, chrom_st, chrom_st + s))
133 | 			chrom_st += s
134 | 		elif c==3:		#gap or intron
135 | 			chrom_st += s
136 | 		elif c==4:		#soft clipping. We do NOT include soft clip as part of exon
137 | 			chrom_st += s
138 | 		else:
139 | 			continue
140 | 	return del_bound			
141 | 
142 | def fetch_deletion_range(cigar):
143 | 	''' fetch deletion regions defined by cigar. st must be zero based
144 | 	return list of tuple of (st, end). 'st','end' is relative to the 
145 | 	start of read.
146 | 	'''
147 | 	del_bound =[]
148 | 	st = 0
149 | 	for c,s in cigar:	#code and size
150 | 		if c==0:		#match
151 | 			st += s
152 | 		elif c==4:
153 | 			st += s		#soft clip
154 | 		elif c==1:		#insertion to ref
155 | 			st += s
156 | 		elif c==2:		#deletion to ref
157 | 			del_bound.append((st,s))	#only record the start position of deletion, and the deletion size
158 | 		elif c==3:		#gap or intron
159 | 			continue
160 | 		else:
161 | 			continue
162 | 	return del_bound
163 | 
164 | 
165 | def fetch_insertion_range(cigar):
166 | 	''' fetch insertion regions defined by cigar. st must be zero based
167 | 	return list of tuple of (st, end). 'st','end' is relative to the 
168 | 	start of read.
169 | 	'''
170 | 	ins_bound =[]
171 | 	st = 0
172 | 	for c,s in cigar:	#code and size
173 | 		if c==0:		#match
174 | 			st += s
175 | 		elif c==4:
176 | 			st += s		#soft clip
177 | 		elif c==1:		#insertion to ref
178 | 			ins_bound.append((st, s))
179 | 			st += s
180 | 		elif c==2:		#deletion to ref
181 | 			continue
182 | 		elif c==3:		#gap or intron
183 | 			continue
184 | 		else:
185 | 			continue
186 | 	return ins_bound
187 | 
188 | 
189 | 	
190 | def fetch_insertion(chrom, st, cigar):
191 | 	''' fetch insertion regions defined by cigar. st must be zero based
192 | 	return list of tuple of (chrom,st, end)
193 | 	
194 | 	NOTE: insertion region does NOT present in reference genome and there
195 | 	fore cannot represented using reference coordinates[start, end]. So we
196 | 	use [start, SIZE).
197 | 	[(100,2)] means 2nt insert between 100 and 101
198 | 	'''
199 | 	#match = re.compile(r'(\d+)(\D)')
200 | 	chrom_st = st
201 | 	ins_bound =[]
202 | 	for c,s in cigar:	#code and size
203 | 		if c==0:		#match
204 | 			chrom_st += s
205 | 		elif c==1:		#insertion to ref
206 | 			ins_bound.append((chrom, chrom_st, s))
207 | 			continue
208 | 		elif c==2:		#deletion to ref
209 | 			chrom_st += s
210 | 		elif c==3:		#gap or intron
211 | 			chrom_st += s
212 | 		elif c==4:		#soft clipping. We do NOT include soft clip as part of exon
213 | 			chrom_st += s
214 | 		else:
215 | 			continue
216 | 	return ins_bound			
217 | 
218 | 
219 | def list2str (lst):
220 | 	'''translate samtools returned cigar_list into cigar_string
221 | 	
222 | 	[(4, 1), (0, 9)] ==> '1S9M'
223 | 	'''
224 | 	code2Char={'0':'M','1':'I','2':'D','3':'N','4':'S','5':'H','6':'P','7':'=','8':'X'}
225 |         
226 | 	cigar_str=''
227 | 	for i in lst:
228 | 		cigar_str += str(i[1]) + code2Char[str(i[0])]
229 | 	return cigar_str
230 | 
231 | def list2longstr (lst):
232 | 	'''translate samtools returned cigar_list into LONG cigar_string
233 | 	Sum of lengths of the M/I/S/=/X operations shall equal the length of SEQ
234 | 	'''
235 | 	
236 | 	code2Char={'0':'M','1':'I','2':'D','3':'N','4':'S','5':'H','6':'P','7':'=','8':'X'}
237 |         
238 | 	cigar_str=''
239 | 	for i in lst:
240 | 		(code,length) = (i[0],i[1])
241 | 		if code in (0,1,4,7,8):
242 | 			cigar_str += code2Char[str(code)] * int(length)
243 | 		else:
244 | 			continue
245 | 	return cigar_str
246 | 
247 | 	


--------------------------------------------------------------------------------
/rseqc/qcmodule/changePoint.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''manipulate fasta for fastq format files.'''
 3 | 
 4 | #import built-in modules
 5 | import re
 6 | import sys
 7 | from string import maketrans
 8 | from random import shuffle
 9 | from heapq import nlargest
10 | #import third-party modules
11 | 
12 | #changes to the paths
13 | 
14 | #changing history to this module
15 | 
16 | 
17 | __author__ = "Liguo Wang"
18 | __copyright__ = "Copyright 2010, Wei Li's Lab"
19 | __credits__ = []
20 | __license__ = "GPL"
21 | __version__ = "1.0.1"
22 | __maintainer__ = "Liguo Wang"
23 | __email__ = "liguow@bcm.edu"
24 | __status__ = "Development" #Prototype or Production
25 | 
26 | 
27 | def S_diff(lst):
28 | 	'''Given a list of int or float, calculate S_diff and S_point'''
29 | 	
30 | 	S_avg = sum(lst) / len(lst)
31 | 	S_dist = [i-S_avg for i in lst]	#distance to average
32 | 	S_cum=[]	#list of cumulative sum
33 | 	S_cum.append(0)
34 | 	for i in range(0,len(S_dist)):
35 | 		S_cum.append(S_cum[i] + S_dist[i])
36 | 	return [nlargest(1,range(0,len(S_cum)),key=lambda i: S_cum[i]),(max(S_cum) - min(S_cum))]
37 | 	#return the index of maximum_diff index, and maximum_diff
38 | 
39 | def bootstrap(lst,obs,rep=1000):
40 | 	'''Given a list of int or float (lst) and an observation value(obs). calcualte the chance (pvalue) 
41 | 	of getting this observation through bootstrapping.'''
42 | 	
43 | 	shuffled_diff=[]
44 | 	count=0
45 | 	tmp=lst
46 | 	for i in range(0,rep):
47 | 		shuffle(tmp)
48 | 		shuffled_diff.append(S_diff(tmp))
49 | 	
50 | 	for i in sorted(shuffled_diff):
51 | 		if (i>=obs):
52 | 			count += 1 
53 | 	if count/rep <0.5:
54 | 		return count/rep
55 | 	else:
56 | 		return 1- count/rep
57 | 


--------------------------------------------------------------------------------
/rseqc/qcmodule/cigar.py:
--------------------------------------------------------------------------------
  1 | '''manipulate CIGAR string'''
  2 | 
  3 | #import built-in modules
  4 | import os,sys
  5 | import re
  6 | import string
  7 | import warnings
  8 | import string
  9 | import collections
 10 | import math
 11 | import sets
 12 | import random
 13 | 
 14 | head_clip = re.compile(r'^(\d+)S')
 15 | tail_clip = re.compile(r'(\d+)S$')
 16 | insertion = re.compile(r'(\d+)I')
 17 | deletion  = re.compile(r'(\d+)D')
 18 | matching  = re.compile(r'(\d+)M')
 19 | skipping  = re.compile(r'(\d+)N')
 20 | read_part = re.compile(r'(\d+)[MIS=X]')
 21 | ref_part = re.compile(r'(\d+)[MISND=X]')
 22 | 
 23 | prior_insertion = re.compile(r'(.+?)(\d+)I')
 24 | prior_deletion = re.compile(r'(.+?)(\d+)D')
 25 | prior_intron = re.compile(r'(.+?)(\d+)N')
 26 | prior_exon = re.compile(r'(.*?)(\d+)M')
 27 | 
 28 | def fetch_head_clip(chr, st, cigar):
 29 | 	'''return genome coordinate of the head clip part encoded in cigar string
 30 | 	NOTE: returned coordinates are 0-based.NOTE: st is 0-based'''
 31 | 	
 32 | 	block=[]
 33 | 	chrom_end = int(st)		
 34 | 	tmp = head_clip.findall(cigar)
 35 | 	if len(tmp)==0:
 36 | 		return block
 37 | 	else:
 38 | 		chrom_st = chrom_end - int(tmp[0])
 39 | 		block.append([chr,chrom_st,chrom_end])
 40 | 	return block
 41 | 
 42 | def fetch_tail_clip(chr, st, cigar):
 43 | 	'''return genome coordinates of the tail clip part encoded in cigar string
 44 | 	NOTE: returned coordinates are 0-based .  NOTE: st is 0-based'''
 45 | 	
 46 | 	block=[]
 47 | 	h = head_clip.findall(cigar)
 48 | 	t = tail_clip.findall(cigar)
 49 | 	if len(t)==0:return block
 50 | 	else:t_len = int(t[0])
 51 | 	
 52 | 	if len(h)==0:h_len=0
 53 | 	else:h_len=int(h[0])
 54 | 	ref_length = sum([int(i) for i in ref_part.findall(cigar)])
 55 | 	#print read_length
 56 | 	chrom_end = int(st) + (ref_length - h_len)		#because SAM is 1-based
 57 | 	chrom_st = chrom_end - t_len
 58 | 	block.append([chr,chrom_st,chrom_end])
 59 | 	return block
 60 | 	
 61 | def fetch_insertion(chr, st, cigar):
 62 | 	'''return genome coordinates of the insertion (to reference) encoded in cigar string
 63 | 	NOTE: returned coordinates are 0-based. Insertion to the reference.  NOTE: st is 0-based'''
 64 | 	
 65 | 	block=[]
 66 | 	h = head_clip.findall(cigar)
 67 | 	if len(h)==0:h_len=0
 68 | 	else:h_len=int(h[0])
 69 | 	
 70 | 	ref_length=0
 71 | 	m = prior_insertion.findall(cigar)
 72 | 	if len(m)==0:
 73 | 		return block
 74 | 	else:
 75 | 		for j in m:
 76 | 			ref_length += sum([int(i) for i in ref_part.findall(j[0])])
 77 | 			chrom_st = int(st) + (ref_length - h_len)
 78 | 			chrom_end = chrom_st + int(j[1])
 79 | 			block.append([chr,chrom_st,chrom_end])
 80 | 	return block
 81 | 
 82 | def fetch_deletion(chr, st, cigar):
 83 | 	'''return genome coordinates of the insertion (to reference) encoded in cigar string
 84 | 	NOTE: returned coordinates are 0-based. Deletion to the reference.  NOTE: st is 0-based'''
 85 | 	
 86 | 	block=[]
 87 | 	h = head_clip.findall(cigar)
 88 | 	if len(h)==0:h_len=0
 89 | 	else:h_len=int(h[0])
 90 | 	
 91 | 	ref_length=0
 92 | 	m = prior_deletion.findall(cigar)
 93 | 	if len(m)==0:
 94 | 		return block
 95 | 	else:
 96 | 		for j in m:
 97 | 			ref_length += sum([int(i) for i in ref_part.findall(j[0])])
 98 | 			chrom_st = int(st) + (ref_length - h_len)
 99 | 			chrom_end = chrom_st + int(j[1])
100 | 			block.append([chr,chrom_st,chrom_end])
101 | 			ref_length += int(j[1])
102 | 	return block
103 | 
104 | 	
105 | def fetch_intron(chr, st, cigar):
106 | 	'''return genome coordinates of the introns encoded in cigar string
107 | 	NOTE: returned coordinates are 0-based. Deletion to the reference NOTE: 
108 | 	st is 0-based '''
109 | 	
110 | 	block=[]
111 | 	h = head_clip.findall(cigar)
112 | 	if len(h)==0:h_len=0
113 | 	else:h_len=int(h[0])
114 | 	
115 | 	ref_length=0
116 | 	m = prior_intron.findall(cigar)
117 | 	if len(m)==0:
118 | 		return block
119 | 	else:
120 | 		for j in m:
121 | 			ref_length += sum([int(i) for i in ref_part.findall(j[0])])
122 | 			chrom_st = int(st)  + (ref_length - h_len)
123 | 			chrom_end = chrom_st + int(j[1])
124 | 			block.append([chr,chrom_st,chrom_end])
125 | 			ref_length += int(j[1])
126 | 	return block
127 | 	
128 | def fetch_exon(chr, st, cigar):
129 | 	'''return genome coordinates of the exon encoded in cigar string
130 | 	NOTE: returned coordinates are 0-based. NOTE: st is 0-based'''
131 | 	
132 | 	block=[]
133 | 	h = head_clip.findall(cigar)
134 | 	if len(h)==0:h_len=0
135 | 	else:h_len=int(h[0])
136 | 	
137 | 	ref_length=0
138 | 	m = prior_exon.findall(cigar)
139 | 	if len(m)==0:
140 | 		return block
141 | 	else:
142 | 		for j in m:
143 | 			ref_length += sum([int(i) for i in ref_part.findall(j[0])])
144 | 			chrom_st = int(st) + (ref_length - h_len)
145 | 			chrom_end = chrom_st + int(j[1])
146 | 			block.append([chr,chrom_st,chrom_end])
147 | 			ref_length += int(j[1])
148 | 	return block
149 | 	
150 | def list2str (lst):
151 | 	'''translate samtools returned cigar_list into cigar_string'''
152 | 	code2Char={'0':'M','1':'I','2':'D','3':'N','4':'S','5':'H','6':'P','7':'=','8':'X'}
153 | 	
154 | 	cigar_str=''
155 | 	for i in lst:
156 | 		cigar_str += str(i[1]) + code2Char[str(i[0])]
157 | 	return cigar_str
158 | 


--------------------------------------------------------------------------------
/rseqc/qcmodule/dotProduct.py:
--------------------------------------------------------------------------------
 1 | import timeit # module with timing subroutines                                                              
 2 | import random # module to generate random numnbers                                                          
 3 | from itertools import imap,starmap,izip
 4 | from operator import mul
 5 | 
 6 | def v(N=50,min=-10,max=10):
 7 |     """Generates a random vector (in an array) of dimension N; the                                          
 8 |     values are integers in the range [min,max]."""
 9 |     out = []
10 |     for k in range(N):
11 |         out.append(random.randint(min,max))
12 |     return out
13 | 
14 | def check(v1,v2):
15 |     if len(v1)!=len(v2):
16 |         raise ValueError,"the lenght of both arrays must be the same"
17 |     pass
18 | 
19 | def d0(v1,v2):
20 |     """                                                                                                     
21 |     d0 is Nominal approach:                                                                                 
22 |     multiply/add in a loop                                                                                  
23 |     """
24 |     check(v1,v2)
25 |     out = 0
26 |     for k in range(len(v1)):
27 |         out += v1[k] * v2[k]
28 |     return out
29 | 
30 | def d1(v1,v2):
31 |     """                                                                                                     
32 |     d1 uses an imap (from itertools)                                                                        
33 |     """
34 |     check(v1,v2)
35 |     return sum(imap(mul,v1,v2))
36 | 
37 | def d2(v1,v2):
38 |     """                                                                                                     
39 |     d2 uses a conventional map                                                                              
40 |     """
41 |     check(v1,v2)
42 |     return sum(map(mul,v1,v2))
43 | 
44 | def d3(v1,v2):
45 |     """                                                                                                     
46 |     d3 uses a starmap (itertools) to apply the mul operator on an izipped (v1,v2)                           
47 |     """
48 |     check(v1,v2)
49 |     return starmap(mul,izip(v1,v2))
50 | 


--------------------------------------------------------------------------------
/rseqc/qcmodule/fasta.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''manipulate fasta for fastq format files.'''
  3 | 
  4 | #import built-in modules
  5 | import numpy
  6 | import re
  7 | import sys
  8 | import string
  9 | from optparse import OptionParser
 10 | import collections
 11 | from rseqc.qcmodule import FrameKmer
 12 | #import third-party modules
 13 | 
 14 | #changes to the paths
 15 | 
 16 | #changing history to this module
 17 | 
 18 | 
 19 | __author__ = "Liguo Wang"
 20 | __copyright__ = "Copyright 2010, Wei Li's Lab"
 21 | __credits__ = []
 22 | __license__ = "GPL"
 23 | __version__ = "1.0.1"
 24 | __maintainer__ = "Liguo Wang"
 25 | __email__ = "liguow@bcm.edu"
 26 | __status__ = "Development" #Prototype or Production
 27 | 
 28 | 
 29 | 
 30 | class Fasta:
 31 | 	'''manipulate fasta or fastq format file
 32 | 	'''
 33 | 	
 34 | 	def __init__(self,fastafile=None):
 35 | 		'''initialize object, lowercase in sequence is automatically converted into uppercase'''
 36 | 		self.seqs={}
 37 | 		self.IDs=[]
 38 | 		self.transtab = string.maketrans("ACGTNX","TGCANX")
 39 | 		self.filename = fastafile
 40 | 		tmpseq=''
 41 | 		if fastafile is not None:
 42 | 			for line in open(fastafile,'r'):
 43 | 				line=line.strip(' \n')
 44 | 				if line.startswith('>'):
 45 | 					if(tmpseq):
 46 | 						self.seqs[name]=tmpseq
 47 | 					name=line[1:]
 48 | 					tmpseq =''
 49 | 					self.IDs.append(name)
 50 | 					print("\tloading "+name+' ...', file=sys.stderr)
 51 | 				else:
 52 | 					tmpseq += line.upper()
 53 | 			self.seqs[name]=tmpseq
 54 | 				
 55 | 	def addSeq(self,id,seq):
 56 | 		'''add sequence to current data'''
 57 | 		if id in self.seqs:
 58 | 			print(id +" already exists!", file=sys.stderr)
 59 | 			return
 60 | 		else:
 61 | 			self.seqs[id]=seq.upper()
 62 | 			self.IDs.append(id)
 63 | 			
 64 | 	def getNames (self,file=None):
 65 | 		'''return all sequence IDs'''
 66 | 		return self.IDs
 67 | 		
 68 | 	def getSeq(self,seqID=None):
 69 | 		'''return sequence for sepcified seqID, otherwise all sequences are returned'''
 70 | 		if seqID is None:
 71 | 			return list(self.seqs.values())
 72 | 		else:
 73 | 			return self.seqs[seqID]
 74 | 
 75 | 	def printSeqs(self,n=50):
 76 | 		'''print all seqs '''
 77 | 		for k,v in list(self.seqs.items()):
 78 | 			print('>' + k)
 79 | 			for i in range(0, len(v), n):
 80 | 				print(v[i:i+n])
 81 | 
 82 | 			
 83 | 	def getSeqLen(self,seqID=None):
 84 | 		seqlen=collections.defaultdict(dict)
 85 | 		if seqID is None:
 86 | 			for (k,v) in list(self.seqs.items()):
 87 | 				seqlen[k]=len(v)
 88 | 		else:
 89 | 			try:
 90 | 				seqlen[seqID]=len(self.seqs[seqID])
 91 | 			except:
 92 | 				print("Not found", file=sys.stderr)
 93 | 		return seqlen
 94 | 			
 95 | 	def countBase(self,pattern=None):
 96 | 		'''count occurence of substring (defined by pattern), otherwise count A,C,G,T,N,X
 97 | 		NOTE: pattern is counted non-overlappingly'''
 98 | 		if pattern is None:
 99 | 			print("ID\tTotal\tA\tC\tG\tT\tN\tX")
100 | 			for (k,v) in list(self.seqs.items()):
101 | 				print(k+"\t", end=' ')
102 | 				print(len(v),"\t", end=' ')
103 | 				print(str(v.count('A'))+"\t", end=' ')
104 | 				print(str(v.count('C'))+"\t", end=' ')
105 | 				print(str(v.count('G'))+"\t", end=' ')
106 | 				print(str(v.count('T'))+"\t", end=' ')
107 | 				print(str(v.count('N'))+"\t", end=' ')
108 | 				print(v.count('X'))
109 | 		else:
110 | 			for (k,v) in list(self.seqs.items()):
111 | 				print(k+"\t", end=' ')
112 | 				print(str(len(v))+"\t", end=' ')
113 | 				print(v.count(pattern))
114 | 	def cal_entropy(self,l=3):
115 | 		'''calculate entropy for each sequence'''
116 | 		for (id,seq) in list(self.seqs.items()):
117 | 			entropy = 0
118 | 			dna_chars_uniq = FrameKmer.all_possible_kmer(l)
119 | 			dna_len = len(seq)
120 | 			
121 | 			for c in dna_chars_uniq:
122 | 				if 'N' in c:
123 | 					continue
124 | 				prop = seq.count(c)/(1.0*dna_len)
125 | 				if prop ==0:
126 | 					continue
127 | 				information = numpy.log2(1.0/prop)
128 | 				entropy += prop * information
129 | 			yield(id, entropy)
130 | 		
131 | 	def revComp(self,seqID=None):
132 | 		'''return reverse-complemented sequence for sepcified seqID, otherwise all sequences are 
133 | 		reverse-complemented'''
134 | 		if seqID is None:
135 | 			for (k,v) in list(self.seqs.items()):
136 | 				print(">" + k + "_rev")
137 | 				tmp = v.translate(self.transtab)
138 | 				return tmp[::-1]			
139 | 		else:
140 | 			return self.seqs[seqID].translate(self.transtab)[::-1]
141 | 
142 | 			
143 | 	def getUniqSeqs(self):
144 | 		'''remove redundancy from original fasta files.
145 | 		duplicated sequences will be only report once'''
146 | 
147 | 		seq2Name={}
148 | 		seq2Count={}
149 | 		for (key,value) in list(self.seqs.items()):
150 | 			seq2Name[value]=key
151 | 			if value in seq2Count:
152 | 				seq2Count[value]+=1
153 | 			else:
154 | 				seq2Count[value]=1
155 | 		for value in list(seq2Name.keys()):
156 | 				print('>'+ str(seq2Name[value]) + '_' + str(seq2Count[value]))
157 | 				print(value)
158 | 
159 | 
160 | 	def findPattern(self,pat,outfile,seqID=None,rev=True):
161 | 		''' find pattern in all sequence unless seqID is specified, coordinates will be returned as bed format file'''
162 | 		
163 | 		fout=open(outfile,'w')
164 | 		length=len(pat)	
165 | 
166 | 		Pat=pat.upper()
167 | 		start=0
168 | 		
169 | 		
170 | 		if seqID is None:
171 | 			for (k,v) in list(self.seqs.items()):
172 | 				loopSwitch=0
173 | 				start=0
174 | 				while loopSwitch !=-1:
175 | 					loopSwitch = v.find(Pat,start)
176 | 					if loopSwitch !=-1:
177 | 						print(k + "\t" + str(loopSwitch) + "\t" + str(loopSwitch + length) + "\t" + Pat + "\t0\t+", file=fout) 
178 | 						start = loopSwitch +1
179 | 					
180 | 		else:
181 | 			loopSwitch=0
182 | 			start=0
183 | 			while loopSwitch !=-1:
184 | 				loopSwitch = self.seqs[seqID].find(Pat,start)
185 | 				print(seqID + "\t" + str(loopSwitch) + "\t" + str(loopSwitch + length) + "\t" + Pat + "\t0\t+", file=fout) 
186 | 				start = loopSwitch +1
187 | 
188 | 		if rev==True:
189 | 			Pat_rev=Pat.translate(self.transtab)[::-1]
190 | 			if seqID is None:
191 | 				for (k,v) in list(self.seqs.items()):
192 | 					loopSwitch=0
193 | 					start=0
194 | 					while loopSwitch !=-1:
195 | 						loopSwitch = v.find(Pat_rev,start)
196 | 						if loopSwitch !=-1:
197 | 							print(k + "\t" + str(loopSwitch) + "\t" + str(loopSwitch + length) + "\t" + Pat + "\t0\t-", file=fout) 
198 | 							start = loopSwitch +1
199 | 						
200 | 			else:
201 | 				loopSwitch=0
202 | 				start=0
203 | 				while loopSwitch !=-1:
204 | 					loopSwitch = self.seqs[seqID].find(Pat_rev,start)
205 | 					print(seqID + "\t" + str(loopSwitch) + "\t" + str(loopSwitch + length) + "\t" + Pat + "\t0\t-", file=fout) 
206 | 					start = loopSwitch +1		
207 | 
208 | 	def fetchSeq(self,chr=None,st=None,end=None,infile=None,outfile=None):
209 | 		''' Fetching sequence based on chrName (should be exactly the same as fasta file), St, End. 
210 | 		NOTE: the coordinate is 0-based,half-open. use infile to specify multiple coordinates. infile
211 | 		should be bed3, bed6 or bed12'''
212 | 
213 | 		if (infile is not None) and (outfile is not None):
214 | 			fout=open(outfile,'w')
215 | 			for line in open(infile):
216 | 				fields=line.strip().split()
217 | 				if (len(fields)==3):
218 | 					print(fields[0]+":"+fields[1]+"-"+fields[2]+"\t"+"strand=+", file=fout)
219 | 					print(self.seqs[fields[0]][int(fields[1]):int(fields[2])].upper(), file=fout)
220 | 				elif (len(fields)>3):
221 | 					if fields[5]=='-':
222 | 						print(fields[0]+":"+fields[1]+"-"+fields[2]+"\t"+"strand=-", file=fout)
223 | 						print(self.seqs[fields[0]][int(fields[1]):int(fields[2])].translate(self.transtab)[::-1].upper(), file=fout)
224 | 					else:
225 | 						print(fields[0]+":"+fields[1]+"-"+fields[2]+"\t"+"strand=+", file=fout)
226 | 						print(self.seqs[fields[0]][int(fields[1]):int(fields[2])].upper(), file=fout)
227 | 		else:
228 | 			try:
229 | 				return self.seqs[chr][st:end].upper()
230 | 			except:
231 | 				print("cannot fetch sequence from " + self.filename + " for " + chr + ":" + str(st) + "-" + str(end), file=sys.stderr)
232 | 				return ''
233 | 				#print >>sys.stderr, chr + "\t" + str(st) +'\t' + str(end) + "  Please input chr,st,end"
234 | 						
235 | def main():
236 | 	parser = OptionParser()
237 | 	parser.add_option("-i","--input_file",dest="in_file",help="input file name")
238 | 	(options,args)=parser.parse_args()
239 | 	obj=Fasta(options.in_file)
240 | 	obj.printSeqs(n=80)
241 | 
242 | if __name__ == "__main__":
243 |     main()
244 | 	
245 | 


--------------------------------------------------------------------------------
/rseqc/qcmodule/fastq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''manipulate fastq files'''
 3 | 
 4 | #import built-in modules
 5 | import os,sys
 6 | import re
 7 | import string
 8 | from optparse import OptionParser
 9 | import warnings
10 | import string
11 | import collections
12 | import math
13 | 
14 | #import third-party modules
15 | from bx.bitset import *
16 | from bx.bitset_builders import *
17 | from bx.intervals import *
18 | 
19 | #changes to the paths
20 | 
21 | #changing history to this module
22 | 
23 | 
24 | __author__ = "Liguo Wang"
25 | __copyright__ = "Copyright 2010, Wei Li's Lab"
26 | __credits__ = []
27 | __license__ = "GPL"
28 | __version__ = "1.0.1"
29 | __maintainer__ = "Liguo Wang"
30 | __email__ = "liguow@bcm.edu"
31 | __status__ = "Development" #Prototype or Production
32 | 
33 | 
34 | 
35 | class FQ:
36 | 	'''provides method to processing fastaq files'''
37 | 	
38 | 	def __init__(self,fqFile):
39 | 		'''This is constructor of FQ'''
40 | 		self.f=open(fqFile,'r')
41 | 		self.fileName=os.path.basename(fqFile)
42 | 		self.ABS_fileName=fqFile
43 | 


--------------------------------------------------------------------------------
/rseqc/qcmodule/fickett.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | '''calculate coding potential'''
 3 | 
 4 | # Fickett TESTCODE data
 5 | # NAR 10(17) 5303-531
 6 | position_prob ={
 7 | 'A':[0.94,0.68,0.84,0.93,0.58,0.68,0.45,0.34,0.20,0.22],
 8 | 'C':[0.80,0.70,0.70,0.81,0.66,0.48,0.51,0.33,0.30,0.23],
 9 | 'G':[0.90,0.88,0.74,0.64,0.53,0.48,0.27,0.16,0.08,0.08],
10 | 'T':[0.97,0.97,0.91,0.68,0.69,0.44,0.54,0.20,0.09,0.09]
11 | }
12 | position_weight={'A':0.26,'C':0.18,'G':0.31,'T':0.33}
13 | position_para  =[1.9,1.8,1.7,1.6,1.5,1.4,1.3,1.2,1.1,0.0]
14 | 
15 | content_prob={
16 | 'A':[0.28,0.49,0.44,0.55,0.62,0.49,0.67,0.65,0.81,0.21],
17 | 'C':[0.82,0.64,0.51,0.64,0.59,0.59,0.43,0.44,0.39,0.31],
18 | 'G':[0.40,0.54,0.47,0.64,0.64,0.73,0.41,0.41,0.33,0.29],
19 | 'T':[0.28,0.24,0.39,0.40,0.55,0.75,0.56,0.69,0.51,0.58]
20 | }
21 | content_weight={'A':0.11,'C':0.12,'G':0.15,'T':0.14}
22 | content_para  =[0.33,0.31,0.29,0.27,0.25,0.23,0.21,0.17,0]
23 | 
24 | def look_up_position_prob(value, base):
25 | 	'''look up positional probability by base and value'''
26 | 	if float(value)<0:
27 | 		return None
28 | 	for idx,val in enumerate (position_para):
29 | 		if (float(value) >= val):
30 | 			return float(position_prob[base][idx]) * float(position_weight[base])
31 | 
32 | def look_up_content_prob(value, base):
33 | 	'''look up content probability by base and value'''
34 | 	if float(value)<0:
35 | 		return None
36 | 	for idx,val in enumerate (content_para):
37 | 		if (float(value) >= val):
38 | 			return float(content_prob[base][idx]) * float(content_weight[base])
39 | 
40 | def fickett_value(dna):
41 | 	'''calculate Fickett value. Input is DNA sequence'''
42 | 	if (len(dna)<1):
43 | 		return None
44 | 	fickett_score=0
45 | 	dna=dna.upper()
46 | 	total_base = len(dna)
47 | 	A_content = float(dna.count('A'))/total_base
48 | 	C_content = float(dna.count('C'))/total_base
49 | 	G_content = float(dna.count('G'))/total_base
50 | 	T_content = float(dna.count('T'))/total_base
51 | 	#print "A content\t" + str(A_content)
52 | 	#print "C content\t" + str(C_content)
53 | 	#print "G content\t" + str(G_content)
54 | 	#print "T content\t" + str(T_content)
55 | 	
56 | 	phase_0 = [dna[i] for i in range(0,len(dna)) if i % 3==0]
57 | 	phase_1 = [dna[i] for i in range(0,len(dna)) if i % 3==1]
58 | 	phase_2 = [dna[i] for i in range(0,len(dna)) if i % 3==2]
59 | 	
60 | 	A_position=max(phase_0.count('A'),phase_1.count('A'),phase_2.count('A'))/(min(phase_0.count('A'),phase_1.count('A'),phase_2.count('A')) +1.0)
61 | 	C_position=max(phase_0.count('C'),phase_1.count('C'),phase_2.count('C'))/(min(phase_0.count('C'),phase_1.count('C'),phase_2.count('C')) +1.0)
62 | 	G_position=max(phase_0.count('G'),phase_1.count('G'),phase_2.count('G'))/(min(phase_0.count('G'),phase_1.count('G'),phase_2.count('G')) +1.0)
63 | 	T_position=max(phase_0.count('T'),phase_1.count('T'),phase_2.count('T'))/(min(phase_0.count('T'),phase_1.count('T'),phase_2.count('T')) +1.0)
64 | 	#print "A position\t" + str(A_position)
65 | 	#print "C position\t" + str(C_position)
66 | 	#print "G position\t" + str(G_position)
67 | 	#print "T position\t" + str(T_position)
68 | 
69 | 	
70 | 	#for i (A_content,C_content,G_content,T_content):
71 | 	fickett_score += look_up_content_prob(A_content,'A')
72 | 	fickett_score += look_up_content_prob(C_content,'C')
73 | 	fickett_score += look_up_content_prob(G_content,'G')
74 | 	fickett_score += look_up_content_prob(T_content,'T')
75 | 	
76 | 	fickett_score += look_up_position_prob(A_position,'A')
77 | 	fickett_score += look_up_position_prob(C_position,'C')
78 | 	fickett_score += look_up_position_prob(G_position,'G')
79 | 	fickett_score += look_up_position_prob(T_position,'T')	
80 | 	
81 | 	return fickett_score
82 | 	
83 | 	
84 | 	


--------------------------------------------------------------------------------
/rseqc/qcmodule/getBamFiles.py:
--------------------------------------------------------------------------------
 1 | import os,sys
 2 | from os.path import abspath,join, getsize
 3 | """
 4 | Get bam files from input, input could be:
 5 | 1) directory that containing one or more bam files
 6 | 2) plain text file containing one or more bam file paths
 7 | 3) a single bam file
 8 | 4) ',' separated multiple bam files
 9 | 
10 | in all cases, the index .bai file(s) should be exist in the same location. eg, if test.bam
11 | exists, test.bam.bai must also exist.
12 | """
13 | 
14 | def isbamfile(infile):
15 | 	'''check if it is bam file, if it is empty and if the .bai file exists'''
16 | 	if os.path.isfile(infile) and infile[-4:].lower() == '.bam':
17 | 		if getsize(infile) != 0:
18 | 			if os.path.isfile(infile + '.bai'):
19 | 				return True
20 | 			else:
21 | 				print >>sys.stderr, "Warning: %s.bai does not exist! Skip it." % (infile)
22 | 				return False
23 | 		else:
24 | 			print >>sys.stderr, "The size of %s is 0! Skip it." % (infile)
25 | 			return False
26 | 	else:
27 | 		return False	
28 | 
29 | def get_bam_files (input,printit=False):
30 | 	bam_files = []	
31 | 	
32 | 	#dir
33 | 	if os.path.isdir(input):
34 | 		for root, directories, files in os.walk(input,followlinks=True):
35 | 			full_names = [join(abspath(root), name) for name in files]
36 | 			for fn in full_names:
37 | 				if isbamfile(fn):
38 | 					bam_files.append(fn)
39 | 	#single bam file
40 | 	elif isbamfile(input):
41 | 		bam_files.append(input)
42 | 	#plain text file
43 | 	elif os.path.isfile(input):
44 | 		try:
45 | 			for line in open(input):
46 | 				line = line.strip()
47 | 				if line.startswith('#'):continue
48 | 				if isbamfile(line):
49 | 					bam_files.append(line)
50 | 		except:
51 | 			pass
52 | 	else:
53 | 		tmp = input.split(',')
54 | 		if len(tmp) <2: pass
55 | 		for i in tmp:
56 | 			if isbamfile(i):
57 | 				bam_files.append(i)
58 | 	
59 | 	if printit:
60 | 		for i in bam_files:
61 | 			print i		
62 | 	return bam_files
63 | 
64 | 
65 | if __name__ == '__main__':
66 | 	get_bam_files(sys.argv[1])


--------------------------------------------------------------------------------
/rseqc/qcmodule/mystat.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | #import built-in modules
  4 | import os,sys
  5 | import re
  6 | import string
  7 | from optparse import OptionParser
  8 | import warnings
  9 | import string
 10 | import collections
 11 | import math
 12 | 
 13 | #import third-party modules
 14 | 
 15 | #changes to the paths
 16 | 
 17 | #changing history to this module
 18 | #05/26/2011: suppport multiple spliced mapped reads
 19 | 
 20 | __author__ = "Liguo Wang"
 21 | __copyright__ = "Copyright 2010, Wei Li's Lab"
 22 | __credits__ = []
 23 | __license__ = "GPL"
 24 | __version__ = "1.1.0"
 25 | __maintainer__ = "Liguo Wang"
 26 | __email__ = "liguow@bcm.edu"
 27 | __status__ = "Development" #Prototype or Production
 28 | 
 29 | 
 30 | def RSS(arg):
 31 | 	'''calculate Square root of sum of square. Input is ',' separated numbers'''
 32 | 	lst=arg.split(',')
 33 | 	lst_sum=0
 34 | 	for i in [ int(i)**2 for i in lst]:
 35 | 		lst_sum += i 
 36 | 	#nsr=10*math.log10((1+noi_sum**0.5)/(1+sig_sum**0.5))
 37 | 	return lst_sum**0.5
 38 | 	
 39 | def H_mean(arg):
 40 | 	'''calculate harmornic mean. Input is ',' separated numbers'''
 41 | 	lst=[1/float(i) for i in arg.split(',') if float(i) !=0]
 42 | 	if len(lst) == 0:
 43 | 		return "NA"
 44 | 	else:
 45 | 		return len(lst)/(sum(lst))
 46 | 
 47 | def shannon_entropy(arg):
 48 | 	'''calculate shannon's entropy (or Shannon-Wiener index).'''
 49 | 	lst=arg
 50 | 	lst=[float(i) for i in lst if float(i)>0]
 51 | 	entropy=0.0
 52 | 	for i in lst:
 53 | 		entropy += (i/sum(lst)) * math.log((i/sum(lst)))
 54 | 	if entropy == 0:
 55 | 		return 0
 56 | 	else:
 57 | 		return -entropy
 58 | 
 59 | 	
 60 | def shannon_entropy_es(arg):
 61 | 	'''calculate estimator of shannon's entropy (Chao & Shen, 2003)'''
 62 | 	lst=arg
 63 | 	lst=[float(i) for i in lst if float(i)>0]
 64 | 	if sum(lst)<=0 or min(lst)<0:return "NA"	#if there is no fragmental splicing
 65 | 	if (len(lst)==1): return 0					#if there is only 1 fragmental splicing
 66 | 	lst.append(2)
 67 | 	
 68 | 	#estimate C_bar
 69 | 	singleton=0
 70 | 	entropy=0.0
 71 | 	for i in lst:
 72 | 		if i ==1:singleton +=1
 73 | 	
 74 | 	C_bar = 1- (singleton/sum(lst))
 75 | 	for i in lst:entropy += ( (C_bar*i/sum(lst)) * math.log((C_bar*i/sum(lst))) )/(1-(1-C_bar*i/sum(lst))**sum(lst))
 76 | 	if entropy == 0:
 77 | 		return 0
 78 | 	else:
 79 | 		return -entropy
 80 | 
 81 | def shannon_entropy_ht(arg):
 82 | 	'''calculate estimator of shannon's entropy based on Horzitz-Thompson'''
 83 | 	lst=arg.split(',')
 84 | 	lst=[float(i) for i in lst if float(i)>0]
 85 | 	if sum(lst)<=0 or min(lst)<0:return "NA"	#if there is no fragmental splicing
 86 | 	if (len(lst)==1): return 0					#if there is only 1 fragmental splicing
 87 | 	
 88 | 	#estimate C_bar
 89 | 	entropy=0.0
 90 | 	for i in lst:
 91 | 		entropy += ( (i/sum(lst)) * math.log((i/sum(lst))) )/(1-(1-i/sum(lst))**sum(lst))
 92 | 	return -entropy
 93 | 	
 94 | def simpson_index(arg):
 95 | 	'''calculate Gini-Simpson's index. Input is ',' separated numbers'''
 96 | 	lst=arg.split(',')
 97 | 	lst=[float(i) for i in lst if float(i)>0]
 98 | 	simpson=0.0
 99 | 	
100 | 	try:
101 | 		for i in lst:
102 | 			simpson = simpson + (i/sum(lst))**2
103 | 		return 1-simpson
104 | 	except: return 0
105 | 	
106 | def simpson_index_es(arg):
107 | 	'''calculate estimator Gini-Simpson's index. Input is ',' separated numbers'''
108 | 	lst=arg.split(',')
109 | 	lst=[float(i) for i in lst if float(i)>0]
110 | 	simpson=0.0
111 | 	
112 | 	try:
113 | 		for i in lst:
114 | 			simpson = simpson + i*(i-1)
115 | 		return 1- (simpson/(sum(lst)*(sum(lst)-1)))
116 | 	except: return 0
117 | 	
118 | def Hill_number(arg,qvalue=1):
119 | 	'''Calculate real diversity (Hill's number). Input is ',' separated numbers. qvalue is the only
120 | 	parameter for Hill's function. When q=1, it return exp(H) which is the effective number of junctions
121 | 	calculated by Shannon's entropy. When q<1, Hill's function was favors low frequency junctions. 
122 | 	When q>1, Hill's function was favors high frequency junctions (common junctions). Simpon's Index
123 | 	is particular case of Hill's function as q=2'''
124 | 	
125 | 	lst=arg.split(',')
126 | 	lst=[float(i) for i in lst if float(i)>0]
127 | 	freq=[(i/sum(lst))**qvalue for i in lst]
128 | 	try:
129 | 		return (sum(freq))**(1/(1-qvalue))
130 | 	except:
131 | 		return math.exp(shannon_entropy(arg))
132 | import math
133 | import functools
134 | 
135 | def percentile(N, percent, key=lambda x:x):
136 |     """
137 |     Find the percentile of a list of values.
138 | 
139 |     @parameter N - is a list of values. Note N MUST BE already sorted.
140 |     @parameter percent - a float value from 0 to 100.
141 |     @parameter key - optional key function to compute value from each element of N.
142 | 
143 |     @return - the percentile of the values
144 |     """
145 |     if not N:
146 |         return None
147 |     k = (len(N)-1) * percent/100.0
148 |     f = math.floor(k)
149 |     c = math.ceil(k)
150 |     if f == c:
151 |         return key(N[int(k)])
152 |     d0 = key(N[int(f)]) * (c-k)
153 |     d1 = key(N[int(c)]) * (k-f)
154 |     return d0+d1
155 | 	
156 | def percentile_list(N):
157 | 	"""
158 | 	Find the percentile of a list of values.
159 | 	@parameter N - is a list of values. Note N MUST BE already sorted.
160 |     @return - the list of percentile of the values
161 | 	"""
162 | 	if not N:return None
163 | 	if len(N) <100: return N
164 | 	per_list=[]
165 | 	for i in range(1,101):
166 | 		k = (len(N)-1) * i/100.0
167 | 		f = math.floor(k)
168 | 		c = math.ceil(k)
169 | 		if f == c:
170 | 			per_list.append( int(N[int(k)])  )
171 | 		else:
172 | 			d0 = N[int(f)] * (c-k)
173 | 			d1 = N[int(c)] * (k-f)
174 | 			per_list.append(int(round(d0+d1)))	
175 | 	return per_list
176 | 


--------------------------------------------------------------------------------
/rseqc/qcmodule/orf.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import sys
  3 | 
  4 | start_coden = ['ATG']
  5 | stop_coden = ["TAG","TAA","TGA"]
  6 | def _reverse_comp(seq):
  7 | 	swap = {"A":"T", "T":"A", "C":"G", "G":"C","N":"N","X":"X"}
  8 | 	tmp = "".join(swap[b] for b in seq)
  9 | 	return tmp[::-1]
 10 |   	
 11 | def longest_orf(seq,strandness,sc=None,tc=None):
 12 | 	'''find the longest ORF in input mRNA sequence. strand=0 only search '+' strand, strand=1, only 
 13 | 	search '-' strand, strand=2, search both strand'''
 14 | 	orf_ranges=collections.defaultdict(list)
 15 | 	dna_seq = seq.upper()
 16 | 	possible_orf={}	#[orf-st, orf_end] ==>size
 17 | 	largest_orf=""
 18 | 	
 19 | 	if sc is not None:
 20 | 		start_coden = sc.strip(',').split(',')
 21 | 		if len(start_coden)==0:
 22 | 			print >>sys.stderr,"Unkown start codon"
 23 | 			sys.exit(1)
 24 | 		else:
 25 | 			for cd in start_coden:
 26 | 				if len(cd) != 3:
 27 | 					print >>sys.stderr,"Unkown start codon" + str(cd)
 28 | 					sys.exit(1)
 29 | 	if tc is not None:
 30 | 		stop_coden = tc.strip(',').split(',')
 31 | 		if len(stop_coden)==0:
 32 | 			print >>sys.stderr,"Unkown stop codon"
 33 | 			sys.exit(1)
 34 | 		else:
 35 | 			for cd in stop_coden:
 36 | 				if len(cd) != 3:
 37 | 					print >>sys.stderr,"Unkown stop codon" + str(cd)
 38 | 					sys.exit(1)	
 39 | 
 40 | 	strand = strandness
 41 | 	start_pos = []
 42 | 	end_pos=[]
 43 | 	orf_ranges.clear()
 44 | 	if strand == '-':
 45 | 		dna_seq=_reverse_comp(dna_seq)
 46 | 	for sc in start_coden:
 47 | 		start_found = dna_seq.find(sc)
 48 | 		while start_found >-1:
 49 | 			start_pos.append(start_found)
 50 | 			start_found = dna_seq.find(sc,start_found +1)
 51 | 	for sc in stop_coden:
 52 | 		end_found = dna_seq.find(sc)
 53 | 		while end_found >-1:
 54 | 			end_pos.append(end_found)
 55 | 			end_found = dna_seq.find(sc,end_found+1)
 56 | 	
 57 | 	for st in start_pos:
 58 | 		for end in end_pos:
 59 | 			if end <= st: continue
 60 | 			if (end - st) % 3==0:
 61 | 				orf_ranges[st].append(end)
 62 | 	for k in sorted(orf_ranges):
 63 | 		possible_orf[str(k) + '\t' + str(min(orf_ranges[k])) + '\t' + strand] = min(orf_ranges[k]) - k
 64 | 		
 65 | 		
 66 | 	for k,v in possible_orf.items():
 67 | 		if v == max(possible_orf.values()):
 68 | 			#print "#" + k
 69 | 			fields=k.split()
 70 | 			largest_orf = dna_seq[int(fields[0]):int(fields[1])]
 71 | 	return largest_orf		#could be None, '' or DNA sequencee
 72 | 	
 73 | 				
 74 | def longest_orf_bed(seq,bedline,sc=None,tc=None):
 75 | 	'''find the longest ORF in input mRNA sequence. strand=0 only search '+' strand, strand=1, only 
 76 | 	search '-' strand, strand=2, search both strand'''
 77 | 	orf_ranges=collections.defaultdict(list)
 78 | 	dna_seq = seq.upper()
 79 | 	possible_orf={}	#[orf-st, orf_end] ==>size
 80 | 	largest_orf=""
 81 | 
 82 | 	if sc is not None:
 83 | 		start_coden = sc.strip(',').split(',')
 84 | 		if len(start_coden)==0:
 85 | 			print >>sys.stderr,"Unkown start codon"
 86 | 			sys.exit(1)
 87 | 		else:
 88 | 			for cd in start_coden:
 89 | 				if len(cd) != 3:
 90 | 					print >>sys.stderr,"Unkown start codon" + str(cd)
 91 | 					sys.exit(1)
 92 | 	if tc is not None:
 93 | 		stop_coden = tc.strip(',').split(',')
 94 | 		if len(stop_coden)==0:
 95 | 			print >>sys.stderr,"Unkown stop codon"
 96 | 			sys.exit(1)
 97 | 		else:
 98 | 			for cd in stop_coden:
 99 | 				if len(cd) != 3:
100 | 					print >>sys.stderr,"Unkown stop codon" + str(cd)
101 | 					sys.exit(1)	
102 | 	
103 | 	fields = bedline.split()
104 | 	txStart = int(fields[1])
105 | 	exon_sizes = [int(i) for i in fields[10].rstrip(',\n').split(',')]
106 | 	exon_starts = map(int,fields[11].rstrip(',').split(','))
107 | 	exon_starts = map((lambda x: x + txStart),exon_starts)
108 | 	exon_ends = map(int,fields[10].rstrip(',').split(','))
109 | 	exon_ends = map((lambda x,y:x+y),exon_starts,exon_ends)
110 | 	strand = fields[5]
111 | 	
112 | 	start_pos = []
113 | 	end_pos=[]
114 | 	orf_ranges.clear()
115 | 	if strand == '-':
116 | 		dna_seq=_reverse_comp(dna_seq)
117 | 	for sc in start_coden:
118 | 		start_found = dna_seq.find(sc)
119 | 		while start_found >-1:
120 | 			start_pos.append(start_found)
121 | 			start_found = dna_seq.find(sc,start_found +1)
122 | 	for sc in stop_coden:
123 | 		end_found = dna_seq.find(sc)
124 | 		while end_found >-1:
125 | 			end_pos.append(end_found)
126 | 			end_found = dna_seq.find(sc,end_found+1)
127 | 	
128 | 	for st in start_pos:
129 | 		for end in end_pos:
130 | 			if end <= st: continue
131 | 			if (end - st) % 3==0:
132 | 				orf_ranges[st].append(end)
133 | 	for k in sorted(orf_ranges):
134 | 		possible_orf[str(k) + '\t' + str(min(orf_ranges[k])) + '\t' + strand] = min(orf_ranges[k]) - k
135 | 	
136 | 	for k,v in possible_orf.items():
137 | 		if v == max(possible_orf.values()):
138 | 			#print k + '\t' + str(v)
139 | 			col=k.split()
140 | 			cds_st = int(col[0])
141 | 			cds_end = int(col[1])
142 | 			strand = col[2]
143 | 			
144 | 			if strand =='+':
145 | 				#determine CDS start site
146 | 				for exon_size, exon_st in zip(exon_sizes,exon_starts):
147 | 					if cds_st > exon_size: 
148 | 						cds_st = cds_st - exon_size
149 | 						continue
150 | 					else:
151 | 						cds_pos1 = exon_st + cds_st
152 | 						break
153 | 				#determine CDS end site
154 | 				for exon_size, exon_st in zip(exon_sizes,exon_starts):
155 | 					if cds_end > exon_size:
156 | 						cds_end = cds_end - exon_size
157 | 						continue
158 | 					else:
159 | 						cds_pos2 = exon_st + cds_end + 3
160 | 						break
161 | 				fields[6] = str(cds_pos1)
162 | 				fields[7] = str(cds_pos2)
163 | 				return '\t'.join(fields)
164 | 
165 | 			if strand =='-':
166 | 				exon_sizes = exon_sizes[::-1]
167 | 				exon_ends = exon_ends[::-1]
168 | 				for exon_size, exon_end in zip(exon_sizes,exon_ends):
169 | 					if cds_st > exon_size: 
170 | 						cds_st = cds_st - exon_size
171 | 						continue
172 | 					else:
173 | 						cds_pos1 = exon_end - cds_st
174 | 						break
175 | 				for exon_size, exon_end in zip(exon_sizes,exon_ends):
176 | 					if cds_end > exon_size:
177 | 						cds_end = cds_end - exon_size
178 | 						continue
179 | 					else:
180 | 						cds_pos2 = exon_end - cds_end - 3
181 | 						break
182 | 				fields[7] = str(cds_pos1)
183 | 				fields[6] = str(cds_pos2)
184 | 				return '\t'.join(fields)
185 | 


--------------------------------------------------------------------------------
/rseqc/qcmodule/poisson.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | def point_poip(actual, mean):
 3 | 	'''give poisson pvalue. P[obs ==mean]'''
 4 | 	# naive:   math.exp(-mean) * mean**actual / factorial(actual)
 5 | 	# iterative, to keep the components from getting too large or small:
 6 | 	p = math.exp(-mean)
 7 | 	for i in xrange(actual):
 8 | 		p *= mean
 9 | 		p /= i+1
10 | 	return p
11 | 
12 | def cumu_poip(num, mean,logp=False):
13 | 	'''give poisson pvalue P[obs >=mean]'''
14 | 	s=0.0
15 | 	for i in range(0,num+1):
16 | 		s += point_poip(i,mean)
17 | 	if logp is True:
18 | 		try:
19 | 			return -10*math.log10(1-s)
20 | 		except:
21 | 			return 3000
22 | 	else:
23 | 		return 1-s
24 | 	
25 | 


--------------------------------------------------------------------------------
/rseqc/qcmodule/quantile.py:
--------------------------------------------------------------------------------
 1 | """
 2 | File	quantile.py
 3 | Desc	computes sample quantiles
 4 | Author  Ernesto P. Adorio, PhD.
 5 | 		UPDEPP (U.P. at Clarkfield)
 6 | Version 0.0.1 August 7. 2009
 7 | """
 8 | from math import modf, floor
 9 | def quantile(x, q,  qtype = 7, issorted = False):
10 | 	"""
11 | 	Args:
12 | 	   x - input data
13 | 	   q - quantile
14 | 	   qtype - algorithm
15 | 	   issorted- True if x already sorted.
16 | 	Compute quantiles from input array x given q.For median,
17 | 	specify q=0.5.
18 | 	References:
19 | 	   http://reference.wolfram.com/mathematica/ref/Quantile.html
20 | 	   http://wiki.r-project.org/rwiki/doku.php?id=rdoc:stats:quantile
21 | 	Author:
22 | 	Ernesto P.Adorio Ph.D.
23 | 	UP Extension Program in Pampanga, Clark Field.
24 | 	"""
25 | 	if not issorted:
26 | 		y = sorted(x)
27 | 	else:
28 | 		y = x
29 | 	if not (1 <= qtype <= 9):
30 | 	   return None  # error!
31 | 	# Parameters for the Hyndman and Fan algorithm
32 | 	abcd = [(0,   0, 1, 0), # inverse empirical distrib.function., R type 1
33 | 			(0.5, 0, 1, 0), # similar to type 1, averaged, R type 2
34 | 			(0.5, 0, 0, 0), # nearest order statistic,(SAS) R type 3
35 | 			(0,   0, 0, 1), # California linear interpolation, R type 4
36 | 			(0.5, 0, 0, 1), # hydrologists method, R type 5
37 | 			(0,   1, 0, 1), # mean-based estimate(Weibull method), (SPSS,Minitab), type 6
38 | 			(1,  -1, 0, 1), # mode-based method,(S, S-Plus), R type 7
39 | 			(1.0/3, 1.0/3, 0, 1), # median-unbiased ,  R type 8
40 | 			(3/8.0, 0.25, 0, 1)   # normal-unbiased, R type 9.
41 | 		   ]
42 | 	a, b, c, d = abcd[qtype-1]
43 | 	n = len(x)
44 | 	g, j = modf( a + (n+b) * q -1)
45 | 	if j < 0:
46 | 		return y[0]
47 | 	elif j >= n:
48 | 		return y[n-1]   # oct. 8, 2010 y[n]???!! uncaught  off by 1 error!!!
49 | 	j = int(floor(j))
50 | 	if g ==  0:
51 | 	   return y[j]
52 | 	else:
53 | 	   return y[j] + (y[j+1]- y[j])* (c + d * g)
54 | def Test():
55 | 	x = [11.4, 17.3, 21.3, 25.9, 40.1, 50.5, 60.0, 70.0, 75]
56 | 	for qtype in range(1,10):
57 | 		print qtype, quantile(x, 0.35, qtype)
58 | if __name__ == "__main__":
59 | 	Test()
60 | 


--------------------------------------------------------------------------------
/rseqc/qcmodule/twoList.py:
--------------------------------------------------------------------------------
 1 | '''manipulate ndarray list'''                                                       
 2 | from itertools import imap,starmap,izip
 3 | from operator import mul,add,sub
 4 | 
 5 | def check_list(v1,v2):
 6 | 	'''check if the length of two list is same'''
 7 | 	if v1.size != v2.size:
 8 | 		raise ValueError,"the lenght of both arrays must be the same"
 9 | 	pass
10 | 
11 | def Add(v1,v2):
12 | 	'''add two list'''                                                                                 
13 | 	check_list(v1,v2)
14 | 	return v1.__add__(v2)
15 | 
16 | def Subtract(v1,v2):
17 | 	'''subtract v2 from v1'''                                                                                 
18 | 	check_list(v1,v2)
19 | 	return v1.__sub__(v2)
20 | 
21 | def Product(v1,v2):
22 | 	'''return product of two list'''                                                                                 
23 | 	check_list(v1,v2)
24 | 	return v1.__mul__(v2)
25 | 
26 | def Division(v1,v2):
27 | 	'''return divide v1 by v2. add 1 to both v1 and v2'''                                                                                 
28 | 	check_list(v1,v2)
29 | 	return (v1+1).__div__(v2+1)
30 | 
31 | def Average(v1,v2):
32 | 	'''return arithmetic mean of two list'''                                                                                 
33 | 	check_list(v1,v2)
34 | 	return v1.__add__(v2)/2
35 | 
36 | def geometricMean(v1,v2):
37 | 	'''return geometric mean of two list'''                                                                                 
38 | 	check_list(v1,v2)
39 | 	return (v1.__mul__(v2))**0.5
40 | 
41 | def Max(v1,v2):
42 | 	'''pairwise comparison two list. return  the max one between two paried number'''                                                                                 
43 | 	check_list(v1,v2)
44 | 	return imap(max,izip(v1,v2))
45 | 
46 | def Min(v1,v2):
47 | 	'''pairwise comparison two list. return  the max one between two paried number'''                                                                                 
48 | 	check_list(v1,v2)
49 | 	return imap(min,izip(v1,v2))
50 | def euclidean_distance(v1,v2):
51 | 	'''return euclidean distance'''                                                                                 
52 | 	check_list(v1,v2)
53 | 	return (sum((v1.__sub__(v2))**2) / v1.size)**0.5


--------------------------------------------------------------------------------
/rseqc/qcmodule/wiggle.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #Liguo Wang
  3 | #04/13/2011
  4 | 
  5 | #import built-in modules
  6 | import os,sys
  7 | import re
  8 | import string
  9 | #from optparse import OptionParser
 10 | import warnings
 11 | import string
 12 | import collections
 13 | import math
 14 | 
 15 | #import third-party modules
 16 | #from bx.bitset import *
 17 | #from bx.bitset_builders import *
 18 | #from bx.intervals import *
 19 | #import fasta
 20 | import bx.wiggle
 21 | from bx.binned_array import BinnedArray
 22 | from bx_extras.fpconst import isNaN
 23 | 
 24 | 
 25 | class ParseWig:
 26 | 	'''provie methods to manipulate wiggle format file. For wiggle format see:
 27 | 	http://genome.ucsc.edu/goldenPath/help/wiggle.html'''
 28 | 	
 29 | 	def __init__(self,wigFile):
 30 | 		'''read wig file, creat wig obj'''
 31 | 		self.scores = {}
 32 | 		self.num_re=re.compile(r'[\d\.\-\+]+');
 33 | 		fh=open(wigFile)
 34 | 		#infile=open(wigFile,'r')
 35 | 		for i, ( chrom, pos, val ) in enumerate( bx.wiggle.Reader( fh ) ):
 36 | 			chrom=chrom.upper()
 37 | 			if not chrom in self.scores: self.scores[ chrom ] = BinnedArray()
 38 | 			self.scores[chrom][pos] = val
 39 | 			if i % 100000 == 0: print "%i datapoints loaded \r" % i 
 40 | 		#print self.scores.keys()
 41 | 		print "total " + str(i) + " points loaded"
 42 | 		
 43 | 	def fetch_all_scores(self,chr,st,end):
 44 | 		'''fetch all wiggle scores defined by st and end.  NOTE:
 45 | 		1)st and end are 0-based, half-open. (st,end]
 46 | 		2)points without score are indicated as "nan"
 47 | 		'''
 48 | 		chr=chr.upper()
 49 | 		return [ self.scores[chr][i] for i in range(st,end)]
 50 | 
 51 | 	def fetch_max_scores(self,chr,st,end):
 52 | 		''' fetch maximum score defined by chr, st, end
 53 | 		1)st and end are 0-based, half-open. (st,end]
 54 | 		'''		
 55 | 		
 56 | 		chr=chr.upper()
 57 | 		return max([ self.scores[chr][i] for i in range(st,end)])
 58 | 
 59 | 	def fetch_min_scores(self,chr,st,end):
 60 | 		''' fetch minimum score defined by chr, st, end
 61 | 		1)st and end are 0-based, half-open. (st,end]
 62 | 		'''	
 63 | 		
 64 | 		chr=chr.upper()
 65 | 		return min([ self.scores[chr][i] for i in range(st,end)])			
 66 | 	
 67 | 	def fetch_avg_scores(self,chr,st,end):
 68 | 		''' fetch average score defined by chr, st, end
 69 | 		1)st and end are 0-based, half-open. (st,end]
 70 | 		'''	
 71 | 		
 72 | 		chr=chr.upper()
 73 | 		lst=[ float(self.scores[chr][i]) for i in range(st,end) if self.num_re.match(str(self.scores[chr][i]))]
 74 | 		return sum(lst)/len(range(st,end))
 75 | 		
 76 | 	def fetch_sum_scores(self,chr,st,end):
 77 | 		''' fetch sum score defined by chr, st, end
 78 | 		1)st and end are 0-based, half-open. (st,end]
 79 | 		'''	
 80 | 		
 81 | 		chr=chr.upper()
 82 | 		lst=[ float(self.scores[chr][i]) for i in range(st,end) if self.num_re.match(str(self.scores[chr][i]))]
 83 | 		return sum(lst)		
 84 | #if __name__ == "__main__": main()
 85 | 
 86 | class ParseWig2:
 87 | 	'''provie methods to manipulate wiggle format file. For wiggle format see:
 88 | 	http://genome.ucsc.edu/goldenPath/help/wiggle.html. The same coordinate could occur more than
 89 | 	one time in wig file, and the scores will be sumed up. Slower than ParseWig'''
 90 | 	
 91 | 	def __init__(self,wigFile):
 92 | 		'''read wig file, creat wig obj'''
 93 | 		self.scores = {}
 94 | 		self.num_re=re.compile(r'[\d\.\-\+]+');
 95 | 		fh=open(wigFile)
 96 | 		#infile=open(wigFile,'r')
 97 | 		for i, ( chrom, pos, val ) in enumerate( bx.wiggle.Reader( fh ) ):
 98 | 			chrom=chrom.upper()
 99 | 			if not chrom in self.scores: self.scores[chrom] = BinnedArray()
100 | 			tmp=self.scores[chrom][pos]
101 | 			if isNaN(tmp):
102 | 				self.scores[chrom][pos] = val
103 | 			else:
104 | 				self.scores[chrom][pos] += val
105 | 			if i % 100000 == 0: print "%i datapoints loaded \r" % i 
106 | 		#print self.scores.keys()
107 | 		print "total " + str(i) + " points loaded"
108 | 		
109 | 	def fetch_all_scores_by_range(self,chr,st,end):
110 | 		'''fetch all wiggle scores defined by st and end.  NOTE:
111 | 		1)st and end are 0-based, half-open. (st,end]
112 | 		2)points without score are indicated as "nan"'''
113 | 		chr=chr.upper()
114 | 		return [ self.scores[chr][i] for i in range(st,end)]
115 | 		
116 | 	def fetch_all_scores_by_positions(self,chr,lst):
117 | 		'''fetch all wiggle scores defined by st and end.  NOTE:
118 | 		2)points without score are indicated as "nan"'''
119 | 		chr=chr.upper()
120 | 		return [ self.scores[chr][i] for i in lst]
121 | 		
122 | 	def fetch_max_scores_by_range(self,chr,st,end):
123 | 		''' fetch maximum score defined by chr, st, end
124 | 		1)st and end are 0-based, half-open. (st,end]
125 | 		'''			
126 | 		chr=chr.upper()
127 | 		return max([ self.scores[chr][i] for i in range(st,end)])
128 | 
129 | 	def fetch_max_scores_by_positions(self,chr,lst):
130 | 		'''fetch maximum score defined by chr, st, end'''
131 | 		
132 | 		chr=chr.upper()
133 | 		return max([ self.scores[chr][i] for i in lst])
134 | 
135 | 	def fetch_min_scores_by_range(self,chr,st,end):
136 | 		''' fetch minimum score defined by chr, st, end
137 | 		1)st and end are 0-based, half-open. (st,end]
138 | 		'''			
139 | 		chr=chr.upper()
140 | 		return min([ self.scores[chr][i] for i in range(st,end)])			
141 | 
142 | 	def fetch_min_scores_by_positions(self,chr,lst):
143 | 		''' fetch minimum score defined by chr, st, end
144 | 		'''			
145 | 		chr=chr.upper()
146 | 		return min([ self.scores[chr][i] for i in lst])
147 | 		
148 | 	
149 | 	def fetch_avg_scores_by_range(self,chr,st,end):
150 | 		''' fetch average score defined by chr, st, end
151 | 		1)st and end are 0-based, half-open. (st,end]
152 | 		'''			
153 | 		chr=chr.upper()
154 | 		lst=[ float(self.scores[chr][i]) for i in range(st,end) if self.num_re.match(str(self.scores[chr][i]))]
155 | 		return sum(lst)/len(range(st,end))
156 | 		
157 | 	def fetch_avg_scores_by_positions(self,chr,lst):
158 | 		''' fetch average score defined by chr, st, end
159 | 		'''			
160 | 		chr=chr.upper()
161 | 		lst_score =[ float(self.scores[chr][i]) for i in lst if self.num_re.match(str(self.scores[chr][i]))]
162 | 		return sum(lst_score)/len(lst_score)
163 | 		
164 | 	def fetch_sum_scores_by_range(self,chr,st,end):
165 | 		''' fetch sum score defined by chr, st, end
166 | 		'''			
167 | 		chr=chr.upper()
168 | 		lst=[ float(self.scores[chr][i]) for i in range(st,end) if self.num_re.match(str(self.scores[chr][i]))]
169 | 		return sum(lst)		
170 | 
171 | 	def fetch_sum_scores_by_positions(self,chr,lst):
172 | 		''' fetch sum score defined by chr, st, end
173 | 		'''			
174 | 		chr=chr.upper()
175 | 		lst_score=[ float(self.scores[chr][i]) for i in lst if self.num_re.match(str(self.scores[chr][i]))]
176 | 		return sum(lst_score)
177 | 
178 | 	def distriub_wig(self,bed,till_count=100):
179 | 		'''calculate coverage over bed file (only consider exon regions). The mRNA sequences in input
180 | 		bed file will be cut into 100 tills of equal size'''
181 | 		
182 | 		print >>sys.stderr,"Reading " + bed + " ..."
183 | 		for line in open(bed,'r'):
184 | 			try:
185 | 				if line.startswith(('#','track','browser')):continue
186 | 				fields=line.rstrip('\r\n').split()
187 | 				txStart=int(fields[1])
188 | 				chrom=fields[0]
189 | 				strand=fields[5]
190 | 				geneName=fields[3]
191 | 				score=fields[4]
192 | 				exon_start=map(int,fields[11].rstrip(',').split(','))
193 | 				exon_start=map((lambda x: x + txStart),exon_start)
194 | 				exon_end=map(int,fields[10].rstrip(',').split(','))
195 | 				exon_end=map((lambda x,y:x+y),exon_start,exon_end)
196 | 			except:
197 | 				print >>sys.stderr,"[NOTE:input bed must be 12-column] skipped this line: " + line,
198 | 				continue
199 | 		
200 | #if __name__ == "__main__": main()
201 | 


--------------------------------------------------------------------------------
/scripts/rseqc:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import argparse
 5 | import logging
 6 | 
 7 | from rseqc.modules import read_dist
 8 | from rseqc.modules import bam_stats
 9 | 
10 | def arg_parser(parser = None):
11 |     '''
12 |     Parse is an instance of the argparse which can be optionaly extended
13 |     '''
14 | 
15 |     # create the top-level parser
16 |     
17 |     if parser is None:
18 |         parser = argparse.ArgumentParser(add_help=True)
19 |     
20 |     subparsers = parser.add_subparsers(help='sub-command help',
21 |                                        dest = 'command'
22 |                                        )
23 |     # bam_stats options
24 |     parser_bam_stats = subparsers.add_parser('bam_stats',
25 |                                              help = 'Summarizing mapping statistics of a BAM or SAM file'
26 |                                              )
27 |     parser_bam_stats.add_argument('-i', '--input_file',
28 |                                   required = True,
29 |                                   help = 'specify your alignment file in SAM or BAM format'
30 |                                   )
31 |     parser_bam_stats.add_argument('-q', '--mapq',
32 |                                   default = 30,
33 |                                   help = 'Minimum mapping quality to determine uniquely mapped read'
34 |                                   )
35 |     # read_distribution options
36 |     parser_read_dist = subparsers.add_parser('read_dist',
37 |                                              help = 'Get read distribution amongst gene feature, e.g exon, intron etc.'
38 |                                              )
39 |     parser_read_dist.add_argument('-i', '--input_files',
40 |                                   required = True,
41 |                                   nargs = '+',
42 |                                   help = 'specify your alignment file in SAM or BAM format'
43 |                                   )
44 |     parser_read_dist.add_argument('-g', '--gene_models',
45 |                                   required = True,
46 |                                   help = 'specify your gene_models file in BED12 or GTF format'
47 |                                   )
48 |     parser_read_dist.add_argument('-t', '--file_type',
49 |                                   default = 'gtf',
50 |                                   help = 'specify file type, supported fomats GTF or BED12, where each line is a transcript'
51 |                                   )
52 |     parser_read_dist.add_argument('-o', '--outdir',
53 |                                   default = 'rseqc/reads_dist',
54 |                                   help = 'specify file type, supported fomats GTF or BED12, where each line is a transcript'
55 |                                   )
56 |     
57 |     args = parser.parse_args()
58 |     
59 |     return args
60 |     
61 | 
62 | def main(args):
63 | 
64 |     logging.basicConfig(level = logging.INFO)
65 |     logger = logging.getLogger(__name__)
66 |     logger.info(' Starting reads distribution calculation...')
67 | 
68 |     if args.command == 'read_dist':
69 |         input_files = args.input_files
70 |         gene_models = args.gene_models
71 |         file_type = args.file_type
72 |         outdir = args.outdir
73 | 
74 |         get_distrib = read_dist.main(input_files, gene_models, file_type, outdir)
75 | 
76 |         logger.info(' Finished read distribution calculation')
77 | 
78 |     if args.command == 'bam_stats':
79 |         input_file = args.input_file
80 |         mapq = args.mapq
81 |         stats = bam_stats.main(input_file, mapq)
82 | 
83 | if __name__ == "__main__":
84 |     args = arg_parser()
85 |     main(args)
86 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MonashBioinformaticsPlatform/RSeQC/5658c4d7c5c1c9a8ece1461df82479b84c5509f8/setup.cfg


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name = "RSeQC",
 5 |     version = "2.6.4",
 6 |     url = "https://github.com/MonashBioinformaticsPlatform/RSeQC",
 7 |     license = "GPLv2",
 8 |     author = "kizza_a",
 9 |     author_email ="kirill.tsyganov@monash.edu",
10 |     description = "RNAseq QCs suite",
11 |     packages=find_packages(exclude=['test']),
12 |     zip_safe=False
13 |     keywords = 'RNA-seq, RNAseq, QC, metrics',
14 |     scripts = ['scripts/rseqc'],
15 |     install_requires = ['pysam',
16 |                         'bx-python'
17 |                         ], 
18 |     )
19 | 
20 | #setup_requires=['numpy'],
21 | #platforms = ['Linux','MacOS'],
22 | #classifiers=[
23 | #    "Development Status :: 3 - Alpha",
24 | #    "Topic :: Utilities",
25 | #    "License :: OSI Approved :: BSD License",
26 | 


--------------------------------------------------------------------------------
/src/binBits.c:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "bits.h"
  3 | #include "binBits.h"
  4 | 
  5 | static Bits * ALL_ZERO = NULL;
  6 | static Bits * ALL_ONE = ( Bits * ) &"ONE";
  7 | 
  8 | struct BinBits* binBitsAlloc( int size, int granularity )
  9 | {
 10 |     struct BinBits * bb;
 11 |     AllocVar(bb);
 12 |     bb->size = size;
 13 |     bb->bin_size = (int) ceil( size / (float) granularity );
 14 |     bb->nbins = (int) ceil( size / (float) bb->bin_size );
 15 |     AllocArray( bb->bins, bb->nbins );
 16 |     return bb;
 17 | }
 18 | 
 19 | void binBitsFree( struct BinBits *bb )
 20 | {
 21 |     int i;
 22 |     for ( i = 0; i < bb->nbins; i++ )
 23 |     {
 24 |         if ( ( bb->bins[i] != ALL_ZERO ) && ( bb->bins[i] != ALL_ONE ) )
 25 |         {
 26 |             bitFree( &(bb->bins[i]) );
 27 |         }
 28 |     }
 29 |     freeMem( bb );
 30 | }
 31 | 
 32 | #ifdef _MSC_VER
 33 |     #define INLINE static __inline
 34 | #else
 35 |     #define INLINE static inline
 36 | #endif
 37 | 
 38 | INLINE int binBitsGetBin( struct BinBits * bb, int pos )
 39 | {
 40 |     return pos / bb->bin_size;
 41 | }
 42 | 
 43 | INLINE int binBitsGetOffset( struct BinBits * bb, int pos )
 44 | {
 45 |     return pos % bb->bin_size;
 46 | }
 47 | 
 48 | boolean binBitsReadOne( struct BinBits * bb, int pos )
 49 | {
 50 |     int bin = binBitsGetBin( bb, pos );
 51 |     
 52 |     if ( bb->bins[bin] == ALL_ZERO )
 53 |     {
 54 |         return 0;
 55 |     }
 56 |     else if ( bb->bins[bin] == ALL_ONE )
 57 |     {
 58 |         return 1;
 59 |     }
 60 |     else
 61 |     {
 62 |         return bitReadOne( bb->bins[bin], binBitsGetOffset( bb, pos ) );
 63 |     }
 64 | }
 65 | 
 66 | void binBitsSetOne( struct BinBits * bb, int pos )
 67 | {
 68 |     int bin = binBitsGetBin( bb, pos );  
 69 |     int offset = binBitsGetOffset( bb, pos );
 70 |     if ( bb->bins[bin] == ALL_ONE )
 71 |     {
 72 |         return;
 73 |     }
 74 |     if ( bb->bins[bin] == ALL_ZERO )
 75 |     {
 76 |         bb->bins[bin] = bitAlloc( bb->bin_size );
 77 |     }
 78 |     bitSetOne( bb->bins[bin], offset );
 79 | }
 80 | 
 81 | void binBitsClearOne( struct BinBits * bb, int pos )
 82 | {
 83 |     int bin = binBitsGetBin( bb, pos );  
 84 |     int offset = binBitsGetOffset( bb, pos );
 85 |     if ( bb->bins[bin] == ALL_ZERO )
 86 |     {
 87 |         return;
 88 |     }
 89 |     if ( bb->bins[bin] == ALL_ONE )
 90 |     {
 91 |         bb->bins[bin] = bitAlloc( bb->bin_size );
 92 |         bitSetRange( bb->bins[bin], 0, bb->bin_size );
 93 |     }
 94 |     bitClearOne( bb->bins[bin], offset );
 95 | }
 96 | 
 97 | void binBitsSetRange( struct BinBits *bb, int start, int size )
 98 | {
 99 |     int bin, offset, delta;
100 |     while ( size > 0 )
101 |     {
102 |         bin = binBitsGetBin( bb, start );  
103 |         offset = binBitsGetOffset( bb, start );
104 |         delta = bb->bin_size - offset;
105 |         if ( bb->bins[bin] == ALL_ZERO )
106 |         {
107 |             bb->bins[bin] = bitAlloc( bb->bin_size );   
108 |         }
109 |         if ( delta < size )
110 |         {
111 |             if ( bb->bins[bin] != ALL_ONE )
112 |             {
113 |                 bitSetRange( bb->bins[bin], offset, delta );
114 |             }
115 |             size -= delta;
116 |             start += delta;
117 |         }
118 |         else
119 |         {
120 |             if ( bb->bins[bin] != ALL_ONE )
121 |             {
122 |                 bitSetRange( bb->bins[bin], offset, size );
123 |             }
124 |             size = 0;
125 |         }
126 |     }
127 | }
128 | 
129 | int binBitsCountRange( struct BinBits *bb, int start, int size )
130 | {
131 |     int delta;
132 |     int count = 0;
133 |     while ( size > 0 )
134 |     {
135 |         int bin = binBitsGetBin( bb, start );  
136 |         int offset = binBitsGetOffset( bb, start );
137 |         delta = bb->bin_size - offset;
138 |         if ( bb->bins[bin] == ALL_ZERO )
139 |         {
140 |             if ( delta < size )
141 |             {
142 |                 size -= delta;
143 |                 start += delta;
144 |             }
145 |             else
146 |             {
147 |                 size = 0;
148 |             }
149 |         }
150 |         else if ( bb->bins[bin] == ALL_ONE )
151 |         {
152 |             if ( delta < size )
153 |             {
154 |                 count += ( delta - offset );
155 |                 size -= delta;
156 |                 start += delta;
157 |             }
158 |             else
159 |             {
160 |                 count += ( size - offset );
161 |                 size = 0;
162 |             }
163 |         }
164 |         else if ( delta < size )
165 |         {
166 |             count += bitCountRange( bb->bins[bin], offset, delta );
167 |             size -= delta;
168 |             start += delta;
169 |         }
170 |         else
171 |         {
172 |             count += bitCountRange( bb->bins[bin], offset, size );
173 |             size = 0;
174 |         } 
175 |     }
176 |     return count;
177 | }
178 | 
179 | int binBitsFindSet( struct BinBits *bb, int start )
180 | {
181 |     int ns;
182 |     int bin = binBitsGetBin( bb, start );  
183 |     int offset = binBitsGetOffset( bb, start );
184 |     while ( bin < bb->nbins )
185 |     {
186 |         if ( bb->bins[bin] == ALL_ONE )
187 |         {
188 |             return bin * bb->bin_size + offset;
189 |         }
190 |         else if ( bb->bins[bin] != ALL_ZERO )
191 |         {
192 |             ns = bitFindSet( bb->bins[bin], offset, bb->bin_size );
193 |             if ( ns < bb->bin_size )
194 |             {
195 |                 return bin * bb->bin_size + ns;
196 |             }
197 |         }
198 |         bin += 1;
199 |         offset = 0;
200 |     }
201 |     return bb->size;
202 | }
203 | 
204 | int binBitsFindClear( struct BinBits *bb, int start )
205 | {
206 |     int ns;
207 |     int bin = binBitsGetBin( bb, start );  
208 |     int offset = binBitsGetOffset( bb, start );
209 |     while ( bin < bb->nbins )
210 |     {
211 |         if ( bb->bins[bin] == ALL_ZERO )
212 |         {
213 |             return bin*bb->bin_size + offset;
214 |         }
215 |         else if ( bb->bins[bin] != ALL_ONE )
216 |         {
217 |             ns = bitFindClear( bb->bins[bin], offset, bb->bin_size );
218 |             if ( ns < bb->bin_size )
219 |             {
220 |                 return bin*bb->bin_size + ns;
221 |             }
222 |         }
223 |         bin += 1;
224 |         offset = 0;
225 |     }
226 |     return bb->size;
227 | }
228 | 
229 | void binBitsAnd( struct BinBits *bb1, struct BinBits *bb2 )
230 | {
231 |     int i;    
232 |     assert( bb1->bin_size == bb2->bin_size && bb1->nbins == bb2->nbins && bb1->size == bb2->size );
233 | 
234 |     for ( i = 0; i < bb1->nbins; i++ )
235 |     {
236 |         if ( bb1->bins[i] == ALL_ZERO )
237 |         {
238 |             // Do nothing
239 |         }
240 |         else if ( bb2->bins[i] == ALL_ZERO )
241 |         {
242 |             if ( bb1->bins[i] != ALL_ONE )
243 |             {
244 |                 bitFree( &bb1->bins[i] );
245 |             }
246 |             bb1->bins[i] = ALL_ZERO;
247 |         }
248 |         else if ( bb2->bins[i] == ALL_ONE )
249 |         {
250 |             // Do nothing
251 |         }
252 |         else if ( bb1->bins[i] == ALL_ONE )
253 |         {
254 |             bb1->bins[i] = bitClone( bb2->bins[i], bb1->bin_size );
255 |         }
256 |         else
257 |         {            
258 |             bitAnd( bb1->bins[i], bb2->bins[i], bb1->bin_size );
259 |         }
260 |     }
261 | }
262 | 
263 | void binBitsOr( struct BinBits *bb1, struct BinBits *bb2 )
264 | {
265 |     int i;    
266 |     assert( bb1->bin_size == bb2->bin_size && bb1->nbins == bb2->nbins && bb1->size == bb2->size );
267 | 
268 |     for ( i = 0; i < bb1->nbins; i++ )
269 |     {
270 |         if ( bb1->bins[i] == ALL_ONE )
271 |         {
272 |             // Do nothing
273 |         }
274 |         else if ( bb2->bins[i] == ALL_ONE )
275 |         {
276 |             if ( bb1->bins[i] != ALL_ZERO )
277 |             {
278 |                 bitFree( &bb1->bins[i] );
279 |             }
280 |             bb1->bins[i] = ALL_ONE;
281 |         }
282 |         else if ( bb2->bins[i] == ALL_ZERO )
283 |         {
284 |             // Do nothing
285 |         }
286 |         else if ( bb1->bins[i] == ALL_ZERO )
287 |         {
288 |             bb1->bins[i] = bitClone( bb2->bins[i], bb1->bin_size );
289 |         }
290 |         else
291 |         {
292 |             bitOr( bb1->bins[i], bb2->bins[i], bb1->bin_size );
293 |         }
294 |     }
295 | }
296 | 
297 | void binBitsNot( struct BinBits *bb )
298 | {
299 |     int i;    
300 | 
301 |     for ( i = 0; i < bb->nbins; i++ )
302 |     {
303 |         if ( bb->bins[i] == ALL_ONE )
304 |         {
305 |             bb->bins[i] = ALL_ZERO;
306 |         }
307 |         else if ( bb->bins[i] == ALL_ZERO )
308 |         {
309 |             bb->bins[i] = ALL_ONE;
310 |         }
311 |         else
312 |         {
313 |             bitNot( bb->bins[i], bb->bin_size );
314 |         }
315 |     }
316 | }
317 | 


--------------------------------------------------------------------------------
/src/binBits.h:
--------------------------------------------------------------------------------
 1 | #ifndef BINBITS_H
 2 | #define BINBITS_H
 3 | 
 4 | #include "common.h"
 5 | #include "bits.h"
 6 | 
 7 | struct BinBits
 8 | {
 9 |     int size;
10 |     int bin_size;
11 |     int nbins;
12 |     Bits ** bins;
13 | };
14 | 
15 | struct BinBits* binBitsAlloc( int size, int granularity );
16 | void binBitsFree( struct BinBits *bb );
17 | boolean binBitsReadOne( struct BinBits * bb, int pos );
18 | void binBitsSetOne( struct BinBits * bb, int pos );
19 | void binBitsClearOne( struct BinBits * bb, int pos );
20 | void binBitsSetRange( struct BinBits *bb, int start, int size );
21 | int binBitsCountRange( struct BinBits *bb, int start, int size );
22 | int binBitsFindSet( struct BinBits *bb, int start );
23 | int binBitsFindClear( struct BinBits *bb, int start );
24 | void binBitsAnd( struct BinBits *bb1, struct BinBits *bb2 );
25 | void binBitsOr( struct BinBits *bb1, struct BinBits *bb2 );
26 | void binBitsNot( struct BinBits *bb );
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/src/bunzip/micro-bunzip.h:
--------------------------------------------------------------------------------
 1 | #ifndef __MICRO_BUNZIP_H__
 2 | #define __MICRO_BUNZIP_H__
 3 | 
 4 | /* ---- Duplicated from micro-bzip.c -------------------------------------- */
 5 | 
 6 | #include <setjmp.h>
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <string.h>
10 | #include <unistd.h>
11 | #include <limits.h>
12 | 
13 | /* Constants for huffman coding */
14 | #define MAX_GROUPS			6
15 | #define GROUP_SIZE   		50		/* 64 would have been more efficient */
16 | #define MAX_HUFCODE_BITS 	20		/* Longest huffman code allowed */
17 | #define MAX_SYMBOLS 		258		/* 256 literals + RUNA + RUNB */
18 | #define SYMBOL_RUNA			0
19 | #define SYMBOL_RUNB			1
20 | 
21 | /* Status return values */
22 | #define RETVAL_OK						0
23 | #define RETVAL_LAST_BLOCK				(-1)
24 | #define RETVAL_NOT_BZIP_DATA			(-2)
25 | #define RETVAL_UNEXPECTED_INPUT_EOF		(-3)
26 | #define RETVAL_UNEXPECTED_OUTPUT_EOF	(-4)
27 | #define RETVAL_DATA_ERROR				(-5)
28 | #define RETVAL_OUT_OF_MEMORY			(-6)
29 | #define RETVAL_OBSOLETE_INPUT			(-7)
30 | 
31 | #define RETVAL_END_OF_BLOCK             (-8)
32 | #define RETVAL_STOPCHAR                 (-9)
33 | #define RETVAL_BUFFER_FULL              (-10)
34 | 
35 | /* Other housekeeping constants */
36 | #define IOBUF_SIZE			4096
37 | 
38 | /* This is what we know about each huffman coding group */
39 | struct group_data {
40 | 	/* We have an extra slot at the end of limit[] for a sentinal value. */
41 | 	int limit[MAX_HUFCODE_BITS+1],base[MAX_HUFCODE_BITS],permute[MAX_SYMBOLS];
42 | 	int minLen, maxLen;
43 | };
44 | 
45 | /* Structure holding all the housekeeping data, including IO buffers and
46 |    memory that persists between calls to bunzip */
47 | typedef struct {
48 | 	/* State for interrupting output loop */
49 | 	int writeCopies,writePos,writeRunCountdown,writeCount,writeCurrent;
50 | 	/* I/O tracking data (file handles, buffers, positions, etc.) */
51 | 	int in_fd,out_fd,inbufCount,inbufPos /*,outbufPos*/;
52 | 	unsigned char *inbuf /*,*outbuf*/;
53 | 	unsigned int inbufBitCount, inbufBits;
54 | 	/* The CRC values stored in the block header and calculated from the data */
55 | 	unsigned int crc32Table[256],headerCRC, totalCRC, writeCRC;
56 | 	/* Intermediate buffer and its size (in bytes) */
57 | 	unsigned int *dbuf, dbufSize;
58 | 	/* These things are a bit too big to go on the stack */
59 | 	unsigned char selectors[32768];			/* nSelectors=15 bits */
60 | 	struct group_data groups[MAX_GROUPS];	/* huffman coding tables */
61 | 	/* For I/O error handling */
62 | 	jmp_buf jmpbuf;
63 | } bunzip_data;
64 | 
65 | static char * const bunzip_errors[]={NULL,"Bad file checksum","Not bzip data",
66 | 		"Unexpected input EOF","Unexpected output EOF","Data error",
67 | 		 "Out of memory","Obsolete (pre 0.9.5) bzip format not supported."};
68 | 
69 | /* ---- Forward declarations for micro-bzip.c ---------------------------- */
70 | 
71 | unsigned int get_bits(bunzip_data *bd, char bits_wanted);
72 | int get_next_block( bunzip_data *bd );
73 | int read_bunzip(bunzip_data *bd, char *outbuf, int len);
74 | int start_bunzip(bunzip_data **bdp, int in_fd, char *inbuf, int len);
75 | int read_bunzip_to_char(bunzip_data *bd, char *outbuf, int len, int* gotcount_out, char stopchar );
76 | 
77 | #endif


--------------------------------------------------------------------------------
/src/cluster.h:
--------------------------------------------------------------------------------
 1 | typedef struct struct_interval {
 2 |     int start;
 3 |     int end;
 4 |     int id;
 5 |     
 6 |     struct struct_interval *next;
 7 | } interval;
 8 | 
 9 | typedef struct struct_clusternode {
10 |     int start;
11 |     int end;
12 |     int priority;
13 |     
14 |     struct struct_interval *interval_head;
15 |     struct struct_interval *interval_tail;
16 |     int num_ivals;
17 |     
18 |     struct struct_clusternode *left;
19 |     struct struct_clusternode *right;
20 | } clusternode;
21 | 
22 | typedef struct {
23 |     int max_dist;
24 |     int min_intervals;
25 |     
26 |     clusternode *root;
27 | } clustertree;
28 | 
29 | typedef struct struct_treeitr {
30 |     struct struct_treeitr *next;
31 |     struct struct_clusternode *node;
32 | } treeitr;
33 | 
34 | 
35 | clusternode* clusternode_insert(clustertree *tree, clusternode *node, int start, int end, int id);
36 | clustertree* create_clustertree(int max_dist, int min_intervals);
37 | treeitr* clusteritr(clustertree *tree);
38 | void freeclusteritr(treeitr *itr);
39 | void free_tree(clustertree *tree);
40 | 


--------------------------------------------------------------------------------
/src/kent/bits.c:
--------------------------------------------------------------------------------
  1 | /* bits - handle operations on arrays of bits. 
  2 |  *
  3 |  * This file is copyright 2002 Jim Kent, but license is hereby
  4 |  * granted for all use - public, private or commercial. */
  5 | 
  6 | #include "common.h"
  7 | #include "bits.h"
  8 | 
  9 | static char const rcsid[] = "$Id: bits.c,v 1.20 2008/03/25 16:32:31 angie Exp $";
 10 | 
 11 | 
 12 | static Bits oneBit[8] = { 0x80, 0x40, 0x20, 0x10, 0x8, 0x4, 0x2, 0x1};
 13 | static Bits leftMask[8] = {0xFF, 0x7F, 0x3F, 0x1F,  0xF,  0x7,  0x3,  0x1,};
 14 | static Bits rightMask[8] = {0x80, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE, 0xFF,};
 15 | int bitsInByte[256];
 16 | 
 17 | static boolean inittedBitsInByte = FALSE;
 18 | 
 19 | void bitsInByteInit(void)
 20 | /* Initialize bitsInByte array. */
 21 | {
 22 |     int i;
 23 | 
 24 |     if (!inittedBitsInByte)
 25 |     {
 26 |         inittedBitsInByte = TRUE;
 27 |         for (i=0; i<256; ++i)
 28 |         {
 29 |             int count = 0;
 30 |             if (i&1)
 31 |                 count = 1;
 32 |             if (i&2)
 33 |                 ++count;
 34 |             if (i&4)
 35 |                 ++count;
 36 |             if (i&8)
 37 |                 ++count;
 38 |             if (i&0x10)
 39 |                 ++count;
 40 |             if (i&0x20)
 41 |                 ++count;
 42 |             if (i&0x40)
 43 |                 ++count;
 44 |             if (i&0x80)
 45 |                 ++count;
 46 |             bitsInByte[i] = count;
 47 |         }
 48 |     }
 49 | }
 50 | 
 51 | Bits *bitAlloc(int bitCount)
 52 | /* Allocate bits. */
 53 | {
 54 |     int byteCount = ((bitCount+7)>>3);
 55 |     return needLargeZeroedMem(byteCount);
 56 | }
 57 | 
 58 | Bits *bitClone(Bits* orig, int bitCount)
 59 | /* Clone bits. */
 60 | {
 61 |     int byteCount = ((bitCount+7)>>3);
 62 |     Bits* bits = needLargeZeroedMem(byteCount);
 63 |     if(orig!=NULL)
 64 |         memcpy(bits, orig, byteCount);
 65 |     return bits;
 66 | }
 67 | 
 68 | void bitFree(Bits **pB)
 69 | /* Free bits. */
 70 | {
 71 |     freez(pB);
 72 | }
 73 | 
 74 | void bitSetOne(Bits *b, int bitIx)
 75 | /* Set a single bit. */
 76 | {
 77 |     b[bitIx>>3] |= oneBit[bitIx&7];
 78 | }
 79 | 
 80 | void bitClearOne(Bits *b, int bitIx)
 81 | /* Clear a single bit. */
 82 | {
 83 |     b[bitIx>>3] &= ~oneBit[bitIx&7];
 84 | }
 85 | 
 86 | void bitSetRange(Bits *b, int startIx, int bitCount)
 87 | /* Set a range of bits. */
 88 | {
 89 | 
 90 |     int endIx = (startIx + bitCount - 1);
 91 |     int startByte = (startIx>>3);
 92 |     int endByte = (endIx>>3);
 93 |     int startBits = (startIx&7);
 94 |     int endBits = (endIx&7);
 95 |     int i;
 96 | 
 97 |     if (bitCount <= 0)
 98 |         return;
 99 | 
100 |     if (startByte == endByte)
101 |     {
102 |         b[startByte] |= (leftMask[startBits] & rightMask[endBits]);
103 |         return;
104 |     }
105 |     b[startByte] |= leftMask[startBits];
106 |     for (i = startByte+1; i<endByte; ++i)
107 |         b[i] = 0xff;
108 |     b[endByte] |= rightMask[endBits];
109 | }
110 | 
111 | 
112 | boolean bitReadOne(Bits *b, int bitIx)
113 | /* Read a single bit. */
114 | {
115 |     return (b[bitIx>>3] & oneBit[bitIx&7]) != 0;
116 | }
117 | 
118 | int bitCountRange(Bits *b, int startIx, int bitCount)
119 | /* Count number of bits set in range. */
120 | {
121 |     int endIx = (startIx + bitCount - 1);
122 |     int startByte = (startIx>>3);
123 |     int endByte = (endIx>>3);
124 |     int startBits = (startIx&7);
125 |     int endBits = (endIx&7);
126 |     int i;
127 |     int count = 0;
128 | 
129 |     if (bitCount <= 0)
130 |         return 0;
131 | 
132 |     if (!inittedBitsInByte)
133 |         bitsInByteInit();
134 |     if (startByte == endByte)
135 |         return bitsInByte[b[startByte] & leftMask[startBits] & rightMask[endBits]];
136 |     count = bitsInByte[b[startByte] & leftMask[startBits]];
137 |     for (i = startByte+1; i<endByte; ++i)
138 |         count += bitsInByte[b[i]];
139 |     count += bitsInByte[b[endByte] & rightMask[endBits]];
140 |     return count;
141 | }
142 | 
143 | int bitFind(Bits *b, int startIx, boolean val, int bitCount)
144 | /* Find the index of the the next set bit. */
145 | {
146 |     unsigned char notByteVal = val ? 0 : 0xff;
147 |     int iBit = startIx;
148 |     int endByte = ((bitCount-1)>>3);
149 |     int iByte;
150 | 
151 | /* scan initial byte */
152 |     while (((iBit & 7) != 0) && (iBit < bitCount))
153 |     {
154 |         if (bitReadOne(b, iBit) == val)
155 |             return iBit;
156 |         iBit++;
157 |     }
158 | 
159 | /* scan byte at a time, if not already in last byte */
160 |     iByte = (iBit >> 3);
161 |     if (iByte < endByte)
162 |     {
163 |         while ((iByte < endByte) && (b[iByte] == notByteVal))
164 |             iByte++;
165 |         iBit = iByte << 3;
166 |     }
167 | 
168 | /* scan last byte */
169 |     while (iBit < bitCount)
170 |     {
171 |         if (bitReadOne(b, iBit) == val)
172 |             return iBit;
173 |         iBit++;
174 |     }
175 |     return bitCount;  /* not found */
176 | }
177 | 
178 | int bitFindSet(Bits *b, int startIx, int bitCount)
179 | /* Find the index of the the next set bit. */
180 | {
181 |     return bitFind(b, startIx, TRUE, bitCount);
182 | }
183 | 
184 | int bitFindClear(Bits *b, int startIx, int bitCount)
185 | /* Find the index of the the next clear bit. */
186 | {
187 |     return bitFind(b, startIx, FALSE, bitCount);
188 | }
189 | 
190 | void bitClear(Bits *b, int bitCount)
191 | /* Clear many bits (possibly up to 7 beyond bitCount). */
192 | {
193 |     int byteCount = ((bitCount+7)>>3);
194 |     zeroBytes(b, byteCount);
195 | }
196 | 
197 | void bitClearRange(Bits *b, int startIx, int bitCount)
198 | /* Clear a range of bits. */
199 | {
200 | 
201 |     int endIx = (startIx + bitCount - 1);
202 |     int startByte = (startIx>>3);
203 |     int endByte = (endIx>>3);
204 |     int startBits = (startIx&7);
205 |     int endBits = (endIx&7);
206 |     int i;
207 | 
208 |     if (bitCount <= 0)
209 |         return;
210 | 
211 |     if (startByte == endByte)
212 |     {
213 |         b[startByte] &= ~(leftMask[startBits] & rightMask[endBits]);
214 |         return;
215 |     }
216 |     b[startByte] &= ~leftMask[startBits];
217 |     for (i = startByte+1; i<endByte; ++i)
218 |         b[i] = 0x00;
219 |     b[endByte] &= ~rightMask[endBits];
220 | }
221 | 
222 | void bitAnd(Bits *a, Bits *b, int bitCount)
223 | /* And two bitmaps.  Put result in a. */
224 | {
225 |     int byteCount = ((bitCount+7)>>3);
226 |     while (--byteCount >= 0)
227 |     {
228 |         *a = (*a & *b++);
229 |         a++;
230 |     }
231 | }
232 | 
233 | void bitOr(Bits *a, Bits *b, int bitCount)
234 | /* Or two bitmaps.  Put result in a. */
235 | {
236 |     int byteCount = ((bitCount+7)>>3);
237 |     while (--byteCount >= 0)
238 |     {
239 |         *a = (*a | *b++);
240 |         a++;
241 |     }
242 | }
243 | 
244 | void bitXor(Bits *a, Bits *b, int bitCount)
245 | {
246 |     int byteCount = ((bitCount+7)>>3);
247 |     while (--byteCount >= 0)
248 |     {
249 |         *a = (*a ^ *b++);
250 |         a++;
251 |     }
252 | }
253 | 
254 | void bitNot(Bits *a, int bitCount)
255 | /* Flip all bits in a. */
256 | {
257 |     int byteCount = ((bitCount+7)>>3);
258 |     while (--byteCount >= 0)
259 |     {
260 |         *a = ~*a;
261 |         a++;
262 |     }
263 | }
264 | 
265 | void bitPrint(Bits *a, int startIx, int bitCount, FILE* out)
266 | /* Print part or all of bit map as a string of 0s and 1s.  Mostly useful for
267 | * debugging */
268 | {
269 |     int i;
270 |     for (i = startIx; i < bitCount; i++)
271 |     {
272 |         if (bitReadOne(a, i))
273 |             fputc('1', out);
274 |         else
275 |             fputc('0', out);
276 |     }
277 |     fputc('\n', out);
278 | }
279 | 
280 | 


--------------------------------------------------------------------------------
/src/kent/bits.h:
--------------------------------------------------------------------------------
 1 | /* bits - handle operations on arrays of bits. 
 2 |  *
 3 |  * This file is copyright 2002 Jim Kent, but license is hereby
 4 |  * granted for all use - public, private or commercial. */
 5 | 
 6 | #ifndef BITS_H
 7 | #define BITS_H
 8 | 
 9 | #include "common.h"
10 | 
11 | typedef unsigned char Bits;
12 | 
13 | Bits *bitAlloc(int bitCount);
14 | /* Allocate bits. */
15 | 
16 | Bits *bitClone(Bits* orig, int bitCount);
17 | /* Clone bits. */
18 | 
19 | void bitFree(Bits **pB);
20 | /* Free bits. */
21 | 
22 | void bitSetOne(Bits *b, int bitIx);
23 | /* Set a single bit. */
24 | 
25 | void bitClearOne(Bits *b, int bitIx);
26 | /* Clear a single bit. */
27 | 
28 | void bitSetRange(Bits *b, int startIx, int bitCount);
29 | /* Set a range of bits. */
30 | 
31 | boolean bitReadOne(Bits *b, int bitIx);
32 | /* Read a single bit. */
33 | 
34 | int bitCountRange(Bits *b, int startIx, int bitCount);
35 | /* Count number of bits set in range. */
36 | 
37 | int bitFindSet(Bits *b, int startIx, int bitCount);
38 | /* Find the index of the the next set bit. */
39 | 
40 | int bitFindClear(Bits *b, int startIx, int bitCount);
41 | /* Find the index of the the next clear bit. */
42 | 
43 | void bitClear(Bits *b, int bitCount);
44 | /* Clear many bits (possibly up to 7 beyond bitCount). */
45 | 
46 | void bitClearRange(Bits *b, int startIx, int bitCount);
47 | /* Clear a range of bits. */
48 | 
49 | void bitAnd(Bits *a, Bits *b, int bitCount);
50 | /* And two bitmaps.  Put result in a. */
51 | 
52 | void bitOr(Bits *a, Bits *b, int bitCount);
53 | /* Or two bitmaps.  Put result in a. */
54 | 
55 | void bitXor(Bits *a, Bits *b, int bitCount);
56 | /* Xor two bitmaps.  Put result in a. */
57 | 
58 | void bitNot(Bits *a, int bitCount);
59 | /* Flip all bits in a. */
60 | 
61 | void bitPrint(Bits *a, int startIx, int bitCount, FILE* out);
62 | /* Print part or all of bit map as a string of 0s and 1s.  Mostly useful for
63 |  * debugging */
64 | 
65 | extern int bitsInByte[256];
66 | /* Lookup table for how many bits are set in a byte. */
67 | 
68 | void bitsInByteInit(void);
69 | /* Initialize bitsInByte array. */
70 | 
71 | #endif /* BITS_H */
72 | 
73 | 


--------------------------------------------------------------------------------
/src/kent/common.c:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | 
 3 | void *needMem(size_t size)
 4 |         /* Need mem calls abort if the memory allocation fails. The memory
 5 |          *  * is initialized to zero. */
 6 | {
 7 |         void *pt;
 8 |         if ((pt = malloc(size)) == NULL)
 9 |         {        
10 |                     fprintf( stderr, "Out of memory needMem - request size %llu bytes\n",
11 |                                                  (unsigned long long)size);
12 |                     exit(1);
13 |         }            
14 |         memset(pt, 0, size);
15 |         return pt;
16 | }
17 | 
18 | void freeMem(void *pt)
19 |         /* Free memory will check for null before freeing. */
20 | {
21 |         if (pt != NULL)
22 |                     free(pt);
23 | }
24 | 
25 | 
26 | void *needLargeZeroedMem(size_t size)
27 |         /* Request a large block of memory and zero it. */
28 | {
29 |         void *v;
30 |         /*v = needLargeMem(size);*/
31 |         v = malloc(size);
32 | 	/* 
33 | 	 * If you do memset(NULL,0,size), there will be a segfault. 
34 | 	 * So check v for NULL 
35 | 	 */
36 | 	if( v != NULL )
37 |         memset(v, 0, size);
38 |         return v;
39 | }
40 | 
41 | void freez(void *vpt)
42 |         /* Pass address of pointer.  Will free pointer and set it 
43 |          *  * to NULL. */
44 | {
45 |         void **ppt = (void **)vpt;
46 |         void *pt = *ppt;
47 |         *ppt = NULL;
48 |         freeMem(pt);
49 | }
50 | 
51 | /* fill a specified area of memory with zeroes 
52 |  * If you do zeroBytes(NULL,count), there will be a segfault. 
53 |  * So check pt for NULL 
54 |  */
55 | void zeroBytes(void *vpt, int count)
56 | {
57 |         char *pt = (char*)vpt;
58 | 	if(pt != NULL ){
59 |         while (--count>=0)
60 |                     *pt++=0;
61 | 	}
62 | }
63 | 


--------------------------------------------------------------------------------
/src/kent/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef __COMMON_H__
 2 | #define __COMMON_H__
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | #include <assert.h>
 8 | #include <math.h>
 9 | 
10 | /* Let's pretend C has a boolean type. */
11 | #define TRUE 1
12 | #define FALSE 0
13 | #define boolean int
14 | #define bool char
15 | 
16 | #define AllocVar(pt) (pt = needMem(sizeof(*pt)))
17 | /* Shortcut to allocating a single variable on the heap and
18 |  *  * assigning pointer to it. */
19 | 
20 | #define AllocArray(pt, size) (pt = needLargeZeroedMem(sizeof(*pt) * (size)))
21 | 
22 | void *needMem(size_t size);
23 | 
24 | void freeMem(void *pt);
25 | 
26 | void *needLargeZeroedMem(size_t size);
27 | 
28 | void freez(void *vpt);
29 | 
30 | void zeroBytes(void *vpt, int count);
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/src/pwm_utils.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <ctype.h>
 3 | #include <stdio.h>
 4 | #include <strings.h>
 5 | 
 6 | int symbol_match( char, char);
 7 | int pattern_match( char*, char*, int);
 8 | 
 9 | int main(int argc, char** argv) {
10 |     if (argc == 3) {
11 |         int string_size = strlen(argv[1]);
12 |         if (strlen(argv[2]) != string_size) {
13 |             fprintf(stdout, "%s != %s\n", argv[1], argv[2]);
14 |             return 1;
15 |         }
16 |         if ( pattern_match( argv[1], argv[2], string_size) )
17 |             fprintf(stdout, "%s == %s\n", argv[1], argv[2]);
18 |         else
19 |             fprintf(stdout, "%s != %s\n", argv[1], argv[2]);
20 |     }
21 |     return 0;
22 | }
23 | 
24 | int pattern_match( char* string, char* pattern, int n){
25 |     int i = 0;
26 |     while (i<n) {
27 |         if (! symbol_match( string[i], pattern[i] )) return 0;
28 |         i++;
29 |     }
30 |     return 1;
31 | }
32 | 
33 | int symbol_match( char s, char p ) {
34 |     char P = toupper(p);
35 |     char S = toupper(s);
36 |     if (P == 'N') return 1;
37 |     switch(P){
38 |         case 'A': return S=='A';
39 |         case 'C': return S=='C';
40 |         case 'G': return S=='G';
41 |         case 'T': return S=='T';
42 |         // IUPAC-UB nomenclature for two-fold degenerate symbols
43 |         case 'R':
44 |             if (S=='A' || S=='G') return 1;
45 |             else return 0;
46 |         case 'Y':
47 |             if (S=='C' || S=='T') return 1;
48 |             else return 0;
49 |         case 'M':
50 |             if (S=='A' || S=='C') return 1;
51 |             else return 0;
52 |         case 'K':
53 |             if (S=='G' || S=='T') return 1;
54 |             else return 0;
55 |         case 'S':
56 |             if (S=='G' || S=='C') return 1;
57 |             else return 0;
58 |         case 'W':
59 |             if (S=='A' || S=='T') return 1;
60 |             else return 0;
61 |     }
62 |     return 0;
63 | }
64 | 


--------------------------------------------------------------------------------
/src/pwm_utils.h:
--------------------------------------------------------------------------------
1 | 
2 | int symbol_match( char, char);
3 | int pattern_match( char*, char*, int);
4 | 


--------------------------------------------------------------------------------
/src/samtools/bgzf.h:
--------------------------------------------------------------------------------
  1 | /* The MIT License
  2 | 
  3 |    Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology
  4 | 
  5 |    Permission is hereby granted, free of charge, to any person obtaining a copy
  6 |    of this software and associated documentation files (the "Software"), to deal
  7 |    in the Software without restriction, including without limitation the rights
  8 |    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 |    copies of the Software, and to permit persons to whom the Software is
 10 |    furnished to do so, subject to the following conditions:
 11 | 
 12 |    The above copyright notice and this permission notice shall be included in
 13 |    all copies or substantial portions of the Software.
 14 | 
 15 |    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 |    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 |    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 |    THE SOFTWARE.
 22 | */
 23 | 
 24 | #ifndef __BGZF_H
 25 | #define __BGZF_H
 26 | 
 27 | #include <stdint.h>
 28 | #include <stdio.h>
 29 | #include <stdbool.h>
 30 | #include <zlib.h>
 31 | #ifdef _USE_KNETFILE
 32 | #include "knetfile.h"
 33 | #endif
 34 | 
 35 | //typedef int8_t bool;
 36 | 
 37 | typedef struct {
 38 |     int file_descriptor;
 39 |     char open_mode;  // 'r' or 'w'
 40 |     bool owned_file, is_uncompressed;
 41 | #ifdef _USE_KNETFILE
 42 | 	union {
 43 | 		knetFile *fpr;
 44 | 		FILE *fpw;
 45 | 	} x;
 46 | #else
 47 |     FILE* file;
 48 | #endif
 49 |     int uncompressed_block_size;
 50 |     int compressed_block_size;
 51 |     void* uncompressed_block;
 52 |     void* compressed_block;
 53 |     int64_t block_address;
 54 |     int block_length;
 55 |     int block_offset;
 56 | 	int cache_size;
 57 |     const char* error;
 58 | 	void *cache; // a pointer to a hash table
 59 | } BGZF;
 60 | 
 61 | #ifdef __cplusplus
 62 | extern "C" {
 63 | #endif
 64 | 
 65 | /*
 66 |  * Open an existing file descriptor for reading or writing.
 67 |  * Mode must be either "r" or "w".
 68 |  * A subsequent bgzf_close will not close the file descriptor.
 69 |  * Returns null on error.
 70 |  */
 71 | #ifdef __SUNPRO_C
 72 | BGZF* bgzf_fdopen(int fd, const char* mode);
 73 | #else
 74 | BGZF* bgzf_fdopen(int fd, const char* __restrict mode);
 75 | #endif
 76 | 
 77 | /*
 78 |  * Open the specified file for reading or writing.
 79 |  * Mode must be either "r" or "w".
 80 |  * Returns null on error.
 81 |  */
 82 | #ifdef __SUNPRO_C
 83 | BGZF* bgzf_open(const char* path, const char* mode);
 84 | #else
 85 | BGZF* bgzf_open(const char* path, const char* __restrict mode);
 86 | #endif
 87 | 
 88 | /*
 89 |  * Close the BGZ file and free all associated resources.
 90 |  * Does not close the underlying file descriptor if created with bgzf_fdopen.
 91 |  * Returns zero on success, -1 on error.
 92 |  */
 93 | int bgzf_close(BGZF* fp);
 94 | 
 95 | /*
 96 |  * Read up to length bytes from the file storing into data.
 97 |  * Returns the number of bytes actually read.
 98 |  * Returns zero on end of file.
 99 |  * Returns -1 on error.
100 |  */
101 | int bgzf_read(BGZF* fp, void* data, int length);
102 | 
103 | /*
104 |  * Write length bytes from data to the file.
105 |  * Returns the number of bytes written.
106 |  * Returns -1 on error.
107 |  */
108 | int bgzf_write(BGZF* fp, const void* data, int length);
109 | 
110 | /*
111 |  * Return a virtual file pointer to the current location in the file.
112 |  * No interpetation of the value should be made, other than a subsequent
113 |  * call to bgzf_seek can be used to position the file at the same point.
114 |  * Return value is non-negative on success.
115 |  * Returns -1 on error.
116 |  */
117 | int64_t bgzf_tell(BGZF* fp);
118 | 
119 | /*
120 |  * Set the file to read from the location specified by pos, which must
121 |  * be a value previously returned by bgzf_tell for this file (but not
122 |  * necessarily one returned by this file handle).
123 |  * The where argument must be SEEK_SET.
124 |  * Seeking on a file opened for write is not supported.
125 |  * Returns zero on success, -1 on error.
126 |  */
127 | int64_t bgzf_seek(BGZF* fp, int64_t pos, int where);
128 | 
129 | /*
130 |  * Set the cache size. Zero to disable. By default, caching is
131 |  * disabled. The recommended cache size for frequent random access is
132 |  * about 8M bytes.
133 |  */
134 | void bgzf_set_cache_size(BGZF *fp, int cache_size);
135 | 
136 | int bgzf_check_EOF(BGZF *fp);
137 | 
138 | #ifdef __cplusplus
139 | }
140 | #endif
141 | 
142 | #endif
143 | 


--------------------------------------------------------------------------------