├── .gitignore
├── .project
├── .pydevproject
├── .settings
    └── org.eclipse.core.resources.prefs
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── README.md
├── REF
    ├── Mapabilities
    │   └── hg38
    │   │   ├── MapabilityExclusion.100bp.bed.gz
    │   │   ├── MapabilityExclusion.150.bed.gz
    │   │   └── MapabilityExclusion.70bp.bed.gz
    └── extra-input-files
    │   ├── Human_hg19_nonPolyA_ROI.bed
    │   ├── Human_hg19_wgEncodeDacMapabilityConsensusExcludable.bed.gz
    │   ├── Human_hg38_nonPolyA_ROI.bed
    │   ├── Mouse_mm10_nonPolyA_ROI.bed
    │   ├── Mouse_mm9_nonPolyA_ROI.bed
    │   ├── RNA.SpikeIn.ERCC.fasta.gz
    │   └── URLs
├── bin
    ├── AdaptorDetect.pl
    ├── DESeq2Constructor.R
    ├── IRFinder
    ├── IRFinderBAM
    ├── IRFinderBuildRef
    ├── IRFinderBuildRefDownload
    ├── IRFinderBuildRefFromSTARRef
    ├── IRFinderBuildRefProcess
    ├── IRFinderDiff
    ├── IRFinderFastQ
    ├── IRFinderLong
    ├── TrimBAM4IGV
    ├── analysisWithLowReplicates.pl
    ├── analysisWithNoReplicates.pl
    └── util
    │   ├── Build-BED-refs.sh
    │   ├── IRFinder-BuildRefFromEnsembl
    │   ├── IntronExclusion.pl
    │   ├── Mapability
    │   ├── adjust.R
    │   ├── bash_utils.sh
    │   ├── bed-to-intron+exon.pl
    │   ├── deseq2.R
    │   ├── generateReadsError.pl
    │   ├── gtf2bed-custom.pl
    │   ├── irfinder
    │   ├── irfinder_cnn
    │   ├── model
    │       ├── best_model.h5
    │       ├── best_model.tflite
    │       └── model_info.json
    │   ├── trim
    │   ├── warnings
    │   └── winflat
├── install.sh
└── src
    ├── cnnfilter
        ├── cnnfilter
        │   ├── actions
        │   │   ├── extract.py
        │   │   ├── models.py
        │   │   ├── resultgraph.py
        │   │   └── selectclass.py
        │   ├── main.py
        │   ├── model
        │   │   ├── best_model.h5
        │   │   └── model_info.json
        │   └── utils
        │   │   └── reader.py
        └── testCNN
        │   ├── actions
        │       ├── extract.py
        │       └── models.py
        │   ├── irfinder_cnn.py
        │   ├── model
        │       ├── best_model.h5
        │       └── model_info.json
        │   └── utils
        │       └── reader.py
    ├── irfinder
        ├── .cproject
        ├── .project
        ├── .settings
        │   ├── language.settings.xml
        │   └── org.eclipse.cdt.core.prefs
        ├── Release
        │   ├── makefile
        │   ├── objects.mk
        │   ├── sources.mk
        │   └── src
        │   │   ├── Blocks
        │   │       └── subdir.mk
        │   │   ├── ReadBlock
        │   │       └── subdir.mk
        │   │   ├── Utils
        │   │       └── subdir.mk
        │   │   └── subdir.mk
        └── src
        │   ├── Blocks
        │       ├── BAM2blocks.cpp
        │       ├── BAM2blocks.h
        │       ├── CoverageBlock.cpp
        │       ├── CoverageBlock.h
        │       ├── FragmentBlocks.cpp
        │       └── FragmentBlocks.h
        │   ├── IRFinder2.cpp
        │   ├── ReadBlock
        │       ├── CoverageBlocks.cpp
        │       ├── CoverageBlocks.h
        │       ├── ReadBlockProcessor.cpp
        │       └── ReadBlockProcessor.h
        │   └── Utils
        │       ├── crc32.cpp
        │       ├── crc32.h
        │       └── includedefine.h
    ├── trim
        ├── Makefile
        ├── TrimReads.cpp
        ├── TrimReads.h
        ├── includedefine.h
        ├── sequenceTools.cpp
        ├── sequenceTools.h
        └── trim.cpp
    └── winflat
        ├── Makefile
        ├── README
        ├── runtest.sh
        ├── winflat
        └── winflat_with_beta.c


/.gitignore:
--------------------------------------------------------------------------------
1 | img/
2 | docker_routine.sh
3 | src/irfinder/Release/irfinder
4 | src/irfinder/Release/**/*.o
5 | src/irfinder/Release/**/*.d
6 | 


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>IRFinder</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.python.pydev.PyDevBuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 	</buildSpec>
14 | 	<natures>
15 | 		<nature>org.python.pydev.pythonNature</nature>
16 | 	</natures>
17 | </projectDescription>
18 | 


--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
2 | <?eclipse-pydev version="1.0"?><pydev_project>
3 |         
4 |     <pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">python3</pydev_property>
5 |         
6 |     <pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python interpreter</pydev_property>
7 |     
8 | </pydev_project>
9 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/cnnfilter/cnnfilter/main.py=utf-8
3 | encoding//src/cnnfilter/testCNN/irfinder_cnn.py=utf-8
4 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # IRFinder Changelogs
 3 | 
 4 | **2.0.0**
 5 |  1. **Novelties**
 6 |     1. New **Long** RunMode to process fast[q|a] files from long reads
 7 |     using Minimap2 as aligner.
 8 |     2. New **-l** argument in **BAM** RunMode, to process long reads using an alternative algorithm. More information in the paper.
 9 |      3. New **AI** process that uses a CNN model to detect false IR events on introns without warning in the last column of the result `IRFinder-IR-[non]dir.txt` file. It will generate a file containing only validated introns ( `IRFinder-IR-[non]dir-val.txt` )
10 |     4. New **Diff** RunMode that uses SUPPA2 ( https://github.com/comprna/SUPPA ) or DESeq2 algorithm to identify differential IR events. 
11 |     5. New **CLI** with dedicated helps for each RunMode and a verbose mode.
12 |     6. New **installation script**, to check the dependencies and install or uninstall IRFinder globally and locally.
13 | 
14 |     7. **Docker** and **Singularity** images available, based on Ubuntu 18 LTS ( bionic ) and containing IRFinder and all his dependencies ( latest versions of STAR, Minimap2 and SUPPA2).
15 | 
16 | 2. **Major changes** ( can impact the results between different versions ) 
17 |     1. **NonUniformIntronCover** warning threshold simplified: now it uses the  25th/50th/75th percentile of intronic depth. Changed from:
18 | ``
19 | (max(Column13, Column14) > 2 + Column9 && max(Column13, Column14) > Column9 * 1.5 ) || (min(Column13, Column14) + 2 < Column9 && min(Column13, Column14)*1.5 < Column9 ) 
20 | ``
21 | to 
22 | `` Column12-Column10 > Column11 ``
23 |     2. **Default  Mapability read length** is now 100 instead of 70. It's not anymore hard coded and can be changed with the argument **-n** in the RunModes *BuildRef[Process|FromSTARRef]*
24 |     3. **Paired reads with one pair unmapped**  are now processed as single reads instead of being removed.
25 | 
26 | 3. **Minor changes** ( no impact on the results but improves the usability)
27 |    1. The mapability file can be given as argument **-M** in the RunModes *BuildRef[Process|FromSTARRef]*. Precomputed mapabilities for hg38 are available under the git subdirectory */REF/Mapabilities/hg38/* for different read lengths ( 70, 100 and 150). This will reduce drastically the time to build the IRFinder reference.
28 |    2. New argument **-l** in *BuildRefFromSTARRef* to create a **l**ink to the existing reference STAR folder, the genome file and the annotation file, instead of copy them. This will save disk space in case of multiple IRFinder reference directories using the same STAR reference.
29 | 
30 | **1.3.1**    
31 | 1. IRFinder now exits immediately after error, instead of trying to complete the remaining processes.    
32 | 2. Improved Perl version judgement during Phase 3 of reference preparation.    
33 |     
34 | **1.3.0**    
35 | New features:    
36 | 1. New `BuildRefFromSTARRef` mode. This allows users to use an existing STAR reference to build IRFinder reference, which significantly reduces the total preparation time. This new mode also tries to automatically figure out the original FASTA and GTF files used to generate the existing STAR reference. Call `IRFinder -h` for more details.    
37 | 2. `BuildRef` and `BuildRefProcess` mode now support `-j` option to parse an integer that changes the default value of `--sjdbOverhang` argument in STAR.    
38 | 3. `FASTQ` mode now supports `-y` option to feed extra STAR arguments to control alignment behaviors.    
39 |     
40 | Improvements:    
41 | 1. `FASTQ` mode now outputs a full BAM file in "Unsorted.bam", instead of a BAM file with a trimmed QS column.   
42 | 2. IRFinder does not automatically generate "unsorted.frag.bam" to save disk space and to avoid redundancy to "Unsorted.bam". Instead, IRFinder now provides a tool at `bin/TrimBAM4IGV` to generate this kind of trimmed BAM file to facilitate visualization purpose in IGV.     
43 | 3. Re-design of standard output information during IRFinder reference preparation. It is easier to recognize occured errors now.    
44 | 4. Usage information now can be viewed by `-h` option.     
45 |      
46 | Bug fixes:    
47 | 1. The mapability calculation during the IRFinder reference preparation stage has been re-designed. The previous algorithm encountered buffer size issues when dealing with genomes with a huge amount of chromosomes/scaffolds. This has been fixed. Please note, the new algorithm requires `samtools` (>=1.4) executable binary ready in $PATH.    
48 | 2. Since Perl 5.28.0, `sort '_mergesort'` is no longer supported. IRFinder now checks the Perl version and uses `sort` functions correspondingly.    
49 |     
50 | **1.2.6**
51 | 1. IRFinder now keeps introns with the same effective regions as separate entries in the reference.    
52 | 2. IRFinder now automatically checks if the reference preparation stage generates empty reference files, which indicates process failure.    
53 | 3. The R object genreated by Differential IR Analysis script now includes an additional slot named "MaxSplice", which represents the maximum splice reads at either end of introns. Each value is the maximum value between Column 17 and 18 in the IR quantification output.    
54 | 4. During differential IR analysis, values in "MaxSplice" are now used as the denominators in the GLM, instead of using the values of Column 19 in the IR quantification output. This makes the IR ratio in the differential IR analysis more consistent with the values of Column 20 in the IR quantification output.    
55 | 5. User manual has been updated.    
56 |     
57 | **1.2.5**
58 | 1. Headers are now correctly added to output files `IRFinder-IR-dir.txt` and `IRFinder-IR-nondir.txt`.
59 | 
60 | **1.2.4**
61 | 1. In the GLM-based method for differential IR comparison, now the orginal matrix for DESeq2 is now made up by IR depth and correct splicing depth. In the previous versions, the latter one is the sum of splicing depth and IR depth. This change is supposed to give a smoother dispersion estimation across all introns.
62 | 
63 | **1.2.3:**
64 | 1. IRFinder now supports GTF attribution tags `gene_type` and `transcript_type` upon the original requirement for typical Ensembl tags `gene_biotype` and `transcript_biotype`. Either of these two pairs is required to correctly build IRFinder reference.    
65 |     
66 | **1.2.2:**
67 | 1. In GLM-based differential IR comparison, fixed an error caused by duplicated row names when creating DESeq2 object with a version of DESeq2 later than 1.10.
68 | 
69 | **1.2.1:**
70 | 1. Improved the performance of DESeq2-based GLM analysis for differential IR. This new approach should improve the estimation of dispersion. Normal splicing from IRFinder result is now used as a variable in the GLM, instead of using the value of normal splicing as an offset. This approach is adapted from [detection of allele-specific expression](http://rpubs.com/mikelove/ase) from Michael Love. See Wiki page for details.
71 | 2. Updated some out-of-date usage information
72 | 
73 | **1.2.0:**
74 | 1. IRFinder is now compatible with GLM-based analysis. This is achieved by passing IRFinder result to DESeq2 using the function in bin/DESeq2Constructor.R. See Wiki page for details  
75 | 2. Fixed the conflict with latest version "bedtools complement" that used to cause failure in preparing IRFinder reference  
76 | 3. Improved memory usage when passing lines to bedtools genomecov. This is also supposed to benefit reference preparation of those genomes with a lot of chromosomes contigs. Thanks for the smart solution from Andreas @andpet0101.  
77 | 4. Specified the gtf file to be downloaded during reference preparation via automatic downloading. Ensembl currently holds several versions of gtf files for the same genome release. This confused IRFinder BuildRefDownload function in the previous version.
78 | 5. Added -v option to print out version number.
79 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG R_VERSION=4.1.2
 2 | 
 3 | FROM rocker/r-ver:${R_VERSION}
 4 | 
 5 | LABEL version=v2.0.1
 6 | 
 7 | 
 8 | ENV LD_LIBRARY_PATH="/usr/local/lib/:$LD_LIBRARY_PATH"
 9 | ENV PYTHONNOUSERSITE="true"
10 | ENV PATH="/Utils/bin/:${PATH}"
11 | 
12 | ARG DEBIAN_FRONTEND=noninteractive
13 | 
14 | ### All the dependencies
15 | RUN apt-get update && \
16 |     apt-get -y upgrade && \
17 |     export DEBIAN_FRONTEND=noninteractive && \ 
18 |     apt-get install -qy make build-essential libxml2-dev libcurl4-openssl-dev gcc bedtools samtools git gzip \
19 | 		zlib1g gawk libz-dev wget libboost-iostreams-dev python3.6 apt-transport-https software-properties-common \
20 | 		 python3-pip && \
21 |     apt-get clean && apt-get purge && \
22 | 	rm -rf /var/lib/apt/lists/* 
23 | 	
24 | RUN	pip3 install -U --no-cache-dir numpy pandas \
25 |     	scikit-learn scipy \
26 |     	statsmodels
27 |  
28 | RUN	Rscript -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager", force=TRUE) } ; BiocManager::install(c("BiocManager", "tximport", "readr", "RCurl", "DESeq2"), force=TRUE,ask=F, quiet=F)' && \
29 | 	Rscript -e 'options(warn=2); installed.packages()' | awk  'BEGIN {v=0} $1=="Version" {v=1; } v==1 && $1 == "DESeq2" { gsub("\"", ""); print $2;v=0 } '
30 | 
31 | 
32 | 
33 | RUN mkdir -p /Utils/bin/ && \
34 |     cd /Utils/ && \
35 |     git clone https://github.com/alexdobin/STAR.git && \
36 | 	cd ./STAR &&  git checkout tags/2.7.9a  && \
37 |     cd ./source && \
38 |     make STAR && \
39 |     ln -s /Utils/STAR/source/STAR /Utils/bin/STAR && \
40 | 	cd /Utils && \
41 | 	git clone https://github.com/comprna/SUPPA.git && \
42 | 	cd ./SUPPA && \
43 | 	echo '#!/usr/bin/env python3' > /Utils/SUPPA/suppa.py.tmp && \
44 | 	cat /Utils/SUPPA/suppa.py >> /Utils/SUPPA/suppa.py.tmp && \
45 | 	mv /Utils/SUPPA/suppa.py.tmp /Utils/SUPPA/suppa.py && \
46 | 	chmod +x /Utils/SUPPA/suppa.py && \
47 | 	ln -s /Utils/SUPPA/suppa.py /Utils/bin/suppa.py
48 | 
49 | RUN cd /Utils/ && git clone https://github.com/lh3/minimap2 && \
50 | 	cd minimap2 && git checkout tags/v2.3 && make && \
51 | 	ln -s /Utils/minimap2/minimap2 /Utils/bin/minimap2	
52 | 
53 | 
54 | ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache
55 | 
56 | COPY ./bin /IRFinder/bin
57 | COPY ./REF /IRFinder/REF
58 | COPY ./src /IRFinder/src
59 | COPY ./install.sh /IRFinder/
60 | RUN    cd /IRFinder/ && \
61 | 	./install.sh
62 | 
63 | 	 
64 | 
65 |     
66 | ENTRYPOINT ["IRFinder"]
67 | 
68 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 william ritchie
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # IRFinder-S
 3 | IRFinder-S is a suite of tools to analyse and explore intron retention events in multiple samples. It comprehends:
 4 | 
 5 | - IRFinder : detect intron retention from RNA-Seq experiments. Includes an automatic CNN filter that emulate a visual inspection to validate the events.
 6 | - IRBase   : visualize and share IRFinder's results. 
 7 | 
 8 | To start using IRFinder, read our [wiki user manual.](https://github.com/RitchieLabIGH/IRFinder/wiki)
 9 | 
10 | [CHANGELOG](https://github.com/RitchieLabIGH/IRFinder/CHANGELOG.md)
11 | 
12 | IRFinder Version 1 is still available at https://github.com/williamritchie/IRFinder but is not anymore maintained.
13 | ## About IRFinder
14 | 
15 | IRFinder, developed at the [Center for Genomic Medicine of Massachusetts General Hospital](https://cgm.massgeneral.org/), the [CNRS](http://www.cnrs.com) and the [Centenary Institute](https://www.centenary.org.au), implements an end-to-end analysis of intron retention (IR) from mRNA sequencing data in multiple species.    
16 | IRFinder includes alignment via the STAR (for short reads) and minimap2 (for long read) algorithm, quality controls on the sample analyzed, IR detection, quantification, convolutional neural network based validation and statistical comparison between multiple samples. 
17 | IRFinder was capable of estimating IR events with low coverage or low mappability as confirmed by RT-qPCR.
18 | 
19 | 
20 |     
21 | ## Before Start: Intron Retention Database - [IRBase](http://irbase.igh.cnrs.fr/)
22 | Before diving into IRFinder package, users might also consider [IRBase](http://irbase.igh.cnrs.fr/). It is a database for human IR inquiry and visualization, based upon pre-calculated IRFinder results from **over 935** public available human cell lines RNA-Seq sample.     
23 | [IRBase](http://irbase.igh.cnrs.fr/) allows users to enquire, visualize and download single-gene IR results in a tissue/cell-type of interest, download transcriptome-wide IR results of a sample of interest, upload your results to compare with the public ones and share them with the community.
24 | 
25 |         
26 | ## Cite IRFinder    
27 | 
28 | Lorenzi, C., Barriere, S., Arnold, K. et al. IRFinder-S: a comprehensive suite to discover and explore intron retention. [Genome Biol 22, 307 (2021)](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02515-8). doi: [10.1186/s13059-021-02515-8](https://doi.org/10.1186/s13059-021-02515-8)
29 | 
30 | Middleton R*, Gao D*, Thomas A, Singh B, Au A, Wong JJ, Bomane A, Cosson B, Eyras E, Rasko JE, Ritchie W. **IRFinder: assessing the impact of intron retention on mammalian gene expression**. [Genome Biol. 2017 Mar 15;18(1):51](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-017-1184-4). doi: 10.1186/s13059-017-1184-4. [PubMed PMID: 28298237](https://www-ncbi-nlm-nih-gov.ezp-prod1.hul.harvard.edu/pubmed/28298237).
31 | 
32 | 


--------------------------------------------------------------------------------
/REF/Mapabilities/hg38/MapabilityExclusion.100bp.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/REF/Mapabilities/hg38/MapabilityExclusion.100bp.bed.gz


--------------------------------------------------------------------------------
/REF/Mapabilities/hg38/MapabilityExclusion.150.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/REF/Mapabilities/hg38/MapabilityExclusion.150.bed.gz


--------------------------------------------------------------------------------
/REF/Mapabilities/hg38/MapabilityExclusion.70bp.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/REF/Mapabilities/hg38/MapabilityExclusion.70bp.bed.gz


--------------------------------------------------------------------------------
/REF/extra-input-files/Human_hg19_nonPolyA_ROI.bed:
--------------------------------------------------------------------------------
  1 | 1	28160912	28161077
  2 | 1	28975112	28975245
  3 | 1	32674695	32681797
  4 | 1	45241536	45241615
  5 | 1	45242162	45242265
  6 | 1	109642815	109643241
  7 | 1	149754245	149783928
  8 | 1	149784826	149785236
  9 | 1	149858525	149858961
 10 | 1	149859019	149859466
 11 | 1	155895749	155895877
 12 | 1	228645065	228645560
 13 | 1	228645808	228646259
 14 | 1	235291118	235291252
 15 | 10	101996913	101997059
 16 | 11	811681	811814
 17 | 11	2985001	2985123
 18 | 11	8705774	8705903
 19 | 11	9450320	9450501
 20 | 11	10823014	10823155
 21 | 11	62432894	62433042
 22 | 11	75111435	75111582
 23 | 11	75115465	75115610
 24 | 11	93464145	93464265
 25 | 11	93465527	93465665
 26 | 11	93466632	93466763
 27 | 11	93468277	93468402
 28 | 12	6619388	6619717
 29 | 12	7076500	7076769
 30 | 12	14920933	14924065
 31 | 12	49048165	49048301
 32 | 12	62995531	62997214
 33 | 12	98993413	98993661
 34 | 12	132515769	132515904
 35 | 13	27829538	27829663
 36 | 13	45911615	45911744
 37 | 14	20811207	20811844
 38 | 14	21860309	21860412
 39 | 14	21865451	21865560
 40 | 14	95999692	95999966
 41 | 14	103804186	103804311
 42 | 15	66795581	66795652
 43 | 16	2205024	2205106
 44 | 16	58582403	58582537
 45 | 17	7809440	7809578
 46 | 17	37009116	37009247
 47 | 17	58308877	58309007
 48 | 18	51748654	51748782
 49 | 19	17973397	17973529
 50 | 19	57791419	57804937
 51 | 2	86362993	86363129
 52 | 2	234184373	234184648
 53 | 2	234197322	234197586
 54 | 20	17943353	17943589
 55 | 20	37053843	37053979
 56 | 20	37058313	37058446
 57 | 20	37062508	37062641
 58 | 20	47895477	47895565
 59 | 20	47896856	47896946
 60 | 21	33749496	33749631
 61 | 3	39449880	39450030
 62 | 3	39452545	39452697
 63 | 3	160232695	160233024
 64 | 3	169482308	169482848
 65 | 3	186502585	186502653
 66 | 3	186504464	186504641
 67 | 3	186505089	186505220
 68 | 4	53579416	53579537
 69 | 5	82360023	82360156
 70 | 5	111497182	111497314
 71 | 5	138614470	138614667
 72 | 5	140090860	140090958
 73 | 5	172447731	172447931
 74 | 6	26020718	26021186
 75 | 6	26021907	26022278
 76 | 6	26027124	26027480
 77 | 6	26031817	26032288
 78 | 6	26033320	26033796
 79 | 6	26043455	26043885
 80 | 6	26045639	26046097
 81 | 6	26055968	26056699
 82 | 6	26115101	26124154
 83 | 6	26124373	26139344
 84 | 6	26156559	26157343
 85 | 6	26158349	26171577
 86 | 6	26188938	26189304
 87 | 6	26197068	26199521
 88 | 6	26199748	26200942
 89 | 6	26216428	26216872
 90 | 6	26217165	26217711
 91 | 6	26225383	26225844
 92 | 6	26250370	26250835
 93 | 6	26251879	26252303
 94 | 6	26271146	26271612
 95 | 6	26273144	26273622
 96 | 6	27093676	27100541
 97 | 6	27100832	27103070
 98 | 6	27114861	27115317
 99 | 6	27775257	27775709
100 | 6	27777842	27778314
101 | 6	27782112	27782607
102 | 6	27782822	27783267
103 | 6	27805658	27806117
104 | 6	27806323	27823487
105 | 6	27834570	27835359
106 | 6	27858093	27860963
107 | 6	27861203	27861669
108 | 6	116440086	116479910
109 | 6	160201282	160201413
110 | 7	45143948	45144081
111 | 7	45144505	45144641
112 | 8	99054314	99054445
113 | 8	128959126	128960591
114 | 9	35657748	35658015
115 | 9	95054743	95054875
116 | 9	125796806	125797975
117 | 9	130210780	130210909
118 | 


--------------------------------------------------------------------------------
/REF/extra-input-files/Human_hg19_wgEncodeDacMapabilityConsensusExcludable.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/REF/extra-input-files/Human_hg19_wgEncodeDacMapabilityConsensusExcludable.bed.gz


--------------------------------------------------------------------------------
/REF/extra-input-files/Human_hg38_nonPolyA_ROI.bed:
--------------------------------------------------------------------------------
  1 | 1	27834401	27834566
  2 | 1	28648600	28648733
  3 | 1	32209094	32216196
  4 | 1	44775864	44775943
  5 | 1	44776490	44776593
  6 | 1	109100193	109100619
  7 | 1	149782689	149812373
  8 | 1	149813271	149813681
  9 | 1	149886975	149887411
 10 | 1	149887469	149887916
 11 | 1	155925958	155926086
 12 | 1	228457364	228457859
 13 | 1	228458107	228458558
 14 | 1	235127803	235127937
 15 | 10	100237156	100237302
 16 | 11	811681	811814
 17 | 11	2963771	2963893
 18 | 11	8684227	8684356
 19 | 11	9428773	9428954
 20 | 11	10801467	10801608
 21 | 11	62665422	62665570
 22 | 11	75400391	75400538
 23 | 11	75404421	75404566
 24 | 11	93730979	93731099
 25 | 11	93732361	93732499
 26 | 11	93733466	93733597
 27 | 11	93735111	93735236
 28 | 12	6510222	6510551
 29 | 12	6967337	6967606
 30 | 12	14767999	14771131
 31 | 12	48654382	48654518
 32 | 12	62601751	62603434
 33 | 12	98599635	98599883
 34 | 12	132031224	132031359
 35 | 13	27255401	27255526
 36 | 13	45337480	45337609
 37 | 14	20343048	20343685
 38 | 14	21392150	21392253
 39 | 14	21397292	21397401
 40 | 14	95533355	95533629
 41 | 14	103337849	103337974
 42 | 15	66503243	66503314
 43 | 16	2155023	2155105
 44 | 16	58548499	58548633
 45 | 17	7906122	7906260
 46 | 17	38852863	38852994
 47 | 17	60231516	60231646
 48 | 18	54222284	54222412
 49 | 19	17862588	17862720
 50 | 19	57280051	57293569
 51 | 2	86135870	86136006
 52 | 2	233275727	233276002
 53 | 2	233288676	233288940
 54 | 20	17962710	17962946
 55 | 20	38425195	38425331
 56 | 20	38429670	38429803
 57 | 20	38433865	38433998
 58 | 20	49278940	49279028
 59 | 20	49280319	49280409
 60 | 21	32377187	32377322
 61 | 3	39408389	39408539
 62 | 3	39411054	39411206
 63 | 3	160514907	160515236
 64 | 3	169764520	169765060
 65 | 3	186784796	186784864
 66 | 3	186786675	186786852
 67 | 3	186787300	186787431
 68 | 4	52713249	52713370
 69 | 5	83064204	83064337
 70 | 5	112161485	112161617
 71 | 5	139278781	139278978
 72 | 5	140711275	140711373
 73 | 5	173020728	173020928
 74 | 6	26020490	26020958
 75 | 6	26021679	26022050
 76 | 6	26026896	26027252
 77 | 6	26031589	26032060
 78 | 6	26033092	26033568
 79 | 6	26043227	26043657
 80 | 6	26045411	26045869
 81 | 6	26055740	26056471
 82 | 6	26114873	26123926
 83 | 6	26124145	26139116
 84 | 6	26156331	26157115
 85 | 6	26158121	26171349
 86 | 6	26188710	26189076
 87 | 6	26196840	26199293
 88 | 6	26199520	26200714
 89 | 6	26216200	26216644
 90 | 6	26216937	26217483
 91 | 6	26225155	26225616
 92 | 6	26250142	26250607
 93 | 6	26251651	26252075
 94 | 6	26270918	26271384
 95 | 6	26272916	26273394
 96 | 6	27125897	27132762
 97 | 6	27133053	27135291
 98 | 6	27147082	27147538
 99 | 6	27807479	27807931
100 | 6	27810064	27810536
101 | 6	27814334	27814829
102 | 6	27815044	27815489
103 | 6	27837880	27838339
104 | 6	27838545	27855709
105 | 6	27866792	27867581
106 | 6	27890315	27893185
107 | 6	27893425	27893891
108 | 6	116118923	116158747
109 | 6	159780250	159780381
110 | 7	45104349	45104482
111 | 7	45104906	45105042
112 | 8	98042086	98042217
113 | 8	127946880	127948345
114 | 9	35657751	35658018
115 | 9	92292461	92292593
116 | 9	123034527	123035696
117 | 9	127448501	127448630
118 | 


--------------------------------------------------------------------------------
/REF/extra-input-files/Mouse_mm10_nonPolyA_ROI.bed:
--------------------------------------------------------------------------------
  1 | 1	86099026	86111970
  2 | 1	87776938	87777209
  3 | 1	87784556	87784820
  4 | 1	127375131	127375239
  5 | 1	133601223	133601355
  6 | 10	11959880	11960005
  7 | 10	34389981	34397085
  8 | 10	91118291	91118536
  9 | 10	117183335	117183411
 10 | 10	125226464	125328963
 11 | 11	6619808	6619940
 12 | 11	6620319	6620454
 13 | 11	11913621	11913748
 14 | 11	20847753	20847881
 15 | 11	58948911	58949532
 16 | 11	58954685	58956674
 17 | 11	69350518	69350661
 18 | 11	69666936	69672423
 19 | 11	97777527	97782437
 20 | 11	99017031	99017152
 21 | 11	106501245	106501377
 22 | 11	116163910	116164036
 23 | 12	16939994	16940115
 24 | 12	31258933	31259062
 25 | 12	105031075	105031349
 26 | 12	111540941	111541067
 27 | 13	21715763	21716143
 28 | 13	21717628	21718115
 29 | 13	21722098	21722478
 30 | 13	21735062	21735837
 31 | 13	21750194	21750505
 32 | 13	21753395	21753907
 33 | 13	21754123	21754503
 34 | 13	21779883	21780625
 35 | 13	21782915	21783397
 36 | 13	21786826	21787218
 37 | 13	21787461	21789213
 38 | 13	21806412	21810199
 39 | 13	21810465	21810944
 40 | 13	21811746	21812150
 41 | 13	21831767	21832196
 42 | 13	21833057	21833575
 43 | 13	21833743	21837530
 44 | 13	22035113	22035643
 45 | 13	22035821	22036299
 46 | 13	22040816	22041352
 47 | 13	22042479	22042949
 48 | 13	22043214	22043676
 49 | 13	23531044	23531519
 50 | 13	23533906	23534304
 51 | 13	23535422	23535951
 52 | 13	23542924	23543359
 53 | 13	23544052	23545055
 54 | 13	23551258	23551648
 55 | 13	23570517	23571220
 56 | 13	23571396	23572013
 57 | 13	23573736	23574196
 58 | 13	23574381	23574952
 59 | 13	23581598	23581990
 60 | 13	23583742	23621124
 61 | 13	23621755	23622502
 62 | 13	23684199	23692488
 63 | 13	23738807	23740366
 64 | 13	23744973	23745602
 65 | 13	23746734	23747202
 66 | 13	23751088	23751593
 67 | 13	23756937	23757427
 68 | 13	23760802	23761249
 69 | 13	23761853	23762386
 70 | 13	24811447	24811568
 71 | 13	49684301	49684433
 72 | 13	51202688	51203065
 73 | 13	62136762	62136831
 74 | 13	62543434	62543503
 75 | 13	74371426	74376566
 76 | 13	75905147	75905278
 77 | 13	95332618	95332749
 78 | 14	11227552	11227992
 79 | 14	26497655	26497785
 80 | 14	32191854	32192050
 81 | 14	52209785	52209894
 82 | 14	57333366	57333485
 83 | 14	57697809	57697937
 84 | 15	71794188	71794365
 85 | 15	98519716	98519853
 86 | 16	23107444	23114136
 87 | 16	43886682	43886814
 88 | 16	52105015	52105147
 89 | 16	53404086	53404193
 90 | 16	54354462	54354593
 91 | 16	71663788	71663918
 92 | 16	90602496	90602617
 93 | 17	12922790	12922917
 94 | 17	24528476	24528553
 95 | 17	55915403	55915870
 96 | 18	33795164	33795295
 97 | 18	35557032	35557227
 98 | 18	36801866	36802008
 99 | 19	8888538	8888685
100 | 19	20033504	20033635
101 | 19	44113979	44114124
102 | 19	46359036	46359144
103 | 19	47170469	47171134
104 | 2	19934863	19934953
105 | 2	32675109	32675245
106 | 2	32963291	32963420
107 | 2	37516332	37520603
108 | 2	38997476	39006168
109 | 2	85420137	85420280
110 | 2	144265979	144266216
111 | 2	158356424	158356554
112 | 2	158358360	158358472
113 | 2	158359798	158359929
114 | 2	158378222	158378354
115 | 2	167063473	167063565
116 | 2	167064998	167065086
117 | 2	171306168	171306431
118 | 3	24333046	24333552
119 | 3	30595346	30595757
120 | 3	69085105	69085447
121 | 3	86100534	86100655
122 | 3	88693930	88694057
123 | 3	96219865	96220308
124 | 3	96220361	96220880
125 | 3	96221121	96223738
126 | 3	96238110	96239127
127 | 3	96261704	96263311
128 | 3	96269721	96279001
129 | 3	96414437	96414859
130 | 3	108554338	108554751
131 | 3	128540372	128540480
132 | 3	150072590	150073620
133 | 4	43492788	43493058
134 | 4	117153827	117156243
135 | 4	129608331	129614257
136 | 4	132270080	132270213
137 | 4	132838383	132838547
138 | 4	134167808	134167895
139 | 5	92429785	92429928
140 | 5	100831281	100831414
141 | 5	110692049	110692181
142 | 5	146832890	146837032
143 | 5	149145724	149145821
144 | 6	8501236	8501356
145 | 6	39422289	39422419
146 | 6	52639234	52639355
147 | 6	71882557	71882693
148 | 6	124715232	124715502
149 | 6	132656957	132657844
150 | 6	132777179	132778162
151 | 6	136801553	136804431
152 | 7	97521808	97521916
153 | 7	99479563	99479707
154 | 7	99482785	99482932
155 | 7	109519147	109522367
156 | 7	110023210	110023342
157 | 7	110046364	110046547
158 | 7	111076060	111076227
159 | 7	118153480	118153610
160 | 7	127527874	127528003
161 | 7	141447370	141451585
162 | 7	143531394	143531520
163 | 8	13876097	13876226
164 | 8	57549775	57549888
165 | 8	69742862	69774886
166 | 8	70894722	70897443
167 | 8	95746060	95746195
168 | 8	110923116	110923263
169 | 8	121666628	121666706
170 | 9	3352657	3352786
171 | 9	15306214	15312104
172 | 9	15313802	15313932
173 | 9	15314845	15314981
174 | 9	15316489	15316588
175 | 9	64173387	64178562
176 | 9	120128780	120128935
177 | X	35838127	35838401
178 | X	93164902	93164984
179 | X	121308217	121308340
180 | X	156455999	156456095
181 | 


--------------------------------------------------------------------------------
/REF/extra-input-files/Mouse_mm9_nonPolyA_ROI.bed:
--------------------------------------------------------------------------------
  1 | 1	87995601	88008545
  2 | 1	89673513	89673784
  3 | 1	89681131	89681395
  4 | 1	129271708	129271816
  5 | 1	135497800	135497932
  6 | 10	11679678	11679803
  7 | 10	34109787	34116891
  8 | 10	90581036	90581281
  9 | 10	116620391	116620467
 10 | 10	124663520	124766019
 11 | 11	6519811	6519943
 12 | 11	6520322	6520457
 13 | 11	11813624	11813751
 14 | 11	20747756	20747884
 15 | 11	58762413	58763034
 16 | 11	58768187	58770176
 17 | 11	69164020	69164163
 18 | 11	69480438	69485925
 19 | 11	97638841	97643751
 20 | 11	98878345	98878466
 21 | 11	106362559	106362691
 22 | 11	116025224	116025350
 23 | 12	16946800	16946921
 24 | 12	31943798	31943927
 25 | 12	106269285	106269559
 26 | 12	112779152	112779278
 27 | 13	21807632	21808012
 28 | 13	21809497	21809984
 29 | 13	21813967	21814347
 30 | 13	21826931	21827706
 31 | 13	21842063	21842374
 32 | 13	21845264	21845776
 33 | 13	21845992	21846372
 34 | 13	21871752	21872494
 35 | 13	21874784	21875266
 36 | 13	21878695	21879087
 37 | 13	21879330	21881082
 38 | 13	21898281	21902068
 39 | 13	21902334	21902813
 40 | 13	21903615	21904019
 41 | 13	21923636	21924065
 42 | 13	21924926	21925444
 43 | 13	21925612	21929399
 44 | 13	22126982	22127512
 45 | 13	22127690	22128168
 46 | 13	22132685	22133221
 47 | 13	22134348	22134818
 48 | 13	22135083	22135545
 49 | 13	23622913	23623388
 50 | 13	23625775	23626173
 51 | 13	23627291	23627820
 52 | 13	23634793	23635228
 53 | 13	23635921	23636924
 54 | 13	23643127	23643517
 55 | 13	23662386	23663089
 56 | 13	23663265	23663882
 57 | 13	23665605	23666065
 58 | 13	23666250	23666821
 59 | 13	23673467	23673859
 60 | 13	23675611	23712993
 61 | 13	23713624	23714371
 62 | 13	23776068	23784357
 63 | 13	23830676	23832235
 64 | 13	23836842	23837471
 65 | 13	23838603	23839071
 66 | 13	23842957	23843462
 67 | 13	23848806	23849296
 68 | 13	23852671	23853118
 69 | 13	23853722	23854255
 70 | 13	24903316	24903437
 71 | 13	49779670	49779802
 72 | 13	51298057	51298434
 73 | 13	62238122	62238191
 74 | 13	62644794	62644863
 75 | 13	74508874	74514014
 76 | 13	76042595	76042726
 77 | 13	96102573	96102704
 78 | 14	12060066	12060506
 79 | 14	27317141	27317271
 80 | 14	33005040	33005236
 81 | 14	52829460	52829569
 82 | 14	57952203	57952322
 83 | 14	58316646	58316774
 84 | 15	71624618	71624795
 85 | 15	98350147	98350284
 86 | 16	23107517	23114209
 87 | 16	43886795	43886927
 88 | 16	52105128	52105260
 89 | 16	53404199	53404306
 90 | 16	54354575	54354706
 91 | 16	71664033	71664163
 92 | 16	90602741	90602862
 93 | 17	13115656	13115783
 94 | 17	24665421	24665498
 95 | 17	56054826	56055293
 96 | 18	33954818	33954949
 97 | 18	35716686	35716881
 98 | 18	36961520	36961662
 99 | 19	8963028	8963175
100 | 19	20107994	20108125
101 | 19	44188469	44188614
102 | 19	46433526	46433634
103 | 19	47244959	47245624
104 | 2	19856490	19856580
105 | 2	32530629	32530765
106 | 2	32818811	32818940
107 | 2	37371852	37376123
108 | 2	38852996	38861688
109 | 2	85260294	85260437
110 | 2	144091715	144091952
111 | 2	158182160	158182290
112 | 2	158184096	158184208
113 | 2	158185534	158185665
114 | 2	158203958	158204090
115 | 2	166888973	166889065
116 | 2	166890498	166890586
117 | 2	171131668	171131931
118 | 3	24231968	24232474
119 | 3	30494268	30494679
120 | 3	68889027	68889369
121 | 3	85904456	85904577
122 | 3	88497852	88497979
123 | 3	96023788	96024231
124 | 3	96024284	96024803
125 | 3	96025044	96027661
126 | 3	96042033	96043050
127 | 3	96065627	96067234
128 | 3	96073644	96082924
129 | 3	96218360	96218782
130 | 3	108357256	108357669
131 | 3	128243290	128243398
132 | 3	149735554	149736584
133 | 4	43505660	43505930
134 | 4	116826432	116828848
135 | 4	129285575	129291501
136 | 4	131825995	131826128
137 | 4	132394298	132394462
138 | 4	133723723	133723810
139 | 5	92858811	92858954
140 | 5	101260300	101260433
141 | 5	111121068	111121200
142 | 5	147644466	147648608
143 | 5	149957300	149957397
144 | 6	8451236	8451356
145 | 6	39372288	39372418
146 | 6	52589228	52589349
147 | 6	71832551	71832687
148 | 6	124665250	124665520
149 | 6	132606975	132607862
150 | 6	132727197	132728180
151 | 6	136750074	136752952
152 | 7	104670318	104670426
153 | 7	106628073	106628217
154 | 7	106631295	106631442
155 | 7	116662661	116665881
156 | 7	117166724	117166856
157 | 7	117189878	117190061
158 | 7	118219574	118219741
159 | 7	125296994	125297124
160 | 7	134671388	134671517
161 | 7	148633269	148637484
162 | 7	150717299	150717425
163 | 8	13876097	13876226
164 | 8	60028572	60028685
165 | 8	72266761	72298785
166 | 8	73418621	73421342
167 | 8	98269960	98270095
168 | 8	113447016	113447163
169 | 8	124190528	124190606
170 | 9	3352657	3352786
171 | 9	15110658	15116548
172 | 9	15118246	15118376
173 | 9	15119289	15119425
174 | 9	15120933	15121032
175 | 9	64021194	64026369
176 | 9	120037898	120038053
177 | X	33378122	33378396
178 | X	90410241	90410323
179 | X	118421826	118421949
180 | X	152890542	152890638
181 | 


--------------------------------------------------------------------------------
/REF/extra-input-files/RNA.SpikeIn.ERCC.fasta.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/REF/extra-input-files/RNA.SpikeIn.ERCC.fasta.gz


--------------------------------------------------------------------------------
/REF/extra-input-files/URLs:
--------------------------------------------------------------------------------
 1 | Human:
 2 | http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/wgEncodeDacMapabilityConsensusExcludable.txt.gz
 3 | 
 4 | ERCC:
 5 | ftp://ftp.ncbi.nlm.nih.gov/repository/acedb/SEQC_Reference_Targets/RNA.SpikeIn.ERCC.fasta.gz
 6 | 
 7 | === Ensembl Base FTP ===
 8 | 
 9 | mm20: (mouse)
10 | ftp://ftp.ensembl.org/pub/release-81/fasta/mus_musculus/dna/
11 | 
12 | hg19: (human, popular)
13 | ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/
14 | 
15 | hg38: (human, most recent)
16 | ftp://ftp.ensembl.org/pub/release-81/fasta/homo_sapiens/dna/
17 | 


--------------------------------------------------------------------------------
/bin/DESeq2Constructor.R:
--------------------------------------------------------------------------------
 1 | DESeqDataSetFromIRFinder = function(filePaths,designMatrix,designFormula,irratio_thr=0, warning_filter="^$" ){
 2 |     res=c()
 3 |     libsz=c()
 4 |     spl=c()
 5 |     irtest=read.table(filePaths[1])
 6 |     if (irtest[1,1]=="Chr"){irtest=irtest[-1,]}
 7 |     irnames=unname(apply(as.matrix(irtest),1,FUN=function(x){return(paste0(x[4],"/",x[1],":",x[2],"-",x[3],":",x[6]))}))
 8 |     n=1
 9 |     warns=c()
10 |     ratio_mask=c()
11 |     for (i in filePaths){
12 |         print(paste0("processing file ",n," at ",i))
13 |         irtab=read.table(i)
14 |         if (irtab[1,1]=="Chr"){irtab=irtab[-1,]}
15 |         #rn=unname(apply(irtab,1,FUN=function(x){return(paste0(x[4],"/",x[1],":",x[2],"-",x[3],":",x[6]))}))
16 |         #row.names(irtab)=rn
17 |         #tmp1=round(as.numeric(as.vector(irtab[irnames,9])))
18 |         #tmp2=as.numeric(as.vector(irtab[irnames,19]))
19 |         tmp1=as.numeric(as.vector(irtab[,9]))
20 |         tmp2=as.numeric(as.vector(irtab[,19]))
21 |         tmp3=tmp1+tmp2
22 |         tmp4=as.numeric(as.vector(irtab[,17]))
23 |         tmp5=as.numeric(as.vector(irtab[,18]))
24 |         tmp6=pmax(tmp4,tmp5, na.rm=T)
25 |         res=cbind(res,tmp1)
26 |         libsz=cbind(libsz,tmp2)
27 |         spl=cbind(spl,tmp6)
28 |         if (length(warns) == 0){
29 |           warns= ! grepl(as.character(irtab[,21]), pattern = warning_filter )  
30 |         } else {
31 |           warns=warns & ! grepl(as.character(irtab[,21]), pattern = warning_filter )
32 |         }
33 |         ratios=(tmp1 / (tmp6+tmp1))
34 |         rmsk=(! is.nan(ratios)) & ratios >= irratio_thr
35 |         if (length(ratio_mask) == 0 ){
36 |           ratio_mask = rmsk
37 |         } else {
38 |           ratio_mask = ratio_mask | rmsk
39 |         }
40 |         n=n+1
41 |     }
42 |     print(warning_filter)
43 |     print(irratio_thr)
44 |     print(paste0("Warning removed: ", sum(! warns)))
45 |     print(paste0("Ratio removed: ", sum(! ratio_mask)))
46 |     warns=warns & ratio_mask
47 |     print(paste0("Combined removed: ", sum(! warns)))
48 |     res.rd=round(res)[warns,]
49 |     libsz.rd=round(libsz)[warns,]
50 |     spl.rd=round(spl)[warns,]
51 |     colnames(res.rd)=paste("intronDepth",as.vector(designMatrix[,1]),sep=".")
52 |     rownames(res.rd)=irnames[warns]
53 |     colnames(libsz.rd)=paste("totalSplice",as.vector(designMatrix[,1]),sep=".")
54 |     rownames(libsz.rd)=irnames[warns]
55 |     colnames(spl.rd)=paste("maxSplice",as.vector(designMatrix[,1]),sep=".")
56 |     rownames(spl.rd)=irnames[warns]
57 |     
58 |     ir=c(rep("IR",dim(designMatrix)[1]),rep("Splice",dim(designMatrix)[1]))
59 |     group=rbind(designMatrix,designMatrix)
60 |     group$IRFinder=ir
61 |     group$IRFinder=factor(group$IRFinder,levels=c("Splice","IR"))
62 |     
63 |     #counts.IRFinder=cbind(res.rd,libsz.rd)
64 |     counts.IRFinder=cbind(res.rd,spl.rd)
65 |     
66 |     dd = DESeqDataSetFromMatrix(countData = counts.IRFinder, colData = group, design = designFormula)
67 |     sizeFactors(dd)=rep(1,dim(group)[1])
68 |     rownames(dd)=irnames[warns]
69 |     final=list(dd,res,libsz,spl)
70 |     names(final)=c("DESeq2Object","IntronDepth","SpliceDepth","MaxSplice")
71 |     return(final)
72 | }
73 | 


--------------------------------------------------------------------------------
/bin/IRFinder:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util
 4 | source ${LIBEXEC}/bash_utils.sh
 5 | 
 6 | RUNMODES="FastQ|Long|BAM|BuildRef|BuildRefDownload|BuildRefProcess|BuildRefFromSTARRef|Diff"
 7 | 
 8 | function usage() {
 9 | 	echo "" >&2
10 | 	echo "IRFinder version: $VERSION" >&2
11 | 	echo "Usage: IRFinder [${RUNMODES}]" >&2
12 | 	echo "" >&2
13 | 	echo "Possible RunModes:" >&2
14 | 	echo "" >&2
15 | 	echo "             BuildRef: Builds IRFinder reference from Ensembl FTP site. Requires Internet" >&2
16 | 	echo "     BuildRefDownload: Only downloads FASTA and GTF files from Ensembl FTP site, ">&2
17 | 	echo "                         without building IRFinder reference. Requires Internet" >&2
18 | 	echo "      BuildRefProcess: Builds IRFinder reference from local FASTA and GTF files" >&2
19 | 	echo "  BuildRefFromSTARRef: Builds IRFinder reference from a local STAR reference" >&2
20 |     echo "                FastQ: Quantifies intron retention from FASTQ file (Default)" >&2
21 |     echo "                 Long: Quantifies intron retention from FASTQ file of long reads" >&2
22 | 	echo "                  BAM: Quantifies intron retention from a BAM file" >&2
23 | 	echo "                 Diff: Compare IRrates from two conditions using SUPPA2 algorithm" >&2	    
24 | 
25 | 	echo "" >&2
26 | 	echo "    -v|--version  Show version number of current IRFinder ( when no RunMode is given )." >&2
27 | 	echo "    -h|--help     Show this usage information. Dedicated usage informations are given if a RunMode is selected." >&2
28 | 	echo "" >&2
29 | 	exit 1
30 | }
31 | 
32 | function isRunMode() {
33 |     if [[ $1 =~ ${RUNMODES} ]]; then
34 |         return 0
35 |     else
36 |         return 1
37 |     fi
38 | }
39 | 
40 | 
41 | 
42 | # === Defaults ===
43 | 
44 | RUNMODE=""
45 | export START_MESSAGE=0
46 | EXECDIR=$(dirname "$(readlink -nf "$BASH_SOURCE")")
47 | 
48 | if [[ $# -eq 0 || $1 == "-h" || $1 == "--help" ]]; then
49 | 	usage
50 | fi
51 | 
52 | if [[ ( $# -eq 1 && $1 == "-v" ) || $( echo "$@" | grep -c "\-\-version" ) == "1"  ]]; then
53 |    echo "IRFinder version: $VERSION" 
54 |    exit
55 | fi
56 | 
57 | if [[ $1 =~ ^[^-] ]] ; then
58 |     RUNMODE=$(echo $1 | awk -v runm="${RUNMODES}" '{IGNORECASE=1; split(runm, arr, "|"); out=$1; for ( k in arr) { if ( arr[k] == $1 ) { out=arr[k] }  }; print out }')
59 |     shift;
60 |     args=$@
61 | elif [[ $( echo $@ | grep -c "\-m" ) == 1 ]] ; then
62 |     RUNMODE=$( echo $@ | awk -v runm="${RUNMODES}" ' {out="" ; IGNORECASE=1; split(runm, arr, "|"); for ( i=1; i<= NF; i++ ) { if ($i == "-m" ) { i=i+1; for ( k in arr ) { if ( arr[k] == $i ) { out=arr[k] }  } } }; print out  } ' )
63 |     if [[ "${RUNMODE}" != "" ]] ; then
64 |         args=$( echo $@ | awk '{out=""; for ( i=1; i<=NF; i++) { if ( $i == "-m" ) { i=i+1 } else { out = out " " $i } };  print out} ' )
65 |     fi
66 | fi
67 | 
68 | if [[ "${RUNMODE}" == "" ]]; then
69 |     echo "Possible runmodes: $RUNMODES"
70 |     exit 1
71 | fi
72 | 
73 | 
74 | 
75 | if isRunMode $RUNMODE; then
76 |     $EXECDIR/IRFinder${RUNMODE} ${args} 
77 | else
78 |     echo "RunMode $RUNMODE not recognized." >&2
79 |     echo "Valid options for Mode are: BuildRef, BuildRefDownload, BuildRefProcess, BuildRefFromSTARRef, BAM, FastQ, Long, Diff. Default: FastQ" >&2
80 | fi
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/bin/IRFinderBAM:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util
  4 | source $LIBEXEC/bash_utils.sh 
  5 | 
  6 | 
  7 | function usage() {
  8 | 	echo "" >&2
  9 | 	echo "IRFinder version: $VERSION" >&2
 10 | 	echo "Usage: IRFinder BAM -r ReferenceDir [Un]sorted.bam " >&2
 11 | 	echo "" >&2
 12 | 	echo "  required:" >&2
 13 | 	echo "    [Un]sorted.bam: the target bam file. If paired end and sorted by coordinates, "
 14 | 	echo "                    the process will be slightly slower and more memory consuming." >&2
 15 | 	# TODO - we cannot currently accept fasta input to the trimmer (only fastq), probably should, believe STAR ignores quality anyway, and we strip it on output.
 16 | 	echo "    -r ReferenceDir: As built by the 'BuildRef' option." >&2
 17 | 	echo "" >&2
 18 | 	echo "  optional:" >&2
 19 | 	echo "    -d string : Output Directory. Default is the current directory." >&2
 20 |     echo "    -l Long reads flag." >&2
 21 |     echo "    -j Jitter, consider the position around the splice sites to compensate sequencing errors ( Long reads only, integer number )." >&2
 22 | 	echo "    -v Verbose." >&2
 23 | 	echo "  additional :" >&2	
 24 | 	echo "    -R double : Minimum IRratio accepted to consider the intron for the CNN validation. Default: 0.05 " >&2
 25 | 	echo "    -w int : Warning level accepted to consider the intron for the CNN validation. Default: 1" >&2
 26 | 	echo "         0: Disabled " >&2
 27 | 	echo "         1: Only without warning " >&2
 28 | 	echo "         2: Include NonUniformIntronCover  " >&2	
 29 | 	echo "         3: Include also MinorIsoform" >&2
 30 | 	echo "         4: Include also LowSplicing" >&2
 31 | 	echo "         5: Include also LowCover ( consider all )" >&2
 32 | 	echo "" >&2
 33 | 	exit 1
 34 | }
 35 | 
 36 | 
 37 | # === Defaults ===
 38 | OUTPUTDIR=.
 39 | THREADS=0
 40 | REF=
 41 | VERBOSE=0
 42 | RETRO=0
 43 | READ_TYPE="SR"
 44 | AI_WARN=1
 45 | AI_INTRON=1
 46 | AI_RATIO="0.05"
 47 | JITTER="3"
 48 | 
 49 | if [ $# -eq 0 ] || [[ $1 == "--help"  ]] ; then
 50 | 	usage
 51 | fi
 52 | 
 53 | while getopts ":r:t:d:i:w:R:j:vhl" opt; do
 54 | 	case $opt in
 55 | 		r)
 56 | 			# Reference directory.
 57 | 			REF=$OPTARG
 58 | 			;;
 59 | 		t)
 60 | 			## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.) IRFinder is single core for now.
 61 | 			if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
 62 | 				echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2
 63 | 				exit 1
 64 | 			fi
 65 | 			THREADS=$OPTARG
 66 | 			;;
 67 |         i)
 68 | 			if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
 69 | 				echo "Argument error: -i $OPTARG, number of warning must be an integer." >&2
 70 | 				exit 1
 71 | 			fi
 72 | 			AI_INTRON=$OPTARG
 73 | 			;;
 74 | 		R)
 75 | 			if [[ ! $OPTARG =~ ^0\.[0-9]+$ ]]; then
 76 | 				echo "Argument error: -r $OPTARG, ratio must be a float number between 0 and 1 not included." >&2
 77 | 				exit 1
 78 | 			fi
 79 | 			AI_RATIO=$OPTARG
 80 | 			;;
 81 |         w)
 82 | 			if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
 83 | 				echo "Argument error: -w $OPTARG, number of intron depth must be an integer." >&2
 84 | 				exit 1
 85 | 			fi
 86 | 			AI_WARN=$OPTARG
 87 | 			;;	
 88 | 	    j)
 89 | 			if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
 90 | 				echo "Argument error: -j $OPTARG, jitter must be an integer." >&2
 91 | 				exit 1
 92 | 			fi
 93 | 			JITTER=$OPTARG
 94 | 			;;						
 95 | 		d)
 96 | 			OUTPUTDIR=$OPTARG
 97 | 			;;
 98 | 		v)
 99 | 			VERBOSE=1
100 | 			;;
101 | 	    l)  
102 | 	        READ_TYPE="LR"
103 | 	        ;;
104 | 		h)
105 | 			usage
106 | 			;;
107 | 		\?)
108 | 			echo "Invalid option: -$OPTARG" >&2
109 | 			exit 1
110 | 			;;
111 | 		:)
112 | 			echo "Option -$OPTARG requires an argument." >&2
113 | 			exit 1
114 | 			;;
115 | 	esac
116 | done
117 | shift $(($OPTIND - 1))
118 | 
119 | 
120 | checkRef $REF
121 | checkOutDir $OUTPUTDIR
122 | checkSamtools
123 | setThreads
124 | 
125 | if [ ! $# -eq 1 ]; then
126 |     echo "Argument error: in run mode BAM, provide a single BAM as input. $# arguments found." >&2    
127 |     exit 1
128 | fi
129 | 
130 | if [[ "${IRF_RUNMODE}" == "" ]]; then 
131 |     logger init
132 | fi
133 | 
134 | RUNMODE="BAM" startMessage $@
135 | 
136 | logger "[ " $(date) " ] Processing the BAM file with IRFinder" 
137 | logger "---"
138 | 
139 | # BAM check 
140 | samtools view -H $1 > /dev/null || exit 1
141 | # Sort check 
142 | if [ $(samtools view -H $1 | grep -c "SO:coordinate" ) -eq 1 ]; then  
143 |     # PE check 
144 |     if [ $( { samtools view -H $1 ; samtools view $1 | head -n 1000 ; } | samtools view -c -f 1 ) -gt 0 ]; then
145 |         logger "The given bam file is sorted by coordinate and is paired."
146 |     fi 
147 | fi
148 | 
149 | 
150 | if [ $VERBOSE -eq 1 ];then
151 |     ${LIBEXEC}/irfinder ${OUTPUTDIR} \
152 |     ${REF}/IRFinder/ref-cover.bed \
153 |     ${REF}/IRFinder/ref-sj.ref \
154 |     ${REF}/IRFinder/ref-read-continues.ref \
155 |     ${REF}/IRFinder/ref-ROI.bed ${READ_TYPE} "${AI_WARN}:${AI_INTRON}:${AI_RATIO}" "${JITTER}" $1  2>> $OUTPUTDIR/logs/irfinder.stderr | tee -a $OUTPUTDIR/logs/irfinder.stdout  
156 |     cat $OUTPUTDIR/logs/irfinder.stderr
157 | else
158 |     ${LIBEXEC}/irfinder ${OUTPUTDIR} \
159 |     ${REF}/IRFinder/ref-cover.bed \
160 |     ${REF}/IRFinder/ref-sj.ref \
161 |     ${REF}/IRFinder/ref-read-continues.ref \
162 |     ${REF}/IRFinder/ref-ROI.bed ${READ_TYPE} "${AI_WARN}:${AI_INTRON}:${AI_RATIO}" "${JITTER}" $1 >> $OUTPUTDIR/logs/irfinder.stdout 2>>  $OUTPUTDIR/logs/irfinder.stderr 
163 | fi
164 |  
165 | 
166 | 
167 | logger "---"
168 | logger "[ " $(date) " ] IRFinder BAM analysis completed " 
169 | logger "---"
170 | 
171 | "$LIBEXEC/warnings" "$OUTPUTDIR"
172 | 
173 | N_WARNINGS=$(wc -l $OUTPUTDIR/WARNINGS | awk '{print $1}' )
174 | if [ $N_WARNINGS -gt 0 ]; then
175 |     logger "Process completed with warnings. Check $OUTPUTDIR/WARNINGS " >&2
176 | fi
177 | 
178 | if [[ -f $OUTPUTDIR/IRFinder-IR-nondir-AI.txt ]]; then
179 |     logger "---"
180 |     logger "[ " $(date) " ] Running CNN validator " 
181 |     logger "---"
182 |     ${LIBEXEC}/irfinder_cnn -d ${OUTPUTDIR} -m ${LIBEXEC}/model/ && rm ${OUTPUTDIR}/*-AI.txt
183 |     logger "---"
184 |     logger "[ " $(date) " ] CNN validator completed" 
185 |     logger "---"
186 | fi
187 | 
188 | 
189 | 
190 | 


--------------------------------------------------------------------------------
/bin/IRFinderBuildRef:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util
  4 | source $LIBEXEC/bash_utils.sh 
  5 | 
  6 | function usage() {
  7 | 	echo "" >&2
  8 | 	echo "IRFinder version: $VERSION" >&2
  9 | 	echo "Usage: IRFinder BuildRef [-v][-h][-t INT][-j INT][-e ExtraGenomeRef.fa][-b Blacklist.bed][-R ROI.bed][-m Mapability.bed][-M INT] -r ReferenceDir URL" >&2
 10 | 	echo "" >&2
 11 | 	echo "  required:" >&2
 12 | 	echo "    URL A base Ensembl URL to a (gzipped) gtf file. For example: ftp://ftp.ensembl.org/pub/release-100/gtf/homo_sapiens/Homo_sapiens.GRCh38.100.gtf.gz" >&2
 13 | 	echo "    -r ReferenceDir: Directory should not yet exist, will be created." >&2
 14 | 	echo "" >&2
 15 | 	echo "  optional:" >&2
 16 | 	echo "    -t Threads: The number of physical CPUs to use by IRFinder. When ommited (default), IRFinder will use all physical CPUs." >&2
 17 | 	echo "    -j INTEGER: An integer that is parsed to '--sjdbOverhang' under STAR 'genomeGenerate' mode. Default: 150." >&2
 18 |     echo "    -M Mapability: A precomputed bed file containing the low mapability areas. Can also be an empty file." >&2
 19 |     echo "    -n MapabilityReadLength: The length of the reads used to compute the mapability. Default: 70" >&2
 20 | 	echo "    -e ExtraGenomeRef.fasta.gz: Typically an ERCC reference." >&2
 21 | 	echo "    -b Blacklist.bed.gz: BED of regions to be excluded from analysis." >&2
 22 | 	echo "    -R ROI.bed.gz: A non-overlapping BED file of additional Regions of Interest for read counts." >&2	
 23 | 	echo "    -v Show version number of current IRFinder." >&2
 24 |     echo "    -L STAR limitGenomeGenerateRAM argument. Default: 31000000000" >&2
 25 | 	echo "    -h Show this usage information." >&2
 26 | 	echo "" >&2
 27 | 	exit 1
 28 | }
 29 | 
 30 | 
 31 | # === Defaults ===
 32 | THREADS=0
 33 | REF=
 34 | SJOH=150
 35 | BUILDERCCFILE=
 36 | BUILDROI=
 37 | BUILDBLACK=
 38 | STAREXEC=STAR
 39 | MAPABILITY_FILE=
 40 | MAPABILITY_LEN=100
 41 | GENOMERAM=31000000000
 42 | 
 43 | if [ $# -eq 0 ] || [[ $1 == "--help"  ]] ; then
 44 | 	usage
 45 | fi
 46 | 
 47 | 
 48 | while getopts ":r:j:t:S:e:b:R:n:M:L:hv" opt; do
 49 | 	case $opt in
 50 | 		r)
 51 | 			# Reference directory.
 52 | 			REF=$OPTARG
 53 | 			if [ -d "$REF" ]; then
 54 |             	echo "Argument error: -r $REF. Reference directory must not exist, BuildRef will create it." >&2
 55 |             	exit 1
 56 |             fi
 57 | 			;;
 58 | 		j) # STAR's --sjdbOverhang
 59 | 			if [[ $OPTARG =~ ^[0-9]+$ ]] ; then 
 60 | 				SJOH=$OPTARG
 61 | 			else
 62 | 				echo "Argument error: -j $OPTARG. '$OPTARG' is not an integer." >&2
 63 | 				exit 1
 64 | 			fi
 65 | 			;;
 66 | 		t)
 67 | 			## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.)
 68 | 			if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
 69 | 				echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2
 70 | 				exit 1
 71 | 			fi
 72 | 			THREADS=$OPTARG
 73 | 			;;
 74 | 		L)
 75 | 			## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.)
 76 | 			if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
 77 | 				echo "Argument error: -L $OPTARG, limitGenomeGenerateRAM must be an integer." >&2
 78 | 				exit 1
 79 | 			fi
 80 | 			GENOMERAM=$OPTARG
 81 | 			;;
 82 | 		S)
 83 | 			## STAR executable. (must be executable!)
 84 | 			if [ -x "$OPTARG" -a ! -d "$OPTARG" ]; then
 85 | 				STAREXEC=$OPTARG
 86 | 			else
 87 | 				echo "Argument error: -S $OPTARG. STAR executable must be an executable program." >&2
 88 | 				exit 1
 89 | 			fi
 90 | 			;;
 91 | 		e)
 92 | 			#ERCC file. (must be a file)
 93 | 			if [ ! -f "$OPTARG" ]; then
 94 | 				echo "Argument error: -e $OPTARG. Specified ERCC/extra-reference file does not exist." >&2
 95 | 				exit 1
 96 | 			fi
 97 | 			BUILDERCCFILE=$OPTARG
 98 | 			;;
 99 | 		b)
100 | 			#Blacklist local file (must be a file)
101 | 			if [ ! -f "$OPTARG" ]; then
102 | 				echo "Argument error: -b $OPTARG. Specified blacklist file does not exist." >&2
103 | 				exit 1
104 | 			fi
105 | 			BUILDBLACK=$OPTARG
106 | 			;;
107 | 		R)
108 | 			#ROI local file. (must be a file)
109 | 			if [ ! -f "$OPTARG" ]; then
110 | 				echo "Argument error: -R $OPTARG. Specified ROI file does not exist." >&2
111 | 				exit 1
112 | 			fi
113 | 			BUILDROI=$OPTARG
114 | 			;;
115 |         M)
116 |           if [ ! -f "$OPTARG" ]; then
117 |             echo "Argument error: -m $OPTARG. Specified Mapability file does not exist." >&2
118 |             exit 1
119 |           fi
120 |           MAPABILITY_FILE=$OPTARG
121 |           ;;
122 |         n)
123 |           if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
124 |     		echo "Argument error: -n $OPTARG, must be an integer." >&2
125 | 			exit 1
126 |           fi
127 | 		  MAPABILITY_LEN=$OPTARG
128 | 		  ;;
129 | 		h)
130 | 			usage
131 | 			;;
132 | 		v)
133 | 		    versionAlert
134 | 		    ;; 
135 | 		\?)
136 | 			echo "Invalid option: -$OPTARG" >&2
137 | 			exit 1
138 | 			;;
139 | 		:)
140 | 			echo "Option -$OPTARG requires an argument." >&2
141 | 			exit 1
142 | 			;;
143 | 	esac
144 | done
145 | shift $(($OPTIND - 1))
146 | 
147 | #echo $@  #The remaining arguments.
148 | #echo $#  #The number of remaining arguments. 
149 | 
150 | if [ ! "$REF" ]; then
151 | 	echo "Argument error: -r is required." >&2
152 | 	exit 1	
153 | fi
154 | 
155 | if [[ "${MAPABILITY_FILE}" == "" ]]; then
156 |     checkStar $STAREXEC
157 | fi
158 | setThreads
159 | 
160 | if [ ! $# -eq 1 ]; then
161 | 	echo "Argument error: in run mode BuildRef, provide a single ftp URL. $# arguments found." >&2
162 | 	exit 1
163 | fi
164 | 
165 | BUILDHINT=$1
166 | if [[ "$BUILDHINT" != ftp* ]]; then
167 | 	echo "Argument error: A single ftp url is required to find and download genome fasta and gtf files. eg: ftp://ftp.ensembl.org/pub/release-78/fasta/mus_musculus/dna/." >&2
168 | 	exit 1
169 | fi
170 | 
171 | 
172 | echo "Launching reference build process. The full build might take hours."
173 | 
174 | "$LIBEXEC/IRFinder-BuildRefFromEnsembl" BuildRef "$THREADS" "$STAREXEC" "$BUILDHINT" "$REF" "$BUILDERCCFILE" "$BUILDROI" "$BUILDBLACK" "$SJOH" "$MAPABILITY_FILE" "$MAPABILITY_LEN" "$GENOMERAM"
175 | 
176 | 
177 | 


--------------------------------------------------------------------------------
/bin/IRFinderBuildRefDownload:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util
 5 | source $LIBEXEC/bash_utils.sh 
 6 | 
 7 | 
 8 | function usage() {
 9 | 	echo "" >&2
10 | 	echo "IRFinder version: $VERSION" >&2
11 | 	echo "Usage: IRFinder BuildRefDownload [-v][-h] -r ReferenceDir URL" >&2
12 | 	echo "" >&2
13 | 	echo "  required:" >&2
14 | 	echo "    URL A base Ensembl URL to a (gzipped) gtf file. For example: ftp://ftp.ensembl.org/pub/release-100/gtf/homo_sapiens/Homo_sapiens.GRCh38.100.gtf.gz" >&2
15 | 	echo "    -r ReferenceDir: Directory should not yet exist, will be created." >&2
16 | 	echo "" >&2
17 | 	echo "  optional:" >&2
18 | 	echo "    -v Show version number of current IRFinder." >&2
19 | 	echo "    -h Show this usage information." >&2
20 | 	echo "" >&2
21 | 	exit 1
22 | }
23 | 
24 | 
25 | # === Defaults ===
26 | REF=
27 | STAREXEC=STAR
28 | 
29 | 
30 | if [ $# -eq 0 ] || [[ $1 == "--help"  ]] ; then
31 | 	usage
32 | fi
33 | 
34 | 
35 | while getopts ":m:r:hv" opt; do
36 | 	case $opt in
37 | 		r)
38 | 			# Reference directory.
39 | 			REF=$OPTARG
40 | 			if [ -d "$REF" ]; then
41 |             	echo "Argument error: -r $REF. Reference directory must not exist, BuildRef will create it." >&2
42 |             	exit 1
43 |             fi
44 | 			;;
45 | 		h)
46 | 			usage
47 | 			;;
48 | 		v)
49 | 		    versionAlert
50 | 		    ;; 
51 | 		\?)
52 | 			echo "Invalid option: -$OPTARG" >&2
53 | 			exit 1
54 | 			;;
55 | 		:)
56 | 			echo "Option -$OPTARG requires an argument." >&2
57 | 			exit 1
58 | 			;;
59 | 	esac
60 | done
61 | shift $(($OPTIND - 1))
62 | 
63 | #echo $@  #The remaining arguments.
64 | #echo $#  #The number of remaining arguments. 
65 | 
66 | if [ ! "$REF" ]; then
67 | 	echo "Argument error: -r is required." >&2
68 | 	exit 1	
69 | fi
70 | 
71 | 
72 | if [ ! $# -eq 1 ]; then
73 | 	echo "Argument error: in run mode BuildRefDownload, provide a single ftp URL. $# arguments found." >&2
74 | 	exit 1
75 | fi
76 | 
77 | BUILDHINT=$1
78 | if [[ "$BUILDHINT" != ftp* ]]; then
79 | 	echo "Argument error: A single ftp url is required to find and download genome fasta and gtf files. eg: ftp://ftp.ensembl.org/pub/release-78/fasta/mus_musculus/dna/." >&2
80 | 	exit 1
81 | fi
82 | 
83 | 
84 | echo "Launching reference build process. The full build might take hours."
85 | 
86 | "$LIBEXEC/IRFinder-BuildRefFromEnsembl" BuildRefDownload "1" "" "$BUILDHINT" "$REF" "" "" "" ""
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/bin/IRFinderBuildRefFromSTARRef:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util
  5 | source $LIBEXEC/bash_utils.sh 
  6 | 
  7 | 
  8 | function usage() {
  9 |   echo "" >&2
 10 |   echo "IRFinder version: $VERSION" >&2
 11 |   echo "Usage: IRFinder BuildRefFromSTARRef [-v][-h][-t INT][-j INT][-e ExtraGenomeRef.fa][-b Blacklist.bed][-R ROI.bed][-m Mapability.bed] -r ReferenceDir -x STARRefDir" >&2
 12 |   echo "" >&2
 13 |   echo "  required:" >&2
 14 |   echo "    -r ReferenceDir: Directory should not yet exist, will be created." >&2
 15 |   echo "    -x STARRefDir. An existing STAR reference folder." >&2
 16 |   echo "       Please note: By default, BuildRefFromSTARRef mode automatically looks for the original FASTA and GTF files used to generate STARRefDir." >&2
 17 |   echo "       Specifically, IRFinder investigates 'genomeParameters.txt' in STARRefDir." >&2
 18 |   echo "       If both files can be located, IRFinder will continue to generate reference, ignoring '-f' and '-g' options." >&2
 19 |   echo "       If either file is missing, IRFinder will quit and you have to re-run it by giving both '-f' and '-g' options." >&2
 20 |   echo "" >&2
 21 |   echo "  optional:" >&2
 22 |   echo "    -t Threads: The number of physical CPUs to use by IRFinder. When ommited (default), IRFinder will use all physical CPUs." >&2
 23 |   echo "    -M Mapability: A precomputed bed file containing the low mapability areas. Can also be an empty file." >&2
 24 |   echo "    -n MapabilityReadLength: The length of the reads used to compute the mapability. Default: 70" >&2
 25 |   echo "    -f GENOME.fa: This MUST be the same FASTA file used to generate STARRefDir. Ignored when IRFinder can automatically locate the original file." >&2
 26 |   echo "    -g TRANSCRIPTS.gtf: This MUST be the same GTF file used to generate STARRefDir. Ignored when IRFinder can automatically locate the original file." >&2
 27 |   echo "    -e ExtraGenomeRef.fasta.gz: Typically an ERCC reference." >&2
 28 |   echo "    -b Blacklist.bed.gz: BED of regions to be excluded from analysis." >&2
 29 |   echo "    -R ROI.bed.gz: A non-overlapping BED file of additional Regions of Interest for read counts." >&2
 30 |   echo "    -l Don't copy the STAR reference and the other files but create symbolic links." >&2
 31 |   echo "    -v Show version number of current IRFinder." >&2
 32 |   echo "    -h Show this usage information." >&2
 33 |   echo "" >&2
 34 |   exit 1
 35 | }
 36 | 
 37 | 
 38 | # === Defaults ===
 39 | THREADS=0
 40 | REF=
 41 | SJOH=150
 42 | BUILDERCCFILE=
 43 | BUILDROI=
 44 | BUILDBLACK=
 45 | STAREXEC=STAR
 46 | MYFASTA="NULL"
 47 | MYGTF="NULL"
 48 | LINK=0
 49 | MAPABILITY_FILE=
 50 | MAPABILITY_LEN=100
 51 | 
 52 | if [ $# -eq 0 ] || [[ $1 == "--help"  ]] ; then
 53 |   usage
 54 | fi
 55 | 
 56 | 
 57 | while getopts ":m:r:t:S:e:b:R:x:f:g:m:M:n:hvl" opt; do
 58 |   case $opt in
 59 |     r)
 60 |       # Reference directory.
 61 |       REF=$OPTARG
 62 |       if [ -d "$REF" ]; then
 63 |               echo "Argument error: -r $REF. Reference directory must not exist, BuildRef will create it." >&2
 64 |               exit 1
 65 |             fi
 66 |       ;;
 67 |     t)
 68 |       ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.)
 69 |       if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
 70 |         echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2
 71 |         exit 1
 72 |       fi
 73 |       THREADS=$OPTARG
 74 |       ;;
 75 |     S)
 76 |       ## STAR executable. (must be executable!)
 77 |       if [ -x "$OPTARG" -a ! -d "$OPTARG" ]; then
 78 |         STAREXEC=$OPTARG
 79 |       else
 80 |         echo "Argument error: -S $OPTARG. STAR executable must be an executable program." >&2
 81 |         exit 1
 82 |       fi
 83 |       ;;
 84 |     e)
 85 |       #ERCC file. (must be a file)
 86 |       if [ ! -f "$OPTARG" ]; then
 87 |         echo "Argument error: -e $OPTARG. Specified ERCC/extra-reference file does not exist." >&2
 88 |         exit 1
 89 |       fi
 90 |       BUILDERCCFILE=$OPTARG
 91 |       ;;
 92 |     b)
 93 |       #Blacklist local file (must be a file)
 94 |       if [ ! -f "$OPTARG" ]; then
 95 |         echo "Argument error: -b $OPTARG. Specified blacklist file does not exist." >&2
 96 |         exit 1
 97 |       fi
 98 |       BUILDBLACK=$OPTARG
 99 |       ;;
100 |     R)
101 |       #ROI local file. (must be a file)
102 |       if [ ! -f "$OPTARG" ]; then
103 |         echo "Argument error: -R $OPTARG. Specified ROI file does not exist." >&2
104 |         exit 1
105 |       fi
106 |       BUILDROI=$OPTARG
107 |       ;;
108 |     M)
109 |       if [ ! -f "$OPTARG" ]; then
110 |         echo "Argument error: -m $OPTARG. Specified Mapability file does not exist." >&2
111 |         exit 1
112 |       fi
113 |       MAPABILITY_FILE=$OPTARG
114 |       ;;
115 |     n)
116 |       if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
117 |   		echo "Argument error: -n $OPTARG, must be an integer." >&2
118 | 		exit 1
119 |       fi
120 | 	  MAPABILITY_LEN=$OPTARG
121 | 	  ;;
122 |     x)
123 |       #
124 |       STARREF=$(realpath $OPTARG)
125 |       ## Must be a directory
126 |       ;;
127 |     f)
128 |       # The original FASTA file to generate STAR reference.
129 |       MYFASTA=$(realpath $OPTARG)
130 |       checkFile $MYFASTA
131 |       ;;
132 |     g)
133 |       # The original GTF file to generate STAR reference.
134 |       MYGTF=$(realpath $OPTARG)
135 |       checkFile $MYGTF
136 |       ;;
137 |     h)
138 |       usage
139 |       ;;
140 |     v)
141 |         versionAlert
142 |         ;; 
143 |     l)
144 |         LINK=1
145 |         ;;
146 |     \?)
147 |       echo "Invalid option: -$OPTARG" >&2
148 |       exit 1
149 |       ;;
150 |     :)
151 |       echo "Option -$OPTARG requires an argument." >&2
152 |       exit 1
153 |       ;;
154 |   esac
155 | done
156 | shift $(($OPTIND - 1))
157 | 
158 | #echo $@  #The remaining arguments.
159 | #echo $#  #The number of remaining arguments. 
160 | 
161 | if [ ! "$REF" ]; then
162 |   echo "Argument error: -r is required." >&2
163 |   exit 1  
164 | fi
165 | 
166 | if [ -d "$REF" ]; then
167 |     echo "Argument error: -r $REF. Reference directory must not exist, BuildRef will create it." >&2
168 |     exit 1
169 | fi
170 | 
171 | if [[ "${MAPABILITY_FILE}" == "" ]]; then
172 |     checkStar $STAREXEC
173 | fi
174 | 
175 | setThreads
176 | 
177 | if [ ! "$STARREF" ]; then
178 |   echo "Argument error: -x is required. Must provide an exisiting STAR reference folder for BuildRefProcess mode." >&2
179 |   exit 1
180 | fi
181 | if [ ! -d "$STARREF" ]; then
182 |   echo "Error: STAR reference at $STARREF does not exist." >&2
183 |   exit 1
184 | fi
185 | 
186 | 
187 | if [[ "${MYFASTA}" == "NULL" ]] || [[ "${MYGTF}" == "NULL" ]] ; then
188 |     if [ ! -f "$STARREF/genomeParameters.txt" ] ; then
189 |       echo "Error: Cannot locate the original FASTA and GTF files used to generate STAR reference at $STARREF." >&2
190 |       echo "       Please provide these two files through '-f' and '-g' options respectively." >&2
191 |       echo "       Or retry to build IRFinder reference in other modes." >&2
192 |       echo "       Run 'IRFinder -h' for more details." >&2
193 |       exit 1
194 |     fi
195 |     STARLINE=$(head -n 1 $STARREF/genomeParameters.txt)
196 |     STARTMP1=(${STARLINE#*--genomeFastaFiles })
197 |     STARTMP2=(${STARLINE#*--sjdbGTFfile })
198 |     ORIFASTA=${STARTMP1[0]}
199 |     ORIGTF=${STARTMP2[0]}
200 |     if [[ "${MYFASTA}" == "NULL" ]]; then
201 |         MYFASTA="${ORIFASTA}"
202 |     fi
203 |     if [[ "${MYGTF}" == "NULL" ]]; then
204 |         MYGTF="${ORIGTF}"
205 |     fi
206 | fi
207 | 
208 | 
209 | #get the original fasta and gtf file used to generate STAR reference using the parameters saved in 'genomeParameters.txt'
210 | if [ ! -f "$MYFASTA" ] || [ ! -f "$MYGTF" ]; then
211 |     echo "Error: Cannot locate the original FASTA and GTF files used to generate the STAR reference $STARREF" >&2
212 |     echo "       at the following locations:" >&2
213 |     echo "       FASTA: $ORIFASTA" >&2
214 |     echo "       GTF: $ORIGTF" >&2
215 |     echo "       Please locate these two files through '-f' and '-g' options respectively." >&2
216 |     echo "       Or retry to build IRFinder reference in other modes." >&2
217 |     echo "       Run 'IRFinder -h' for more details." >&2
218 |     exit 1
219 | fi
220 | 
221 | if [ $LINK -eq 1 ]; then
222 |     CP_CMD="ln -s "    
223 | else
224 |     CP_CMD="cp "        
225 | fi
226 | 
227 | MYFASTA=$(realpath $MYFASTA )
228 | MYGTF=$(realpath $MYGTF )
229 | REF=$(realpath $REF )
230 | 
231 | echo "Launching reference build process. The full build might take hours."
232 | echo "<Phase 1: STAR Reference Preparation>"
233 | mkdir "$REF"
234 | date +"%b %d %T ... copying the genome FASTA file..."
235 | $CP_CMD "$MYFASTA" "$REF/genome.fa"
236 | date +"%b %d %T ... copying the transcriptome GTF file..."
237 | $CP_CMD "$MYGTF" "$REF/transcripts.gtf"
238 | date +"%b %d %T ... copying the STAR reference folder..."
239 | $CP_CMD -r "$STARREF" "$REF/STAR"
240 | 
241 | "$LIBEXEC/IRFinder-BuildRefFromEnsembl" "BuildRefFromSTARRef" "$THREADS" "$STAREXEC" "$BUILDHINT" "$REF" "$BUILDERCCFILE" "$BUILDROI" "$BUILDBLACK" "$SJOH" "$MAPABILITY_FILE" "$MAPABILITY_LEN"
242 | 
243 | 
244 | 
245 | 
246 | 


--------------------------------------------------------------------------------
/bin/IRFinderBuildRefProcess:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util
  5 | source $LIBEXEC/bash_utils.sh 
  6 | 
  7 | 
  8 | function usage() {
  9 | 	echo "" >&2
 10 | 	echo "IRFinder version: $VERSION" >&2
 11 | 	echo "Usage: IRFinder BuildRefProcess [-v][-h][-t INT][-j INT][-e ExtraGenomeRef.fa][-b Blacklist.bed][-R ROI.bed][-m Mapability.bed] -r ReferenceDir " >&2
 12 | 	echo "" >&2
 13 | 	echo "Parameters for BuildRefProcess mode:" >&2
 14 | 	echo "  required:" >&2
 15 | 	echo "    -r ReferenceDir. Directory should already contain EXACT files named 'genome.fa' and 'transcripts.gtf' (case-sensitive) for genome and transcriptome annotations respectively." >&2
 16 | 	echo "  optional:" >&2
 17 | 	echo "    -t Threads: The number of physical CPUs to use by IRFinder. When ommited (default), IRFinder will use all physical CPUs." >&2
 18 | 	echo "    -j INTEGER: an integer that is parsed to '--sjdbOverhang' under STAR 'genomeGenerate' mode. Default: 150." >&2
 19 | 	echo "    -e ExtraGenomeRef.fasta.gz: Typically an ERCC reference." >&2
 20 | 	echo "    -b Blacklist.bed.gz: BED of regions to be excluded from analysis." >&2
 21 | 	echo "    -R ROI.bed.gz: A non-overlapping BED file of additional Regions of Interest for read counts." >&2
 22 |     echo "    -m Mapability: A precomputed bed file containing the low mapability areas. Can also be an empty file." >&2
 23 |     echo "    -n MapabilityReadLength: The length of the reads used to compute the mapability. Default: 70" >&2    
 24 |     echo "    -L STAR limitGenomeGenerateRAM argument. Default: 31000000000" >&2
 25 | 	echo "    -h Show this usage information." >&2    
 26 | 	echo "" >&2
 27 | 	exit 1
 28 | }
 29 | 
 30 | 
 31 | # === Defaults ===
 32 | THREADS=0
 33 | REF=
 34 | SJOH=150
 35 | BUILDERCCFILE=
 36 | BUILDROI=
 37 | BUILDBLACK=
 38 | STAREXEC=STAR
 39 | MAPABILITY_FILE=
 40 | MAPABILITY_LEN=100
 41 | GENOMERAM=31000000000
 42 | 
 43 | if [ $# -eq 0 ] || [[ $1 == "--help"  ]] ; then
 44 | 	usage
 45 | fi
 46 | 
 47 | 
 48 | while getopts ":m:r:j:t:S:e:b:R:M:n:L:l:hv" opt; do
 49 | 	case $opt in
 50 | 		r)
 51 | 			# Reference directory.
 52 | 			REF=$OPTARG
 53 | 			;;
 54 | 		j) # STAR's --sjdbOverhang
 55 | 			if [[ $OPTARG =~ ^[0-9]+$ ]] ; then 
 56 | 				SJOH=$OPTARG
 57 | 			else
 58 | 				echo "Argument error: -j $OPTARG. '$OPTARG' is not an integer." >&2
 59 | 				exit 1
 60 | 			fi
 61 | 			;;
 62 | 		t)
 63 | 			## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.)
 64 | 			if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
 65 | 				echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2
 66 | 				exit 1
 67 | 			fi
 68 | 			THREADS=$OPTARG
 69 | 			;;
 70 | 		L)
 71 | 			## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.)
 72 | 			if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
 73 | 				echo "Argument error: -L $OPTARG, limitGenomeGenerateRAM must be an integer." >&2
 74 | 				exit 1
 75 | 			fi
 76 | 			GENOMERAM=$OPTARG
 77 | 			;;			
 78 | 		S)
 79 | 			## STAR executable. (must be executable!)
 80 | 			if [ -x "$OPTARG" -a ! -d "$OPTARG" ]; then
 81 | 				STAREXEC=$OPTARG
 82 | 			else
 83 | 				echo "Argument error: -S $OPTARG. STAR executable must be an executable program." >&2
 84 | 				exit 1
 85 | 			fi
 86 | 			;;
 87 | 		e)
 88 | 			#ERCC file. (must be a file)
 89 | 			if [ ! -f "$OPTARG" ]; then
 90 | 				echo "Argument error: -e $OPTARG. Specified ERCC/extra-reference file does not exist." >&2
 91 | 				exit 1
 92 | 			fi
 93 | 			BUILDERCCFILE=$OPTARG
 94 | 			;;
 95 | 		b)
 96 | 			#Blacklist local file (must be a file)
 97 | 			if [ ! -f "$OPTARG" ]; then
 98 | 				echo "Argument error: -b $OPTARG. Specified blacklist file does not exist." >&2
 99 | 				exit 1
100 | 			fi
101 | 			BUILDBLACK=$OPTARG
102 | 			;;
103 | 		R)
104 | 			#ROI local file. (must be a file)
105 | 			if [ ! -f "$OPTARG" ]; then
106 | 				echo "Argument error: -R $OPTARG. Specified ROI file does not exist." >&2
107 | 				exit 1
108 | 			fi
109 | 			BUILDROI=$OPTARG
110 | 			;;
111 |         M)
112 |           if [ ! -f "$OPTARG" ]; then
113 |             echo "Argument error: -m $OPTARG. Specified Mapability file does not exist." >&2
114 |             exit 1
115 |           fi
116 |           MAPABILITY_FILE=$OPTARG
117 |           ;;
118 |         n)
119 |           if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
120 |     		echo "Argument error: -n $OPTARG, must be an integer." >&2
121 | 			exit 1
122 |           fi
123 | 		  MAPABILITY_LEN=$OPTARG
124 | 		  ;;
125 | 		h)
126 | 			usage
127 | 			;;
128 | 		v)
129 | 		    versionAlert
130 | 		    ;; 
131 | 		\?)
132 | 			echo "Invalid option: -$OPTARG" >&2
133 | 			exit 1
134 | 			;;
135 | 		:)
136 | 			echo "Option -$OPTARG requires an argument." >&2
137 | 			exit 1
138 | 			;;
139 | 	esac
140 | done
141 | shift $(($OPTIND - 1))
142 | 
143 | #echo $@  #The remaining arguments.
144 | #echo $#  #The number of remaining arguments. 
145 | 
146 | if [ ! "$REF" ]; then
147 | 	echo "Argument error: -r is required." >&2
148 | 	exit 1	
149 | fi
150 | 
151 | checkStar $STAREXEC
152 | setThreads
153 | 
154 | if [ ! -f "$REF/genome.fa" ] || [ ! -f "$REF/transcripts.gtf" ]; then
155 | 	echo "Argument error: -r $REF. Reference directory must exist and contain genome.fa and transcripts.gtf files. Use the BuildRefDownload run mode to create these." >&2
156 | 	exit 1
157 | fi
158 | 
159 | if [ -d "$REF/STAR" ] || [ -d "$REF/Mapability" ] || [ -d "$REF/IRFinder" ]; then
160 | 	echo "Argument error: -r $REF. Will not overwrite. It appears BuildRefProcess has already been run for this reference. Reference directory must not contain STAR, Mapability or IRFinder directories." >&2
161 | 	exit 1
162 | fi
163 | 
164 | 
165 | echo "Launching reference build process. The full build might take hours."
166 | "$LIBEXEC/IRFinder-BuildRefFromEnsembl" "BuildRefProcess" "$THREADS" "$STAREXEC" "" "$REF" "$BUILDERCCFILE" "$BUILDROI" "$BUILDBLACK" "$SJOH" "$MAPABILITY_FILE" "$MAPABILITY_LEN" "$GENOMERAM"
167 | 
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------
/bin/IRFinderLong:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util
  4 | 
  5 | source $LIBEXEC/bash_utils.sh 
  6 | 
  7 | 
  8 | function usage() {
  9 | 	echo "" >&2
 10 | 	echo "IRFinder version: $VERSION" >&2
 11 | 	echo "Usage: IRFinder Long -r ReferenceDir raw_reads_1.fast[q|a][.gz] [raw_reads_2.fast[q|a][.gz]...]" >&2
 12 | 	echo "" >&2
 13 | 	echo "  required:" >&2
 14 | 	echo "    raw_reads_1.fast[q|a][.gz]: one or more long reads fastq or fasta files, compressed or not." >&2
 15 | 	echo "    -r ReferenceDir. As built by the 'BuildRef' option." >&2
 16 | 	echo "" >&2
 17 | 	echo "  optional:" >&2
 18 | 	echo "    -d Output Directory: Default is the current directory." >&2
 19 | 	echo "    -M Sort memory. Maximum memory to use for sort for each thread, in MB. Default: 768." >&2
 20 | 	echo "    -x Minimap2 preset: splice (default), map-[pb|ont], ava-[pb|ont], asm[5|10|20], sr. see minimap2.1 for details." >&2
 21 | 	echo "    -E Minimap2 executable: Default is 'minimap2'." >&2
 22 |     echo "    -t Threads: The number of physical CPUs to use by IRFinder. When ommited (default), IRFinder will use all physical CPUs." >&2
 23 |   	echo "    -u Unsorted output: Do not sort the read fragment BAM file." >&2 
 24 |   	echo "    -v Verbose ( Default: print the log only in the output/logs/irfinder.std[out|err] )" >&2 
 25 | 	echo "    -y STRING: an extra string that is parsed to Minimap2 for reads alignment. Default: '-uf -k14' " >&2
 26 |     echo "    -j Jitter, consider the position around the splice sites to compensate sequencing errors ( integer number )." >&2	
 27 | 	echo "" >&2
 28 | 	exit 1
 29 | }
 30 | 
 31 | # === Defaults ===
 32 | OUTPUTDIR=.
 33 | THREADS=0
 34 | REF=
 35 | MINIMAP_PRESET="splice"
 36 | DOSORT=1
 37 | MINIMAP_EXTRA="-uf -k14"
 38 | MINIMAP_EXEC=minimap2
 39 | VERBOSE=0
 40 | SORTMEM=768
 41 | AI_WARN=1
 42 | AI_INTRON=1
 43 | JITTER=3
 44 | 
 45 | if [ $# -eq 0 ] || [[ $1 == "--help"  ]] ; then
 46 | 	usage
 47 | fi
 48 | 
 49 | while getopts ":m:j:r:t:d:E:x:uM:y:i:w:vh" opt; do
 50 | 	case $opt in
 51 | 		r)
 52 | 			# Reference directory.
 53 | 			REF=$OPTARG
 54 | 			;;
 55 | 		t)
 56 | 			## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.)
 57 | 			if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
 58 | 				echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2
 59 | 				exit 1
 60 | 			fi
 61 | 			THREADS=$OPTARG
 62 | 			;;
 63 | 		d)
 64 | 			OUTPUTDIR=$OPTARG
 65 | 			;;
 66 | 		x)
 67 | 			## Minimap preset.
 68 | 			if [[ $OPTARG =~ ^splice$|^map-(pb|ont)$|^ava-(pb|ont)$|^asm[5|10|20]$|^sr$ ]]; then
 69 | 				MINIMAP_PRESET=$OPTARG
 70 | 			else
 71 | 				echo "Argument error: -x $OPTARG. Valid options for Minimap presets are: " >&2
 72 | 				echo "  splice (default), map-pb, map-ont, ava-pb, ava-ont, asm5, asm10, asm20 or sr" >&2
 73 | 				exit 1
 74 | 			fi
 75 | 			;;
 76 | 	    j)
 77 | 			if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
 78 | 				echo "Argument error: -j $OPTARG, jitter must be an integer." >&2
 79 | 				exit 1
 80 | 			fi
 81 | 			JITTER=$OPTARG
 82 | 			;;				
 83 | 		E)
 84 | 			## Minimap executable. (must be executable!)
 85 | 			if [ -x "$OPTARG" -a ! -d "$OPTARG" ]; then
 86 | 				MINIMAP_EXEC=$OPTARG
 87 | 			else
 88 | 				echo "Argument error: -S $OPTARG. Minimap2 executable must be an executable program." >&2
 89 | 				exit 1
 90 | 			fi
 91 | 			;;
 92 | 		u)
 93 | 			DOSORT=0
 94 | 		    ;;
 95 | 		M)
 96 | 			#Max sort memory in MB.
 97 | 			if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
 98 | 				echo "Argument error: -M $OPTARG, maximum sort RAM in MB must be an integer." >&2
 99 | 				exit 1
100 | 			fi
101 | 			SORTMEM=$OPTARG
102 | 			;;
103 | 		y)
104 | 			MINIMAP_EXTRA=$OPTARG
105 | 			;;
106 | 		v)
107 | 			VERBOSE=1
108 | 			;;
109 | 		h)
110 | 			usage
111 | 			;;
112 | 		\?)
113 | 			echo "Invalid option: -$OPTARG" >&2
114 | 			exit 1
115 | 			;;
116 | 		:)
117 | 			echo "Option -$OPTARG requires an argument." >&2
118 | 			exit 1
119 | 			;;
120 | 	esac
121 | done
122 | shift $(($OPTIND - 1))
123 | 
124 | if [ $# -eq 0 ];then
125 |     echo "ERROR! No fasta or fastq file provided." >&2
126 |     exit 1
127 | fi
128 | 
129 | INPUT_FILES=""
130 | for f in $@; do
131 |     if [ -f $f ]; then
132 |         INPUT_FILES="${INPUT_FILES} ${f}"
133 |     else
134 |         echo "ERROR! File ${f} doesn't exists." >&2
135 |         exit 1
136 |     fi
137 | done
138 | 
139 | checkRef $REF
140 | checkOutDir $OUTPUTDIR
141 | checkMinimap $MINIMAP_EXEC
142 | checkSamtools
143 | setThreads
144 | 
145 | logger init
146 | 
147 | RUNMODE="Long" startMessage $@
148 | 
149 | 
150 | 
151 | logger "[ " $(date) " ] Minimap2 is starting with $THREADS threads" 
152 | 
153 | $MINIMAP_EXEC -a -t $THREADS -x $MINIMAP_PRESET $MINIMAP_EXTRA $REF/genome.fa $@  2> $OUTPUTDIR/logs/minimap2.log | samtools view -b > $OUTPUTDIR/Unsorted.bam  || exit 1
154 |  
155 | logger "---"
156 | logger "[ " $(date) " ] Minimap2 mapping completed" 
157 | logger "---"
158 | 
159 | VERBOSE_FLAG=""
160 | if [[ "${VERBOSE}" == "1" ]]; then
161 |     VERBOSE_FLAG=" -v " 
162 | fi
163 | 
164 | IRF_RUNMODE="Long" $(dirname "$(readlink -nf "$BASH_SOURCE")")/IRFinderBAM $VERBOSE_FLAG -l -r $REF -t $THREADS -j $JITTER -d $OUTPUTDIR $OUTPUTDIR/Unsorted.bam || exit 1
165 | 
166 | 
167 | if [ $DOSORT -eq 1 ]; then
168 |     logger "---"
169 |     logger "[ " $(date) " ] Sorting the bam file" 
170 |     echo "---- samtools sort  -@ $THREADS -m ${SORTMEM}M -o $OUTPUTDIR/Sorted.bam $OUTPUTDIR/Unsorted.bam  ---"  >> $OUTPUTDIR/logs/samtools.log && \
171 |     samtools sort  -@ $THREADS -m ${SORTMEM}M -o $OUTPUTDIR/Sorted.bam $OUTPUTDIR/Unsorted.bam &>> $OUTPUTDIR/logs/samtools.log && \
172 |     logger "---"
173 |     logger "[ " $(date) " ] Indexing the sorted bam file" 
174 |     echo "---- samtools index -@ $THREADS $OUTPUTDIR/Sorted.bam ---" >> $OUTPUTDIR/logs/samtools.log && \
175 |     samtools index  -@ $THREADS  $OUTPUTDIR/Sorted.bam  &>> $OUTPUTDIR/logs/samtools.log && \
176 |     rm $OUTPUTDIR/Unsorted.bam
177 | fi
178 | logger "---"
179 | logger "[ " $(date) " ] IRFinder Long completed." 
180 | logger "---"
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/bin/TrimBAM4IGV:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | export LC_ALL=C
  4 | export LANG=C
  5 | 
  6 | set -e
  7 | 
  8 | function usage() {
  9 | 	echo "Usage: TrimBAM4IGV -o OUTPUTDIR INPUT.bam" >&2
 10 | 	echo "Output: OUTPUTDIR/INPUT.trimmed.bam; OUTPUTDIR/INPUT.trimmed.bam.bai" >&2
 11 | 	echo "" >&2
 12 | 	echo "    -o OUTPUTDIR : required. Directory to save trimmed BAM." >&2
 13 | 	echo "    -r region : optional. A string to guide Samtools extracting reads in the corresponding region." >&2
 14 | 	echo "    -t NUM_THREADS : optional. Number of threads to use. Default: the number of physical CPUs." >&2
 15 | 	echo "    -h Show this usage information." >&2
 16 | 	exit 1
 17 | }
 18 | 
 19 | 
 20 | # === Defaults ===
 21 | THREADS=0
 22 | 
 23 | if [ $# -eq 0 ]; then
 24 | 	usage
 25 | fi
 26 | 
 27 | while getopts ":o:r:t:h" opt; do
 28 | 	case $opt in
 29 | 		o)
 30 | 			# Reference directory.
 31 | 			OUTPUTDIR=$OPTARG
 32 | 			;;
 33 | 		t)
 34 | 			## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.)
 35 | 			if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
 36 | 				echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2
 37 | 				exit 1
 38 | 			fi
 39 | 			THREADS=$OPTARG
 40 | 			;;
 41 | 		r)
 42 | 			REGIONS=$OPTARG
 43 | 			;;
 44 | 		h)
 45 | 			usage
 46 | 			;;
 47 | 		\?)
 48 | 			echo "Invalid option: -$OPTARG" >&2
 49 | 			exit 1
 50 | 			;;
 51 | 		:)
 52 | 			echo "Option -$OPTARG requires an argument." >&2
 53 | 			exit 1
 54 | 			;;
 55 | 	esac
 56 | done
 57 | shift $(($OPTIND - 1))
 58 | 
 59 | #echo $@  #The remaining arguments.
 60 | #echo $#  #The number of remaining arguments. 
 61 | 
 62 | STVERSTR=`samtools --version`
 63 | STVER=$(echo $STVERSTR|cut -d" " -f2)
 64 | STVERMAIN=$(echo $STVER|cut -d"." -f1)
 65 | STVERMINOR=$(echo $STVER|cut -d"." -f2)
 66 | if [[ ! "$STVERMAIN" -ge 1 ]]; then
 67 | 	echo "Error: Samtools $STVER: version too old (>=1.4 required)." >&2
 68 | 	exit 1
 69 | elif [[ ! "$STVERMINOR" -ge 4 ]]; then
 70 | 	echo "Error: Samtools $STVER: version too old (>=1.4 required)." >&2
 71 | 	exit 1
 72 | fi
 73 | 
 74 | if [ ! "$OUTPUTDIR" ]; then
 75 | 	echo "Argument error: -o is required." >&2
 76 | 	usage	
 77 | fi
 78 | 
 79 | if [ ! -d "$OUTPUTDIR" ]; then
 80 | 	mkdir "$OUTPUTDIR"
 81 | fi
 82 | 
 83 | # Auto detect CPUs.
 84 | if [[ $THREADS == 0 ]]; then
 85 | 	THREADS=`awk 'BEGIN {FS=":"} ($0 ~ /^physical id/ ) { printf $2 " --"} ($0 ~ /^core id/) {print $2}' < /proc/cpuinfo | sort -u | wc -l`
 86 | 	if [ ! -n $THREADS -o $THREADS -eq 0 ]; then
 87 |     	    	# If physical CPU detection doesn't work for some reason, detect virtual CPUs (includes hyperthreading instances).
 88 | 		THREADS=`grep -c ^processor /proc/cpuinfo`
 89 | 	fi
 90 | fi
 91 | 
 92 | SAMPLE=$(echo $1|awk 'BEGIN{FS=".bam"}{print $1}')
 93 | 
 94 | if [ -f "$1"".bai" ]; then
 95 | 	if [ ! "$REGIONS" ]; then
 96 | 		samtools view -h "$1"|awk 'BEGIN{FS=OFS="\t"}(substr($0,1,1)=="@"){print $0}(substr($0,1,1)!="@"){print $1,$2,$3,$4,$5,$6,$7,$8,$9,"*","*"}' > "$OUTPUTDIR/tmp_sorted.trimmed.sam"
 97 | 	else
 98 | 		samtools view -h "$1" $REGIONS|awk 'BEGIN{FS=OFS="\t"}(substr($0,1,1)=="@"){print $0}(substr($0,1,1)!="@"){print $1,$2,$3,$4,$5,$6,$7,$8,$9,"*","*"}' > "$OUTPUTDIR/tmp_sorted.trimmed.sam"
 99 | 	fi
100 | else
101 | 	echo "Warning: BAM index not found: the input BAM is treated as name-sorted and will be sorted by coordinate first. This might take a while." >&2
102 | 	echo "         If the input BAM has already been sorted by coordinate, please index it and re-run this command." >&2
103 | 	samtools sort -@ "$THREADS" "$1" > "$OUTPUTDIR/tmp_sorted.bam"
104 | 	samtools index -@ "$THREADS" "$OUTPUTDIR/tmp_sorted.bam"
105 | 	if [ ! "$REGIONS" ]; then
106 | 		samtools view -h "$OUTPUTDIR/tmp_sorted.bam"|awk 'BEGIN{FS=OFS="\t"}(substr($0,1,1)=="@"){print $0}(substr($0,1,1)!="@"){print $1,$2,$3,$4,$5,$6,$7,$8,$9,"*","*"}' > "$OUTPUTDIR/tmp_sorted.trimmed.sam"
107 | 	else
108 | 		samtools view -h "$OUTPUTDIR/tmp_sorted.bam" $REGIONS|awk 'BEGIN{FS=OFS="\t"}(substr($0,1,1)=="@"){print $0}(substr($0,1,1)!="@"){print $1,$2,$3,$4,$5,$6,$7,$8,$9,"*","*"}' > "$OUTPUTDIR/tmp_sorted.trimmed.sam"
109 | 	fi
110 | fi 
111 | 
112 | samtools view -S -b "$OUTPUTDIR/tmp_sorted.trimmed.sam" > "$OUTPUTDIR/""$SAMPLE"".trimmed.bam"
113 | samtools index -@ "$THREADS" "$OUTPUTDIR/""$SAMPLE"".trimmed.bam"
114 | 
115 | 
116 | rm "$OUTPUTDIR"/tmp_sorted*


--------------------------------------------------------------------------------
/bin/analysisWithLowReplicates.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | use Data::Dumper;
  4 | use List::Util qw(max min);
  5 | use FindBin qw($RealBin);
  6 | 
  7 | my $winflatExec = "winflat";
  8 | if ( -x "$RealBin/util/winflat" ) {
  9 | 	$winflatExec = "$RealBin/util/winflat";
 10 | }else{
 11 | 	#system('which','winflat', '>/dev/null');
 12 | 	system('which winflat >/dev/null');
 13 | 	if ($? != 0) {
 14 | 		print STDERR "FATAL: winflat utiltiy not found.\nSearched at:\n  $RealBin/util/winflat\n  and on the PATH\n";
 15 | 		exit 1;
 16 | 	}
 17 | }
 18 | 
 19 | 
 20 | sub arrayEqual {
 21 |     my ($xref, $yref, $maxCompare) = @_;
 22 |     return unless  @$xref == @$yref;
 23 | 
 24 | 	for (my $i = 0; $i < $maxCompare && $i < scalar @$xref; $i++) {
 25 | 		return unless $xref->[$i] eq $yref->[$i];
 26 | 	}
 27 |     return 1;
 28 | }
 29 | 
 30 | sub separatedAB {
 31 |     my ($arrayref, $aCount, $bCount) = @_;
 32 | 	## An array with aCount elements followed by bCount elements.
 33 | 	## All of the A elements need to be < or > all of the B elements.
 34 | 
 35 | 	if ($arrayref->[0] == $arrayref->[$aCount]) {
 36 | 		return 0; #neither > or <.
 37 | 	}elsif ($arrayref->[0] > $arrayref->[$aCount]) {
 38 | 		for (my $a = 0; $a < $aCount; $a++) {
 39 | 			for (my $b = $aCount; $b < $aCount+$bCount; $b++) {
 40 | 				return 0 if (!($arrayref->[$a] > $arrayref->[$b]));
 41 | 			}
 42 | 		}
 43 | 	}else{
 44 | 		for (my $a = 0; $a < $aCount; $a++) {
 45 | 			for (my $b = $aCount; $b < $aCount+$bCount; $b++) {
 46 | 				return 0 if (!($arrayref->[$a] < $arrayref->[$b]));
 47 | 			}
 48 | 		}	
 49 | 	}
 50 | 	return 1;
 51 | }
 52 | 
 53 | 
 54 | 
 55 | my $current = "";
 56 | #Filehandles.
 57 | my $poolA;
 58 | my $poolAname;
 59 | my $poolB;
 60 | my $poolBname;
 61 | my @reps;
 62 | my @repsFileNames;
 63 | my $repsA = 0;
 64 | my $repsB = 0;
 65 | my @output;
 66 | 
 67 | while (scalar @ARGV) {
 68 |   my $param = shift @ARGV;
 69 |   if ($param =~ m/^\-/) {
 70 |     if ($param eq '-A') {
 71 |       $current = 'A';
 72 |     }elsif ($param eq '-B') {
 73 |       $current = 'B';
 74 |     }else{
 75 |       print STDERR "Invalid parameter: $param\n";
 76 |       exit 1;
 77 |     }
 78 |   }else{
 79 |     if ($current eq "") {
 80 |       print STDERR "Invalid parameters. eg: -A pooledA/IR-nondir.txt repA1/IR-nondir.txt repA2/IR-nondir.txt -B pooledB/IR-nondir.txt repB1/IR-nondir.txt repB2/IR-nondir.txt\n";
 81 |       exit 1;
 82 |     }elsif ($current eq "A") {
 83 |       if ($poolA) {
 84 |         #open $repsA[scalar @repsA], '<', $param or die "Can't open file $param for reading";
 85 |         ## Insert an element into the array @reps, after the last A element.
 86 | 		splice(@repsFileNames, $repsA, 0, $param);
 87 | 		splice(@reps, $repsA, 0, undef);
 88 | 		open $reps[$repsA], '<', $param or die "Can't open file $param for reading";
 89 | 
 90 | 		$repsA++;
 91 |       }else{
 92 |         open $poolA, '<', $param or die "Can't open file $param for reading";
 93 |         $poolAname = $param;
 94 |       }
 95 |     }elsif ($current eq "B") {
 96 |       if ($poolB) {
 97 |         ## Add an element to the very end of the array @reps (ie: after all the A elements, and all the B elements)
 98 | 		@repsFileNames[scalar @repsFileNames]=$param;
 99 |         open $reps[scalar @reps], '<', $param or die "Can't open file $param for reading";
100 |         $repsB++
101 |       }else{
102 |         open $poolB, '<', $param or die "Can't open file $param for reading";
103 |         $poolBname = $param;
104 |       }
105 |     }else{
106 |       print STDERR "error in code\n";
107 |       exit 2;
108 |     }
109 |   }
110 | }
111 | 
112 | ( $repsA >= 2 ) or die "For condition A, must provide a pooled data file and at least 2 replicate files.";
113 | ( $repsB >= 2 ) or die "For condition B, must provide a pooled data file and at least 2 replicate files.";
114 | 
115 | #print Dumper(\@repsFileNames);
116 | #print Dumper(\@reps);
117 | 
118 | my @repsHeader;
119 | my $counter = 1;
120 | foreach(@repsFileNames[0 .. $repsA-1]) {
121 | 	$_ = '#Condition A replicate ' . $counter . ': ' . $_;
122 | 	push @repsHeader, "A$counter-IRratio";
123 | 	$counter++;
124 | }
125 | $counter = 1;
126 | foreach(@repsFileNames[$repsA .. scalar @repsFileNames - 1]) {
127 | 	$_ = '#Condition B replicate ' . $counter . ': ' . $_;
128 | 	push @repsHeader, "B$counter-IRratio";
129 | 	$counter++;
130 | }
131 | 
132 | print "#Condition A combined: $poolAname\n";
133 | print join("\n",@repsFileNames[0 .. $repsA-1]), "\n";
134 | print "#Condition B combined: $poolBname\n";
135 | print join("\n",@repsFileNames[$repsA .. scalar @repsFileNames - 1]), "\n";
136 | 
137 | print join("\t",
138 | 	"Chr", "Start", "End", "Intron-GeneName/GeneID","-","Direction","ExcludedBases",
139 | 	"p-diff","p-increased","p-decreased",
140 | 	"A-IRratio","A-IRok","A-IntronCover","A-IntronDepth","A-SplicesMax","A-SplicesExact",
141 | 	"B-IRratio","B-IRok","B-IntronCover","B-IntronDepth","B-SplicesMax","B-SplicesExact",
142 | 	"replicates", @repsHeader
143 | 	),"\n";
144 | 
145 | 
146 | my $lineNumber = 0;
147 | while(<$poolA>) {
148 |   my $pA = $_;
149 |   chomp $pA;
150 |   my $pB = <$poolB>;
151 |   chomp $pB;
152 |   $lineNumber++;
153 | 
154 |   my @pA = split /\t/, $pA;
155 |   my @pB = split /\t/, $pB;
156 | 
157 |   if (!( arrayEqual( \@pA, \@pB, 7) )) {
158 |     print STDERR "FATAL: Files do not list records in the same order with identical number of lines.\n";
159 |     print join("\t", @pA[0 .. 6]), "\n";
160 |     print join("\t", @pB[0 .. 6]), "\n";
161 | 
162 |     exit 1;
163 |   }
164 | 
165 |   ## Loop through replicates, fill an array. (check the ~~ 0..6)
166 |   my @repsIR;
167 |   foreach(@reps) {
168 |   	my @fields = split /\t/, <$_>;
169 |   	if (!( arrayEqual( \@pA, \@fields, 7) )) {
170 | 	    print STDERR "FATAL: Files do not list records in the same order with identical number of lines.\n";
171 | 	    print join("\t", @fields[0 .. 6]), "\n";
172 | 	    print join("\t", @pA[0 .. 6]), "\n";
173 | 	    exit 1;
174 |   	}
175 |   	push @repsIR, @fields[19];
176 |   }
177 | 
178 |   ## Do the maths, are the replicates OK?
179 |   my $ok = ($pA[20] eq "-" || $pB[20] eq "-") && ($pA[8] >= 1 || $pB[8] >= 1) && ($pA[19] >= 0.01 || $pB[19] >= 0.01) && separatedAB(\@repsIR, $repsA, $repsB);
180 | 
181 |   my $pValUp = 99;
182 |   my $pValDown = 99;
183 | 
184 |   if ($ok) {
185 |     ## Check if both are sufficiently expressed (either the intron, or the splices)
186 |     if (( $pA[8] >= 1 || max($pA[16],$pA[17]) >= 10 ) && ( $pB[8] >= 1 || max($pB[16],$pB[17]) >= 10 )) {
187 | 	  	## calculate the winflat p-value of the difference (from the pooled IRdepth & junctionDepth).
188 | 		#print $lineNumber, "\n";
189 | 	    open my $winflat, '-|', $winflatExec, '-xvalue', int($pA[8]+0.5), '-yvalue', int($pB[8]+0.5), '-diff', max($pA[16],$pA[17])+int($pA[8]+0.5), max($pB[16],$pB[17])+int($pB[8]+0.5);
190 | 		my @winflat = <$winflat>;
191 | 		close $winflat;
192 | 	    foreach (@winflat) {chomp; s/^.*\).*= *//; s/\W*$//};
193 | 		$pValDown = $winflat[0];
194 | 		$pValUp = $winflat[1];
195 | 	}else{
196 | 		## Properly expressed in only one of the samples. Flag as interesting, but not differential IR.
197 | 		$pValUp = 33;
198 | 		$pValDown = 33;
199 | 	}
200 |   }
201 |   my $pValDiff = min($pValUp, $pValDown);
202 | 
203 |   if ($ok) {
204 |     push @output, [@pA[0 .. 6],
205 |     	$pValDiff, $pValUp, $pValDown,
206 |     	$pA[19], $pA[20], $pA[7], $pA[8], max($pA[16],$pA[16]), $pA[18],
207 |     	$pB[19], $pB[20], $pB[7], $pB[8], max($pB[16],$pB[16]), $pB[18],
208 |     	"reps", @repsIR];
209 |   }
210 | 
211 |   ## Max SJ - 17/18
212 |   ## Exact SJ - 29
213 |   ## IRRatio = 20
214 |   ## ok 21
215 |   ## coverage 8
216 |   ## trimmedMean 9
217 |   ## 
218 | }
219 | 
220 | foreach ( sort { $a->[7] <=> $b->[7] } @output ) {
221 | 	print join("\t", @$_), "\n";
222 | }
223 | 


--------------------------------------------------------------------------------
/bin/analysisWithNoReplicates.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | use Data::Dumper;
  4 | use List::Util qw(max min);
  5 | use FindBin qw($RealBin);
  6 | use sort 'stable';
  7 | 
  8 | 
  9 | my $winflatExec = "winflat";
 10 | if ( -x "$RealBin/util/winflat" ) {
 11 | 	$winflatExec = "$RealBin/util/winflat";
 12 | }else{
 13 | 	#system('which','winflat', '>/dev/null');
 14 | 	system('which winflat >/dev/null');
 15 | 	if ($? != 0) {
 16 | 		print STDERR "FATAL: winflat utiltiy not found.\nSearched at:\n  $RealBin/util/winflat\n  and on the PATH\n";
 17 | 		exit 1;
 18 | 	}
 19 | }
 20 | 
 21 | 
 22 | sub arrayEqual {
 23 |     my ($xref, $yref, $maxCompare) = @_;
 24 |     return unless  @$xref == @$yref;
 25 | 
 26 | 	for (my $i = 0; $i < $maxCompare && $i < scalar @$xref; $i++) {
 27 | 		return unless $xref->[$i] eq $yref->[$i];
 28 | 	}
 29 |     return 1;
 30 | }
 31 | 
 32 | sub separatedAB {
 33 |     my ($arrayref, $aCount, $bCount) = @_;
 34 | 	## An array with aCount elements followed by bCount elements.
 35 | 	## All of the A elements need to be < or > all of the B elements.
 36 | 
 37 | 	if ($arrayref->[0] == $arrayref->[$aCount]) {
 38 | 		return 0; #neither > or <.
 39 | 	}elsif ($arrayref->[0] > $arrayref->[$aCount]) {
 40 | 		for (my $a = 0; $a < $aCount; $a++) {
 41 | 			for (my $b = $aCount; $b < $aCount+$bCount; $b++) {
 42 | 				return 0 if (!($arrayref->[$a] > $arrayref->[$b]));
 43 | 			}
 44 | 		}
 45 | 	}else{
 46 | 		for (my $a = 0; $a < $aCount; $a++) {
 47 | 			for (my $b = $aCount; $b < $aCount+$bCount; $b++) {
 48 | 				return 0 if (!($arrayref->[$a] < $arrayref->[$b]));
 49 | 			}
 50 | 		}	
 51 | 	}
 52 | 	return 1;
 53 | }
 54 | 
 55 | 
 56 | 
 57 | my $current = "";
 58 | #Filehandles.
 59 | my $poolA;
 60 | my $poolAname;
 61 | my $poolB;
 62 | my $poolBname;
 63 | my @reps;
 64 | my @repsFileNames;
 65 | my $repsA = 0;
 66 | my $repsB = 0;
 67 | my @output;
 68 | 
 69 | while (scalar @ARGV) {
 70 |   my $param = shift @ARGV;
 71 |   if ($param =~ m/^\-/) {
 72 |     if ($param eq '-A') {
 73 |       $current = 'A';
 74 |     }elsif ($param eq '-B') {
 75 |       $current = 'B';
 76 |     }else{
 77 |       print STDERR "Invalid parameter: $param\n";
 78 |       exit 1;
 79 |     }
 80 |   }else{
 81 |     if ($current eq "") {
 82 |       print STDERR "Invalid parameters. eg: -A pooledA/IR-nondir.txt repA1/IR-nondir.txt repA2/IR-nondir.txt -B pooledB/IR-nondir.txt repB1/IR-nondir.txt repB2/IR-nondir.txt\n";
 83 |       exit 1;
 84 |     }elsif ($current eq "A") {
 85 |       if ($poolA) {
 86 |         #open $repsA[scalar @repsA], '<', $param or die "Can't open file $param for reading";
 87 |         ## Insert an element into the array @reps, after the last A element.
 88 | 		splice(@repsFileNames, $repsA, 0, $param);
 89 | 		splice(@reps, $repsA, 0, undef);
 90 | 		open $reps[$repsA], '<', $param or die "Can't open file $param for reading";
 91 | 
 92 | 		$repsA++;
 93 |       }else{
 94 |         open $poolA, '<', $param or die "Can't open file $param for reading";
 95 |         $poolAname = $param;
 96 |       }
 97 |     }elsif ($current eq "B") {
 98 |       if ($poolB) {
 99 |         ## Add an element to the very end of the array @reps (ie: after all the A elements, and all the B elements)
100 | 		@repsFileNames[scalar @repsFileNames]=$param;
101 |         open $reps[scalar @reps], '<', $param or die "Can't open file $param for reading";
102 |         $repsB++
103 |       }else{
104 |         open $poolB, '<', $param or die "Can't open file $param for reading";
105 |         $poolBname = $param;
106 |       }
107 |     }else{
108 |       print STDERR "error in code\n";
109 |       exit 2;
110 |     }
111 |   }
112 | }
113 | 
114 | ( $poolA ) or die "For condition A, must provide a file.";
115 | ( $poolB ) or die "For condition B, must provide a file.";
116 | ( $repsA == 0 ) or die "For condition A, must provide a single file only.";
117 | ( $repsB == 0 ) or die "For condition B, must provide a single file only.";
118 | 
119 | 
120 | #print Dumper(\@repsFileNames);
121 | #print Dumper(\@reps);
122 | 
123 | print "#Condition A: $poolAname\n";
124 | print "#Condition B: $poolBname\n";
125 | 
126 | 
127 | print join("\t",
128 | 	"Chr", "Start", "End", "Intron-GeneName/GeneID","-","Direction","ExcludedBases",
129 | 	"p-diff","p-increased","p-decreased",
130 | 	"A-IRratio","A-IRok","A-IntronCover","A-IntronDepth","A-SplicesMax","A-SplicesExact",
131 | 	"B-IRratio","B-IRok","B-IntronCover","B-IntronDepth","B-SplicesMax","B-SplicesExact",
132 | 	),"\n";
133 | 
134 | 
135 | my $lineNumber = 0;
136 | while(<$poolA>) {
137 |   my $pA = $_;
138 |   chomp $pA;
139 |   my $pB = <$poolB>;
140 |   chomp $pB;
141 |   $lineNumber++;
142 | 
143 |   my @pA = split /\t/, $pA;
144 |   my @pB = split /\t/, $pB;
145 | 
146 |   if (!( arrayEqual( \@pA, \@pB, 7) )) {
147 |     print STDERR "FATAL: Files do not list records in the same order with identical number of lines.\n";
148 |     print join("\t", @pA[0 .. 6]), "\n";
149 |     print join("\t", @pB[0 .. 6]), "\n";
150 | 
151 |     exit 1;
152 |   }
153 | 
154 | #   ## Loop through replicates, fill an array. (check the ~~ 0..6)
155 | #   my @repsIR;
156 | #   foreach(@reps) {
157 | #   	my @fields = split /\t/, <$_>;
158 | #   	if (!( arrayEqual( \@pA, \@fields, 7) )) {
159 | # 	    print STDERR "FATAL: Files do not list records in the same order with identical number of lines.\n";
160 | # 	    print join("\t", @fields[0 .. 6]), "\n";
161 | # 	    print join("\t", @pA[0 .. 6]), "\n";
162 | # 	    exit 1;
163 | #   	}
164 | #   	push @repsIR, @fields[19];
165 | #   }
166 | 
167 |   ## Do the maths, are the replicates OK?
168 | #   my $ok = ($pA[20] eq "ok" || $pB[20] eq "ok") && ($pA[8] >= 1 || $pB[8] >= 1) && ($pA[19] >= 0.01 || $pB[19] >= 0.01) && separatedAB(\@repsIR, $repsA, $repsB);
169 | 
170 |   # No replicates. Still do a form of check -- is this intron interesting?
171 |   my $ok = ($pA[20] eq "-" || $pB[20] eq "-") && ($pA[8] >= 1 || $pB[8] >= 1) && (max($pA[16],$pA[17]) >= 10 || max($pB[16],$pB[17]) >= 10) && ($pA[19] >= 0.01 || $pB[19] >= 0.01);
172 | 
173 | 
174 |   my $pValUp = 99;
175 |   my $pValDown = 99;
176 | 
177 |   if ($ok) {
178 |     ## Check if both are sufficiently expressed (either the intron, or the splices)
179 |     if (( $pA[8] >= 1 || max($pA[16],$pA[17]) >= 10 ) && ( $pB[8] >= 1 || max($pB[16],$pB[17]) >= 10 )) {
180 | 	  	## calculate the winflat p-value of the difference (from the pooled IRdepth & junctionDepth).
181 | 		#print $lineNumber, "\n";
182 | 	    open my $winflat, '-|', $winflatExec, '-xvalue', int($pA[8]+0.5), '-yvalue', int($pB[8]+0.5), '-diff', max($pA[16],$pA[17])+int($pA[8]+0.5), max($pB[16],$pB[17])+int($pB[8]+0.5);
183 | 		my @winflat = <$winflat>;
184 | 		close $winflat;
185 | 	    foreach (@winflat) {chomp; s/^.*\).*= *//; s/\W*$//};
186 | 		$pValDown = $winflat[0];
187 | 		$pValUp = $winflat[1];
188 | 	}else{
189 | 		## Properly expressed in only one of the samples. Flag as interesting, but not differential IR.
190 | 		$pValUp = 33;
191 | 		$pValDown = 33;
192 | 	}
193 |   }
194 |   my $pValDiff = min($pValUp, $pValDown);
195 | 
196 |   if ($ok) {
197 |     # [  ] pushes an array ref onto the array.
198 |     push @output, [@pA[0 .. 6],
199 |     	$pValDiff, $pValUp, $pValDown,
200 |     	$pA[19], $pA[20], $pA[7], $pA[8], max($pA[16],$pA[16]), $pA[18],
201 |     	$pB[19], $pB[20], $pB[7], $pB[8], max($pB[16],$pB[16]), $pB[18]
202 |     	];
203 |   }
204 | 
205 |   ## Max SJ - 17/18
206 |   ## Exact SJ - 29
207 |   ## IRRatio = 20
208 |   ## ok 21
209 |   ## coverage 8
210 |   ## trimmedMean 9
211 |   ## 
212 | }
213 | 
214 | foreach ( sort { $a->[7] <=> $b->[7] } @output ) {
215 | 	print join("\t", @$_), "\n";
216 | }
217 | #print Dumper (\@output);
218 | 


--------------------------------------------------------------------------------
/bin/util/IntronExclusion.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | 
  4 | my %genes;
  5 | 
  6 | sub intronNumber
  7 | {
  8 |   my ($gene) = shift;
  9 |   $genes{$gene} ++;
 10 |   return $genes{$gene};
 11 | }
 12 | 
 13 | sub processIntron {
 14 |   my ($intron, $overlaps) = @_;
 15 |   my ($chr, $start, $end, $gene, $score, $dir) = split /\t/, $intron;
 16 | 
 17 |   my $len = $end-$start;
 18 |   my $excl = 0;
 19 |   my $antisense_dirty = 0;
 20 |   my $excluded_by_exon = 0;
 21 | 
 22 |   my @intron_seg=();
 23 |   push @intron_seg, {'start' => $start, 'end' => $end};
 24 | 
 25 |   foreach my $overlap (@$overlaps) {
 26 |     # $overlap->{start, end, type}
 27 |     if ($overlap->{'type'} eq 'A') {
 28 |       $antisense_dirty = 2 if ($antisense_dirty < 2);
 29 |       #ignore anti-sense, but mark dirty
 30 |     }elsif ($overlap->{'type'} eq 'AE') {
 31 |       $antisense_dirty = 1 if ($antisense_dirty < 1);
 32 |       #ignore anti-sense, but mark dirty
 33 |     }elsif ($overlap->{'type'} eq 'E' && $overlap->{'start'} < $start && $overlap->{'end'} > $end) {
 34 |       #print STDERR "Found an exon/feature entirely covering this intron, skpping\n";
 35 |       $excluded_by_exon = 1;
 36 |     }else{
 37 |       # We want to exclude this segment from our intron.
 38 | 
 39 |       foreach my $seg (@intron_seg) {
 40 |         if ($seg->{'end'}==0) {
 41 |           # do nothing, this segment has already been deleted.
 42 |         }elsif ($overlap->{'end'} <= $seg->{'start'}) {
 43 |           # end is before the start, skip it
 44 |         }elsif ($overlap->{'start'} >= $seg->{'end'}) {
 45 |           # start is after the end, skip it
 46 |         }elsif ($overlap->{'start'} <= $seg->{'start'} && $overlap->{'end'} >= $seg->{'end'}) {
 47 |           # exclude entirely covers a segment (equality or beyond) then remove it
 48 |           $seg->{'start'}=0; 
 49 |           $seg->{'end'}=0; 
 50 |         }elsif ($overlap->{'start'} <= $seg->{'start'}) {
 51 |           # start is before the start, trim the start
 52 |           $seg->{'start'} = $overlap->{'end'};
 53 |         }elsif ($overlap->{'end'} >= $seg->{'end'}) {
 54 |           # end is after the end, trim the end
 55 |           $seg->{'end'} = $overlap->{'start'};
 56 |         }else{
 57 |           # start inside, end inside - split it
 58 |           push @intron_seg, {'start'=>$overlap->{'end'},'end'=>$seg->{'end'}};
 59 |           $seg->{'end'}=$overlap->{'start'};
 60 |         }
 61 |       }
 62 |     }
 63 |   }
 64 |   # Procesed all overlaps.
 65 |   # result fragments, in no specific order, in @intron_seg.
 66 |   #print $intron, "\n";
 67 |   my $newlen = 0;
 68 |   my $newstart;
 69 |   my $newend = 0;
 70 |   my @sizes;
 71 |   my @starts;
 72 |   foreach my $seg (sort {$a->{'start'} <=> $b->{'start'}} @intron_seg) {
 73 |     if ($seg->{'end'} != 0) {
 74 |       $newstart = $seg->{'start'} if (!$newstart);
 75 |       $newend = $seg->{'end'} if ($seg->{'end'} > $newend);
 76 |       push @starts, $seg->{'start'} - $newstart;
 77 |       push @sizes, $seg->{'end'}-$seg->{'start'};
 78 |     }
 79 |     #print join("\t", "", $seg->{'start'}, $seg->{'end'}), "\n";
 80 |     $newlen += $seg->{'end'}-$seg->{'start'};
 81 |   }
 82 |   if ($newlen > 40 && ($newlen/$len) >= 0.7) {
 83 |     my $antisense_text = 'clean';
 84 |     if ($excluded_by_exon >= 1) {
 85 | 	$antisense_text = 'known-exon';
 86 |         $antisense_text .= '+anti-near' if ($antisense_dirty >= 1);
 87 |         $antisense_text .= '+anti-over' if ($antisense_dirty >= 2);
 88 |     }else{
 89 |         $antisense_text = 'anti-near' if ($antisense_dirty >= 1);
 90 |         $antisense_text = 'anti-over' if ($antisense_dirty >= 2);
 91 |     }
 92 |     print join("\t", $chr, $newstart, $newend, join("/",$gene,intronNumber($gene),$start,$end,$len,$len-$newlen,$antisense_text), $score, $dir, $newstart, $newend, "255,0,0", scalar @sizes, join(",",@sizes), join(",",@starts)), "\n"; 
 93 | 
 94 |     if ($len >= 110) {
 95 |       print OF50 join("\t", $chr, $start+5, $start+55, "S", 0, $dir, $start+5, $start+55, "255,0,0", 1, 50, 0), "\n";
 96 |       print OF50 join("\t", $chr, $end-55, $end-5, "S", 0, $dir, $end-55, $end-5, "255,0,0", 1, 50, 0), "\n";
 97 |     }
 98 | #    if ($len >= 210) {
 99 | #      print OF50 join("\t", $chr, $start+55, $start+105, "E", 0, $dir, $start+55, $start+105, "255,0,0", 1, 50, 0), "\n";
100 | #      print OF50 join("\t", $chr, $end-105, $end-55, "E", 0, $dir, $end-105, $end-55, "255,0,0", 1, 50, 0), "\n";
101 | #    }
102 |     print OF1 join("\t", $chr, $start, $dir), "\n";
103 |     print OF1 join("\t", $chr, $end, $dir), "\n";
104 |   }
105 | }
106 | 
107 | 
108 | 
109 | #### MAIN ####
110 | 
111 | if (! (scalar @ARGV == 2) ) {
112 |   print STDERR "Usage: cat inputBedIntersection | ./thisTool.pl out2 out3 > out1\n";
113 |   exit(1);
114 | }
115 | 
116 | open OF50, '>', $ARGV[0];
117 | open OF1, '>', $ARGV[1];
118 | 
119 | my $lastintron = '';
120 | my @overlaps;
121 | while(<STDIN>) {
122 |   chomp;
123 | 
124 | ## Directional
125 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       134895  135807  E       0       -       5
126 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       135135  135900  E       0       -       98
127 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       135230  136040  X       0       -       238
128 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       136070  136410  X       0       -       340
129 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       136440  136710  X       0       -       270
130 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       136750  137100  X       0       -       350
131 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       137140  137790  X       0       -       480
132 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       137615  139384  E       0       -       5
133 | #1       736543  741178  RP11-206L10.8/ENSG00000230092/- 0       -       1       736253  736548  E       0       -       5
134 | #1       736543  741178  RP11-206L10.8/ENSG00000230092/- 0       -       1       736550  736680  X       0       -       130
135 | #1       736543  741178  RP11-206L10.8/ENSG00000230092/- 0       -       1       736710  736840  X       0       -       130
136 | 
137 | ## Non-Directional
138 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       134895  135807  E       5
139 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       135135  135900  E       98
140 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       135230  136040  X       238
141 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       136070  136410  X       340
142 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       136440  136710  X       270
143 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       136750  137100  X       350
144 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       137140  137790  X       480
145 | #1       135802  137620  AL627309.1/ENSG00000237683/-    0       -       1       137615  139384  E       5
146 | 
147 |   my ($intron, $overlapstart, $overlapend, $overlaptype) = $_ =~ /^([^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+)\t[^\t]+\t([^\t]+)\t([^\t]+)\t([^\t]+)/;
148 |   push @overlaps, {'start'=>$overlapstart, 'end'=>$overlapend, 'type'=>$overlaptype};
149 | 
150 |   if ($lastintron ne $intron) {
151 |     if ($lastintron ne '') {
152 |       processIntron($lastintron, \@overlaps);
153 |       undef @overlaps;
154 |     }
155 |   }
156 |   push @overlaps, {'start'=>$overlapstart, 'end'=>$overlapend, 'type'=>$overlaptype};
157 | 
158 |   $lastintron = $intron;
159 | 
160 | }
161 | processIntron($lastintron, \@overlaps);
162 | 


--------------------------------------------------------------------------------
/bin/util/Mapability:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ulimit -Su 4000
 4 | export LANG=C
 5 | export LC_ALL=C
 6 | 
 7 | set -e
 8 | 
 9 | STAREXEC=$1
10 | STARGENOME=$2
11 | FA=$3
12 | LIBEXEC=$4
13 | THREADS=$5
14 | READ_LENGTH=$6
15 | 
16 | TMPBED=tmp_$$
17 | 
18 | mkdir "$TMPBED"
19 | 
20 | TMPCMP=gzip
21 | TMPEXT=gz
22 | if [ -x /usr/bin/lzop ]; then
23 |   TMPCMP=/usr/bin/lzop
24 |   TMPEXT=lzo
25 | fi
26 | 
27 | 
28 | #echo ""
29 | echo "<Phase 2: Mapability Calculation>"
30 | date +"%b %d %T ... mapping genome fragments of length $READ_LENGTH back to genome..."
31 | 
32 | if [ $THREADS -eq 1 ]; then
33 |     STAR_THREADS=1
34 | else
35 |     STAR_THREADS=$(( THREADS - 1 ))
36 | fi
37 | 
38 | "$STAREXEC" \
39 | --genomeDir "$STARGENOME" \
40 | --genomeLoad NoSharedMemory \
41 | --runThreadN $THREADS --outStd SAM --outSAMmode NoQS \
42 | --outSAMattributes None \
43 | --outFilterMultimapNmax 1 \
44 | --readFilesIn <("$LIBEXEC/generateReadsError.pl" $READ_LENGTH 10 < "$FA") \
45 | > genome_fragments.sam
46 | 
47 | date +"%b %d %T ... sorting aligned genome fragments..."
48 | 
49 | samtools sort -@ "$THREADS" genome_fragments.sam > genome_fragments.bam
50 | 
51 | date +"%b %d %T ... indexing aligned genome fragments..."
52 | 
53 | samtools index -@ "$THREADS" genome_fragments.bam
54 | 
55 | date +"%b %d %T ... filtering aligned genome fragments by chromosome/scaffold..."
56 | 
57 | ## prevent histexpand for the character '!'
58 | set +o histexpand
59 | 
60 | cat "$STARGENOME/chrName.txt" | \
61 |     xargs --max-args 1 --max-procs "${THREADS}" -I{} bash -c "samtools view genome_fragments.bam {}|awk -v read_length=\"${READ_LENGTH}M\" -v tmpdir=\"${TMPBED}\" -v tmpcmp=\"${TMPCMP}\" -v tmpext=\"${TMPEXT}\" 'BEGIN{FS=\"[\\t!]\"; OFS=\"\\t\"}{if ((\$8 == read_length ) && (\$3 == \$6) && (\$2 == \$5)) {print \$5, \$6-1, \$6+69 | (tmpcmp \" -c1 > \" tmpdir \"/\" \$5 \".bed.\" tmpext ) }}END{close( (tmpcmp \" -c1 > \" tmpdir \"/\" \$5 \".bed.\" tmpext ))}'"
62 | 
63 | date +"%b %d %T ... merging filtered genome fragments..."
64 | 
65 | if [ "$TMPEXT" == "gz" ]; then
66 |     find "$TMPBED" -type f -name "*.bed.""$TMPEXT"|xargs --max-args 1 zcat >> genome_fragments.unsorted.bed
67 | elif [ "$TMPEXT" == "lzo" ]; then
68 |     find "$TMPBED" -type f -name "*.bed.""$TMPEXT"|xargs --max-args 1 lzop -cdf >> genome_fragments.unsorted.bed
69 | fi
70 | 
71 | date +"%b %d %T ... calculating regions for exclusion..."
72 | 
73 | 
74 | bedtools genomecov -i genome_fragments.unsorted.bed -bga -g "$STARGENOME/chrNameLength.txt" | \
75 |     awk 'BEGIN{FS=OFS="\t";chr="random"}($1!=chr){chr=$1}($1==chr){print}' | \
76 |     awk 'BEGIN {FS=OFS="\t"} ($4 < 5) {print $1, $2, $3}' | \
77 |     bedtools merge -i stdin | \
78 |     sort -S5G -k1,1 -k2,2n -k3,3n| \
79 |     gzip > MapabilityExclusion.bed.gz
80 | 
81 | #ls "$TMPBED"/*.bed."$TMPEXT" | xargs --max-args 1 --max-procs "$THREADS" -I{} bash -c "\"$TMPCMP\" -cd < {} | bedtools genomecov -i stdin -bga -g \"$CHRLEN\"| awk 'NR==1{chr=\$1;print}\$1==chr{print}' | awk 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"} (\$4 < 5) {print \$1, \$2, \$3}' | bedtools merge -i stdin > \"$TMPEXCL/\"{}.exclusion"
82 | 
83 | #find "$TMPBED" -type f -name "*.bed.""$TMPEXT"|cut -d"/" -f3|xargs --max-args 1 --max-procs "$THREADS" -I{} bash -c "\"$TMPCMP\" -cd < \"$TMPBED\"/{} | bedtools genomecov -i stdin -bga -g \"$CHRLEN\"| awk 'NR==1{chr=\$1;print}\$1==chr{print}' | awk 'BEGIN {FS=OFS=\"\\t\"} (\$4 < 5) {print \$1, \$2, \$3}' | bedtools merge -i stdin > \"$TMPEXCL\"/{}.exclusion"
84 | 
85 | #cat "$TMPEXCL"/*.exclusion | sort -S5G -k1,1 -k2,2n -k3,3n | gzip > MapabilityExclusion.bed.gz
86 | 
87 | date +"%b %d %T ... cleaning temporary files..."
88 | 
89 | find "$TMPBED" -type f -name "*.bed.""$TMPEXT"|xargs --max-args 1 --max-procs "$THREADS" rm
90 | rm genome_fragments.*
91 | rm Log.*
92 | rm SJ.out.tab
93 | rmdir "$TMPBED"
94 | 


--------------------------------------------------------------------------------
/bin/util/adjust.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | args <- commandArgs(trailingOnly = T)
 3 | dat=read.table(args[1], stringsAsFactors = F, header = T)
 4 | cols=colnames(dat)
 5 | for (cn in cols[grepl(pattern ="p.val", x = cols)]  ){
 6 |   dat[,paste0(cn,"_BH_adjusted")]=p.adjust(dat[,cn], method = "BH")
 7 | }
 8 | write.table(x=dat, file = paste0(args[1], "_adjusted.tsv"), row.names = F, col.names = T, quote = F, sep="\t")
 9 | 
10 | 


--------------------------------------------------------------------------------
/bin/util/bash_utils.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | ## Useful functions for IRFinder's utils
  3 | 
  4 | export IRFINDER_BASH_UTILS_IMPORTED=1
  5 | export VERSION=2.0.1
  6 | export LC_ALL=C
  7 | export LANG=C
  8 | 
  9 | 
 10 | function versionAlert(){
 11 |    echo "IRFinder version: $VERSION" 
 12 |    exit
 13 | }
 14 | 
 15 | function checkFile() {
 16 |     if [ ! -f "${1}" ]; then
 17 |         echo "Error: file $1 doesn't exists" >&2
 18 |         exit 1
 19 |     fi
 20 | }
 21 | 
 22 | function checkSamtools() {
 23 |     STVERSTR=`samtools --version`
 24 | 	STVER=$(echo $STVERSTR|cut -d" " -f2)
 25 | 	STVERMAIN=$(echo $STVER|cut -d"." -f1)
 26 | 	STVERMINOR=$(echo $STVER|cut -d"." -f2)
 27 | 	if [[ ! "$STVERMAIN" -ge 1 ]]; then
 28 | 		echo "Error: Samtools $STVER: version too old (>=1.4 required)." >&2
 29 | 		exit 1
 30 | 	elif [[ ! "$STVERMINOR" -ge 4 ]]; then
 31 | 		echo "Error: Samtools $STVER: version too old (>=1.4 required)." >&2
 32 | 		exit 1
 33 | 	fi
 34 | }
 35 | 
 36 | function getMem(){
 37 |     local MEMK=`awk '($1 ~ /^MemTotal:/) {print $2}' < /proc/meminfo`
 38 |     echo $(( $MEMK/1000 ))
 39 | }
 40 | 
 41 | 
 42 | function checkStar(){
 43 |     MEMM=$(getMem)
 44 |     if [ "${MEMM}" -lt 32000 ]; then
 45 |         echo "System limitation: Minimum required RAM is 32GB. This software uses STAR for RNA mapping. RAM requirement is approximately 30GB for the human genome." >&2
 46 |         echo "  RunModes: BAM and BuildRefDownload, may be completed on servers with more RAM." >&2
 47 |         exit 2
 48 |     fi
 49 |     if [[ "$1" != "" ]]; then
 50 |         STAREXEC="$1"
 51 |     fi 
 52 |     if [[ "${STAREXEC}" == "" ]]; then
 53 |         STAREXEC="STAR"
 54 |     fi
 55 |     "$STAREXEC" --version &>/dev/null
 56 |     if [ ! $? -eq 0 ]; then
 57 |         echo "Error: STAR version is too old. --version parameter returns an error. Minimum version of 2.4.0 required." >&2
 58 |         exit 2
 59 |     fi
 60 | }
 61 | 
 62 | 
 63 | function checkMinimap(){
 64 |     if [[ "$1" != "" ]]; then
 65 |         MINIMAP_EXEC="$1"
 66 |     fi 
 67 |     if [[ "${MINIMAP_EXEC}" == "" ]]; then
 68 |         MINIMAP_EXEC="minimap2"
 69 |     fi
 70 |     if ! which $MINIMAP_EXEC > /dev/null 2> /dev/null ; then
 71 |       echo "minimap2 not found ( executable: $MINIMAP_EXEC ). To use the RunMode Long, install it: https://github.com/lh3/minimap2 " >&2
 72 |       exit 1
 73 |     fi
 74 |     MINIMAP_VERSION=$("$MINIMAP_EXEC" --version)
 75 |     if [[ $(echo ${MINIMAP_VERSION/-*/} | awk '{if ( $1 > 2.0 ) {print "ok" } else { print "no" }}') != "ok" ]]; then
 76 |         echo "Error: Minimap version is too old. Minimum version of 2.0.0 required. ${MINIMAP_VERSION} detected" >&2
 77 |         exit 2
 78 |     fi
 79 | }
 80 | 
 81 | 
 82 | function checkSuppa(){
 83 |     if ! which suppa.py >/dev/null 2>/dev/null ; then
 84 |         echo "SUPPA2 not found ( executable: suppa.py ). To use the RunMode Diff, install it: https://github.com/comprna/SUPPA " >&2
 85 |         exit 1
 86 |     fi
 87 | }
 88 | 
 89 | function checkDeseq(){
 90 | 	if ! which Rscript > /dev/null 2>/dev/null; then
 91 | 		echo "Rscript not found."
 92 | 		exit 1
 93 | 	fi
 94 | 	DESeqVersion=$(Rscript -e 'installed.packages()' | awk  'BEGIN {v=0} $1=="Version" {v=1; } v==1 && $1 == "DESeq2" { gsub("\"", ""); print $2;v=0 } ' )
 95 | 	
 96 | 	if [[ "${DESeqVersion}" == "" ]]; then
 97 | 		DESeqVersion=$(Rscript -e 'installed.packages()' | awk  'BEGIN {v=0} $NF=="Version" {v=1; } v==1 && $1 == "DESeq2" { gsub("\"", ""); print $NF;v=0 } ' )
 98 | 		if [[ "${DESeqVersion}" == "" ]]; then
 99 | 			echo "DESeq2 not installed. "
100 | 			exit 1
101 | 		fi
102 | 	fi
103 | 	logger "DESeq2 version $DESeqVersion"
104 | }
105 | 
106 | function setThreads(){
107 |     if [[ "${THREADS}" == "" ||  $THREADS == 0 ]]; then
108 | 	    THREADS=`grep -c ^processor /proc/cpuinfo`    
109 |     	if [ ! -n $THREADS ] | [ $THREADS -eq 0 ]; then
110 |         	THREADS=`awk 'BEGIN {FS=":"} ($0 ~ /^physical id/ ) { printf $2 " --"} ($0 ~ /^core id/) {print $2}' < /proc/cpuinfo | sort -u | wc -l`    	
111 |     	    if [ ! -n $THREADS ] | [ $THREADS -eq 0 ]; then
112 |         	    THREADS=1
113 |         	fi
114 | 	    fi
115 |     fi
116 | }
117 | 
118 | 
119 | function checkRef(){
120 |     if [ ! "$1" ]; then
121 |     	echo "Argument error: -r is required." >&2
122 |     	exit 1	
123 |     fi
124 |     if [ ! -f "$1/IRFinder/ref-cover.bed" ]; then
125 | 		echo "Argument error: -r $1, Does not appear to be a valid IRFinder reference. Could not find $1/IRFinder/ref-cover.bed" >&2
126 | 		exit 1
127 | 	fi
128 | }
129 | 
130 | function checkOutDir(){
131 |     local OUTPUTDIR=$1
132 |     if [ -d "$OUTPUTDIR" ]; then
133 | 		if [ -e "$OUTPUTDIR/IRFinder-IR-nondir.txt" ]; then
134 | 			echo "Argument error: -d $OUTPUTDIR, output directory contains files from a previous IRFinder run. Will not overwrite." >&2
135 | 			exit 1
136 | 		else
137 |     		mkdir -p "$OUTPUTDIR/logs/"
138 | 		fi
139 | 	else
140 | 		mkdir -p "$OUTPUTDIR/logs/"
141 | 		if [ ! -d "$OUTPUTDIR" ]; then
142 | 			echo "Argument error: Output directory $OUTPUTDIR does not exist, and could not be created." >&2
143 | 			exit 1
144 | 		fi
145 | 	fi
146 | }
147 | 
148 | function logger() {
149 |     LOGOUT="./irfinder.stdout"
150 |     if [[ "$OUTPUTDIR" != "" ]]; then
151 |         if [ ! -d ${OUTPUTDIR}/logs/ ]; then 
152 |             mkdir -p ${OUTPUTDIR}/logs/
153 |         fi
154 |         LOGOUT="$OUTPUTDIR/logs/irfinder.stdout" 
155 |     fi
156 |     if [[ "$1" == "init" ]] && [[ $# == 1 ]]; then
157 |         > $LOGOUT
158 |         LOG_MESSAGE="\n --------------------\n|  IRFinder v. $VERSION | \n --------------------\n"
159 |     else
160 |         LOG_MESSAGE="${@}"
161 |     fi    
162 |     if [[ "${VERBOSE}" == "1"  ]] ; then
163 |         echo -e "${LOG_MESSAGE}" | tee -ai $LOGOUT 
164 |     else
165 |         echo -e "${LOG_MESSAGE}" >> $LOGOUT
166 |     fi
167 | }
168 | 
169 | function startMessage(){
170 |     ## Check if the startMessage was called by the BAM mode after the FastQ or Long analysis
171 |     if [[ "${IRF_RUNMODE}" == "" ]]; then
172 |         logger "---" 
173 |         logger "IRFinder version: $VERSION " 
174 |         logger "IRFinder start: " `date` 
175 |         logger "IRFinder runmode: $RUNMODE"
176 |         logger "IRFinder user@host: $USER @ $HOSTNAME" 
177 |         logger "IRFinder working dir: " `pwd` 
178 |         logger "IRFinder reference: $REF" 
179 |         n=1
180 |         for f in $@; do
181 |             logger "IRFinder file ${n}: $f" 
182 |             n=$((n+1))
183 |         done
184 |         logger "---"  
185 |         START_MESSAGE=1
186 |     fi
187 | }
188 | 


--------------------------------------------------------------------------------
/bin/util/bed-to-intron+exon.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #0	1	2	3						4	5	6	7	8	9	10		11
 4 | #1      11868   14409   ENST00000456328/processed_transcript/DDX11L1    0       +       11868   14409   0       3       359,109,1189,   0,744,1352,
 5 | 
 6 | 
 7 | open EXON, '>', $ARGV[0];
 8 | open INTRON, '>', $ARGV[1];
 9 | 
10 | while (<STDIN>) {
11 |   chomp;
12 |   @f = split /\t/;
13 | 
14 |   $trans_start = $f[1];
15 | 
16 |   @length = split /,/, $f[10];
17 |   @start = split /,/, $f[11];
18 |   $chr = $f[0];
19 |   ($gene_id,$gene_name) = $f[3] =~ /\/([^\/]*)\/([^\/]*)$/;
20 |   $dir = $f[5];
21 | 
22 |   $last_end = undef;
23 |   while (@length) {
24 |     $start = shift @start;
25 |     $length = shift @length;
26 |     if (defined($last_end)) {
27 |         #only output if the intron has length.
28 |         if (($last_end+1) < ($start-1)) {
29 |           print INTRON join("\t", $chr, $trans_start+$last_end, $trans_start+$start, "$gene_name/$gene_id/$dir", $f[4], $f[5]), "\n";
30 |         }
31 |     }
32 |     #print EXON "$chr\t" . ($trans_start+$start) . "\t" . ($trans_start+$start+$length) . "\t$name\n";
33 |     print EXON join("\t", $chr, $trans_start+$start, $trans_start+$start+$length, "$gene_name/$gene_id/$dir", $f[4], $f[5]), "\n";
34 |     $last_end = $start+$length;
35 |   }
36 | }
37 | 
38 | close INTRON;
39 | close EXON;
40 | 


--------------------------------------------------------------------------------
/bin/util/deseq2.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | library(DESeq2)
 3 | library(ggplot2)
 4 | ### Load DESeq2Constructor
 5 | initial.options <- commandArgs(trailingOnly = FALSE)
 6 | file.arg.name <- "--file="
 7 | script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
 8 | script.basename <- dirname(script.name)
 9 | other.name <- file.path(script.basename, "/../DESeq2Constructor.R")
10 | source(other.name)
11 | # source("~/git/IRFinder/bin/DESeq2Constructor.R")
12 | ### Read args
13 | # setwd("~/test/IRFinder2/Diff/sing/")
14 | # setwd("/media/lorencl/f4e6cecd-2fb8-4aa6-991c-620f450fd511/works/IRFinder2/Diff/test_9")
15 | # args=c("./groups.tsv", "0.05", "0" , "0" ,"0")
16 | 
17 | args <- commandArgs(trailingOnly = T)
18 | groups=read.table(args[1], stringsAsFactors = F, header = T)
19 | out_folder=dirname(args[1])
20 | IRratio_thr=as.numeric(args[2])
21 | warning_filter=args[3]
22 | cooks_cutoff=args[4]=="1"
23 | if (cooks_cutoff ){
24 |   print("cooks_cutoff enabled")
25 | } else {
26 |   print("cooks_cutoff disabled")
27 | }
28 | 
29 | independentFiltering= args[5]=="1"
30 | if (independentFiltering ){
31 |   print("independentFiltering enabled")
32 | } else {
33 |   print("independentFiltering disabled")
34 | }
35 | 
36 | paths = as.vector(groups$Files)
37 | experiment = groups[,c("SampleName", "Condition")]
38 | 
39 | experiment$Condition=factor(experiment$Condition) 
40 | rownames(experiment)=NULL
41 | 
42 | metaList=DESeqDataSetFromIRFinder(filePaths=paths, designMatrix=experiment, designFormula=~1, irratio_thr=IRratio_thr, warning_filter=warning_filter )
43 | 
44 | dds = metaList$DESeq2Object
45 | design(dds) = ~Condition + Condition:IRFinder     
46 | conditions=levels(experiment$Condition)
47 | dds = DESeq(dds)
48 | resultsNames(dds)
49 | nn_counts = counts(dds, normalized=F)
50 | global_dat = data.frame( intron = rownames(nn_counts) );
51 | for ( i in 1:nrow(experiment) ) {
52 |   s_name =experiment$SampleName[i]
53 | 	global_dat[,paste0("IRratio.",s_name)] = nn_counts[, paste0("intronDepth.", s_name)] / (nn_counts[, paste0("intronDepth.", s_name)] + nn_counts[, paste0("maxSplice.", s_name)] )
54 | 	global_dat[,paste0("IRratio.",s_name)][is.na(global_dat[,paste0("IRratio.",s_name)])]=0
55 | }
56 | 
57 | for ( i in 1:(length(conditions)) ){
58 | 	global_dat[,paste0(conditions[i], ".Mean.IRratio")]= rowMeans(global_dat[,paste0("IRratio.",experiment$SampleName[experiment$Condition == conditions[i]])])
59 | 	
60 | }
61 | 
62 | 
63 | for ( i in 1:(length(conditions)-1) ){
64 |   for (j in (i+1):length(conditions)){
65 |     contrast_name=paste0(conditions[i], "_", conditions[j])
66 |     res = results(dds, contrast=list(paste0("Condition", conditions[i] ,".IRFinderIR"),paste0("Condition", conditions[j] ,".IRFinderIR")), cooksCutoff=cooks_cutoff, independentFiltering=independentFiltering)  
67 |     res$padj[is.na(res$padj)]=1
68 |     global_dat[,paste0("DESeq2.padj.", contrast_name)]=res$padj
69 |     global_dat[,paste0("DESeq2.baseMean.", contrast_name)]=res$baseMean
70 |     global_dat[,paste0("DESeq2.log2FoldChange.", contrast_name)]=res$log2FoldChange
71 |     if ( sum(res$padj < 0.05 ) > 0 ){
72 |       pdf(paste0(out_folder, "/", contrast_name, "_plot.pdf"))
73 |       nn_counts = counts(dds, normalized=F)
74 |       for ( name in rownames(res)[res$padj < 0.05]){
75 |         dat = data.frame( name = experiment$SampleName, grp= experiment$Condition,
76 |                           intron_depth = nn_counts[name, paste0("intronDepth.", experiment$SampleName)] ,   
77 |                           max_splice= nn_counts[name, paste0("maxSplice.", experiment$SampleName)])
78 |         dat$IRratio = dat$intron_depth / ( dat$intron_depth+dat$max_splice)
79 |         print(ggplot(dat)+geom_boxplot(aes(x=grp, fill=grp, y=IRratio )) + ggtitle(paste0(name, "\n", res[name, "padj"])))
80 |       }
81 |       dev.off()
82 |     }
83 |     write.table(res, file = paste0(out_folder, "/", contrast_name, "_DESeq2.tsv") ,sep="\t", quote = F)
84 |   }
85 | }
86 | rownames(global_dat)=global_dat$intron
87 | global_dat=global_dat[,-1]
88 | write.table(global_dat, file = paste0(out_folder, "/all_results_DESeq2.tsv") ,sep="\t", quote = F)
89 | 
90 | quit(save = "no")
91 | 
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/bin/util/generateReadsError.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | #use Fcntl;
  4 | 
  5 | #fcntl(stdout, F_SETPIPE_SZ, 1048576);
  6 | #fcntl(fileno(STDOUT), 1031, 1048576);
  7 | 
  8 | my $generatedCount = 0;
  9 | 
 10 | my $readLen = $ARGV[0];
 11 | my $stride = $ARGV[1];
 12 | 
 13 | my $lastDirection = 0;
 14 | 
 15 | sub reverse_complement {
 16 |         my $dna = shift;
 17 | 
 18 |         # reverse the DNA sequence
 19 |         my $revcomp = reverse($dna);
 20 | 
 21 |         # complement the reversed DNA sequence
 22 |         $revcomp =~ tr/ACGTacgt/TGCAtgca/;
 23 |         return $revcomp;
 24 | }  
 25 | 
 26 | 
 27 | my @error = (
 28 | {'A' => 'G', 'G' => 'T', 'T' => 'C', 'C' => 'A', 'N' => 'N' },
 29 | {'A' => 'T', 'G' => 'C', 'T' => 'A', 'C' => 'G', 'N' => 'N' },
 30 | {'A' => 'C', 'G' => 'A', 'T' => 'G', 'C' => 'T', 'N' => 'N' }
 31 | );
 32 | 
 33 | my $readCount = 0;
 34 | 
 35 | sub processRead( $ $ $ ) {
 36 |   my $read = shift;
 37 |   my $pos = shift;
 38 |   my $chr = shift;
 39 | 
 40 |   $readCount++;
 41 | 
 42 |   my $numN = $read =~ tr/N/N/;
 43 |   if ($numN * 2 < $readLen) {
 44 |     #only output reads where less than half of the read will be NNNNN
 45 | 
 46 |     # generate a single base error in a deterministic manner.
 47 |     substr($read,35,1) = $error[$readCount % 3]{substr($read,35,1)};
 48 | 
 49 |     if ($lastDirection == 0) {
 50 |       print ">RF!$chr!$pos\n";
 51 |       print "$read\n";
 52 |       $lastDirection = 1;
 53 |     }else{
 54 |       print ">RR!$chr!$pos\n";
 55 |       print reverse_complement($read) . "\n";
 56 |       $lastDirection = 0;
 57 |     }
 58 | 
 59 |   }
 60 | }
 61 | 
 62 | sub processBuffer( $ $ $ ) {
 63 |   my $b = shift;
 64 |   my $pos = shift;
 65 |   my $chr = shift;
 66 | 
 67 |   #while (length($$b) >= $readLen + $stride) {
 68 |   while (length($$b) >= $readLen) {
 69 |     processRead(substr($$b,0,$readLen), $pos, $chr);
 70 |     #my $thisread = substr($$b,0,$readLen);
 71 |     $$b = substr($$b,$stride);
 72 |     $pos = $pos + $stride;
 73 |   }
 74 |   return $pos;
 75 | }
 76 | 
 77 | my $count = 0;
 78 | my $chr = '';
 79 | my $pos = 1;
 80 | my $buffer = '';
 81 | 
 82 | while(<STDIN>) {
 83 |   chomp;
 84 |   $count ++;
 85 |   if (m/^>/) {
 86 |     s/ .*$//;
 87 |     s/^>//;
 88 |     $chr = $_;
 89 |     $pos = 1;
 90 |     $buffer = '';
 91 |   }
 92 |   else{
 93 |     # Should allow into the buffer only valid letters.
 94 |     $_ = uc($_);
 95 |     s/[^ATCGN]/N/g;
 96 |     $buffer .= $_;
 97 |     $pos = processBuffer(\$buffer, $pos, $chr);
 98 |   }
 99 | #  if ($count > 10000) { exit; }
100 | }
101 | 


--------------------------------------------------------------------------------
/bin/util/gtf2bed-custom.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | # Copyright (c) 2011 Erik Aronesty (erik@q32.com)
  4 | # 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | # 
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | # 
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | # THE SOFTWARE.
 22 | # 
 23 | # ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT.
 24 | #
 25 | # https://code.google.com/p/ea-utils/source/browse/trunk/clipper/gtf2bed
 26 | 
 27 | use Data::Dumper;
 28 | use sort 'stable';
 29 | use if $]<5.028, "sort", '_mergesort';  # Note the hash function is not stable on later versions of PERL. Must sort a hash on relevant values if stability is desired.
 30 | 
 31 | $in = shift @ARGV;
 32 | 
 33 | open IN, ($in =~ /\.gz$/ ? "gunzip -c $in" : $in =~ /\.zip$/ ? "unzip -p $in" : "$in");
 34 | while (<IN>) {
 35 | 	$gff = 2 if /^##gff-version 2/;
 36 | 	$gff = 3 if /^##gff-version 3/;
 37 | 	next if /^#/ && $gff;
 38 | 
 39 | 	s/\s+$//;
 40 | 	# 0-chr 1-src 2-feat 3-beg 4-end 5-scor 6-dir 7-fram 8-attr
 41 | 	my @f = split /\t/;
 42 | 	if ($gff) {
 43 |         # most ver 2's stick gene names in the id field
 44 | 		($id) = $f[8]=~ /\bID="([^"]+)"/;
 45 |         # most ver 3's stick unquoted names in the name field
 46 | 		($id) = $f[8]=~ /\bName=([^";]+)/ if !$id && $gff == 3;
 47 | 	} else {
 48 | 		($id) = $f[8]=~ /transcript_id "([^"]+)"/;
 49 | 	}
 50 | 
 51 | 	next unless $id && $f[0];
 52 | 
 53 | 	if ($f[2] eq 'exon') {
 54 | 		die "no position at exon on line $." if ! $f[3];
 55 |         # gff3 puts :\d in exons sometimes
 56 |         $id =~ s/:\d+$// if $gff == 3;
 57 | 		push @{$exons{$id}}, \@f;
 58 | 		# save lowest start
 59 | 		$trans{$id} = \@f if !$trans{$id};
 60 | 	}# elsif ($f[2] eq 'start_codon') {
 61 | 	#	#optional, output codon start/stop as "thick" region in bed
 62 | 	#	$sc{$id}->[0] = $f[3];
 63 | 	#}# elsif ($f[2] eq 'CDS') {
 64 | 	#	#optional, output codon start/stop as "thick" region in bed
 65 | 	#	push @{$cds{$id}}, \@f;
 66 | 	#	# save lowest start
 67 | 	#	$cdx{$id} = \@f if !$cdx{$id};
 68 | 	#} elsif ($f[2] eq 'stop_codon') {
 69 | 	#	$sc{$id}->[1] = $f[4];
 70 | 	#}# elsif ($f[2] eq 'miRNA' ) {
 71 | 	#	$trans{$id} = \@f if !$trans{$id};
 72 | 	#	push @{$exons{$id}}, \@f;
 73 | 	#}
 74 | }
 75 | 
 76 | for $id ( 
 77 | 	# sort by chr then pos
 78 | 	sort {
 79 | 		$trans{$a}->[0] eq $trans{$b}->[0] ? 
 80 | 		$trans{$a}->[3] <=> $trans{$b}->[3] : 
 81 | 		$trans{$a}->[0] cmp $trans{$b}->[0]
 82 | 	} (keys(%trans)) ) {
 83 | 		my ($chr, undef, undef, undef, undef, undef, $dir, undef, $attr, undef, $cds, $cde) = @{$trans{$id}};
 84 |         my ($cds, $cde);
 85 |         ($cds, $cde) = @{$sc{$id}} if $sc{$id};
 86 | 		my ($gene_name) = $attr=~ /gene_name "([^"]+)"/;
 87 | 		my ($gene_id) = $attr=~ /gene_id "([^"]+)"/;
 88 | 		my ($trans_type) = $attr=~ /transcript_biotype "([^"]+)"/;
 89 | 		if (!( $trans_type && length($trans_type)>0)) {
 90 | 			($trans_type) = $attr=~ /gene_biotype "([^"]+)"/;
 91 | 		}
 92 |                 if (!( $trans_type && length($trans_type)>0)) {
 93 |                         ($trans_type) = $attr=~ /transcript_type "([^"]+)"/;
 94 |                 }
 95 |                 if (!( $trans_type && length($trans_type)>0)) {
 96 |                         ($trans_type) = $attr=~ /gene_type "([^"]+)"/;
 97 |                 }
 98 | 		# sort by pos
 99 | 		my @ex = sort {
100 | 			$a->[3] <=> $b->[3]
101 | 		} @{$exons{$id}};
102 | 
103 | 		my $beg = $ex[0][3];
104 | 		my $end = $ex[-1][4];
105 | 		
106 | 		if ($dir eq '-') {
107 | 			# swap
108 | 			$tmp=$cds;
109 | 			$cds=$cde;
110 | 			$cde=$tmp;
111 | 			$cds -= 2 if $cds;
112 | 			$cde += 2 if $cde;
113 | 		}
114 | 
115 | 		# not specified, just use exons
116 | 		$cds = $beg if !$cds;
117 | 		$cde = $end if !$cde;
118 | 
119 | 		# adjust start for bed
120 | 		--$beg; --$cds;
121 | 	
122 | 		my $exn = @ex;												# exon count
123 | 		my $exst = join ",", map {$_->[3]-$beg-1} @ex;				# exon start
124 | 		my $exsz = join ",", map {$_->[4]-$_->[3]+1} @ex;			# exon size
125 | 
126 | #		if (($trans_type eq 'protein_coding') || ($trans_type eq 'processed_transcript')) {
127 | 		#if (!(($trans_type eq 'protein_coding') || ($trans_type eq 'processed_transcript'))) {
128 | 			# added an extra comma to make it look exactly like ucsc's beds
129 | 			print "$chr\t$beg\t$end\t$id/$trans_type/$gene_id/$gene_name\t0\t$dir\t$cds\t$cde\t0\t$exn\t$exsz,\t$exst,\n";
130 | #		}
131 | }
132 | 
133 | 
134 | close IN;
135 | 


--------------------------------------------------------------------------------
/bin/util/irfinder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/irfinder


--------------------------------------------------------------------------------
/bin/util/irfinder_cnn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/irfinder_cnn


--------------------------------------------------------------------------------
/bin/util/model/best_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/model/best_model.h5


--------------------------------------------------------------------------------
/bin/util/model/best_model.tflite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/model/best_model.tflite


--------------------------------------------------------------------------------
/bin/util/model/model_info.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Image directory": "/work/ritchieteam/EMT_irfinder/trainingshortread/",
 3 |   "Output directory": "depth15_ir0.1_noir0.cov50long_.01cov25allshortnondir_congruant_2_2021-05-20_12_01",
 4 |   "Validation split": 0.1,
 5 |   "Epochs": 500,
 6 |   "Batch size": 50,
 7 |   "Model json": null,
 8 |   "Image size": 256,
 9 |   "Number of colors": 0,
10 |   "Seed": 123,
11 |   "Threads": 5,
12 |   "Dataset": {
13 |     "counts": [
14 |       [
15 |         1662,
16 |         185
17 |       ],
18 |       [
19 |         8164,
20 |         907
21 |       ]
22 |     ],
23 |     "class_names": [
24 |       "hIR",
25 |       "noIR"
26 |     ]
27 |   }
28 | }


--------------------------------------------------------------------------------
/bin/util/trim:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/trim


--------------------------------------------------------------------------------
/bin/util/winflat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/winflat


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | if [[ "$1" != "local" ]] && [[ "$1" != "" ]] && [[ "$1" != "remove" ]]; then
  4 |     echo -e "Usage: \nGlobal installation:\tsudo install.sh\nLocal installation:\tinstall.sh local" >&2
  5 |     echo -e "Uninstall all:\tsudo install.sh remove\nUninstall local:\tinstall.sh remove local\nUninstall local:\tsudo install.sh remove global" >&2
  6 |     exit 1
  7 | fi
  8 | 
  9 | function checkDependencies(){
 10 |     local distro=$(cat /proc/version )
 11 |     local deps=0
 12 |     echo "Checking dependencies..."
 13 |     for pkg in $@ ; do
 14 |         if [[ "${distro}" =~ Ubuntu|Debian ]]  ; then
 15 |             if ! dpkg -s $pkg >/dev/null 2>/dev/null ; then 
 16 |                 echo "Dependency $pkg not found." >&2
 17 |                 deps=1
 18 |             fi
 19 |         else
 20 |             if ! rpm -q $pkg >/dev/null 2>/dev/null ; then
 21 |                 echo "Dependency $pkg not found." >&2 
 22 |                 deps=1
 23 |             fi
 24 |         fi
 25 |     done
 26 |     if [ $deps -eq 1 ]; then
 27 |         exit 1
 28 |     fi
 29 | }
 30 | 
 31 | 
 32 | if [[ $1 == "remove" ]]; then
 33 |     if [[ "$2" != "global" ]] && [[ "$2" != "local" ]] && [[ "$2" != "" ]] ; then
 34 |         echo "Error: $2 not recognized. Use 'local' or 'global' or leave empty" >&2
 35 |         exit 1
 36 |     fi
 37 |     if [[ "$2" == "" || "$2" == "global" ]]; then
 38 |         if [ "$EUID" -ne 0 ]; then 
 39 |             echo "Please run as root"
 40 |             exit
 41 |         fi
 42 |         if [ -d /usr/local/IRFinder ]; then
 43 |             rm -fr /usr/local/IRFinder /usr/bin/IRFinder
 44 |             echo "Removed system installation"
 45 |         else
 46 |             echo "Global installation of IRFinder not found"
 47 |         fi
 48 |     fi
 49 |     if [[ "$2" == "" || "$2" == "local" ]] ;then
 50 |         if [ -d ~/.local/IRFinder ] ; then
 51 |             rm -fr ~/.local/IRFinder ~/.local/bin/IRFinder
 52 |             echo "Removed local installation"
 53 |         else
 54 |             echo "Local installation of IRFinder not found."
 55 |         fi
 56 |     fi
 57 |     exit
 58 | fi
 59 | 
 60 | 
 61 | if [[ "${1}" != "local" ]]; then
 62 |     if [ "$EUID" -ne 0 ]; then 
 63 |         echo "Please run as root or to install IRFinder locally call"
 64 |         echo "./install.sh local"
 65 |         echo ""
 66 |         exit 1
 67 |     fi
 68 | fi
 69 | 
 70 | checkDependencies "make bedtools samtools gzip gawk libboost-iostreams-dev zlib1g"
 71 | 
 72 | 
 73 | ORIGINAL_FOLDER=$(realpath $PWD)
 74 | BASE_FOLDER=$(dirname "$(readlink -nf "$BASH_SOURCE")")
 75 | 
 76 | cd $BASE_FOLDER/src/trim/
 77 | make clean
 78 | make
 79 | cp ./trim $BASE_FOLDER/bin/util/trim
 80 | make clean
 81 | cd ../winflat
 82 | make clean
 83 | make
 84 | cp ./winflat $BASE_FOLDER/bin/util/winflat
 85 | cd ../irfinder/Release
 86 | make clean
 87 | make
 88 | cp ./irfinder $BASE_FOLDER/bin/util/irfinder
 89 | make clean
 90 | cd $BASE_FOLDER
 91 | chmod -R a+x ./bin
 92 | if [[ "${1}" == "local" ]];then
 93 |     if [ -d ~/.local/IRFinder ]; then
 94 |         rm -fr ~/.local/IRFinder ~/.local/bin/IRFinder
 95 |     fi
 96 |     cp -r $BASE_FOLDER ~/.local/IRFinder
 97 |     ln -s $(realpath ~/.local/IRFinder/bin/IRFinder) ~/.local/bin/IRFinder
 98 | else
 99 |     if [ -d /usr/local/IRFinder ]; then
100 |         rm -fr /usr/local/IRFinder /usr/bin/IRFinder
101 |     fi
102 |     cp -r $BASE_FOLDER /usr/local/IRFinder 
103 |     ln -s /usr/local/IRFinder/bin/IRFinder /usr/bin/IRFinder
104 | fi
105 | 
106 | cd $ORIGINAL_FOLDER
107 | 
108 | 
109 | if ! which suppa.py >/dev/null 2>/dev/null ; then
110 |   echo "SUPPA2 not found. To use the RunMode Diff, install it: https://github.com/comprna/SUPPA " >&2
111 | fi
112 | 
113 | if ! which STAR > /dev/null 2> /dev/null ; then
114 |   echo "STAR not found. To use the RunMode FastQ and to produce your own mapability files during the reference build, install it: https://github.com/alexdobin/STAR " >&2
115 | fi
116 | 
117 | if ! which minimap2 > /dev/null 2> /dev/null ; then
118 |   echo "minimap2 not found. To use the RunMode Long, install it: https://github.com/lh3/minimap2 " >&2
119 | fi
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 


--------------------------------------------------------------------------------
/src/cnnfilter/cnnfilter/actions/resultgraph.py:
--------------------------------------------------------------------------------
 1 | import sys,os
 2 | import pandas as pd
 3 | import numpy as np
 4 | #EMT5p
 5 | if sys.argv[1] == "EMT5m":
 6 | 
 7 |     shortnames = [["T5moins_rep1", "T5moins_rep2", "T5moins_rep3"]]
 8 | 
 9 | if sys.argv[1] == "EMT5p":
10 | 
11 |     shortnames = [["T5plus_rep1", "T5plus_rep2", "T5plus_rep3"]]
12 | 
13 | if sys.argv[1] == "EMT1p":
14 | 
15 |     shortnames = [["T1plus_rep1", "T1plus_rep2", "T1plus_rep3"]]
16 | 
17 | 
18 | folder = "/work/sylvain/IntronScanner/test/training/EMT_training/depth15_ir0.05_noir0.cov50long_.023allshort_rmALLnondirnoncongruant2021-05-07_16_53/"
19 | folder = sys.argv[1]
20 | 
21 | pred = pd.read_csv("prediction_for_EMT_test.tsv",sep="\t|-|:",skiprows=1,header=None)
22 | pred[2]=pred[2]+15
23 | pred[3]=pred[3]-15
24 | pred["id"]=pred[1].apply(str)+":"+pred[2].apply(str)+"-"+pred[3].apply(str)
25 | pred["truelab"]="no"
26 | for sit, sname in enumerate(shortnames[0]):
27 |     data = pd.read_csv(sname+".tsv",delimiter="\t")
28 |     data["id"]=data["Chr"].apply(str)+":"+data["Start"].apply(str)+"-"+data["End"].apply(str)
29 |     pred.loc[pred[0]==sname].loc[pred['id'].isin(data["id"])]
30 |     data.loc[data['id'].isin(pred.loc[pred[0] == sname]["id"])]
31 |     pred.loc[pred[0] == sname]["truelab"]=data.loc[data['id'].isin(pred.loc[pred[0] == sname]["id"])]["Warnings"]
32 |     (pred[1]==data["Chr"]) & (pred[2]==data["Start"]) & (pred[3]==data["End"])  & (pred[0]==sname)
33 |     pred[pred[0]==sname]
34 | # for sit, sname in enumerate(shortnames[0]):
35 | #     txt = numpy.loadtxt(os.path.join(folder, sname, "IRFinder-IR-nondir.txt"), delimiter="\t)
36 | #     data = np.genfromtxt(fname=os.path.join(folder, sname, "IRFinder-IR-nondir.txt"), delimiter="\t", skip_header=1)
37 | #     sarray = open(os.path.join(folder, sname, "IRFinder-IR-nondir-AI.txt"), "rt")
38 | #     print(os.path.join(folder, sname, "IRFinder-IR-nondir-AI.txt"))
39 | #
40 | #
41 | #     line = _array.readline()
42 | #
43 | #     while True:
44 | #         line = _array.readline()
45 | #         if not line:
46 | #             break
47 | #
48 | #             irratio = float(a[19])
49 | #             classpred= a[20]
50 | #             irratio=float(a[3])


--------------------------------------------------------------------------------
/src/cnnfilter/cnnfilter/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # encoding: utf-8
  3 | '''
  4 | intron_scanner.intron_scanner -- shortdesc
  5 | 
  6 | intron_scanner.intron_scanner is a description
  7 | 
  8 | It defines classes_and_methods
  9 | 
 10 | @author:     user_name
 11 | 
 12 | @copyright:  2020 organization_name. All rights reserved.
 13 | 
 14 | @license:    license
 15 | 
 16 | @contact:    user_email
 17 | @deffield    updated: Updated
 18 | '''
 19 | 
 20 | import sys
 21 | import os
 22 | 
 23 | LIBRARY_LOCATION = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 24 | sys.path.append(LIBRARY_LOCATION)
 25 | 
 26 | from optparse import OptionParser
 27 | 
 28 | 
 29 | def title(title):
 30 |     bar = '-' * 50
 31 |     white = ' ' * round((50 - len(title)) / 2)
 32 |     white_b = ' ' * 5
 33 |     title = "\n " + white_b + bar + "\n" + white_b + "|" + white + title + white + "|\n " + white_b + bar + "\n\n"
 34 |     print(title)
 35 | 
 36 | 
 37 | def print_help():
 38 |     print("Usage: intron_scanner action [options]\n\nPossible actions:")
 39 |     print("\t- extract:\t exctract genomic regions (bed file) from a bam file as images.")
 40 |     print(
 41 |         "\t- a2i:    \t exctract images from an array produced by the extract command and organize in according to given labels")
 42 |     print("\t- train:  \t train a tensorflow model on a given set of images")
 43 |     print("\t- test:  \t use a trained model to predict the class of a given set of images")
 44 |     print("\n")
 45 | 
 46 | 
 47 | def main(argv=None):
 48 |     '''Command line options.'''
 49 |     program_name = os.path.basename(sys.argv[0])
 50 |     title("Intron scanner")
 51 |     action = "help"
 52 |     if len(sys.argv) > 1:
 53 |         action = sys.argv[1]
 54 | 
 55 |     if argv is None:
 56 |         argv = sys.argv[2:]
 57 |     try:
 58 | 
 59 |         if action == "train":
 60 |             program_name = program_name + " " + 'training process'
 61 |             parser = OptionParser(usage="Usage: %prog train ",
 62 |                                   description="Train a neural network model on a given set of images")
 63 |             parser.add_option("-d", "--img-dir", dest="dir",
 64 |                               help="The directory containing the images, divided in subdirectories in according to the classes. Example: ./training/ -> ./training/labelA/ ./training/labelB/ ",
 65 |                               metavar="DIR", type="string")
 66 |             parser.add_option("-o", "--out", dest="outdir", help="Output directory [default: %default]", metavar="DIR",
 67 |                               type="string")
 68 |             parser.add_option("-b", "--batch", dest="batch", help="Number of images to load. [default: %default]",
 69 |                               metavar="INT", type="int")
 70 |             parser.add_option("-s", "--image-size", dest="size", help="Images size [default: %default]", metavar="INT",
 71 |                               type="int")
 72 |             # parser.add_option("-c", "--color-number", dest="colorN",
 73 |             #                   help="Number of color's dimension : 1 -> grey (read)\n\t\t\t 2 ->(read and annotation ??),\n\t\t\t3 -> 3colors  [default: %default]",
 74 |             #                   metavar="INT", type="int")
 75 |             parser.add_option("-S", "--seed", dest="seed", help="Seed for the validation split [default: %default]",
 76 |                               metavar="INT", type="int")
 77 |             parser.add_option("-t", "--threads", dest="threads", help="Number of threads to use. [default: %default]",
 78 |                               metavar="INT", type="int")
 79 |             parser.add_option("-V", "--validation-split", dest="vsplit", metavar="FLOAT",
 80 |                               help="Fraction of the dataset to use for the validation [default: %default]",
 81 |                               type="float")
 82 |             parser.add_option("-e", "--epochs", dest="epochs", metavar="INT",
 83 |                               help="Number of training epoch [default: %default]", type="int")
 84 |             parser.add_option("-E", "--earlystop", dest="earlystop", metavar="INT",
 85 |                               help="Number of patience epoch for earlystop , -1 for no earlystop  [default: 0.1*epochs (10 percent of the total number of epochs]",
 86 |                               type="int")
 87 |             parser.add_option("-m", "--json-model", dest="model", metavar="FILE",
 88 |                               help="Load the tensorflow model from a json file [default: %default]", type="string")
 89 |             parser.add_option("-v", "--verbose", dest="verbose", action="count", help="Set tensorflow verbosity level")
 90 |             # set defaults
 91 |             parser.set_defaults(outdir="./model/", size=256, colorN=0, verbose=0, epoch=10, earlystop=-200, ext="png",
 92 |                                 model=None, vsplit=0.20, batch=50, seed=123, threads=None, epochs=10)
 93 |             # process options
 94 |             (opts, _) = parser.parse_args(argv)
 95 |             required = "dir ".split()
 96 |             for r in required:
 97 |                 if opts.__dict__[r] is None:
 98 |                     parser.error("Parameter %s required\n\nUse --help to get more information\n" % r)
 99 |             from cnnfilter.actions.models import IntronModeller
100 |             modeller = IntronModeller(opts.verbose)
101 |             # modeller.train(opts.dir, opts.outdir, opts.size, opts.batch, opts.vsplit, opts.seed, opts.epochs, opts.threads, opts.model, opts.colorN, opts.earlystop)
102 |             modeller.train_from_array(opts.dir, opts.outdir, opts.size, opts.batch, opts.vsplit, opts.seed, opts.epochs,
103 |                                       opts.threads, opts.model, opts.colorN, opts.earlystop)
104 | 
105 |         elif action == "test":
106 |             program_name = program_name + " " + 'test process'
107 |             parser = OptionParser(usage="Usage: %prog test ",
108 |                                   description="Test a neural network model on a given set of images")
109 |             parser.add_option("-d", "--img-dir", dest="dir",
110 |                               help="The directory containing the images to predict. If they are in subdirectories, the subfolder name is used as true label.",
111 |                               metavar="DIR", type="string")
112 |             parser.add_option("-a", "--array-file", dest="array", metavar="FILE",
113 |                               help="Use a file conaining the image information, produced by the extract process",
114 |                               type="string")
115 |             parser.add_option("-b", "--bed-file", dest="bed", metavar="FILE",
116 |                               help="bed file associated to the array (-a). Can be a general tsv file. The last column is used as true label",
117 |                               type="string")
118 |             parser.add_option("-m", "--model-dir", dest="model", metavar="DIR",
119 |                               help="Folder containing the model. It has to contain the files best_model.h5 and model_info.json [default: %default]",
120 |                               type="string")
121 |             # parser.add_option("-c", "--color-number", dest="colorN",
122 |             #                   help="Number of color's dimension : 1 -> grey (read)\n\t\t\t 2 ->(read and annotation ??),\n\t\t\t3 -> 3colors  [default: %default]",
123 |             #                   metavar="INT", type="int")
124 |             parser.add_option("-o", "--out", dest="out", help="Output file [default: %default]", metavar="FILE",
125 |                               type="string")
126 |             parser.add_option("-v", "--verbose", dest="verbose", action="count", help="Set tensorflow verbosity level")
127 |             # set defaults
128 |             parser.set_defaults(out="./predictions.tsv", verbose=0, dir=None, array=None, bed=None,
129 |                                 model="./model/")
130 |             # process options
131 |             (opts, _) = parser.parse_args(argv)
132 |             if (opts.dir != None) == (opts.array != None):
133 |                 parser.error(
134 |                     "Parameters -a and -d are mutual exclusive and at least one is required\n\nUse --help to get more information\n")
135 |             from cnnfilter.actions.models import IntronModeller
136 |             modeller = IntronModeller(opts.verbose)
137 |             modeller.test(opts.dir, opts.array, opts.bed, opts.model, opts.out)
138 |         elif action == "help" or action == "-h" or action == "--help":
139 |             print_help()
140 |         else:
141 |             raise ValueError("Action %s not recognized." % action)
142 | 
143 |     except Exception as e:
144 |         print(program_name + ": " + repr(e) + "\n")
145 |         print("\n\nFor help use --help\n\n")
146 |         print(e)
147 |         if __debug__:
148 |             raise e
149 |         return 2
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     sys.exit(main())
154 | 
155 | 
156 | #json,gzip,time,tensorflow,matplotlib,numpy,sklearn,re,progress


--------------------------------------------------------------------------------
/src/cnnfilter/cnnfilter/model/best_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/src/cnnfilter/cnnfilter/model/best_model.h5


--------------------------------------------------------------------------------
/src/cnnfilter/cnnfilter/model/model_info.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Image directory": "/work/ritchieteam/EMT_irfinder/trainingshortread/",
 3 |   "Output directory": "depth15_ir0.1_noir0.cov50long_.01cov25allshortnondir_congruant_2_2021-05-20_12_01",
 4 |   "Validation split": 0.1,
 5 |   "Epochs": 500,
 6 |   "Batch size": 50,
 7 |   "Model json": null,
 8 |   "Image size": 256,
 9 |   "Number of colors": 0,
10 |   "Seed": 123,
11 |   "Threads": 5,
12 |   "Dataset": {
13 |     "counts": [
14 |       [
15 |         1662,
16 |         185
17 |       ],
18 |       [
19 |         8164,
20 |         907
21 |       ]
22 |     ],
23 |     "class_names": [
24 |       "hIR",
25 |       "noIR"
26 |     ]
27 |   }
28 | }


--------------------------------------------------------------------------------
/src/cnnfilter/cnnfilter/utils/reader.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import gzip
 3 | 
 4 | class ImageArray:
 5 |     def __init__(self, raw_line):
 6 |         raw_line=raw_line.split("\t")
 7 |         raw_name=raw_line[0].split(":")
 8 |         self.name=raw_name[0]+":"+raw_name[1]
 9 |         self.strand=raw_name[2]
10 |         self.region=json.loads(raw_line[1])
11 |         self.is_valid= len(self.region) > 1
12 |         if self.is_valid:
13 |             self.is_valid=max([sum(i) for i in self.region]) > 0
14 |     
15 | 
16 | class ImageArchive:
17 |     
18 |     def _open_file(self, fname):
19 |         if fname.endswith(".gz"):
20 |             return gzip.open(fname, "rt")
21 |         else:
22 |             return open(fname, "rt")
23 |     
24 |     def _count_lines(self, fname):
25 |         tmp = self._open_file(fname)
26 |         count=0
27 |         for _ in tmp:
28 |             count+=1
29 |         tmp.close()
30 |         return count
31 |         
32 |         
33 |     def __init__(self, bed_file, array_file):
34 |         self._len=self._count_lines(array_file)
35 |         if bed_file != None:
36 |             if self._count_lines(bed_file) != self._len :
37 |                 raise AssertionError("Files {} and {} have not the same number of lines!".format(array_file, bed_file))
38 |             self._bed = self._open_file(bed_file)
39 |             self._has_bed=True
40 |         else:
41 |             self._has_bed=False
42 |         self._array= self._open_file(array_file)
43 |         self._index=-1
44 |         
45 |             
46 |     def __iter__(self):
47 |         return self
48 |     
49 |     def __next__(self):
50 |         self._index+=1
51 |         if self._index < self._len:
52 |             if self._has_bed:
53 |                 return self._bed.readline().strip().split("\t"), ImageArray(self._array.readline())
54 |             else:
55 |                 return ["NA"], ImageArray(self._array.readline())
56 |         else:
57 |             raise StopIteration
58 |     
59 |     def __len__(self):
60 |         return self._len
61 |     
62 |     def __del__(self):
63 |         self.close()
64 |         
65 |     def close(self):
66 |         self._array.close()
67 |         if  self._has_bed:
68 |             self._bed.close()
69 |         
70 |     def getIndex(self):
71 |         return self._index
72 |     
73 | 


--------------------------------------------------------------------------------
/src/cnnfilter/testCNN/actions/extract.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.interpolate import interp1d
 3 | 
 4 | 
 5 | 
 6 | 
 7 | 
 8 | def getImageArrayFromRegion(region, img_size=None):
 9 |     '''
10 |     Return the numpy array representing the image from a given region
11 |     '''
12 | 
13 | 
14 |     read_img, ann_img = generateImagesArrayGreyFromRegion(region,img_size)
15 | 
16 |     return read_img
17 | 
18 | 
19 | 
20 | def generateImagesArrayGreyFromRegion(region, img_size=None):
21 | 
22 | 
23 |     '''
24 |     Return the arrays composing an image from a given region
25 |     '''
26 | 
27 |     region_size = len(region)
28 | 
29 |     depth = max([sum(i) for i in region])
30 |     if depth == 0:
31 |         raise ArithmeticError("Error! trying to generate an image with zero depth.")
32 |     reads_img = (np.array(region)[:, :] / depth) * 255
33 | 
34 | 
35 |     if region_size < img_size:
36 |         kindinterp = "nearest"
37 |     else:
38 |         kindinterp = "zero"  #"linear"
39 | 
40 | 
41 |     f0 = interp1d(np.arange(0, region_size-30), reads_img[15:-15,0], kind=kindinterp)
42 |     f1 = interp1d(np.arange(0, region_size-30), reads_img[15:-15,1], kind=kindinterp)
43 | 
44 |     reads_imgd1 = np.array([np.array(reads_img[0:15, 0])])
45 |     reads_imgd1 = np.append(reads_imgd1, f0(np.arange(0, (img_size - 30)) * ((region_size - 31) / (img_size - 30))))
46 |     reads_imgd1 = np.append(reads_imgd1, reads_img[-15:, 0])
47 | 
48 |     reads_imgd2 = np.array([np.array(reads_img[0:15, 1]+reads_img[0:15, 0])])
49 | 
50 | 
51 |     reads_imgd2 = np.append(reads_imgd2, f1(np.arange(0, (img_size - 30)) * ((region_size - 31) / (img_size - 30)))+reads_imgd1[15:-15])
52 | 
53 |     reads_imgd2 = np.append(reads_imgd2, reads_img[ -15:,1]+reads_img[-15:, 0])
54 | 
55 |     reads_img2 = np.array([reads_imgd1,reads_imgd2])
56 | 
57 |     reads_img2 = np.expand_dims(np.rot90(np.round(reads_img2).astype("float32"), k=3), axis=2)
58 | 
59 |     return reads_img2, None
60 | 
61 | 


--------------------------------------------------------------------------------
/src/cnnfilter/testCNN/actions/models.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 3 | 
 4 | import tflite_runtime.interpreter as tflite
 5 | from scipy.special import softmax
 6 | 
 7 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 8 | 
 9 | import json
10 | import numpy as np
11 | 
12 | from utils.reader import ImageArchive
13 | from actions.extract import getImageArrayFromRegion
14 | 
15 | 
16 | class IntronModeller():
17 |     def __init__(self, verbosity=0):
18 |         if verbosity > 3:
19 |             verbosity=3
20 |         os.environ['TF_CPP_MIN_LOG_LEVEL'] ="{}".format(3-verbosity)
21 |         self.verbosity = verbosity
22 | 
23 | 
24 |     def test(self, img_dir=None, model_dir="./model/", colorN=3, imagemode=0):
25 |         if (img_dir == None):
26 |             raise ValueError("img_dir is required.")
27 |         self._load_model(model_dir)
28 |         self._model_dir = model_dir
29 |         
30 | 
31 |         if img_dir != None:
32 |             for filesdir in ["IRFinder-IR-dir", "IRFinder-IR-nondir"]:
33 |                 if os.path.isfile(os.path.join(img_dir, filesdir+"-AI.txt")):
34 |                     output_file=os.path.join(img_dir, filesdir+"-val.txt")
35 |                     self._out_f = open(output_file, "wt")
36 |                     arr_file = os.path.join(img_dir, filesdir + "-AI.txt")
37 |                     bed_file = os.path.join(img_dir, filesdir + ".txt")
38 |                     print("Processing "+filesdir+".txt")
39 |                     self._test_irfinder_result(arr_file,bed_file)
40 |                     print("Done.")
41 |                     self._out_f.close()
42 | 
43 |     def _predict(self, arr):
44 |         self.model["Model"].reset_all_variables()
45 |         self.model["Model"].set_tensor(self.model["InputDetails"][0]['index'], [arr])
46 |         self.model["Model"].invoke()
47 |         return self.model["Model"].get_tensor(self.model["OutputDetails"][0]['index'])[0]
48 | 
49 | 
50 | 
51 |     def _test_irfinder_result(self, arr_file, bed_file):
52 |         ori_res = open(bed_file , "rt")
53 |         line = ori_res.readline().split("\t")
54 |         line[4] = "CNN_IRscore"
55 |         self._out_f.write(("\t").join(line))
56 |         ori_res.close()
57 |         arch = ImageArchive(bed_file,arr_file )
58 |         for bed, arr in arch:
59 |             if arr.is_valid:
60 |                 pred= self._predict(getImageArrayFromRegion(arr.region, self.model["Image size"]))
61 |                 score = softmax(pred)
62 |                 idx_max = np.argmax(score)
63 |                 pred_lab = self.model["Dataset"]["class_names"][idx_max]
64 |                 if pred_lab=="hIR":
65 |                     line = bed
66 |                     line[4] = str(score[0])
67 |                     self._out_f.write(("\t").join(line)+"\n")
68 |         return
69 | 
70 | 
71 | 
72 |     def _load_model(self, model_dir):
73 |         print("Loading the best_model in {}".format(model_dir))
74 |         model_info_file="{}/model_info.json".format(model_dir)
75 |         model_file="{}/best_model.tflite".format(model_dir)
76 |         if not os.path.exists(model_info_file) or not os.path.exists(model_file):
77 |             raise FileNotFoundError("Error! files model_info.json and best_model.h5 have to be in the model folder! ")
78 |         with open(model_info_file, "rt") as fp:
79 |             self.model=json.load(fp)
80 |         self.model["Model"]=tflite.Interpreter(model_path=model_file)
81 |         self.model["Model"].allocate_tensors()
82 |         self.model["InputDetails"]=self.model["Model"].get_input_details()
83 |         self.model["OutputDetails"]=self.model["Model"].get_output_details()
84 |         print("Done.")
85 |         return
86 | 
87 | 
88 |         return data
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/src/cnnfilter/testCNN/irfinder_cnn.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # encoding: utf-8
 3 | '''
 4 | intron_scanner.intron_scanner -- shortdesc
 5 | 
 6 | intron_scanner.intron_scanner is a description
 7 | 
 8 | It defines classes_and_methods
 9 | 
10 | @author:     user_name
11 | 
12 | @copyright:  2020 organization_name. All rights reserved.
13 | 
14 | @license:    license
15 | 
16 | @contact:    user_email
17 | @deffield    updated: Updated
18 | '''
19 | 
20 | import sys
21 | import os
22 | import json
23 | import gzip
24 | 
25 | LIBRARY_LOCATION = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
26 | sys.path.append(LIBRARY_LOCATION)
27 | 
28 | from optparse import OptionParser
29 | 
30 | 
31 | 
32 | 
33 | 
34 | def title(title):
35 |     bar = '-' * 50
36 |     white = ' ' * round((50 - len(title)) / 2)
37 |     white_b = ' ' * 5
38 |     title = "\n " + white_b + bar + "\n" + white_b + "|" + white + title + white + "|\n " + white_b + bar + "\n\n"
39 |     print(title)
40 | 
41 | 
42 | def print_help():
43 |     print("Usage: intron_scanner action [options]\n\nPossible actions:")
44 | 
45 |     print("\t- test:  \t use a trained model to predict the class of a given set of images")
46 |     print("\n")
47 | 
48 | 
49 | def main(argv=None):
50 |     '''Command line options.'''
51 |     program_name = os.path.basename(sys.argv[0])
52 |     title("CNN filter")
53 |     if argv is None:
54 |         argv = sys.argv
55 |     try:
56 |         program_name = program_name + " " + 'test process'
57 |         parser = OptionParser(usage="Usage: %prog test ",
58 |                               description="Test a neural network model on a given set of images")
59 |         parser.add_option("-d", "--img-dir", dest="dir",
60 |                           help="The directory containing the IRFinder results to predict. ",
61 |                           metavar="DIR", type="string")
62 |         
63 |         parser.add_option("-m", "--model-dir", dest="model", metavar="DIR",
64 |                           help="Folder containing the model. It has to contain the files best_model.h5 and model_info.json [default: %default]",
65 |                           type="string")
66 |         parser.add_option("-o", "--out", dest="out", help="Output file [default: %default]", metavar="FILE",
67 |                           type="string")
68 |         parser.add_option("-v", "--verbose", dest="verbose", action="count", help="Set tensorflow verbosity level")
69 |         # set defaults
70 |         parser.set_defaults(out="./predictions.tsv", verbose=0, colorN=3, dir=None, array=None, bed=None,
71 |                             model="./model/")
72 |         # process options
73 |         (opts, _) = parser.parse_args(argv)
74 |         if (opts.dir != None) == (opts.array != None):
75 |             parser.error(
76 |                 "Parameter -d is required\n\nUse --help to get more information\n")
77 |         from actions.models import IntronModeller
78 |         modeller = IntronModeller(opts.verbose)
79 |         modeller.test(opts.dir, opts.model, opts.out)
80 |     except Exception as e:
81 |         print(program_name + ": " + repr(e) + "\n")
82 |         print("\n\nFor help use --help\n\n")
83 |         print(e)
84 |         if __debug__:
85 |             raise e
86 |         return 2
87 | 
88 | 
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     sys.exit(main())
93 | 
94 | 
95 | #json,gzip,time,tensorflow,matplotlib,numpy,sklearn,re,progress


--------------------------------------------------------------------------------
/src/cnnfilter/testCNN/model/best_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/src/cnnfilter/testCNN/model/best_model.h5


--------------------------------------------------------------------------------
/src/cnnfilter/testCNN/model/model_info.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Image directory": "/work/ritchieteam/EMT_irfinder/trainingshortread/",
 3 |   "Output directory": "depth15_ir0.1_noir0.cov50long_.01cov25allshortnondir_congruant_2_2021-05-20_12_01",
 4 |   "Validation split": 0.1,
 5 |   "Epochs": 500,
 6 |   "Batch size": 50,
 7 |   "Model json": null,
 8 |   "Image size": 256,
 9 |   "Number of colors": 0,
10 |   "Seed": 123,
11 |   "Threads": 5,
12 |   "Dataset": {
13 |     "counts": [
14 |       [
15 |         1662,
16 |         185
17 |       ],
18 |       [
19 |         8164,
20 |         907
21 |       ]
22 |     ],
23 |     "class_names": [
24 |       "hIR",
25 |       "noIR"
26 |     ]
27 |   }
28 | }


--------------------------------------------------------------------------------
/src/cnnfilter/testCNN/utils/reader.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import gzip
 3 | 
 4 | class ImageArray:
 5 |     def __init__(self, raw_line):
 6 |         raw_line=raw_line.split("\t")
 7 |         raw_name=raw_line[0].split(":")
 8 |         self.name=raw_name[0]+":"+raw_name[1]
 9 |         pos=raw_name[1].split("-")
10 |         self.intron_name="{}:{}-{}".format(raw_name[0],int(pos[0])+15,int(pos[1])-15)
11 |         self.strand=raw_name[2]
12 |         self.region=json.loads(raw_line[1])
13 |         self.is_valid= len(self.region) > 1
14 |         if self.is_valid:
15 |             self.is_valid=max([sum(i) for i in self.region]) > 0
16 |     
17 | 
18 | class ImageArchive:
19 |     
20 |     def _open_file(self, fname):
21 |         if fname.endswith(".gz"):
22 |             return gzip.open(fname, "rt")
23 |         else:
24 |             return open(fname, "rt")
25 |     
26 |     def _count_lines(self, fname):
27 |         tmp = self._open_file(fname)
28 |         count=0
29 |         for _ in tmp:
30 |             count+=1
31 |         tmp.close()
32 |         return count
33 |         
34 |         
35 |     def __init__(self, bed_file, array_file):
36 |         self._len=self._count_lines(array_file)
37 |         if bed_file != None:
38 |             self._bed = self._open_file(bed_file)
39 |             self._has_bed=True
40 |         else:
41 |             self._has_bed=False
42 |         self._array= self._open_file(array_file)
43 |         self._index=-1
44 |         
45 |             
46 |     def __iter__(self):
47 |         return self
48 |     
49 |     def __next__(self):
50 |         self._index+=1
51 |         if self._index < self._len:
52 |             if self._has_bed:
53 |                 img_array=ImageArray(self._array.readline())
54 |                 bed_line=self._bed.readline().strip().split("\t")
55 |                 while bed_line[0] + ":" + bed_line[1] + "-" + bed_line[2] != img_array.intron_name :
56 |                         bed_line=self._bed.readline().strip().split("\t")
57 |                 return bed_line , img_array
58 |             else:
59 |                 return ["NA"], ImageArray(self._array.readline())
60 |         else:
61 |             raise StopIteration
62 |     
63 |     def __len__(self):
64 |         return self._len
65 |     
66 |     def __del__(self):
67 |         self.close()
68 |         
69 |     def close(self):
70 |         self._array.close()
71 |         if  self._has_bed:
72 |             self._bed.close()
73 |         
74 |     def getIndex(self):
75 |         return self._index
76 |     
77 | 


--------------------------------------------------------------------------------
/src/irfinder/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>irfinder</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
10 | 			<triggers>clean,full,incremental,</triggers>
11 | 			<arguments>
12 | 			</arguments>
13 | 		</buildCommand>
14 | 		<buildCommand>
15 | 			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
16 | 			<triggers>full,incremental,</triggers>
17 | 			<arguments>
18 | 			</arguments>
19 | 		</buildCommand>
20 | 	</buildSpec>
21 | 	<natures>
22 | 		<nature>org.eclipse.cdt.core.cnature</nature>
23 | 		<nature>org.eclipse.cdt.core.ccnature</nature>
24 | 		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
25 | 		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
26 | 	</natures>
27 | </projectDescription>
28 | 


--------------------------------------------------------------------------------
/src/irfinder/.settings/language.settings.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <project>
 3 |     	
 4 |     <configuration id="cdt.managedbuild.config.gnu.exe.release.1147039463" name="Release">
 5 |         		
 6 |         <extension point="org.eclipse.cdt.core.LanguageSettingsProvider">
 7 |             			
 8 |             <provider copy-of="extension" id="org.eclipse.cdt.ui.UserLanguageSettingsProvider"/>
 9 |             			
10 |             <provider-reference id="org.eclipse.cdt.core.ReferencedProjectsLanguageSettingsProvider" ref="shared-provider"/>
11 |             			
12 |             <provider-reference id="org.eclipse.cdt.managedbuilder.core.MBSLanguageSettingsProvider" ref="shared-provider"/>
13 |             			
14 |             <provider class="org.eclipse.cdt.managedbuilder.language.settings.providers.GCCBuiltinSpecsDetector" console="false" env-hash="-373090648061809668" id="org.eclipse.cdt.managedbuilder.core.GCCBuiltinSpecsDetector" keep-relative-paths="false" name="CDT GCC Built-in Compiler Settings" parameter="${COMMAND} ${FLAGS} -E -P -v -dD &quot;${INPUTS}&quot;" prefer-non-shared="true">
15 |                 				
16 |                 <language-scope id="org.eclipse.cdt.core.gcc"/>
17 |                 				
18 |                 <language-scope id="org.eclipse.cdt.core.g++"/>
19 |                 			
20 |             </provider>
21 |             		
22 |         </extension>
23 |         	
24 |     </configuration>
25 |     
26 | </project>
27 | 


--------------------------------------------------------------------------------
/src/irfinder/.settings/org.eclipse.cdt.core.prefs:
--------------------------------------------------------------------------------
1 | doxygen/doxygen_new_line_after_brief=true
2 | doxygen/doxygen_use_brief_tag=false
3 | doxygen/doxygen_use_javadoc_tags=true
4 | doxygen/doxygen_use_pre_tag=false
5 | doxygen/doxygen_use_structural_commands=false
6 | eclipse.preferences.version=1
7 | 


--------------------------------------------------------------------------------
/src/irfinder/Release/makefile:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | -include ../makefile.init
 6 | 
 7 | RM := rm -rf
 8 | 
 9 | # All of the sources participating in the build are defined here
10 | -include sources.mk
11 | -include src/Utils/subdir.mk
12 | -include src/ReadBlock/subdir.mk
13 | -include src/Blocks/subdir.mk
14 | -include src/subdir.mk
15 | -include subdir.mk
16 | -include objects.mk
17 | 
18 | ifneq ($(MAKECMDGOALS),clean)
19 | ifneq ($(strip $(CC_DEPS)),)
20 | -include $(CC_DEPS)
21 | endif
22 | ifneq ($(strip $(C++_DEPS)),)
23 | -include $(C++_DEPS)
24 | endif
25 | ifneq ($(strip $(C_UPPER_DEPS)),)
26 | -include $(C_UPPER_DEPS)
27 | endif
28 | ifneq ($(strip $(CXX_DEPS)),)
29 | -include $(CXX_DEPS)
30 | endif
31 | ifneq ($(strip $(CPP_DEPS)),)
32 | -include $(CPP_DEPS)
33 | endif
34 | ifneq ($(strip $(C_DEPS)),)
35 | -include $(C_DEPS)
36 | endif
37 | endif
38 | 
39 | -include ../makefile.defs
40 | 
41 | # Add inputs and outputs from these tool invocations to the build variables 
42 | 
43 | # All Target
44 | all: irfinder
45 | 
46 | # Tool invocations
47 | irfinder: $(OBJS) $(USER_OBJS)
48 | 	@echo 'Building target: $@'
49 | 	@echo 'Invoking: GCC C++ Linker'
50 | 	g++  -o "irfinder" $(OBJS) $(USER_OBJS) $(LIBS)
51 | 	@echo 'Finished building target: $@'
52 | 	@echo ' '
53 | 
54 | # Other Targets
55 | clean:
56 | 	-$(RM) $(CC_DEPS)$(C++_DEPS)$(EXECUTABLES)$(C_UPPER_DEPS)$(CXX_DEPS)$(OBJS)$(CPP_DEPS)$(C_DEPS) irfinder
57 | 	-@echo ' '
58 | 
59 | .PHONY: all clean dependents
60 | 
61 | -include ../makefile.targets
62 | 


--------------------------------------------------------------------------------
/src/irfinder/Release/objects.mk:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 | 
5 | USER_OBJS :=
6 | 
7 | LIBS := -lboost_iostreams
8 | 
9 | 


--------------------------------------------------------------------------------
/src/irfinder/Release/sources.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | C_UPPER_SRCS := 
 6 | CXX_SRCS := 
 7 | C++_SRCS := 
 8 | OBJ_SRCS := 
 9 | CC_SRCS := 
10 | ASM_SRCS := 
11 | CPP_SRCS := 
12 | C_SRCS := 
13 | O_SRCS := 
14 | S_UPPER_SRCS := 
15 | CC_DEPS := 
16 | C++_DEPS := 
17 | EXECUTABLES := 
18 | C_UPPER_DEPS := 
19 | CXX_DEPS := 
20 | OBJS := 
21 | CPP_DEPS := 
22 | C_DEPS := 
23 | 
24 | # Every subdirectory with source files must be described here
25 | SUBDIRS := \
26 | src/Blocks \
27 | src \
28 | src/ReadBlock \
29 | src/Utils \
30 | 
31 | 


--------------------------------------------------------------------------------
/src/irfinder/Release/src/Blocks/subdir.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | # Add inputs and outputs from these tool invocations to the build variables 
 6 | CPP_SRCS += \
 7 | ../src/Blocks/BAM2blocks.cpp \
 8 | ../src/Blocks/CoverageBlock.cpp \
 9 | ../src/Blocks/FragmentBlocks.cpp 
10 | 
11 | OBJS += \
12 | ./src/Blocks/BAM2blocks.o \
13 | ./src/Blocks/CoverageBlock.o \
14 | ./src/Blocks/FragmentBlocks.o 
15 | 
16 | CPP_DEPS += \
17 | ./src/Blocks/BAM2blocks.d \
18 | ./src/Blocks/CoverageBlock.d \
19 | ./src/Blocks/FragmentBlocks.d 
20 | 
21 | 
22 | # Each subdirectory must supply rules for building sources it contributes
23 | src/Blocks/%.o: ../src/Blocks/%.cpp
24 | 	@echo 'Building file: $<'
25 | 	@echo 'Invoking: GCC C++ Compiler'
26 | 	g++ -O3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@)" -o "$@" "$<"
27 | 	@echo 'Finished building: $<'
28 | 	@echo ' '
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/src/irfinder/Release/src/ReadBlock/subdir.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | # Add inputs and outputs from these tool invocations to the build variables 
 6 | CPP_SRCS += \
 7 | ../src/ReadBlock/CoverageBlocks.cpp \
 8 | ../src/ReadBlock/ReadBlockProcessor.cpp 
 9 | 
10 | OBJS += \
11 | ./src/ReadBlock/CoverageBlocks.o \
12 | ./src/ReadBlock/ReadBlockProcessor.o 
13 | 
14 | CPP_DEPS += \
15 | ./src/ReadBlock/CoverageBlocks.d \
16 | ./src/ReadBlock/ReadBlockProcessor.d 
17 | 
18 | 
19 | # Each subdirectory must supply rules for building sources it contributes
20 | src/ReadBlock/%.o: ../src/ReadBlock/%.cpp
21 | 	@echo 'Building file: $<'
22 | 	@echo 'Invoking: GCC C++ Compiler'
23 | 	g++ -O3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@)" -o "$@" "$<"
24 | 	@echo 'Finished building: $<'
25 | 	@echo ' '
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/src/irfinder/Release/src/Utils/subdir.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | # Add inputs and outputs from these tool invocations to the build variables 
 6 | CPP_SRCS += \
 7 | ../src/Utils/crc32.cpp 
 8 | 
 9 | OBJS += \
10 | ./src/Utils/crc32.o 
11 | 
12 | CPP_DEPS += \
13 | ./src/Utils/crc32.d 
14 | 
15 | 
16 | # Each subdirectory must supply rules for building sources it contributes
17 | src/Utils/%.o: ../src/Utils/%.cpp
18 | 	@echo 'Building file: $<'
19 | 	@echo 'Invoking: GCC C++ Compiler'
20 | 	g++ -O3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@)" -o "$@" "$<"
21 | 	@echo 'Finished building: $<'
22 | 	@echo ' '
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/src/irfinder/Release/src/subdir.mk:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Automatically-generated file. Do not edit!
 3 | ################################################################################
 4 | 
 5 | # Add inputs and outputs from these tool invocations to the build variables 
 6 | CPP_SRCS += \
 7 | ../src/IRFinder2.cpp 
 8 | 
 9 | OBJS += \
10 | ./src/IRFinder2.o 
11 | 
12 | CPP_DEPS += \
13 | ./src/IRFinder2.d 
14 | 
15 | 
16 | # Each subdirectory must supply rules for building sources it contributes
17 | src/%.o: ../src/%.cpp
18 | 	@echo 'Building file: $<'
19 | 	@echo 'Invoking: GCC C++ Compiler'
20 | 	g++ -O3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@)" -o "$@" "$<"
21 | 	@echo 'Finished building: $<'
22 | 	@echo ' '
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/src/irfinder/src/Blocks/BAM2blocks.h:
--------------------------------------------------------------------------------
  1 | #ifndef CODE_BAM2BLOCKS
  2 | #define CODE_BAM2BLOCKS
  3 | 
  4 | #include "FragmentBlocks.h"
  5 | #include <boost/iostreams/filtering_streambuf.hpp>
  6 | #include <boost/iostreams/copy.hpp>
  7 | #include <boost/iostreams/filter/gzip.hpp>
  8 | 
  9 | /* Little Endian .. for big endian each group of 4 bytes needs to be reversed before individual members are accessed. */
 10 | // std c11 allows anonymous struct/union. -Wall may give a warning as non-portable to older c++ standards.
 11 | 
 12 | 
 13 | 
 14 | class BAM2blocks {
 15 | 
 16 | 	// TODO -- are structs best hidden inside the class? Does doing so push them into namespace of the class only?
 17 | 	struct bam_read_core {
 18 | 		union {
 19 | 		  char c[36];
 20 | 		  struct {
 21 | 			int32_t block_size;
 22 | 			int32_t refID;
 23 | 			int32_t pos;
 24 | 			uint8_t l_read_name;
 25 | 			uint8_t mapq;
 26 | 			uint16_t bin;
 27 | 			uint16_t n_cigar_op;
 28 | 			uint16_t flag;
 29 | 			int32_t l_seq;
 30 | 			int32_t next_refID;
 31 | 			int32_t next_pos;
 32 | 			int32_t tlen;
 33 | 		  }; // anonymous struct to allow easy access to members.
 34 | 		};
 35 | 		char read_name[256];
 36 | 		union {
 37 | 		  char cigar_buffer[20000];
 38 | 		  int32_t cigar[5000];
 39 | 		};
 40 | 	};
 41 |  
 42 | 	union bam_header {
 43 | 		char c[8];
 44 | 		struct {
 45 | 		  char magic[4];
 46 | 		  int32_t l_text;
 47 | 		};
 48 | 	};
 49 | 
 50 | 	union stream_int32 {
 51 | 		char c[4];
 52 | 		int32_t i;
 53 | 	};
 54 | 
 55 | 	static const int BAM_HEADER_BYTES = 8;
 56 | 	static const int BAM_READ_CORE_BYTES = 36;
 57 | 	static const int BAM_READ_CORE_MAX_CIGAR = 20000;
 58 | 
 59 | 	FragmentBlocks oBlocks;
 60 | 
 61 | 	std::vector< std::function<void(const std::vector<std::string> &)> > callbacksChrMappingChange;
 62 | 	std::vector< std::function<void(const FragmentBlocks &)> > callbacksProcessBlocks;
 63 | 
 64 | 	// Statistics.
 65 | 	ulong cShortPairs;
 66 | 	ulong cIntersectPairs;
 67 | 	ulong cLongPairs;
 68 | 	ulong cSingleReads;
 69 | 	ulong cPairedReads;
 70 | 	ulong cErrorReads;
 71 | 	ulong cSkippedReads;
 72 | 	uint64_t totalNucleotides;
 73 | 	std::map<uint16_t, uint> skippedReason;
 74 | 
 75 | 	std::map<std::string, std::vector<char>> tmp_reads;
 76 | 	bam_read_core tmp_read;
 77 | 	bam_read_core tmp_mate;
 78 | 	uint64_t current_read=0;
 79 | 
 80 | 	bool getNextReadHead(bam_read_core &);
 81 | 	void errorMessage();
 82 | 	void getReadBody(bam_read_core &);
 83 | 	void handlePairs(bam_read_core &, bam_read_core &);
 84 | 	std::string getName(bam_read_core &);
 85 | 	void setMate(std::vector<char> & mate);
 86 | 	void saveMate();
 87 | 	std::istream * IN;
 88 | 	std::istream instream;
 89 | 	void cigar2block(int32_t * cigar, uint16_t n_cigar_op, std::vector<int> &starts, std::vector<int> &lens, int &ret_genome_len);
 90 | 
 91 | 	unsigned int processPair(bam_read_core * read1, bam_read_core * read2);
 92 | 	unsigned int processSingle(bam_read_core * read1);
 93 | 
 94 | 	std::vector<unsigned char> stream_buffer;
 95 | 	void fillBuffer();
 96 | 	std::ifstream file;
 97 | 	boost::iostreams::filtering_streambuf<boost::iostreams::input> inbuf;
 98 | 	bool coord_sorted=false;
 99 | public:
100 |   	BAM2blocks();
101 |   	void openFile(std::istream * _IN);
102 |   	void openFile(std::string in_file);
103 |   	void readBamHeader();  // implied by openFile. So perhaps should be private.
104 |   	int processAll();
105 | 
106 | 	void registerCallbackChrMappingChange( std::function<void(const std::vector<std::string> &)> callback );
107 | 	void registerCallbackProcessBlocks( std::function<void(const FragmentBlocks &)> callback );
108 | 
109 | 	std::string samHeader;
110 | 	std::vector<std::string> chr_names;   //tab terminated chromosome names.
111 | 	std::vector<int32_t> chr_lens;	//length of each chromosome (not used when reading, used if optionally outputting an altered BAM file)
112 | };
113 | 
114 | 
115 | #endif
116 | 


--------------------------------------------------------------------------------
/src/irfinder/src/Blocks/CoverageBlock.cpp:
--------------------------------------------------------------------------------
  1 | #include "CoverageBlock.h"
  2 | // using namespace std;
  3 | 
  4 | CoverageBlock::CoverageBlock(uint start, uint end) {
  5 | 	blockStart = start;
  6 | 	blockEnd = end;
  7 | 	firstDepth[0] = 0;
  8 | 	firstDepth[1] = 0;
  9 | 	blockExtents = NULL;
 10 | 	blockExtentsL = NULL;
 11 | }
 12 | 
 13 | //direction -- 0=False/Neg, 1=True/Pos.
 14 | void CoverageBlock::RecordCover(uint readStart, uint readEnd, bool dir) {
 15 | 
 16 | 	if (readStart <= blockStart && readEnd > blockStart) {
 17 | 		firstDepth[dir]++;
 18 | 	} else if (readStart < blockEnd) {
 19 | 		// Need to increment the starts vector.
 20 | 		uint inc_index = readStart - blockStart - 1;
 21 | 		if (blockExtentsL) { //already an int vector
 22 | 			blockExtentsL->at(inc_index).start[dir]++;
 23 | 		} else if (!blockExtents) { //don't have a char vector either - create first.
 24 | 			blockExtents = new std::vector<start_stops>(vectorLen());
 25 | 			blockExtents->at(inc_index).start[dir]++;
 26 | 		} else {
 27 | 			if (blockExtents->at(inc_index).start[dir] == 254) {
 28 | 				blockExtentsL = new std::vector<start_stopsL>(
 29 | 						blockExtents->begin(), blockExtents->end());
 30 | 				delete blockExtents;
 31 | 				blockExtents = NULL;
 32 | 				blockExtentsL->at(inc_index).start[dir]++;
 33 | 			} else {
 34 | 				blockExtents->at(inc_index).start[dir]++;
 35 | 			}
 36 | 		}
 37 | 	} else {
 38 | 		return;
 39 | 	}
 40 | 
 41 | 	if (readEnd >= blockEnd) {
 42 | 		return;
 43 | 	} else {
 44 | 		// Need to increment the ends vector.
 45 | 		uint inc_index = readEnd - blockStart - 1;
 46 | 
 47 | 		if (blockExtentsL) { //already an int vector
 48 | 			blockExtentsL->at(inc_index).end[dir]++;
 49 | 		} else if (!blockExtents) { //don't have a char vector either - create first.
 50 | 			blockExtents = new std::vector<start_stops>(vectorLen());
 51 | 			blockExtents->at(inc_index).end[dir]++;
 52 | 		} else {
 53 | 			if (blockExtents->at(inc_index).end[dir] == 254) {
 54 | 				blockExtentsL = new std::vector<start_stopsL>(
 55 | 						blockExtents->begin(), blockExtents->end());
 56 | 				delete blockExtents;
 57 | 				blockExtents = NULL;
 58 | 				blockExtentsL->at(inc_index).end[dir]++;
 59 | 			} else {
 60 | 				blockExtents->at(inc_index).end[dir]++;
 61 | 			}
 62 | 		}
 63 | 	}
 64 | 	// Can Throw: Out of range exception.
 65 | }
 66 | 
 67 | void CoverageBlock::updateCoverageHist(std::map<uint, uint> &hist, uint start,
 68 | 		uint end) const {
 69 | 
 70 | 	if (!blockExtentsL && !blockExtents) {
 71 | 		// how many bases in this block?
 72 | 		hist[firstDepth[0] + firstDepth[1]] += std::min(blockEnd, end)
 73 | 				- std::max(blockStart, start);
 74 | 	} else {
 75 | 		// There are read starts and ends -- need to walk the positions from the start of this block
 76 | 		//  even if not in the region of interest.
 77 | 
 78 | 		//special handling for the first base -- the one before the vector starts.
 79 | 		uint depth = firstDepth[0] + firstDepth[1];
 80 | 		if (start <= blockStart) {
 81 | 			// use the first depth, before commencing in the vector.
 82 | 			hist[depth]++;
 83 | 		}
 84 | 
 85 | 		uint startindex = std::max(blockStart + 1, start) - blockStart - 1;
 86 | 		uint endindex = std::min(blockEnd, end) - blockStart - 1;
 87 | 		if (blockExtents) {
 88 | 			for (uint i = 0; i < endindex; i++) {
 89 | 				depth += -(*blockExtents)[i].end[0] - (*blockExtents)[i].end[1]
 90 | 						+ (*blockExtents)[i].start[0]
 91 | 						+ (*blockExtents)[i].start[1];
 92 | 				if (i >= startindex) {
 93 | 					hist[depth]++;
 94 | 				}
 95 | 			}
 96 | 		} else {
 97 | 			for (uint i = 0; i < endindex; i++) {
 98 | 				depth += -(*blockExtentsL)[i].end[0]
 99 | 						- (*blockExtentsL)[i].end[1]
100 | 						+ (*blockExtentsL)[i].start[0]
101 | 						+ (*blockExtentsL)[i].start[1];
102 | 				if (i >= startindex) {
103 | 					hist[depth]++;
104 | 				}
105 | 			}
106 | 		}
107 | 		//  When in the region of interest, update the hist each step.
108 | 	}
109 | }
110 | 
111 | void CoverageBlock::updateCoverageHist(std::map<uint, uint> &hist, uint start,
112 | 		uint end, bool dir) const {
113 | 	if (!blockExtentsL && !blockExtents) {
114 | 		// how many bases in this block?
115 | 		hist[firstDepth[dir]] += std::min(blockEnd, end)
116 | 				- std::max(blockStart, start);
117 | 	} else {
118 | 		//special handling for the first base -- the one before the vector starts.
119 | 		uint depth = firstDepth[dir];
120 | 		if (start <= blockStart) {
121 | 			// use the first depth, before commencing in the vector.
122 | 			hist[depth]++;
123 | 		}
124 | 
125 | 		uint startindex = std::max(blockStart + 1, start) - blockStart - 1;
126 | 		uint endindex = std::min(blockEnd, end) - blockStart - 1;
127 | 		if (blockExtents) {
128 | 			for (uint i = 0; i < endindex; i++) {
129 | 				depth += -(*blockExtents)[i].end[dir]
130 | 						+ (*blockExtents)[i].start[dir];
131 | 				if (i >= startindex) {
132 | 					hist[depth]++;
133 | 				}
134 | 			}
135 | 		} else {
136 | 			for (uint i = 0; i < endindex; i++) {
137 | 				depth += -(*blockExtentsL)[i].end[dir]
138 | 						+ (*blockExtentsL)[i].start[dir];
139 | 				if (i >= startindex) {
140 | 					hist[depth]++;
141 | 				}
142 | 			}
143 | 		}
144 | 	}
145 | }
146 | 
147 | void CoverageBlock::updateCoverageArray(std::vector<uint> &arr,
148 | 		std::vector<bool> &covered, uint start, uint end) const {
149 | 	uint depth = firstDepth[0] + firstDepth[1],
150 | 			startindex = std::max( blockStart, start-1) - blockStart,
151 | 			endindex = std::min(blockEnd, end) - blockStart,
152 | 			startarray = std::max(blockStart+1, start ) - start ,
153 | 			endarray = std::min(blockEnd, end) - start ;
154 | 
155 | 	if (!blockExtentsL && !blockExtents) {
156 | 		for (uint i = startindex; i < endindex && startarray < endarray;
157 | 				i++, startarray++) {
158 | 			arr[startarray] += depth;
159 | 			covered[startarray] = true;
160 | 		}
161 | 	} else {
162 | 		// There are read starts and ends -- need to walk the positions from the start of this block
163 | 		//  even if not in the region of interest.
164 | 		if (blockExtents) {
165 | 			for (uint i = 0; i < endindex && startarray < endarray; i++) {
166 | 				depth += -(*blockExtents)[i].end[0] - (*blockExtents)[i].end[1]
167 | 						+ (*blockExtents)[i].start[0]
168 | 						+ (*blockExtents)[i].start[1];
169 | 				if (i >= startindex) {
170 | 					arr[startarray] += depth;
171 | 					covered[startarray] = true;
172 | 					startarray++;
173 | 				}
174 | 			}
175 | 		} else {
176 | 			for (uint i = 0; i < endindex && startarray < endarray; i++) {
177 | 				depth += -(*blockExtentsL)[i].end[0]
178 | 						- (*blockExtentsL)[i].end[1]
179 | 						+ (*blockExtentsL)[i].start[0]
180 | 						+ (*blockExtentsL)[i].start[1];
181 | 				if (i >= startindex) {
182 | 					arr[startarray] += depth;
183 | 					covered[startarray] = true;
184 | 					startarray++;
185 | 				}
186 | 			}
187 | 		}
188 | 		//  When in the region of interest, update the hist each step.
189 | 	}
190 | }
191 | 
192 | void CoverageBlock::updateCoverageArray(std::vector<uint> &arr,
193 | 		std::vector<bool> &covered, uint start, uint end, bool dir) const {
194 | 
195 | 	uint depth = firstDepth[0] + firstDepth[1],
196 | 				startindex = std::max( blockStart, start-1) - blockStart,
197 | 				endindex = std::min(blockEnd, end) - blockStart,
198 | 				startarray = std::max(blockStart+1, start ) - start ,
199 | 				endarray = std::min(blockEnd, end) - start ;
200 | 	if (!blockExtentsL && !blockExtents) {
201 | 		for (uint i = startindex; i < endindex && startarray < endarray;
202 | 				i++, startarray++) {
203 | 			arr[startarray] += depth;
204 | 			covered[startarray] = true;
205 | 		}
206 | 	} else {
207 | 		// There are read starts and ends -- need to walk the positions from the start of this block
208 | 		//  even if not in the region of interest.
209 | 		if (blockExtents) {
210 | 			for (uint i = 0; i < endindex && startarray < endarray; i++) {
211 | 				depth += -(*blockExtents)[i].end[dir]
212 | 						+ (*blockExtents)[i].start[dir];
213 | 				if (i >= startindex) {
214 | 					arr[startarray] += depth;
215 | 					covered[startarray] = true;
216 | 					startarray++;
217 | 				}
218 | 			}
219 | 		} else {
220 | 			for (uint i = 0; i < endindex && startarray < endarray; i++) {
221 | 				depth += -(*blockExtentsL)[i].end[dir]
222 | 						+ (*blockExtentsL)[i].start[dir];
223 | 				if (i >= startindex) {
224 | 					arr[startarray] += depth;
225 | 					covered[startarray] = true;
226 | 					startarray++;
227 | 				}
228 | 			}
229 | 		}
230 | 		//  When in the region of interest, update the hist each step.
231 | 	}
232 | }
233 | 
234 | void CoverageBlock::print(std::ostream &os) const {
235 | 	os << "Coverage block " << blockStart << " - " << blockEnd << "\n";
236 | 	os << "First depth 0 : " << firstDepth[0] << "\n";
237 | 	os << "First depth 1 : " << firstDepth[0] << "\n";
238 | 	uint i=0;
239 | 	if (blockExtents) {
240 | 		os << "BlockExtents: \n";
241 | 		for (auto &a : (*blockExtents)) {
242 | 			os << i+blockStart << " " << (uint) a.start[0] << ":" << (uint) a.start[1] << " - " << (uint) a.end[0] <<  ":"<< (uint) a.end[1] << "\n";
243 | 			i++;
244 | 		}
245 | 	}
246 | 	if (blockExtentsL) {
247 | 		os << "BlockExtentsL: \n";
248 | 		for (auto &a : (*blockExtentsL)) {
249 | 			os << i+blockStart << " " << (uint) a.start[0] << ":" << (uint) a.start[1] << " - " << (uint) a.end[0] <<  ":"<< (uint) a.end[1] << "\n";
250 | 			i++;
251 | 		}
252 | 	}
253 | }
254 | 
255 | std::ostream& operator<<(std::ostream &os, const CoverageBlock &cb) {
256 | 	cb.print(os);
257 | 	return os;
258 | }
259 | 


--------------------------------------------------------------------------------
/src/irfinder/src/Blocks/CoverageBlock.h:
--------------------------------------------------------------------------------
 1 | #ifndef CODE_COVERAGEBLOCK
 2 | #define CODE_COVERAGEBLOCK
 3 | 
 4 | #include "../Utils/includedefine.h"
 5 | 
 6 | class start_stops {
 7 | 	public:
 8 | 		unsigned char start[2];
 9 | 		unsigned char end[2];
10 | 
11 | 		start_stops() {
12 | 			start[0]=0;
13 | 			start[1]=0;
14 | 			end[0]=0;
15 | 			end[1]=0;
16 | 		};
17 | };
18 | 
19 | class start_stopsL {
20 | 	public:
21 | 		unsigned int start[2];
22 | 		unsigned int end[2];
23 | 
24 | 		start_stopsL() {
25 | 			start[0]=0;
26 | 			start[1]=0;
27 | 			end[0]=0;
28 | 			end[1]=0;
29 | 		};
30 | 		start_stopsL(const start_stops &copy) {
31 | 			start[0]=copy.start[0];
32 | 			start[1]=copy.start[1];
33 | 			end[0]=copy.end[0];
34 | 			end[1]=copy.end[1];
35 | 		};
36 | 
37 | };
38 | 
39 | 
40 | class CoverageBlock {
41 | 	private:
42 | 	    uint blockStart;
43 | 	    uint blockEnd;
44 | 	    uint firstDepth[2];
45 | 		std::vector<start_stops>* blockExtents;
46 | 		std::vector<start_stopsL>* blockExtentsL;
47 | 
48 | 		inline uint vectorLen() {
49 | 			return (blockEnd - blockStart - 1);
50 | 		};
51 | 	public:
52 | 		uint getLength() { return blockEnd - blockStart;}
53 | 		uint getStart(){return blockStart;}
54 | 		uint getEnd(){return blockEnd;}
55 | 		CoverageBlock(uint start, uint end);
56 | 		void RecordCover(uint start, uint end, bool dir);
57 | 		//RetrieveCover(..);
58 | 		void print(std::ostream& os) const;
59 | 
60 | 		//First form, non-directional. Second form, directional with "dir" specifiying whether sense or anti-sense.
61 | 		void updateCoverageHist(std::map<uint,uint> &hist, uint start, uint end) const;
62 | 		void updateCoverageHist(std::map<uint,uint> &hist, uint start, uint end, bool dir) const;
63 | 		void updateCoverageArray(std::vector<uint> &arr,std::vector<bool> & covered, uint start, uint end) const;
64 | 		void updateCoverageArray(std::vector<uint> &arr,std::vector<bool> & covered, uint start, uint end, bool dir) const;
65 | 		inline bool posIsAfterStart(const uint &compareval) const {
66 | 		  return (compareval > blockStart);
67 | 		};
68 | 
69 | 		// http://www.learncpp.com/cpp-tutorial/94-overloading-the-comparison-operators/
70 | 		// http://en.cppreference.com/w/cpp/language/operator_comparison
71 | 		inline bool operator<(const CoverageBlock &b) const {
72 | 			return (blockEnd < b.blockEnd);
73 | 		};
74 | 		inline bool operator<(const uint &b) const {
75 | 			return (blockEnd < b);  //a is the object.
76 | 		};
77 | 		friend inline bool operator<(const uint &a, const CoverageBlock &b) {
78 | 			return (a < b.blockEnd);  //a is a uint.
79 | 		};
80 | };
81 | 
82 | std::ostream& operator<<( std::ostream& os, const CoverageBlock& cb);
83 | 
84 | #endif
85 | 


--------------------------------------------------------------------------------
/src/irfinder/src/Blocks/FragmentBlocks.cpp:
--------------------------------------------------------------------------------
 1 | #include "FragmentBlocks.h"
 2 | 
 3 | 
 4 | // This class is an information storage container only -- pretty much a struct.
 5 | // It allows all the relevant information relating to an interpreted fragment to be passed
 6 | // to the variety of callback watchers that require fragment blocks to update their stats.
 7 | 
 8 | FragmentBlocks::FragmentBlocks() {
 9 | 	rStarts[0].reserve(initial_alloc);
10 | 	rLens[0].reserve(initial_alloc);
11 | 	rStarts[1].reserve(initial_alloc);
12 | 	rLens[1].reserve(initial_alloc);
13 | 	readName.reserve(max_read_name);
14 | 	readCount = 0;
15 | }
16 | 
17 | // Return a string representation of the Chromosome name.
18 | const std::string FragmentBlocks::chrName() const {
19 | 	return chr_names.at(chr_id);
20 | }
21 | 
22 | // Update the internal data structure with a new mapping between Chromosome ID# and Chromosome name (string).
23 | void FragmentBlocks::ChrMapUpdate(const std::vector<std::string> &chrmap) {
24 | 	chr_names = chrmap;
25 | }
26 | 


--------------------------------------------------------------------------------
/src/irfinder/src/Blocks/FragmentBlocks.h:
--------------------------------------------------------------------------------
 1 | #ifndef CODE_FRAGMENTBLOCKS
 2 | #define CODE_FRAGMENTBLOCKS
 3 | 
 4 | #include "../Utils/includedefine.h"
 5 | 
 6 | /* A class to store up to 2 reads belonging to a single fragment.
 7 |  * It is a storage class, almost a struct, it does not perform processing itself.
 8 |  * Read1 is always valid.
 9 |  * Read2 is only valid if readCount == 2.
10 |  *
11 |  * There may only be a single read if:
12 |  *  - the sequencing is single end rather than paired end..
13 |  *  - the sequencing is paired end, but the two reads overlapped and have been combined
14 |  *    into a single synthetic read / block of coverage.
15 |  */
16 | class FragmentBlocks {
17 | 	private:
18 | 		static const int initial_alloc = 100;
19 | 		static const int max_read_name = 300;
20 | 		std::vector<std::string> chr_names; //TODO - this is currently unused??
21 | 	public:
22 | 		FragmentBlocks();
23 | 		const std::string chrName() const;
24 | 		void ChrMapUpdate(const std::vector<std::string>& chrmap);
25 | 
26 | 		std::string readName;
27 | 		std::vector<int> rStarts[2];
28 | 		std::vector<int> rLens[2];
29 | 		uint readStart[2];
30 | 		uint readEnd[2];
31 | 		int readCount;
32 | 		uint chr_id; // Assumption that both r1 & r2 are on the same chromosome?
33 | 					//   if they aren't we shouldn't process them as a single fragment.
34 | 					//   perhaps a sanity check in pairing, only treat them as a pair
35 | 					//   if the name of the reads matches and the Chr matches.
36 | 		bool direction;
37 | };
38 | 
39 | #endif
40 | 


--------------------------------------------------------------------------------
/src/irfinder/src/ReadBlock/CoverageBlocks.h:
--------------------------------------------------------------------------------
 1 | #ifndef CODE_READBLOCKPROCESSOR_COVERAGEBLOCKS
 2 | #define CODE_READBLOCKPROCESSOR_COVERAGEBLOCKS
 3 | 
 4 | #include "../Blocks/CoverageBlock.h"
 5 | #include "ReadBlockProcessor.h"
 6 | #include "../Blocks/FragmentBlocks.h"
 7 | 
 8 | struct BEDrecord {
 9 | 	std::string chrName;
10 | 	std::string name;
11 | 	uint start;
12 | 	uint end;
13 | 	bool direction;
14 | 	std::vector<std::pair<uint,uint>> blocks;
15 | };
16 | 
17 | 
18 | class CoverageBlocks : public ReadBlockProcessor {
19 | 	//Store the Blocked BED record for each ROI/intron. This won't be referred to again until the end.
20 | 	//XX Create the temporary vectors (per Chr) which simply list the blocks sequentially as read.
21 | 	//XX Sort the temporary vectors
22 | 	//XX Build the final vectors of "blocks of interest"
23 | 	//xx Delete the temporary vectors
24 | 	//xx Create the parallel vectors with counter objects. (do these as a batch at the end, once vector size is known - for best memory layout)
25 | 	//xx Process fragments against the counter structure. (have I already written a class/object for this?)
26 | 	
27 | 	//Produce summary statistical output for each Blocked BED record, using the counter structure.
28 | 
29 | 	protected:
30 | 
31 | 		// Coverage depth data-structures.
32 | 		std::map<std::string, std::vector<CoverageBlock>> chrName_CoverageBlocks;
33 | 		std::map<std::string, std::vector<CoverageBlock>> chrName_FlankCoverageBlocks;
34 | 		// Shortcut pointers to depth data-structures.
35 | 		std::vector<std::vector<CoverageBlock>*> chrID_CoverageBlocks;
36 | 		std::vector<std::vector<CoverageBlock>*> chrID_FlankCoverageBlocks;
37 | 
38 | 		// TODO: what is optimal for speed & memory usage?
39 | //		static const uint coverage_block_max_length = 5000;
40 | 		static const uint coverage_block_max_length = 500;
41 | 
42 | 
43 | 		std::vector<BEDrecord> BEDrecords;
44 | 		bool long_read=false;
45 | 		int jitter = 3;
46 | 
47 | 	public:
48 | 		CoverageBlocks(std::string read_type) {
49 | 			long_read = read_type == "LR";
50 | 		}
51 | 		void setJitter(int j){jitter=j;};
52 | 		void ProcessBlocks(const FragmentBlocks &fragblock);
53 | 		void ChrMapUpdate(const std::vector<std::string> &chrmap);
54 | 		void loadRef(std::istream &IN);
55 | 		int WriteOutput(std::ostream *os) const;
56 | 		
57 | 		void fillHist(std::map<uint,uint> &hist, const std::string &chrName, const std::vector<std::pair<uint,uint>> &blocks) const;
58 | 		void fillHist(std::map<uint,uint> &hist, const std::string &chrName, const std::vector<std::pair<uint,uint>> &blocks, bool direction) const;
59 | 		void getCoverageArray(std::vector<uint> &coverages,
60 | 				std::vector<bool> & covered,
61 | 				const std::string &chrName,
62 | 				const uint arr_start, const uint arr_end) const;
63 | 		void getCoverageArray(std::vector<uint> &coverages,
64 | 				std::vector<bool> & covered,
65 | 				const std::string &chrName,
66 | 				const uint arr_start, const uint arr_end,
67 | 				bool direction) const;
68 | 		double meanFromHist(const std::map<uint,uint> &hist) const;
69 | 		double coverageFromHist(const std::map<uint,uint> &hist) const;
70 | 		double percentileFromHist(const std::map<uint,uint> &hist, uint percentile) const;
71 | 		double trimmedMeanFromHist(const std::map<uint,uint> &hist, uint centerPercent) const;
72 | };
73 | 
74 | class CoverageBlocksIRFinder : public CoverageBlocks {
75 | 	private:
76 | 		uint AI_warn=0;
77 | 		uint AI_intron=1;
78 | 		double AI_ratio=0.05;
79 | 	public:
80 | 
81 | 	CoverageBlocksIRFinder(std::string read_type) : CoverageBlocks(read_type){
82 | 	}
83 | 	void setAI(uint AI_warning_level, uint AI_min_intron_coverage, double AI_IRratio){
84 | 		AI_warn=AI_warning_level;
85 | 		AI_intron=AI_min_intron_coverage;
86 | 		AI_ratio=AI_IRratio;
87 | 	}
88 | 		int WriteOutput(std::ostream *os, std::ostream *osAI, const JunctionCount &JC, const SpansPoint &SP, int directionality = 0) const;
89 | 
90 | };
91 | 
92 | 
93 | #endif
94 | 


--------------------------------------------------------------------------------
/src/irfinder/src/ReadBlock/ReadBlockProcessor.h:
--------------------------------------------------------------------------------
  1 | #ifndef CODE_READBLOCKPROCESSOR
  2 | #define CODE_READBLOCKPROCESSOR
  3 | 
  4 | #include "../Blocks/FragmentBlocks.h"
  5 | 
  6 | /*
  7 | The code can be finished faster if we force a requirement that all input files are coordinate sorted by the start of each block.
  8 | ie: sort -k2,2n (for BED files).
  9 | Chromosome sorted or not won't matter, as these get split into different vectors in all cases.
 10 | */
 11 | 
 12 | 
 13 | 
 14 | class ReadBlockProcessor {
 15 | 	public:
 16 | 		virtual void ProcessBlocks(const FragmentBlocks &fragblock) = 0;
 17 | 		virtual void ChrMapUpdate(const std::vector<std::string> &chrmap) = 0; //Maybe some of these funcs shouldn't be pure virtual - overloadable if needed, but default often ok.
 18 | };
 19 | 
 20 | 
 21 | class BED12Output : public ReadBlockProcessor {
 22 | 	private:
 23 | 		std::vector<std::string> chr_names;
 24 | 		std::ostream* out;
 25 | 	public:
 26 | 		void ProcessBlocks(const FragmentBlocks &fragblock);
 27 | 		void ChrMapUpdate(const std::vector<std::string> &chrmap);
 28 | 		void SetOutputStream(std::ostream *os);
 29 | };
 30 | 
 31 | 
 32 | class JunctionCount : public ReadBlockProcessor {
 33 | 	private:
 34 | 		std::map<std::string, std::map<std::pair<uint,uint>,uint[3]>> chrName_junc_count;
 35 | 		std::vector<std::map<std::pair<uint,uint>,uint[3]>*> chrID_junc_count;
 36 | 		//uint[3] - 0, neg strand count; 1, pos strand count; 2 = expected direction from ref: 0=unknown, 1=neg, 2=pos.
 37 | 
 38 | 		std::map<std::string, std::map<uint,uint[2]>> chrName_juncLeft_count;
 39 | 		std::vector<std::map<uint,uint[2]>*> chrID_juncLeft_count;
 40 | 
 41 | 		std::map<std::string, std::map<uint,uint[2]>> chrName_juncRight_count;
 42 | 		std::vector<std::map<uint,uint[2]>*> chrID_juncRight_count;
 43 | 		  //chrID_... stores a fast access pointer to the appropriate structure in chrName_... 
 44 | 	public:
 45 | 		void ProcessBlocks(const FragmentBlocks &fragblock);
 46 | 		void ChrMapUpdate(const std::vector<std::string> &chrmap);
 47 | 		int WriteOutput(std::ostream *os) const;
 48 | 		void loadRef(std::istream &IN); //loadRef is optional, it allows directional detection to determine not just non-dir vs dir, but also which direction.
 49 | 
 50 | 		int Directional() const;
 51 | 		
 52 | 		uint lookup(std::string ChrName, uint left, uint right, bool direction) const;
 53 | 		uint lookup(std::string ChrName, uint left, uint right) const;
 54 | 		uint lookupLeft(std::string ChrName, uint left, bool direction) const;
 55 | 		uint lookupLeft(std::string ChrName, uint left) const;
 56 | 		uint lookupRight(std::string ChrName, uint right, bool direction) const;
 57 | 		uint lookupRight(std::string ChrName, uint right) const;
 58 | 
 59 | // Ideally we would read the XS junction strand attribute from the BAM if we want to count junctions from non-directional sequencing.
 60 | //   that will require BAM2blocks to be informed it should read the optional attributes looking for that attrib in that case.
 61 | // -- or we can just ignore direction -- the splice start/end information effectively determines the XS info (by ref to the reference)
 62 | };
 63 | 
 64 | 
 65 | class SpansPoint : public ReadBlockProcessor {
 66 | 	private:
 67 | 		std::map<std::string, std::vector<uint>> chrName_pos;
 68 | 		std::map<std::string, std::vector<uint>> chrName_count[2];
 69 | 		std::vector<std::vector<uint>*> chrID_pos;
 70 | 		std::vector<std::vector<uint>*> chrID_count[2];
 71 | 		char overhangLeft;
 72 | 		char overhangRight;
 73 | 		char overhangTotal;
 74 | 		//chrID_... stores a fast access pointer to the appropriate structure in chrName_... 
 75 | 	public:
 76 | 		void setSpanLength(uint overhang_left, uint overhang_right);
 77 | 		void loadRef(std::istream &IN);
 78 | 		void ProcessBlocks(const FragmentBlocks &fragblock);
 79 | 		void ChrMapUpdate(const std::vector<std::string> &chrmap);
 80 | 		//void SetOutputStream(std::ostream *os);
 81 | 		int WriteOutput(std::ostream *os) const;
 82 | 		uint lookup(std::string ChrName, uint pos, bool direction) const;
 83 | 		uint lookup(std::string ChrName, uint pos) const;
 84 | };
 85 | 
 86 | class FragmentsInChr : public ReadBlockProcessor {
 87 | 	// Counts the number of fragments in each Chromosome. (for both + & - strands).
 88 | 	private:
 89 | 		std::map<std::string, std::vector<uint>> chrName_count; //only expecting 2 items in our vector.
 90 | 		std::vector<std::vector<uint>*> chrID_count;
 91 | 	public:
 92 | 		void ProcessBlocks(const FragmentBlocks &blocks);
 93 | 		void ChrMapUpdate(const std::vector<std::string> &chrmap);
 94 | 		int WriteOutput(std::ostream *os) const;		
 95 | };
 96 | 
 97 | 
 98 | class FragmentsInROI : public ReadBlockProcessor {
 99 | 	// Counts the number of fragments fully contained within a ROI.
100 | 	//   the ROIs may not overlap. Direction ignored for overlap detect.
101 | 	private:
102 | 		std::map<std::string, ulong> RegionID_counter[2];
103 |  
104 | 		std::map<std::string, std::vector<std::pair<uint,uint>>> chrName_ROI;
105 | 		std::map<std::string, std::vector<ulong*>> chrName_count[2];
106 | 
107 | 		std::vector<std::vector<std::pair<uint,uint>>*> chrID_ROI;
108 | 		std::vector<std::vector<ulong*>*> chrID_count[2];
109 | 
110 | 		// Perhaps we want to store some text relating to each record too? Easy to do if the input is pre-sorted (at least within each Chr).
111 | 		//   if pre-sorted, it may be easier to check for no overlapping blocks on read .. or can do this immediately after read with a single nested-walk.
112 | 		std::map<std::string, std::vector<std::string>> chrName_ROI_text;
113 | 	public:
114 | 		void ProcessBlocks(const FragmentBlocks &blocks);
115 | 		void ChrMapUpdate(const std::vector<std::string> &chrmap);
116 | 		void loadRef(std::istream &IN);
117 | 		int WriteOutput(std::ostream *os) const;		
118 | };
119 | 
120 | 
121 | /*
122 | class CoverageBlocks : public ReadBlockProcessor { ... }
123 | // In it's own file -- bigger code.
124 | */
125 | 
126 | #endif
127 | 


--------------------------------------------------------------------------------
/src/irfinder/src/Utils/crc32.h:
--------------------------------------------------------------------------------
 1 | // //////////////////////////////////////////////////////////
 2 | // crc32.h
 3 | // Copyright (c) 2014 Stephan Brumme. All rights reserved.
 4 | // see http://create.stephan-brumme.com/disclaimer.html
 5 | //
 6 | 
 7 | #pragma once
 8 | 
 9 | //#include "hash.h"
10 | #include <string>
11 | 
12 | // define fixed size integer types
13 | #ifdef _MSC_VER
14 | // Windows
15 | typedef unsigned __int8  uint8_t;
16 | typedef unsigned __int32 uint32_t;
17 | #else
18 | // GCC
19 | #include <stdint.h>
20 | #endif
21 | 
22 | 
23 | /// compute CRC32 hash, based on Intel's Slicing-by-8 algorithm
24 | /** Usage:
25 |     CRC32 crc32;
26 |     std::string myHash  = crc32("Hello World");     // std::string
27 |     std::string myHash2 = crc32("How are you", 11); // arbitrary data, 11 bytes
28 | 
29 |     // or in a streaming fashion:
30 | 
31 |     CRC32 crc32;
32 |     while (more data available)
33 |       crc32.add(pointer to fresh data, number of new bytes);
34 |     std::string myHash3 = crc32.getHash();
35 |   */
36 | class CRC32 //: public Hash
37 | {
38 | public:
39 |   /// same as reset()
40 |   CRC32();
41 | 
42 |   /// compute CRC32 of a memory block
43 |   std::string operator()(const void* data, size_t numBytes);
44 |   /// compute CRC32 of a string, excluding final zero
45 |   std::string operator()(const std::string& text);
46 | 
47 |   /// add arbitrary number of bytes
48 |   void add(const void* data, size_t numBytes);
49 | 
50 |   /// return latest hash as 16 hex characters
51 |   std::string getHash();
52 | 
53 |   /// return latest hash as a raw 32 bit integer
54 |   uint32_t getRawHash();
55 | 
56 |   /// restart
57 |   void reset();
58 | 
59 | private:
60 |   /// hash
61 |   uint32_t m_hash;
62 | };
63 | 


--------------------------------------------------------------------------------
/src/irfinder/src/Utils/includedefine.h:
--------------------------------------------------------------------------------
 1 | #ifndef INCLUDEDEFINE_DEF
 2 | #define INCLUDEDEFINE_DEF
 3 | 
 4 | #include <cstring>
 5 | #include <time.h>
 6 | #include <iostream>
 7 | #include <iomanip>
 8 | #include <fstream>
 9 | #include <sstream>
10 | #include <limits>
11 | #include <sys/types.h>
12 | #include <vector>
13 | #include <map>
14 | #include <algorithm> // std::sort
15 | #include <functional> // std::function
16 | #include <cmath>
17 | 
18 | 
19 | //__asm__(".symver memcpy,memcpy@GLIBC_2.2.5");
20 | 
21 | 
22 | 
23 | 
24 | #define DEF_lineLengthMax 10000
25 | #define DEF_adaptLengthMax 500
26 | 
27 | 
28 | // infix_iterator.h 
29 | // 
30 | // Lifted from Jerry Coffin's 's prefix_ostream_iterator 
31 | template <class T, 
32 |           class charT=char, 
33 |           class traits=std::char_traits<charT> > 
34 | class infix_ostream_iterator : 
35 |     public std::iterator<std::output_iterator_tag,void,void,void,void> 
36 | { 
37 |     std::basic_ostream<charT,traits> *os; 
38 |     charT const* delimiter; 
39 |     bool first_elem; 
40 | public: 
41 |     typedef charT char_type; 
42 |     typedef traits traits_type; 
43 |     typedef std::basic_ostream<charT,traits> ostream_type; 
44 |     infix_ostream_iterator(ostream_type& s) 
45 |         : os(&s),delimiter(0), first_elem(true) 
46 |     {} 
47 |     infix_ostream_iterator(ostream_type& s, charT const *d) 
48 |         : os(&s),delimiter(d), first_elem(true) 
49 |     {} 
50 |     infix_ostream_iterator<T,charT,traits>& operator=(T const &item) 
51 |     { 
52 |         // Here's the only real change from ostream_iterator: 
53 |         // Normally, the '*os << item;' would come before the 'if'. 
54 |         if (!first_elem && delimiter != 0) 
55 |             *os << delimiter; 
56 |         *os << item; 
57 |         first_elem = false; 
58 |         return *this; 
59 |     } 
60 |     infix_ostream_iterator<T,charT,traits> &operator*() { 
61 |         return *this; 
62 |     } 
63 |     infix_ostream_iterator<T,charT,traits> &operator++() { 
64 |         return *this; 
65 |     } 
66 |     infix_ostream_iterator<T,charT,traits> &operator++(int) { 
67 |         return *this; 
68 |     } 
69 | };     
70 | 
71 | 
72 | 
73 | #endif
74 | 


--------------------------------------------------------------------------------
/src/trim/Makefile:
--------------------------------------------------------------------------------
 1 | OBJECTS := TrimReads.o sequenceTools.o trim.o 
 2 | SOURCES=$(wildcard *.cpp)
 3 | LDFLAGS :=
 4 | LDFLAGS_static := -static -static-libgcc
 5 | LDFLAGS_GDB :=
 6 | OPTIMFLAGS :=
 7 | OPTIMFLAGS1 :=
 8 | # below flags make little difference.
 9 | #OPTIMFLAGS=-ffast-math
10 | #OPTIMFLAGS=-fforce-addr -funsafe-loop-optimizations -ftree-loop-linear -ftree-vectorize 
11 | #OPTIMFLAGS=-fforce-addr -funsafe-loop-optimizations -ftree-vectorize 
12 | # unroll-loops slows the program ~6%.
13 | #OPTIMFLAGS1=-funroll-loops  -fprefetch-loop-arrays
14 | SVNDEF := -D'SVN_VERSION_COMPILED="magictrim0.1"'
15 | COMPTIMEPLACE := -D'COMPILATION_TIME_PLACE="$(shell echo `date` `hostname --fqdn`:`pwd`)"'
16 | CCFLAGS_MAIN := -pipe -std=c++0x -O3    -Wall -Wextra -fopenmp $(SVNDEF) $(COMPTIMEPLACE) $(OPTIMFLAGS) $(OPTIMFLAGS1)
17 | CCFLAGS_GDB := -pipe  -std=c++0x -g -O0 -Wall -Wextra -fopenmp $(SVNDEF) $(COMPTIMEPLACE)
18 | CC :=g++
19 | GCC:=gcc
20 | 
21 | %.o : %.cpp
22 | 	$(CC) -c $(CCFLAGS) $<
23 | 
24 | all: trimstatic
25 | 
26 | clean :
27 | 	rm -f *.o trim Depend.list
28 | 
29 | ifneq ($(MAKECMDGOALS),clean)
30 | Depend.list: $(SOURCES)
31 | 	/bin/rm -f ./Depend.list
32 | 	$(CC) $(CCFLAGS_MAIN) -MM $^ >> Depend.list
33 | include Depend.list
34 | endif
35 | 
36 | trim : CCFLAGS=$(CCFLAGS_MAIN)
37 | trim : $(OBJECTS)
38 | 	$(CC) -o trim $(CCFLAGS) $(OBJECTS) $(LDFLAGS)
39 | 
40 | trimstatic : CCFLAGS=$(CCFLAGS_MAIN)
41 | trimstatic : $(OBJECTS)
42 | 	$(CC) -o trim $(CCFLAGS) $(OBJECTS) $(LDFLAGS_static)
43 | 
44 | gdb : CCFLAGS= $(CCFLAGS_GDB)
45 | gdb : $(OBJECTS)
46 | 	$(CC) -o trim $(CCFLAGS_GDB) $(OBJECTS) $(LDFLAGS_GDB) 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/src/trim/TrimReads.cpp:
--------------------------------------------------------------------------------
  1 | #include "TrimReads.h"
  2 | 
  3 | 
  4 | TrimReads::TrimReads (istream * _IF1, istream * _IF2, ostream * _OF1, ostream * _OF2, ostream * _DebugLog, char * Adapt1, char * Adapt2, char _debug)
  5 | {
  6 |   IF1 = _IF1;
  7 |   IF2 = _IF2;
  8 |   OF1 = _OF1;
  9 |   OF2 = _OF2;
 10 |   OFdebug = _DebugLog;
 11 |   debug = _debug;
 12 | 
 13 |   time(&startTime);
 14 |   cout << "Started trimming at: " << timeMonthDayTime(startTime) << endl;
 15 | 
 16 | 
 17 |   Compare1 = new char[DEF_lineLengthMax + DEF_adaptLengthMax];
 18 |   Compare2 = new char[DEF_lineLengthMax + DEF_adaptLengthMax];
 19 | 
 20 |   lAdapt1 = strlen(Adapt1);
 21 |   lAdapt2 = strlen(Adapt2);
 22 | 
 23 |   seqToNumRevComp(Adapt2, Compare1, lAdapt2);
 24 |   seqToNumRev(Adapt1, Compare2, lAdapt1);
 25 |   
 26 |   Compare1Seq = Compare1 + lAdapt2; //Set pointer inside Compare1 just after the adapter - copy Sequence into this location each time.
 27 |   Compare2Seq = Compare2 + lAdapt1;
 28 | 
 29 |   lAdapt1prefix = 0;
 30 |   while (lAdapt1prefix <= lAdapt1) {
 31 |   	if (Compare2[lAdapt1-1-lAdapt1prefix] != 0) break;
 32 |   	lAdapt1prefix++;
 33 |   }
 34 |   lAdapt2prefix = 0;
 35 |   while (lAdapt2prefix <= lAdapt2) {
 36 |   	if (Compare1[lAdapt2-1-lAdapt2prefix] != 0) break;
 37 |   	lAdapt2prefix++;
 38 |   }
 39 | 
 40 |   minSpan = lAdapt1 + lAdapt2 - lAdapt1prefix - lAdapt2prefix;
 41 | 
 42 |   chunk1 = new char[DEF_lineLengthMax*4];
 43 |   chunk2 = new char[DEF_lineLengthMax*4];
 44 | 
 45 |   countInputReads = 0;
 46 |   countOutputTrimmed = 0;
 47 |   countOutputUntrimmed = 0;
 48 |   countOutputTooShort = 0;
 49 | 
 50 | };
 51 | 
 52 | int TrimReads::trimAll()
 53 | {
 54 | //  *OFdebug << "trimAll" << endl;
 55 |   while(IF1->peek() == '@') {
 56 | 
 57 |   countInputReads++;
 58 | 
 59 |   uint chunkLen1 = 0;
 60 |   uint chunkLen2 = 0;
 61 | 
 62 |   IF1->getline(chunk1, DEF_lineLengthMax);
 63 |   chunkLen1 += IF1->gcount();
 64 |   chunk1[chunkLen1-1] = '\n';
 65 |   Seq1 = chunk1 + chunkLen1;
 66 |   uint lName1 = IF1->gcount() - 1;
 67 | 
 68 |   IF1->getline(chunk1 + chunkLen1, DEF_lineLengthMax);
 69 |   chunkLen1 += IF1->gcount();
 70 |   chunk1[chunkLen1-1] = '\n';
 71 |   uint lR1 = IF1->gcount() - 1;
 72 | 
 73 |   IF1->getline(chunk1 + chunkLen1, DEF_lineLengthMax);
 74 |   chunkLen1 += IF1->gcount();
 75 |   chunk1[chunkLen1-1] = '\n';
 76 |   uint lQName1 = IF1->gcount() - 1;
 77 | 
 78 |   IF1->getline(chunk1 + chunkLen1, DEF_lineLengthMax);
 79 |   chunkLen1 += IF1->gcount();
 80 |   chunk1[chunkLen1-1] = '\n';
 81 |   uint lQual1 = IF1->gcount() - 1;
 82 | 
 83 | 
 84 |   IF2->getline(chunk2, DEF_lineLengthMax);
 85 |   chunkLen2 += IF2->gcount();
 86 |   chunk2[chunkLen2-1] = '\n';
 87 |   Seq2 = chunk2 + chunkLen2;
 88 |   uint lName2 = IF2->gcount() - 1;
 89 | 
 90 |   IF2->getline(chunk2 + chunkLen2, DEF_lineLengthMax);
 91 |   chunkLen2 += IF2->gcount();
 92 |   chunk2[chunkLen2-1] = '\n';
 93 |   uint lR2 = IF2->gcount() - 1;
 94 | 
 95 |   IF2->getline(chunk2 + chunkLen2, DEF_lineLengthMax);
 96 |   chunkLen2 += IF2->gcount();
 97 |   chunk2[chunkLen2-1] = '\n';
 98 |   uint lQName2 = IF2->gcount() - 1;
 99 | 
100 |   IF2->getline(chunk2 + chunkLen2, DEF_lineLengthMax);
101 |   chunkLen2 += IF2->gcount();
102 |   chunk2[chunkLen2-1] = '\n';
103 |   uint lQual2 = IF2->gcount() - 1;
104 | 
105 | 
106 |   if (lR1 != lQual1 || lR2 != lQual2) {
107 |     //*OFdebug << "FATAL: a quality line has different length to Sequence line - corrupt input file." << endl;
108 |     cout << "FATAL: a quality line has different length to Sequence line - corrupt input file." << endl;
109 |     cout << "Error at read number: " << countInputReads << endl;
110 |     return(1);
111 |   }
112 | 
113 |   seqToNum(Seq1, Compare1Seq, lR1);
114 |   seqToNumComp(Seq2, Compare2Seq, lR2);
115 | 
116 |   uint maxOverlap = minSpan+max(lR1+lAdapt2prefix, lR2+lAdapt1prefix);
117 | //  uint overlapPos = localAlign(Compare1, lR1+lAdapt2, Compare2, lR2+lAdapt1, lAdapt1+lAdapt2, maxOverlap);
118 |   uint overlapPos = localAlign(Compare1, lR1+lAdapt2, Compare2, lR2+lAdapt1, minSpan, maxOverlap);
119 | 
120 |   uint InsertLen = overlapPos-lAdapt1-lAdapt2+lAdapt1prefix+lAdapt2prefix;
121 | 
122 |   if (insertLenDistribution.size() <= InsertLen) {
123 |     insertLenDistribution.resize( InsertLen+1 ,0);
124 |   }
125 | 
126 |   insertLenDistribution.at(InsertLen)++;
127 | 
128 |   if (maxOverlap == overlapPos) {
129 |     //No trimming, output exactly what we read - as a single fast block.
130 |     OF1->write(chunk1, chunkLen1);
131 |     OF2->write(chunk2, chunkLen2);
132 |     countOutputUntrimmed++;
133 |   }else if (InsertLen < 30) {
134 |     // Don't output, too short.
135 |     countOutputTooShort++;
136 |   }else{
137 |   	if (!debug) {
138 | 		chunk1[lName1] = '\0';
139 | 		uint firstSpace = strcspn(chunk1," ");
140 | 		chunk1[lName1] = '\n';
141 | 		OF1->write(chunk1, firstSpace);
142 | 		*OF1 << "_INS_" << InsertLen;
143 | 		OF1->write(chunk1+firstSpace, lName1+1-firstSpace+min(InsertLen-lAdapt2prefix,lR1));
144 | 		OF1->write(chunk1+lName1+lR1+1, lQName1+2+min(InsertLen-lAdapt2prefix,lR1));
145 | 		OF1->write("\n",1);
146 | 
147 | 		chunk2[lName2] = '\0';
148 | 		firstSpace = strcspn(chunk2," ");
149 | 		chunk2[lName2] = '\n';
150 | 		OF2->write(chunk2, firstSpace);
151 | 		*OF2 << "_INS_" << InsertLen;
152 | 		OF2->write(chunk2+firstSpace, lName2+1-firstSpace+min(InsertLen-lAdapt1prefix,lR2));
153 | 		OF2->write(chunk2+lName2+lR2+1, lQName2+2+min(InsertLen-lAdapt1prefix,lR2));
154 | 		OF2->write("\n",1);
155 | 	}else{
156 | 		chunk1[lName1] = '\0';
157 | 		uint firstSpace = strcspn(chunk1," ");
158 | 		chunk1[lName1] = '\n';
159 | 		OF1->write(chunk1, firstSpace);
160 | 		*OF1 << "_INS_" << InsertLen;
161 | 
162 | 		for(int i = lName1+1+min(InsertLen-lAdapt2prefix,lR1); i<(lName1+1+lR1); i++){
163 | 		  chunk1[i] = tolower(chunk1[i]);
164 | 		}		
165 | 		OF1->write(chunk1+firstSpace, lName1+1-firstSpace+lR1);
166 | 		OF1->write(chunk1+lName1+lR1+1, lQName1+2+lR1);
167 | 		OF1->write("\n",1);
168 | 
169 | 		chunk2[lName2] = '\0';
170 | 		firstSpace = strcspn(chunk2," ");
171 | 		chunk2[lName2] = '\n';
172 | 		OF2->write(chunk2, firstSpace);
173 | 		*OF2 << "_INS_" << InsertLen;
174 | 
175 | 		for(int i = lName2+1+min(InsertLen-lAdapt1prefix,lR2); i<(lName2+1+lR2); i++){
176 | 		  chunk2[i] = tolower(chunk2[i]);
177 | 		}		
178 | 		OF2->write(chunk2+firstSpace, lName2+1-firstSpace+lR2);
179 | 		OF2->write(chunk2+lName2+lR2+1, lQName2+2+lR2);
180 | 		OF2->write("\n",1);	
181 | 	}
182 | 
183 |     countOutputTrimmed++;
184 |   }
185 | 
186 | 
187 |   }
188 | 
189 |   // Output summary statistics.
190 |   time(&endTime);
191 | 
192 |   cout << "Completed trimming at: " << timeMonthDayTime(endTime) << endl;
193 |   cout << double(countInputReads)/1e6/difftime(endTime,startTime)*3600 << "\tTrimming speed, million reads per hour" << endl;
194 | 
195 |   ios::fmtflags old_output_settings = cout.flags();
196 |   cout << fixed << setprecision(4);
197 |   cout << (countOutputTrimmed+countOutputTooShort)/double(countInputReads)*100 << "\t% with adaptor" << endl;
198 |   cout.flags(old_output_settings);
199 |   cout << countInputReads << "\tTotal input reads" << endl;
200 |   cout << (countOutputUntrimmed+countOutputTrimmed) << "\tTotal output reads" << endl;
201 |   cout << countOutputUntrimmed << "\tTotal unmodified reads output" << endl;
202 |   cout << countOutputTrimmed << "\tTotal trimmed reads output" << endl;
203 |   cout << countOutputTooShort << "\tTotal trimmed reads too short" << endl;
204 |   cout << endl;
205 |   cout << "------ Insert length distribution ------" << endl;
206 |   cout << "Length\tCount" << endl;
207 | 
208 |   for (uint i=0; i<insertLenDistribution.size() ; i++) {
209 |     cout << i << "\t" << insertLenDistribution.at(i) << endl;
210 |   }
211 | 
212 |   return 0;
213 | };
214 | 


--------------------------------------------------------------------------------
/src/trim/TrimReads.h:
--------------------------------------------------------------------------------
 1 | #ifndef CODE_TRIMREADS
 2 | #define CODE_TRIMREADS
 3 | 
 4 | #include "includedefine.h"
 5 | #include "sequenceTools.h"
 6 | 
 7 | class TrimReads {
 8 |   char* chunk1;
 9 |   char* chunk2;
10 |   char* Seq1;
11 |   char* Seq2;
12 |   char* Compare1;
13 |   char* Compare1Seq;
14 |   char* Compare2;
15 |   char* Compare2Seq;
16 | 
17 |   int lAdapt1;
18 |   int lAdapt2;
19 |   int lAdapt1prefix;
20 |   int lAdapt2prefix;
21 |   int minSpan;
22 | 
23 |   unsigned long int countInputReads;
24 |   unsigned long int countOutputTrimmed;
25 |   unsigned long int countOutputUntrimmed;
26 |   unsigned long int countOutputTooShort;
27 | 
28 |   std::vector<unsigned long int> insertLenDistribution;
29 | 
30 |   time_t startTime, endTime;
31 | 
32 |   istream* IF1;
33 |   istream* IF2;
34 |   ostream* OF1;
35 |   ostream* OF2;
36 |   ostream* OFdebug;
37 |   char debug;
38 | 
39 |   public:
40 |     TrimReads(istream * _IF1, istream * _IF2, ostream * _OF1, ostream * _OF2, ostream * _DebugLog, char * Adapt1, char * Adapt2, char _debug = 0);
41 |     int trimAll();
42 | };
43 | 
44 | #endif
45 | 


--------------------------------------------------------------------------------
/src/trim/includedefine.h:
--------------------------------------------------------------------------------
 1 | #ifndef INCLUDEDEFINE_DEF
 2 | #define INCLUDEDEFINE_DEF
 3 | 
 4 | #include <cstring>
 5 | #include <time.h>
 6 | #include <iostream>
 7 | #include <iomanip>
 8 | #include <fstream>
 9 | #include <sys/types.h>
10 | #include <vector>
11 | 
12 | using namespace std;
13 | 
14 | 
15 | #define DEF_lineLengthMax 10000
16 | #define DEF_adaptLengthMax 500
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/src/trim/sequenceTools.cpp:
--------------------------------------------------------------------------------
  1 | #include "sequenceTools.h"
  2 | 
  3 | uint localAlign(const char *x, uint nx, const char *y, uint ny, uint minspan, uint maxspan)
  4 | {
  5 |   // Expecting x, a numeric Seq string. y, a pre-complemented numeric Seq string. Both are expected in their original direction.
  6 |   uint nMatch;
  7 |   int nMMcounter;
  8 |   double nScore;
  9 |   double nScoreBest=0;
 10 |   uint spanBest=maxspan;
 11 |   uint ixbegin;
 12 |   uint ixlimit;
 13 |   // Min score = 0.8 therefore, at worst position (integer comparison):
 14 |   //uint maxMismatch = (maxspan/10) + 1;
 15 |   uint maxMismatch;
 16 | 
 17 |   for (uint span=minspan; span<=maxspan; span++ ) {
 18 |     nMatch=0;
 19 |     ixbegin = max(0,int(span)-int(ny));
 20 |     ixlimit = min(span,nx);
 21 |     maxMismatch = ((ixlimit-ixbegin)/10) + 1;
 22 |     nMMcounter=maxMismatch;
 23 |     for (uint ix=ixbegin; ix<ixlimit; ix++) {
 24 |       char cy = y[span-ix-1];
 25 |       if (x[ix] == 0 || cy == 0) continue;
 26 | 
 27 |       if (x[ix] == cy) {
 28 |         nMatch++;
 29 |       }else{
 30 |         nMMcounter--;
 31 |       }
 32 | 
 33 |       if (nMMcounter < 0) break;
 34 |     }
 35 |     if (nMMcounter >= 0) {
 36 |       nScore = ((double)(nMatch - maxMismatch + nMMcounter))/(ixlimit-ixbegin);
 37 |       if (nScore >= 0.8 && nScore > nScoreBest) {
 38 |         nScoreBest = nScore;
 39 |         spanBest=span;
 40 |       }
 41 |     }
 42 |   }
 43 | 
 44 |   return spanBest;
 45 | }
 46 | 
 47 | void seqToNum(const char* in, char* out, uint nin) // do we really need length, or just run until \0?
 48 | {
 49 |   for (uint jj=0;jj<nin;jj++) {
 50 |     switch(in[jj]){
 51 | //       case ('N'): case ('n'): case ('.'):  out[jj]=char(0);break;
 52 |        case ('A'): case ('a'):  out[jj]=char(1);break;
 53 |        case ('T'): case ('t'): case ('U'): case ('u'):  out[jj]=char(2);break;
 54 |        case ('C'): case ('c'):  out[jj]=char(3);break;
 55 |        case ('G'): case ('g'):  out[jj]=char(4);break;
 56 |        default:  out[jj]=char(0);
 57 |     }
 58 | /*    switch(in[jj]){
 59 |        case ('N'): case ('n'): case ('.'):  out[jj]=char(0);break;
 60 |        case ('A'): case ('a'):  out[jj]='A';break;
 61 |        case ('T'): case ('t'):  out[jj]='T';break;
 62 |        case ('C'): case ('c'):  out[jj]='C';break;
 63 |        case ('G'): case ('g'):  out[jj]='G';break;
 64 |        default: out[jj]=char(99);
 65 |     }*/
 66 |   }
 67 | }
 68 | 
 69 | void seqToNumComp(const char* in, char* out, uint nin)
 70 | {
 71 |   for (uint jj=0;jj<nin;jj++) {
 72 |     switch(in[jj]){
 73 | //       case ('N'): case ('n'): case ('.'):  out[jj]=char(0);break;
 74 |        case ('A'): case ('a'):  out[jj]=char(2);break;
 75 |        case ('T'): case ('t'): case ('U'): case ('u'):  out[jj]=char(1);break;
 76 |        case ('C'): case ('c'):  out[jj]=char(4);break;
 77 |        case ('G'): case ('g'):  out[jj]=char(3);break;
 78 |        default: out[jj]=char(0);
 79 |     }
 80 | /*    switch(in[jj]){
 81 |        case ('N'): case ('n'): case ('.'):  out[jj]=char(0);break;
 82 |        case ('A'): case ('a'):  out[jj]='T';break;
 83 |        case ('T'): case ('t'):  out[jj]='A';break;
 84 |        case ('C'): case ('c'):  out[jj]='G';break;
 85 |        case ('G'): case ('g'):  out[jj]='C';break;
 86 |        default: out[jj]=char(99);
 87 |     }*/
 88 |   }
 89 | }
 90 | 
 91 | void seqToNumRevComp(const char* in, char* out, uint nin)
 92 | {
 93 |   for (uint jj=0;jj<nin;jj++) {
 94 |     switch(in[jj]){
 95 | //       case ('N'): case ('n'): case ('.'):  out[nin-jj-1]=char(0);break;
 96 |        case ('A'): case ('a'):  out[nin-jj-1]=char(2);break;
 97 |        case ('T'): case ('t'): case ('U'): case ('u'):  out[nin-jj-1]=char(1);break;
 98 |        case ('C'): case ('c'):  out[nin-jj-1]=char(4);break;
 99 |        case ('G'): case ('g'):  out[nin-jj-1]=char(3);break;
100 |        default: out[nin-jj-1]=char(0);
101 |     }
102 | /*    switch(in[jj]){
103 |        case ('N'): case ('n'): case ('.'):  out[nin-jj-1]=char(0);break;
104 |        case ('A'): case ('a'):  out[nin-jj-1]='T';break;
105 |        case ('T'): case ('t'):  out[nin-jj-1]='A';break;
106 |        case ('C'): case ('c'):  out[nin-jj-1]='G';break;
107 |        case ('G'): case ('g'):  out[nin-jj-1]='C';break;
108 |        default: out[nin-jj-1]=char(99);
109 |     }*/
110 |   }
111 | }
112 | void seqToNumRev(const char* in, char* out, uint nin)
113 | {
114 |   for (uint jj=0;jj<nin;jj++) {
115 |     switch(in[jj]){
116 | //       case ('N'): case ('n'): case ('.'):  out[nin-jj-1]=char(0);break;
117 |        case ('A'): case ('a'):  out[nin-jj-1]=char(1);break;
118 |        case ('T'): case ('t'): case ('U'): case ('u'):  out[nin-jj-1]=char(2);break;
119 |        case ('C'): case ('c'):  out[nin-jj-1]=char(3);break;
120 |        case ('G'): case ('g'):  out[nin-jj-1]=char(4);break;
121 |        default: out[nin-jj-1]=char(0);
122 |     }
123 | /*    switch(in[jj]){
124 |        case ('N'): case ('n'): case ('.'):  out[nin-jj-1]=char(0);break;
125 |        case ('A'): case ('a'):  out[nin-jj-1]='A';break;
126 |        case ('T'): case ('t'):  out[nin-jj-1]='T';break;
127 |        case ('C'): case ('c'):  out[nin-jj-1]='C';break;
128 |        case ('G'): case ('g'):  out[nin-jj-1]='G';break;
129 |        default: out[nin-jj-1]=char(99);
130 |     }*/
131 |   }
132 | }
133 | 
134 | std::string timeMonthDayTime() {
135 |     time_t rawTime;
136 |     char timeChar[100];
137 |     time(&rawTime);
138 |     strftime(timeChar,80,"%b %d %H:%M:%SS",localtime(&rawTime));
139 |     std::string timeString=timeChar;
140 |     timeString.erase(timeString.end()-1,timeString.end());
141 |     return timeString;
142 | };
143 | 
144 | std::string timeMonthDayTime(time_t &rawTime) {
145 |     char timeChar[100];
146 |     strftime(timeChar,80,"%b %d %H:%M:%SS",localtime(&rawTime));
147 |     std::string timeString=timeChar;
148 |     timeString.erase(timeString.end()-1,timeString.end());
149 |     return timeString;
150 | };
151 | 
152 | 


--------------------------------------------------------------------------------
/src/trim/sequenceTools.h:
--------------------------------------------------------------------------------
 1 | #ifndef CODE_SEQUENCETOOLS
 2 | #define CODE_SEQUENCETOOLS
 3 | 
 4 | #include "includedefine.h"
 5 | 
 6 | uint localAlign(const char *, uint, const char *, uint ny, uint minspan, uint maxspan);
 7 | void seqToNum(const char*, char*, uint); // do we really need length, or just run until \0?
 8 | void seqToNumComp(const char*, char*, uint);
 9 | void seqToNumRevComp(const char*, char*, uint);
10 | void seqToNumRev(const char*, char*, uint);
11 | 
12 | std::string timeMonthDayTime();
13 | std::string timeMonthDayTime(time_t &rawTime);
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/src/trim/trim.cpp:
--------------------------------------------------------------------------------
 1 | #include "includedefine.h"
 2 | 
 3 | #include "sequenceTools.h"
 4 | #include "TrimReads.h"
 5 | 
 6 | 
 7 | int main(int argc, char * argv[])
 8 | {
 9 |   char debug = 0;
10 |   
11 |   if (argc != 7) {
12 |   	if (argc == 8 && strcmp(argv[7], "debug") == 0) {
13 |   		debug = 1;
14 |   	}else{
15 |     	cerr << "Usage: cmd in_1.fastq in_2.fastq out_1.fastq out_2.fastq adapt1 adapt2 [debug]" << endl;
16 |     	exit(1);
17 |     } 
18 |   }
19 | 
20 |   ifstream IN1;
21 |   IN1.open (argv[1], ifstream::in);
22 |   ifstream IN2;
23 |   IN2.open (argv[2], ifstream::in);
24 |   ofstream OUT1;
25 |   OUT1.open (argv[3], ifstream::out);
26 |   ofstream OUT2;
27 |   OUT2.open (argv[4], ifstream::out);
28 | 
29 |   TrimReads * TR = new TrimReads(&IN1, &IN2, &OUT1, &OUT2, &cerr, argv[5], argv[6], debug);
30 |   int success = TR->trimAll();
31 | 
32 |   OUT1.flush();
33 |   OUT2.flush();
34 | 
35 |   IN1.close();
36 |   IN2.close();
37 | 
38 |   OUT1.close();
39 |   OUT2.close();
40 | 
41 |   exit(success);
42 | //  f1InStream.getline
43 | };
44 | 


--------------------------------------------------------------------------------
/src/winflat/Makefile:
--------------------------------------------------------------------------------
 1 | all: winflat test 
 2 | 
 3 | winflat: winflat_with_beta.c
 4 | 	$(CC) -o winflat winflat_with_beta.c -lm 
 5 | 
 6 | test:	winflat
 7 | 	sh runtest.sh 2> /dev/null
 8 | 
 9 | clean:
10 | 	rm -f *.o winflat


--------------------------------------------------------------------------------
/src/winflat/README:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/src/winflat/README


--------------------------------------------------------------------------------
/src/winflat/runtest.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | echo -e " " \\t 2E=0.01 \\t 2E=0.005
3 | echo -e x \\t Ymin--Ymax \\t Ymin--Ymax
4 | for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
5 | do
6 | echo -e  $i \\t `./winflat -xvalue $i -sig 0.01 | awk '{print $2}'` \\t\\t  `./winflat -xvalue $i -sig 0.05 | awk '{print $2}'` 
7 | done
8 | exit 0 
9 | 


--------------------------------------------------------------------------------
/src/winflat/winflat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/src/winflat/winflat


--------------------------------------------------------------------------------
/src/winflat/winflat_with_beta.c:
--------------------------------------------------------------------------------
  1 | /* To compile cc -o winflat_with_beta winflat_with_beta.c -lm */ 
  2 | /* Copyright Stephane Audic 2003 */ 
  3 | /* With code included from Numerical Recipes in C */
  4 | 
  5 | 
  6 | 
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | 
 11 | 
 12 | 
 13 | #define MAXIT 500
 14 | #define EPS 3.0e-30
 15 | #define FPMIN 1.0e-30
 16 | 
 17 | double betacf(double a, double b, double x);
 18 | double gammln(double xx);
 19 | double betai(double a, double b, double x);
 20 | 
 21 | double betacf(double a, double b, double x)
 22 | {
 23 |   /*	void nrerror(char error_text[]); */
 24 | 	int m,m2;
 25 | 	double aa,c,d,del,h,qab,qam,qap;
 26 | 	
 27 | 	qab=a+b;
 28 | 	qap=a+1.0;
 29 | 	qam=a-1.0;
 30 | 	c=1.0;
 31 | 	d=1.0-qab*x/qap;
 32 | 	if (fabs(d) < FPMIN) d=FPMIN;
 33 | 	d=1.0/d;
 34 | 	h=d;
 35 | 	for (m=1;m<=MAXIT;m++) {
 36 | 		m2=2*m;
 37 | 		aa=m*(b-m)*x/((qam+m2)*(a+m2));
 38 | 		d=1.0+aa*d;
 39 | 		if (fabs(d) < FPMIN) d=FPMIN;
 40 | 		c=1.0+aa/c;
 41 | 		if (fabs(c) < FPMIN) c=FPMIN;
 42 | 		d=1.0/d;
 43 | 		h *= d*c;
 44 | 		aa = -(a+m)*(qab+m)*x/((a+m2)*(qap+m2));
 45 | 		d=1.0+aa*d;
 46 | 		if (fabs(d) < FPMIN) d=FPMIN;
 47 | 		c=1.0+aa/c;
 48 | 		if (fabs(c) < FPMIN) c=FPMIN;
 49 | 		d=1.0/d;
 50 | 		del=d*c;
 51 | 		h *= del;
 52 | 		if (fabs(del-1.0) < EPS) break;
 53 | 	}
 54 | 	if (m > MAXIT){
 55 | 	  fprintf( stderr , "a or b too big, or MAXIT too small in betacf");
 56 | 	  exit(1) ; 
 57 | 	}
 58 | 	return h;
 59 | }
 60 | #undef MAXIT
 61 | #undef EPS
 62 | #undef FPMIN
 63 | 
 64 | 
 65 | double gammln(double xx)
 66 | {
 67 | 	double x,y,tmp,ser;
 68 | 	static double cof[6]={76.18009172947146,-86.50532032941677,
 69 | 		24.01409824083091,-1.231739572450155,
 70 | 		0.1208650973866179e-2,-0.5395239384953e-5};
 71 | 	int j;
 72 | 
 73 | 	y=x=xx;
 74 | 	tmp=x+5.5;
 75 | 	tmp -= (x+0.5)*log(tmp);
 76 | 	ser=1.000000000190015;
 77 | 	for (j=0;j<=5;j++) ser += cof[j]/++y;
 78 | 	return -tmp+log(2.5066282746310005*ser/x);
 79 | }
 80 | 
 81 | 
 82 | 
 83 | double betai(double a, double b, double x)
 84 | {
 85 | 	double bt;
 86 | 
 87 | 	if (x < 0.0 || x > 1.0) { 
 88 | 	  fprintf( stderr , "Bad x in routine betai") ; 
 89 | 	  exit(1) ; 
 90 | 	}
 91 | 	if (x == 0.0 || x == 1.0) bt=0.0;
 92 | 	else
 93 | 		bt=exp(gammln(a+b)-gammln(a)-gammln(b)+a*log(x)+b*log(1.0-x));
 94 | 	if (x < (a+1.0)/(a+b+2.0))
 95 | 		return bt*betacf(a,b,x)/a;
 96 | 	else
 97 | 		return 1.0-bt*betacf(b,a,1.0-x)/b;
 98 | }
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | void usage(){
107 |     fprintf( stderr , "usage: two behaviours can be invoqued. \n") ; 
108 |     fprintf( stderr , "  winflat -xvalue x -sig significance [ -diff n1 n2 ]\n");
109 |     fprintf( stderr , "  will return the lower and upper y value at the given significance level\n");
110 |     fprintf( stderr ,"\n") ; 
111 | 
112 |     fprintf( stderr , "  winflat -xvalue x -yvalue y  [ -diff n1 n2 ]\n");
113 |     fprintf( stderr , "  will return the probability of over or underexpression \n");
114 |     
115 | 
116 |     /* fprintf( stderr ,"\n") ; 
117 |     fprintf( stderr , "  winflat -xvalue x -show  [ -diff n1 n2 ]\n");
118 |     fprintf( stderr , "  will return a plot \n");
119 |     */ 
120 | 
121 |     fprintf( stderr ,"If the number of clones in the two libraries is \n") ; 
122 |     fprintf( stderr ,"different, use -diff n1 n2\n") ; 
123 | /*    fprintf(stderr  ,"To see p(y|x) and C(y|x) , use -show \n") ;  */ 
124 | 
125 |     fprintf( stderr ,"\n") ; 
126 | }
127 | 
128 | int main( int argc , char **argv){
129 |   double temp ; 
130 |   double thisproba , thisproba2 ;
131 |   int x ; 
132 |   int y ; 
133 |   int thisy ; 
134 |   int n1 = 1  , n2 = 1  ; 
135 |   double ratio ; 
136 |   double t1 , t2 , t3 , t4 ; 
137 | 
138 |   double sum ; 
139 |   int ymin , ymax ; 
140 |   int argcount = 1 ; 
141 |   int noup ; 
142 |   double  sig ; 
143 |   int show = 0 ; 
144 | 
145 |   double p ; 
146 | #define WNMIN -1 
147 | #define WNMAX  10000
148 | 
149 |   if( argc < 4){
150 |     usage() ; 
151 |     exit(0) ; 
152 |   }
153 | 
154 |   x = -1 ; 
155 |   y = -1 ; 
156 | 
157 |   while( argcount < argc ){
158 | 
159 |     if( !strcmp( argv[argcount] , "-xvalue")){
160 |       x = atoi( argv[argcount+1] ) ; 
161 |       argcount += 2 ; 
162 |     } else if( !strcmp( argv[argcount] , "-yvalue")){
163 |       y = atoi( argv[argcount+1] ) ; 
164 |       argcount += 2 ; 
165 |     } else if( !strcmp( argv[ argcount ] , "-diff")){
166 |       /* correction in the case where the number of est's drawn 
167 | 	 from the samples  is different */
168 |       n1 = atoi( argv[ argcount + 1 ] ) ; 
169 |       n2 = atoi( argv[ argcount + 2 ] ) ; 
170 |       argcount += 3 ; 
171 |     } else if( !strcmp( argv[ argcount ] , "-sig" )){
172 |       sig = (double) atof( argv[ argcount + 1 ] ) ; 
173 |       argcount += 2 ; 
174 |     } else {
175 |       usage() ; 
176 |       exit(0) ; 
177 |     }
178 |   }
179 | 
180 | 
181 | 
182 |     /* Check arguments and invoque the right procedure */ 
183 |     
184 |     if( x > -1 && y > -1 ){
185 | 	/* Both x and y are defined so we compute the significance window */
186 | 	ymin =  WNMIN ;
187 | 	ymax =  WNMAX ;
188 |   
189 | 	sum = 0 ; 
190 | 	noup = 1 ; 
191 | 	ratio = (double) n1 / (double) n2 ; 
192 | 
193 | 	p = (double) ( n1 ) / (double) ( n1 + n2 )  ; 
194 | 
195 | 	thisproba = betai( (double) (x + 1 ) , (double)( y + 1 ) , p ) ; 
196 | 	thisproba2 = betai(  (double)  ( y + 1 )  ,  (double) (x + 1 ) , 1 - p ) ; 
197 | 	fprintf( stdout , "P( y <= %d | x = %d ) = %g \n" , y , x , thisproba ) ; 
198 | 	fprintf( stdout , "P( y >= %d | x = %d ) = %g \n" , y , x , thisproba2 ) ; 
199 |     } else if( x > -1 && y == -1 ){
200 | 
201 | 	y = 0 ; 
202 | 	ymin =  WNMIN ;
203 | 	ymax =  WNMAX ;
204 |   
205 | 	sum = 0 ; 
206 | 	noup = 1 ; 
207 | 	ratio = (double) n1 / (double) n2 ; 
208 | 
209 | 	p = (double) ( n1 ) / (double) ( n1 + n2 )  ; 
210 | 
211 | 
212 | 	y = 0 ; 
213 | 	/* fprintf( stderr , "x = %d y= %d sig = %g\n" , x , y , sig ) ;  */ 
214 | 	while( 1 ){
215 | 	    thisproba = betai( (double) (x + 1 ) , (double)( y + 1 ) , p ) ; 
216 | 	    thisproba2 = betai(  (double)  ( y + 1 )  ,  (double) (x + 1 ) , 1 - p ) ; 
217 | 
218 | 	    /* fprintf( stdout , "%d %d C(%d | %d ) = %g    %g\n" , 
219 | 		     x , y , y , x , thisproba , thisproba2 ) ; 
220 | 	    */ 
221 | 
222 | 	    if( thisproba < sig / 2.0 ){
223 | 		ymin = y ; 
224 | 	    }
225 | 
226 | 	    if( ymax == WNMAX && thisproba2 < sig / 2.0 ){
227 | 		ymax = y ; 
228 | 		break ; 
229 | 	    }
230 | 	    y++ ; 
231 | 	}
232 | 	thisproba = betai( (double) (x + 1 ) , (double)( ymin + 1 ) , p ) ; 
233 | 	thisproba2 = betai(  (double)  ( ymax + 1 )  ,  (double) (x + 1 ) , 1 - p ) ; 
234 | 	fprintf( stderr , "P( y <= %d | x = %d ) = %g \n" , ymin , x , thisproba ) ; 
235 | 	fprintf( stderr , "P( y >= %d | x = %d ) = %g \n" ,  ymax , x , thisproba2 ) ; 
236 | 	
237 | 	if( ymin == -1 ){
238 | 	    fprintf( stdout , "%d *--%d\n" , x , ymax) ;
239 | 	} else {
240 | 	    fprintf( stdout , "%d %d--%d\n" , x , ymin , ymax) ;
241 | 	}
242 | 	
243 |     } else if( show == 1 ){
244 | 	
245 | 	fprintf( stdout , "%d %d C(%d | %d ) = %g    %g\n" , 
246 | 		 x , y , y , x , thisproba , thisproba2 ) ; 
247 | 	y++ ; 
248 | 	if( thisproba > 0.9999999999 ){
249 | 
250 | 	}
251 |     }
252 | }
253 | 
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 


--------------------------------------------------------------------------------