├── .gitignore
├── .project
├── .pydevproject
├── .settings
└── org.eclipse.core.resources.prefs
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── README.md
├── REF
├── Mapabilities
│ └── hg38
│ │ ├── MapabilityExclusion.100bp.bed.gz
│ │ ├── MapabilityExclusion.150.bed.gz
│ │ └── MapabilityExclusion.70bp.bed.gz
└── extra-input-files
│ ├── Human_hg19_nonPolyA_ROI.bed
│ ├── Human_hg19_wgEncodeDacMapabilityConsensusExcludable.bed.gz
│ ├── Human_hg38_nonPolyA_ROI.bed
│ ├── Mouse_mm10_nonPolyA_ROI.bed
│ ├── Mouse_mm9_nonPolyA_ROI.bed
│ ├── RNA.SpikeIn.ERCC.fasta.gz
│ └── URLs
├── bin
├── AdaptorDetect.pl
├── DESeq2Constructor.R
├── IRFinder
├── IRFinderBAM
├── IRFinderBuildRef
├── IRFinderBuildRefDownload
├── IRFinderBuildRefFromSTARRef
├── IRFinderBuildRefProcess
├── IRFinderDiff
├── IRFinderFastQ
├── IRFinderLong
├── TrimBAM4IGV
├── analysisWithLowReplicates.pl
├── analysisWithNoReplicates.pl
└── util
│ ├── Build-BED-refs.sh
│ ├── IRFinder-BuildRefFromEnsembl
│ ├── IntronExclusion.pl
│ ├── Mapability
│ ├── adjust.R
│ ├── bash_utils.sh
│ ├── bed-to-intron+exon.pl
│ ├── deseq2.R
│ ├── generateReadsError.pl
│ ├── gtf2bed-custom.pl
│ ├── irfinder
│ ├── irfinder_cnn
│ ├── model
│ ├── best_model.h5
│ ├── best_model.tflite
│ └── model_info.json
│ ├── trim
│ ├── warnings
│ └── winflat
├── install.sh
└── src
├── cnnfilter
├── cnnfilter
│ ├── actions
│ │ ├── extract.py
│ │ ├── models.py
│ │ ├── resultgraph.py
│ │ └── selectclass.py
│ ├── main.py
│ ├── model
│ │ ├── best_model.h5
│ │ └── model_info.json
│ └── utils
│ │ └── reader.py
└── testCNN
│ ├── actions
│ ├── extract.py
│ └── models.py
│ ├── irfinder_cnn.py
│ ├── model
│ ├── best_model.h5
│ └── model_info.json
│ └── utils
│ └── reader.py
├── irfinder
├── .cproject
├── .project
├── .settings
│ ├── language.settings.xml
│ └── org.eclipse.cdt.core.prefs
├── Release
│ ├── makefile
│ ├── objects.mk
│ ├── sources.mk
│ └── src
│ │ ├── Blocks
│ │ └── subdir.mk
│ │ ├── ReadBlock
│ │ └── subdir.mk
│ │ ├── Utils
│ │ └── subdir.mk
│ │ └── subdir.mk
└── src
│ ├── Blocks
│ ├── BAM2blocks.cpp
│ ├── BAM2blocks.h
│ ├── CoverageBlock.cpp
│ ├── CoverageBlock.h
│ ├── FragmentBlocks.cpp
│ └── FragmentBlocks.h
│ ├── IRFinder2.cpp
│ ├── ReadBlock
│ ├── CoverageBlocks.cpp
│ ├── CoverageBlocks.h
│ ├── ReadBlockProcessor.cpp
│ └── ReadBlockProcessor.h
│ └── Utils
│ ├── crc32.cpp
│ ├── crc32.h
│ └── includedefine.h
├── trim
├── Makefile
├── TrimReads.cpp
├── TrimReads.h
├── includedefine.h
├── sequenceTools.cpp
├── sequenceTools.h
└── trim.cpp
└── winflat
├── Makefile
├── README
├── runtest.sh
├── winflat
└── winflat_with_beta.c
/.gitignore:
--------------------------------------------------------------------------------
1 | img/
2 | docker_routine.sh
3 | src/irfinder/Release/irfinder
4 | src/irfinder/Release/**/*.o
5 | src/irfinder/Release/**/*.d
6 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | IRFinder
4 |
5 |
6 |
7 |
8 |
9 | org.python.pydev.PyDevBuilder
10 |
11 |
12 |
13 |
14 |
15 | org.python.pydev.pythonNature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | python3
5 |
6 | python interpreter
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/cnnfilter/cnnfilter/main.py=utf-8
3 | encoding//src/cnnfilter/testCNN/irfinder_cnn.py=utf-8
4 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 |
2 | # IRFinder Changelogs
3 |
4 | **2.0.0**
5 | 1. **Novelties**
6 | 1. New **Long** RunMode to process fast[q|a] files from long reads
7 | using Minimap2 as aligner.
8 | 2. New **-l** argument in **BAM** RunMode, to process long reads using an alternative algorithm. More information in the paper.
9 | 3. New **AI** process that uses a CNN model to detect false IR events on introns without warning in the last column of the result `IRFinder-IR-[non]dir.txt` file. It will generate a file containing only validated introns ( `IRFinder-IR-[non]dir-val.txt` )
10 | 4. New **Diff** RunMode that uses SUPPA2 ( https://github.com/comprna/SUPPA ) or DESeq2 algorithm to identify differential IR events.
11 | 5. New **CLI** with dedicated helps for each RunMode and a verbose mode.
12 | 6. New **installation script**, to check the dependencies and install or uninstall IRFinder globally and locally.
13 |
14 | 7. **Docker** and **Singularity** images available, based on Ubuntu 18 LTS ( bionic ) and containing IRFinder and all his dependencies ( latest versions of STAR, Minimap2 and SUPPA2).
15 |
16 | 2. **Major changes** ( can impact the results between different versions )
17 | 1. **NonUniformIntronCover** warning threshold simplified: now it uses the 25th/50th/75th percentile of intronic depth. Changed from:
18 | ``
19 | (max(Column13, Column14) > 2 + Column9 && max(Column13, Column14) > Column9 * 1.5 ) || (min(Column13, Column14) + 2 < Column9 && min(Column13, Column14)*1.5 < Column9 )
20 | ``
21 | to
22 | `` Column12-Column10 > Column11 ``
23 | 2. **Default Mapability read length** is now 100 instead of 70. It's not anymore hard coded and can be changed with the argument **-n** in the RunModes *BuildRef[Process|FromSTARRef]*
24 | 3. **Paired reads with one pair unmapped** are now processed as single reads instead of being removed.
25 |
26 | 3. **Minor changes** ( no impact on the results but improves the usability)
27 | 1. The mapability file can be given as argument **-M** in the RunModes *BuildRef[Process|FromSTARRef]*. Precomputed mapabilities for hg38 are available under the git subdirectory */REF/Mapabilities/hg38/* for different read lengths ( 70, 100 and 150). This will reduce drastically the time to build the IRFinder reference.
28 | 2. New argument **-l** in *BuildRefFromSTARRef* to create a **l**ink to the existing reference STAR folder, the genome file and the annotation file, instead of copy them. This will save disk space in case of multiple IRFinder reference directories using the same STAR reference.
29 |
30 | **1.3.1**
31 | 1. IRFinder now exits immediately after error, instead of trying to complete the remaining processes.
32 | 2. Improved Perl version judgement during Phase 3 of reference preparation.
33 |
34 | **1.3.0**
35 | New features:
36 | 1. New `BuildRefFromSTARRef` mode. This allows users to use an existing STAR reference to build IRFinder reference, which significantly reduces the total preparation time. This new mode also tries to automatically figure out the original FASTA and GTF files used to generate the existing STAR reference. Call `IRFinder -h` for more details.
37 | 2. `BuildRef` and `BuildRefProcess` mode now support `-j` option to parse an integer that changes the default value of `--sjdbOverhang` argument in STAR.
38 | 3. `FASTQ` mode now supports `-y` option to feed extra STAR arguments to control alignment behaviors.
39 |
40 | Improvements:
41 | 1. `FASTQ` mode now outputs a full BAM file in "Unsorted.bam", instead of a BAM file with a trimmed QS column.
42 | 2. IRFinder does not automatically generate "unsorted.frag.bam" to save disk space and to avoid redundancy to "Unsorted.bam". Instead, IRFinder now provides a tool at `bin/TrimBAM4IGV` to generate this kind of trimmed BAM file to facilitate visualization purpose in IGV.
43 | 3. Re-design of standard output information during IRFinder reference preparation. It is easier to recognize occured errors now.
44 | 4. Usage information now can be viewed by `-h` option.
45 |
46 | Bug fixes:
47 | 1. The mapability calculation during the IRFinder reference preparation stage has been re-designed. The previous algorithm encountered buffer size issues when dealing with genomes with a huge amount of chromosomes/scaffolds. This has been fixed. Please note, the new algorithm requires `samtools` (>=1.4) executable binary ready in $PATH.
48 | 2. Since Perl 5.28.0, `sort '_mergesort'` is no longer supported. IRFinder now checks the Perl version and uses `sort` functions correspondingly.
49 |
50 | **1.2.6**
51 | 1. IRFinder now keeps introns with the same effective regions as separate entries in the reference.
52 | 2. IRFinder now automatically checks if the reference preparation stage generates empty reference files, which indicates process failure.
53 | 3. The R object genreated by Differential IR Analysis script now includes an additional slot named "MaxSplice", which represents the maximum splice reads at either end of introns. Each value is the maximum value between Column 17 and 18 in the IR quantification output.
54 | 4. During differential IR analysis, values in "MaxSplice" are now used as the denominators in the GLM, instead of using the values of Column 19 in the IR quantification output. This makes the IR ratio in the differential IR analysis more consistent with the values of Column 20 in the IR quantification output.
55 | 5. User manual has been updated.
56 |
57 | **1.2.5**
58 | 1. Headers are now correctly added to output files `IRFinder-IR-dir.txt` and `IRFinder-IR-nondir.txt`.
59 |
60 | **1.2.4**
61 | 1. In the GLM-based method for differential IR comparison, now the orginal matrix for DESeq2 is now made up by IR depth and correct splicing depth. In the previous versions, the latter one is the sum of splicing depth and IR depth. This change is supposed to give a smoother dispersion estimation across all introns.
62 |
63 | **1.2.3:**
64 | 1. IRFinder now supports GTF attribution tags `gene_type` and `transcript_type` upon the original requirement for typical Ensembl tags `gene_biotype` and `transcript_biotype`. Either of these two pairs is required to correctly build IRFinder reference.
65 |
66 | **1.2.2:**
67 | 1. In GLM-based differential IR comparison, fixed an error caused by duplicated row names when creating DESeq2 object with a version of DESeq2 later than 1.10.
68 |
69 | **1.2.1:**
70 | 1. Improved the performance of DESeq2-based GLM analysis for differential IR. This new approach should improve the estimation of dispersion. Normal splicing from IRFinder result is now used as a variable in the GLM, instead of using the value of normal splicing as an offset. This approach is adapted from [detection of allele-specific expression](http://rpubs.com/mikelove/ase) from Michael Love. See Wiki page for details.
71 | 2. Updated some out-of-date usage information
72 |
73 | **1.2.0:**
74 | 1. IRFinder is now compatible with GLM-based analysis. This is achieved by passing IRFinder result to DESeq2 using the function in bin/DESeq2Constructor.R. See Wiki page for details
75 | 2. Fixed the conflict with latest version "bedtools complement" that used to cause failure in preparing IRFinder reference
76 | 3. Improved memory usage when passing lines to bedtools genomecov. This is also supposed to benefit reference preparation of those genomes with a lot of chromosomes contigs. Thanks for the smart solution from Andreas @andpet0101.
77 | 4. Specified the gtf file to be downloaded during reference preparation via automatic downloading. Ensembl currently holds several versions of gtf files for the same genome release. This confused IRFinder BuildRefDownload function in the previous version.
78 | 5. Added -v option to print out version number.
79 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | ARG R_VERSION=4.1.2
2 |
3 | FROM rocker/r-ver:${R_VERSION}
4 |
5 | LABEL version=v2.0.1
6 |
7 |
8 | ENV LD_LIBRARY_PATH="/usr/local/lib/:$LD_LIBRARY_PATH"
9 | ENV PYTHONNOUSERSITE="true"
10 | ENV PATH="/Utils/bin/:${PATH}"
11 |
12 | ARG DEBIAN_FRONTEND=noninteractive
13 |
14 | ### All the dependencies
15 | RUN apt-get update && \
16 | apt-get -y upgrade && \
17 | export DEBIAN_FRONTEND=noninteractive && \
18 | apt-get install -qy make build-essential libxml2-dev libcurl4-openssl-dev gcc bedtools samtools git gzip \
19 | zlib1g gawk libz-dev wget libboost-iostreams-dev python3.6 apt-transport-https software-properties-common \
20 | python3-pip && \
21 | apt-get clean && apt-get purge && \
22 | rm -rf /var/lib/apt/lists/*
23 |
24 | RUN pip3 install -U --no-cache-dir numpy pandas \
25 | scikit-learn scipy \
26 | statsmodels
27 |
28 | RUN Rscript -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager", force=TRUE) } ; BiocManager::install(c("BiocManager", "tximport", "readr", "RCurl", "DESeq2"), force=TRUE,ask=F, quiet=F)' && \
29 | Rscript -e 'options(warn=2); installed.packages()' | awk 'BEGIN {v=0} $1=="Version" {v=1; } v==1 && $1 == "DESeq2" { gsub("\"", ""); print $2;v=0 } '
30 |
31 |
32 |
33 | RUN mkdir -p /Utils/bin/ && \
34 | cd /Utils/ && \
35 | git clone https://github.com/alexdobin/STAR.git && \
36 | cd ./STAR && git checkout tags/2.7.9a && \
37 | cd ./source && \
38 | make STAR && \
39 | ln -s /Utils/STAR/source/STAR /Utils/bin/STAR && \
40 | cd /Utils && \
41 | git clone https://github.com/comprna/SUPPA.git && \
42 | cd ./SUPPA && \
43 | echo '#!/usr/bin/env python3' > /Utils/SUPPA/suppa.py.tmp && \
44 | cat /Utils/SUPPA/suppa.py >> /Utils/SUPPA/suppa.py.tmp && \
45 | mv /Utils/SUPPA/suppa.py.tmp /Utils/SUPPA/suppa.py && \
46 | chmod +x /Utils/SUPPA/suppa.py && \
47 | ln -s /Utils/SUPPA/suppa.py /Utils/bin/suppa.py
48 |
49 | RUN cd /Utils/ && git clone https://github.com/lh3/minimap2 && \
50 | cd minimap2 && git checkout tags/v2.3 && make && \
51 | ln -s /Utils/minimap2/minimap2 /Utils/bin/minimap2
52 |
53 |
54 | ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache
55 |
56 | COPY ./bin /IRFinder/bin
57 | COPY ./REF /IRFinder/REF
58 | COPY ./src /IRFinder/src
59 | COPY ./install.sh /IRFinder/
60 | RUN cd /IRFinder/ && \
61 | ./install.sh
62 |
63 |
64 |
65 |
66 | ENTRYPOINT ["IRFinder"]
67 |
68 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2016 william ritchie
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # IRFinder-S
3 | IRFinder-S is a suite of tools to analyse and explore intron retention events in multiple samples. It comprehends:
4 |
5 | - IRFinder : detect intron retention from RNA-Seq experiments. Includes an automatic CNN filter that emulate a visual inspection to validate the events.
6 | - IRBase : visualize and share IRFinder's results.
7 |
8 | To start using IRFinder, read our [wiki user manual.](https://github.com/RitchieLabIGH/IRFinder/wiki)
9 |
10 | [CHANGELOG](https://github.com/RitchieLabIGH/IRFinder/CHANGELOG.md)
11 |
12 | IRFinder Version 1 is still available at https://github.com/williamritchie/IRFinder but is not anymore maintained.
13 | ## About IRFinder
14 |
15 | IRFinder, developed at the [Center for Genomic Medicine of Massachusetts General Hospital](https://cgm.massgeneral.org/), the [CNRS](http://www.cnrs.com) and the [Centenary Institute](https://www.centenary.org.au), implements an end-to-end analysis of intron retention (IR) from mRNA sequencing data in multiple species.
16 | IRFinder includes alignment via the STAR (for short reads) and minimap2 (for long read) algorithm, quality controls on the sample analyzed, IR detection, quantification, convolutional neural network based validation and statistical comparison between multiple samples.
17 | IRFinder was capable of estimating IR events with low coverage or low mappability as confirmed by RT-qPCR.
18 |
19 |
20 |
21 | ## Before Start: Intron Retention Database - [IRBase](http://irbase.igh.cnrs.fr/)
22 | Before diving into IRFinder package, users might also consider [IRBase](http://irbase.igh.cnrs.fr/). It is a database for human IR inquiry and visualization, based upon pre-calculated IRFinder results from **over 935** public available human cell lines RNA-Seq sample.
23 | [IRBase](http://irbase.igh.cnrs.fr/) allows users to enquire, visualize and download single-gene IR results in a tissue/cell-type of interest, download transcriptome-wide IR results of a sample of interest, upload your results to compare with the public ones and share them with the community.
24 |
25 |
26 | ## Cite IRFinder
27 |
28 | Lorenzi, C., Barriere, S., Arnold, K. et al. IRFinder-S: a comprehensive suite to discover and explore intron retention. [Genome Biol 22, 307 (2021)](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02515-8). doi: [10.1186/s13059-021-02515-8](https://doi.org/10.1186/s13059-021-02515-8)
29 |
30 | Middleton R*, Gao D*, Thomas A, Singh B, Au A, Wong JJ, Bomane A, Cosson B, Eyras E, Rasko JE, Ritchie W. **IRFinder: assessing the impact of intron retention on mammalian gene expression**. [Genome Biol. 2017 Mar 15;18(1):51](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-017-1184-4). doi: 10.1186/s13059-017-1184-4. [PubMed PMID: 28298237](https://www-ncbi-nlm-nih-gov.ezp-prod1.hul.harvard.edu/pubmed/28298237).
31 |
32 |
--------------------------------------------------------------------------------
/REF/Mapabilities/hg38/MapabilityExclusion.100bp.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/REF/Mapabilities/hg38/MapabilityExclusion.100bp.bed.gz
--------------------------------------------------------------------------------
/REF/Mapabilities/hg38/MapabilityExclusion.150.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/REF/Mapabilities/hg38/MapabilityExclusion.150.bed.gz
--------------------------------------------------------------------------------
/REF/Mapabilities/hg38/MapabilityExclusion.70bp.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/REF/Mapabilities/hg38/MapabilityExclusion.70bp.bed.gz
--------------------------------------------------------------------------------
/REF/extra-input-files/Human_hg19_nonPolyA_ROI.bed:
--------------------------------------------------------------------------------
1 | 1 28160912 28161077
2 | 1 28975112 28975245
3 | 1 32674695 32681797
4 | 1 45241536 45241615
5 | 1 45242162 45242265
6 | 1 109642815 109643241
7 | 1 149754245 149783928
8 | 1 149784826 149785236
9 | 1 149858525 149858961
10 | 1 149859019 149859466
11 | 1 155895749 155895877
12 | 1 228645065 228645560
13 | 1 228645808 228646259
14 | 1 235291118 235291252
15 | 10 101996913 101997059
16 | 11 811681 811814
17 | 11 2985001 2985123
18 | 11 8705774 8705903
19 | 11 9450320 9450501
20 | 11 10823014 10823155
21 | 11 62432894 62433042
22 | 11 75111435 75111582
23 | 11 75115465 75115610
24 | 11 93464145 93464265
25 | 11 93465527 93465665
26 | 11 93466632 93466763
27 | 11 93468277 93468402
28 | 12 6619388 6619717
29 | 12 7076500 7076769
30 | 12 14920933 14924065
31 | 12 49048165 49048301
32 | 12 62995531 62997214
33 | 12 98993413 98993661
34 | 12 132515769 132515904
35 | 13 27829538 27829663
36 | 13 45911615 45911744
37 | 14 20811207 20811844
38 | 14 21860309 21860412
39 | 14 21865451 21865560
40 | 14 95999692 95999966
41 | 14 103804186 103804311
42 | 15 66795581 66795652
43 | 16 2205024 2205106
44 | 16 58582403 58582537
45 | 17 7809440 7809578
46 | 17 37009116 37009247
47 | 17 58308877 58309007
48 | 18 51748654 51748782
49 | 19 17973397 17973529
50 | 19 57791419 57804937
51 | 2 86362993 86363129
52 | 2 234184373 234184648
53 | 2 234197322 234197586
54 | 20 17943353 17943589
55 | 20 37053843 37053979
56 | 20 37058313 37058446
57 | 20 37062508 37062641
58 | 20 47895477 47895565
59 | 20 47896856 47896946
60 | 21 33749496 33749631
61 | 3 39449880 39450030
62 | 3 39452545 39452697
63 | 3 160232695 160233024
64 | 3 169482308 169482848
65 | 3 186502585 186502653
66 | 3 186504464 186504641
67 | 3 186505089 186505220
68 | 4 53579416 53579537
69 | 5 82360023 82360156
70 | 5 111497182 111497314
71 | 5 138614470 138614667
72 | 5 140090860 140090958
73 | 5 172447731 172447931
74 | 6 26020718 26021186
75 | 6 26021907 26022278
76 | 6 26027124 26027480
77 | 6 26031817 26032288
78 | 6 26033320 26033796
79 | 6 26043455 26043885
80 | 6 26045639 26046097
81 | 6 26055968 26056699
82 | 6 26115101 26124154
83 | 6 26124373 26139344
84 | 6 26156559 26157343
85 | 6 26158349 26171577
86 | 6 26188938 26189304
87 | 6 26197068 26199521
88 | 6 26199748 26200942
89 | 6 26216428 26216872
90 | 6 26217165 26217711
91 | 6 26225383 26225844
92 | 6 26250370 26250835
93 | 6 26251879 26252303
94 | 6 26271146 26271612
95 | 6 26273144 26273622
96 | 6 27093676 27100541
97 | 6 27100832 27103070
98 | 6 27114861 27115317
99 | 6 27775257 27775709
100 | 6 27777842 27778314
101 | 6 27782112 27782607
102 | 6 27782822 27783267
103 | 6 27805658 27806117
104 | 6 27806323 27823487
105 | 6 27834570 27835359
106 | 6 27858093 27860963
107 | 6 27861203 27861669
108 | 6 116440086 116479910
109 | 6 160201282 160201413
110 | 7 45143948 45144081
111 | 7 45144505 45144641
112 | 8 99054314 99054445
113 | 8 128959126 128960591
114 | 9 35657748 35658015
115 | 9 95054743 95054875
116 | 9 125796806 125797975
117 | 9 130210780 130210909
118 |
--------------------------------------------------------------------------------
/REF/extra-input-files/Human_hg19_wgEncodeDacMapabilityConsensusExcludable.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/REF/extra-input-files/Human_hg19_wgEncodeDacMapabilityConsensusExcludable.bed.gz
--------------------------------------------------------------------------------
/REF/extra-input-files/Human_hg38_nonPolyA_ROI.bed:
--------------------------------------------------------------------------------
1 | 1 27834401 27834566
2 | 1 28648600 28648733
3 | 1 32209094 32216196
4 | 1 44775864 44775943
5 | 1 44776490 44776593
6 | 1 109100193 109100619
7 | 1 149782689 149812373
8 | 1 149813271 149813681
9 | 1 149886975 149887411
10 | 1 149887469 149887916
11 | 1 155925958 155926086
12 | 1 228457364 228457859
13 | 1 228458107 228458558
14 | 1 235127803 235127937
15 | 10 100237156 100237302
16 | 11 811681 811814
17 | 11 2963771 2963893
18 | 11 8684227 8684356
19 | 11 9428773 9428954
20 | 11 10801467 10801608
21 | 11 62665422 62665570
22 | 11 75400391 75400538
23 | 11 75404421 75404566
24 | 11 93730979 93731099
25 | 11 93732361 93732499
26 | 11 93733466 93733597
27 | 11 93735111 93735236
28 | 12 6510222 6510551
29 | 12 6967337 6967606
30 | 12 14767999 14771131
31 | 12 48654382 48654518
32 | 12 62601751 62603434
33 | 12 98599635 98599883
34 | 12 132031224 132031359
35 | 13 27255401 27255526
36 | 13 45337480 45337609
37 | 14 20343048 20343685
38 | 14 21392150 21392253
39 | 14 21397292 21397401
40 | 14 95533355 95533629
41 | 14 103337849 103337974
42 | 15 66503243 66503314
43 | 16 2155023 2155105
44 | 16 58548499 58548633
45 | 17 7906122 7906260
46 | 17 38852863 38852994
47 | 17 60231516 60231646
48 | 18 54222284 54222412
49 | 19 17862588 17862720
50 | 19 57280051 57293569
51 | 2 86135870 86136006
52 | 2 233275727 233276002
53 | 2 233288676 233288940
54 | 20 17962710 17962946
55 | 20 38425195 38425331
56 | 20 38429670 38429803
57 | 20 38433865 38433998
58 | 20 49278940 49279028
59 | 20 49280319 49280409
60 | 21 32377187 32377322
61 | 3 39408389 39408539
62 | 3 39411054 39411206
63 | 3 160514907 160515236
64 | 3 169764520 169765060
65 | 3 186784796 186784864
66 | 3 186786675 186786852
67 | 3 186787300 186787431
68 | 4 52713249 52713370
69 | 5 83064204 83064337
70 | 5 112161485 112161617
71 | 5 139278781 139278978
72 | 5 140711275 140711373
73 | 5 173020728 173020928
74 | 6 26020490 26020958
75 | 6 26021679 26022050
76 | 6 26026896 26027252
77 | 6 26031589 26032060
78 | 6 26033092 26033568
79 | 6 26043227 26043657
80 | 6 26045411 26045869
81 | 6 26055740 26056471
82 | 6 26114873 26123926
83 | 6 26124145 26139116
84 | 6 26156331 26157115
85 | 6 26158121 26171349
86 | 6 26188710 26189076
87 | 6 26196840 26199293
88 | 6 26199520 26200714
89 | 6 26216200 26216644
90 | 6 26216937 26217483
91 | 6 26225155 26225616
92 | 6 26250142 26250607
93 | 6 26251651 26252075
94 | 6 26270918 26271384
95 | 6 26272916 26273394
96 | 6 27125897 27132762
97 | 6 27133053 27135291
98 | 6 27147082 27147538
99 | 6 27807479 27807931
100 | 6 27810064 27810536
101 | 6 27814334 27814829
102 | 6 27815044 27815489
103 | 6 27837880 27838339
104 | 6 27838545 27855709
105 | 6 27866792 27867581
106 | 6 27890315 27893185
107 | 6 27893425 27893891
108 | 6 116118923 116158747
109 | 6 159780250 159780381
110 | 7 45104349 45104482
111 | 7 45104906 45105042
112 | 8 98042086 98042217
113 | 8 127946880 127948345
114 | 9 35657751 35658018
115 | 9 92292461 92292593
116 | 9 123034527 123035696
117 | 9 127448501 127448630
118 |
--------------------------------------------------------------------------------
/REF/extra-input-files/Mouse_mm10_nonPolyA_ROI.bed:
--------------------------------------------------------------------------------
1 | 1 86099026 86111970
2 | 1 87776938 87777209
3 | 1 87784556 87784820
4 | 1 127375131 127375239
5 | 1 133601223 133601355
6 | 10 11959880 11960005
7 | 10 34389981 34397085
8 | 10 91118291 91118536
9 | 10 117183335 117183411
10 | 10 125226464 125328963
11 | 11 6619808 6619940
12 | 11 6620319 6620454
13 | 11 11913621 11913748
14 | 11 20847753 20847881
15 | 11 58948911 58949532
16 | 11 58954685 58956674
17 | 11 69350518 69350661
18 | 11 69666936 69672423
19 | 11 97777527 97782437
20 | 11 99017031 99017152
21 | 11 106501245 106501377
22 | 11 116163910 116164036
23 | 12 16939994 16940115
24 | 12 31258933 31259062
25 | 12 105031075 105031349
26 | 12 111540941 111541067
27 | 13 21715763 21716143
28 | 13 21717628 21718115
29 | 13 21722098 21722478
30 | 13 21735062 21735837
31 | 13 21750194 21750505
32 | 13 21753395 21753907
33 | 13 21754123 21754503
34 | 13 21779883 21780625
35 | 13 21782915 21783397
36 | 13 21786826 21787218
37 | 13 21787461 21789213
38 | 13 21806412 21810199
39 | 13 21810465 21810944
40 | 13 21811746 21812150
41 | 13 21831767 21832196
42 | 13 21833057 21833575
43 | 13 21833743 21837530
44 | 13 22035113 22035643
45 | 13 22035821 22036299
46 | 13 22040816 22041352
47 | 13 22042479 22042949
48 | 13 22043214 22043676
49 | 13 23531044 23531519
50 | 13 23533906 23534304
51 | 13 23535422 23535951
52 | 13 23542924 23543359
53 | 13 23544052 23545055
54 | 13 23551258 23551648
55 | 13 23570517 23571220
56 | 13 23571396 23572013
57 | 13 23573736 23574196
58 | 13 23574381 23574952
59 | 13 23581598 23581990
60 | 13 23583742 23621124
61 | 13 23621755 23622502
62 | 13 23684199 23692488
63 | 13 23738807 23740366
64 | 13 23744973 23745602
65 | 13 23746734 23747202
66 | 13 23751088 23751593
67 | 13 23756937 23757427
68 | 13 23760802 23761249
69 | 13 23761853 23762386
70 | 13 24811447 24811568
71 | 13 49684301 49684433
72 | 13 51202688 51203065
73 | 13 62136762 62136831
74 | 13 62543434 62543503
75 | 13 74371426 74376566
76 | 13 75905147 75905278
77 | 13 95332618 95332749
78 | 14 11227552 11227992
79 | 14 26497655 26497785
80 | 14 32191854 32192050
81 | 14 52209785 52209894
82 | 14 57333366 57333485
83 | 14 57697809 57697937
84 | 15 71794188 71794365
85 | 15 98519716 98519853
86 | 16 23107444 23114136
87 | 16 43886682 43886814
88 | 16 52105015 52105147
89 | 16 53404086 53404193
90 | 16 54354462 54354593
91 | 16 71663788 71663918
92 | 16 90602496 90602617
93 | 17 12922790 12922917
94 | 17 24528476 24528553
95 | 17 55915403 55915870
96 | 18 33795164 33795295
97 | 18 35557032 35557227
98 | 18 36801866 36802008
99 | 19 8888538 8888685
100 | 19 20033504 20033635
101 | 19 44113979 44114124
102 | 19 46359036 46359144
103 | 19 47170469 47171134
104 | 2 19934863 19934953
105 | 2 32675109 32675245
106 | 2 32963291 32963420
107 | 2 37516332 37520603
108 | 2 38997476 39006168
109 | 2 85420137 85420280
110 | 2 144265979 144266216
111 | 2 158356424 158356554
112 | 2 158358360 158358472
113 | 2 158359798 158359929
114 | 2 158378222 158378354
115 | 2 167063473 167063565
116 | 2 167064998 167065086
117 | 2 171306168 171306431
118 | 3 24333046 24333552
119 | 3 30595346 30595757
120 | 3 69085105 69085447
121 | 3 86100534 86100655
122 | 3 88693930 88694057
123 | 3 96219865 96220308
124 | 3 96220361 96220880
125 | 3 96221121 96223738
126 | 3 96238110 96239127
127 | 3 96261704 96263311
128 | 3 96269721 96279001
129 | 3 96414437 96414859
130 | 3 108554338 108554751
131 | 3 128540372 128540480
132 | 3 150072590 150073620
133 | 4 43492788 43493058
134 | 4 117153827 117156243
135 | 4 129608331 129614257
136 | 4 132270080 132270213
137 | 4 132838383 132838547
138 | 4 134167808 134167895
139 | 5 92429785 92429928
140 | 5 100831281 100831414
141 | 5 110692049 110692181
142 | 5 146832890 146837032
143 | 5 149145724 149145821
144 | 6 8501236 8501356
145 | 6 39422289 39422419
146 | 6 52639234 52639355
147 | 6 71882557 71882693
148 | 6 124715232 124715502
149 | 6 132656957 132657844
150 | 6 132777179 132778162
151 | 6 136801553 136804431
152 | 7 97521808 97521916
153 | 7 99479563 99479707
154 | 7 99482785 99482932
155 | 7 109519147 109522367
156 | 7 110023210 110023342
157 | 7 110046364 110046547
158 | 7 111076060 111076227
159 | 7 118153480 118153610
160 | 7 127527874 127528003
161 | 7 141447370 141451585
162 | 7 143531394 143531520
163 | 8 13876097 13876226
164 | 8 57549775 57549888
165 | 8 69742862 69774886
166 | 8 70894722 70897443
167 | 8 95746060 95746195
168 | 8 110923116 110923263
169 | 8 121666628 121666706
170 | 9 3352657 3352786
171 | 9 15306214 15312104
172 | 9 15313802 15313932
173 | 9 15314845 15314981
174 | 9 15316489 15316588
175 | 9 64173387 64178562
176 | 9 120128780 120128935
177 | X 35838127 35838401
178 | X 93164902 93164984
179 | X 121308217 121308340
180 | X 156455999 156456095
181 |
--------------------------------------------------------------------------------
/REF/extra-input-files/Mouse_mm9_nonPolyA_ROI.bed:
--------------------------------------------------------------------------------
1 | 1 87995601 88008545
2 | 1 89673513 89673784
3 | 1 89681131 89681395
4 | 1 129271708 129271816
5 | 1 135497800 135497932
6 | 10 11679678 11679803
7 | 10 34109787 34116891
8 | 10 90581036 90581281
9 | 10 116620391 116620467
10 | 10 124663520 124766019
11 | 11 6519811 6519943
12 | 11 6520322 6520457
13 | 11 11813624 11813751
14 | 11 20747756 20747884
15 | 11 58762413 58763034
16 | 11 58768187 58770176
17 | 11 69164020 69164163
18 | 11 69480438 69485925
19 | 11 97638841 97643751
20 | 11 98878345 98878466
21 | 11 106362559 106362691
22 | 11 116025224 116025350
23 | 12 16946800 16946921
24 | 12 31943798 31943927
25 | 12 106269285 106269559
26 | 12 112779152 112779278
27 | 13 21807632 21808012
28 | 13 21809497 21809984
29 | 13 21813967 21814347
30 | 13 21826931 21827706
31 | 13 21842063 21842374
32 | 13 21845264 21845776
33 | 13 21845992 21846372
34 | 13 21871752 21872494
35 | 13 21874784 21875266
36 | 13 21878695 21879087
37 | 13 21879330 21881082
38 | 13 21898281 21902068
39 | 13 21902334 21902813
40 | 13 21903615 21904019
41 | 13 21923636 21924065
42 | 13 21924926 21925444
43 | 13 21925612 21929399
44 | 13 22126982 22127512
45 | 13 22127690 22128168
46 | 13 22132685 22133221
47 | 13 22134348 22134818
48 | 13 22135083 22135545
49 | 13 23622913 23623388
50 | 13 23625775 23626173
51 | 13 23627291 23627820
52 | 13 23634793 23635228
53 | 13 23635921 23636924
54 | 13 23643127 23643517
55 | 13 23662386 23663089
56 | 13 23663265 23663882
57 | 13 23665605 23666065
58 | 13 23666250 23666821
59 | 13 23673467 23673859
60 | 13 23675611 23712993
61 | 13 23713624 23714371
62 | 13 23776068 23784357
63 | 13 23830676 23832235
64 | 13 23836842 23837471
65 | 13 23838603 23839071
66 | 13 23842957 23843462
67 | 13 23848806 23849296
68 | 13 23852671 23853118
69 | 13 23853722 23854255
70 | 13 24903316 24903437
71 | 13 49779670 49779802
72 | 13 51298057 51298434
73 | 13 62238122 62238191
74 | 13 62644794 62644863
75 | 13 74508874 74514014
76 | 13 76042595 76042726
77 | 13 96102573 96102704
78 | 14 12060066 12060506
79 | 14 27317141 27317271
80 | 14 33005040 33005236
81 | 14 52829460 52829569
82 | 14 57952203 57952322
83 | 14 58316646 58316774
84 | 15 71624618 71624795
85 | 15 98350147 98350284
86 | 16 23107517 23114209
87 | 16 43886795 43886927
88 | 16 52105128 52105260
89 | 16 53404199 53404306
90 | 16 54354575 54354706
91 | 16 71664033 71664163
92 | 16 90602741 90602862
93 | 17 13115656 13115783
94 | 17 24665421 24665498
95 | 17 56054826 56055293
96 | 18 33954818 33954949
97 | 18 35716686 35716881
98 | 18 36961520 36961662
99 | 19 8963028 8963175
100 | 19 20107994 20108125
101 | 19 44188469 44188614
102 | 19 46433526 46433634
103 | 19 47244959 47245624
104 | 2 19856490 19856580
105 | 2 32530629 32530765
106 | 2 32818811 32818940
107 | 2 37371852 37376123
108 | 2 38852996 38861688
109 | 2 85260294 85260437
110 | 2 144091715 144091952
111 | 2 158182160 158182290
112 | 2 158184096 158184208
113 | 2 158185534 158185665
114 | 2 158203958 158204090
115 | 2 166888973 166889065
116 | 2 166890498 166890586
117 | 2 171131668 171131931
118 | 3 24231968 24232474
119 | 3 30494268 30494679
120 | 3 68889027 68889369
121 | 3 85904456 85904577
122 | 3 88497852 88497979
123 | 3 96023788 96024231
124 | 3 96024284 96024803
125 | 3 96025044 96027661
126 | 3 96042033 96043050
127 | 3 96065627 96067234
128 | 3 96073644 96082924
129 | 3 96218360 96218782
130 | 3 108357256 108357669
131 | 3 128243290 128243398
132 | 3 149735554 149736584
133 | 4 43505660 43505930
134 | 4 116826432 116828848
135 | 4 129285575 129291501
136 | 4 131825995 131826128
137 | 4 132394298 132394462
138 | 4 133723723 133723810
139 | 5 92858811 92858954
140 | 5 101260300 101260433
141 | 5 111121068 111121200
142 | 5 147644466 147648608
143 | 5 149957300 149957397
144 | 6 8451236 8451356
145 | 6 39372288 39372418
146 | 6 52589228 52589349
147 | 6 71832551 71832687
148 | 6 124665250 124665520
149 | 6 132606975 132607862
150 | 6 132727197 132728180
151 | 6 136750074 136752952
152 | 7 104670318 104670426
153 | 7 106628073 106628217
154 | 7 106631295 106631442
155 | 7 116662661 116665881
156 | 7 117166724 117166856
157 | 7 117189878 117190061
158 | 7 118219574 118219741
159 | 7 125296994 125297124
160 | 7 134671388 134671517
161 | 7 148633269 148637484
162 | 7 150717299 150717425
163 | 8 13876097 13876226
164 | 8 60028572 60028685
165 | 8 72266761 72298785
166 | 8 73418621 73421342
167 | 8 98269960 98270095
168 | 8 113447016 113447163
169 | 8 124190528 124190606
170 | 9 3352657 3352786
171 | 9 15110658 15116548
172 | 9 15118246 15118376
173 | 9 15119289 15119425
174 | 9 15120933 15121032
175 | 9 64021194 64026369
176 | 9 120037898 120038053
177 | X 33378122 33378396
178 | X 90410241 90410323
179 | X 118421826 118421949
180 | X 152890542 152890638
181 |
--------------------------------------------------------------------------------
/REF/extra-input-files/RNA.SpikeIn.ERCC.fasta.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/REF/extra-input-files/RNA.SpikeIn.ERCC.fasta.gz
--------------------------------------------------------------------------------
/REF/extra-input-files/URLs:
--------------------------------------------------------------------------------
1 | Human:
2 | http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/wgEncodeDacMapabilityConsensusExcludable.txt.gz
3 |
4 | ERCC:
5 | ftp://ftp.ncbi.nlm.nih.gov/repository/acedb/SEQC_Reference_Targets/RNA.SpikeIn.ERCC.fasta.gz
6 |
7 | === Ensembl Base FTP ===
8 |
9 | mm20: (mouse)
10 | ftp://ftp.ensembl.org/pub/release-81/fasta/mus_musculus/dna/
11 |
12 | hg19: (human, popular)
13 | ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/
14 |
15 | hg38: (human, most recent)
16 | ftp://ftp.ensembl.org/pub/release-81/fasta/homo_sapiens/dna/
17 |
--------------------------------------------------------------------------------
/bin/DESeq2Constructor.R:
--------------------------------------------------------------------------------
1 | DESeqDataSetFromIRFinder = function(filePaths,designMatrix,designFormula,irratio_thr=0, warning_filter="^$" ){
2 | res=c()
3 | libsz=c()
4 | spl=c()
5 | irtest=read.table(filePaths[1])
6 | if (irtest[1,1]=="Chr"){irtest=irtest[-1,]}
7 | irnames=unname(apply(as.matrix(irtest),1,FUN=function(x){return(paste0(x[4],"/",x[1],":",x[2],"-",x[3],":",x[6]))}))
8 | n=1
9 | warns=c()
10 | ratio_mask=c()
11 | for (i in filePaths){
12 | print(paste0("processing file ",n," at ",i))
13 | irtab=read.table(i)
14 | if (irtab[1,1]=="Chr"){irtab=irtab[-1,]}
15 | #rn=unname(apply(irtab,1,FUN=function(x){return(paste0(x[4],"/",x[1],":",x[2],"-",x[3],":",x[6]))}))
16 | #row.names(irtab)=rn
17 | #tmp1=round(as.numeric(as.vector(irtab[irnames,9])))
18 | #tmp2=as.numeric(as.vector(irtab[irnames,19]))
19 | tmp1=as.numeric(as.vector(irtab[,9]))
20 | tmp2=as.numeric(as.vector(irtab[,19]))
21 | tmp3=tmp1+tmp2
22 | tmp4=as.numeric(as.vector(irtab[,17]))
23 | tmp5=as.numeric(as.vector(irtab[,18]))
24 | tmp6=pmax(tmp4,tmp5, na.rm=T)
25 | res=cbind(res,tmp1)
26 | libsz=cbind(libsz,tmp2)
27 | spl=cbind(spl,tmp6)
28 | if (length(warns) == 0){
29 | warns= ! grepl(as.character(irtab[,21]), pattern = warning_filter )
30 | } else {
31 | warns=warns & ! grepl(as.character(irtab[,21]), pattern = warning_filter )
32 | }
33 | ratios=(tmp1 / (tmp6+tmp1))
34 | rmsk=(! is.nan(ratios)) & ratios >= irratio_thr
35 | if (length(ratio_mask) == 0 ){
36 | ratio_mask = rmsk
37 | } else {
38 | ratio_mask = ratio_mask | rmsk
39 | }
40 | n=n+1
41 | }
42 | print(warning_filter)
43 | print(irratio_thr)
44 | print(paste0("Warning removed: ", sum(! warns)))
45 | print(paste0("Ratio removed: ", sum(! ratio_mask)))
46 | warns=warns & ratio_mask
47 | print(paste0("Combined removed: ", sum(! warns)))
48 | res.rd=round(res)[warns,]
49 | libsz.rd=round(libsz)[warns,]
50 | spl.rd=round(spl)[warns,]
51 | colnames(res.rd)=paste("intronDepth",as.vector(designMatrix[,1]),sep=".")
52 | rownames(res.rd)=irnames[warns]
53 | colnames(libsz.rd)=paste("totalSplice",as.vector(designMatrix[,1]),sep=".")
54 | rownames(libsz.rd)=irnames[warns]
55 | colnames(spl.rd)=paste("maxSplice",as.vector(designMatrix[,1]),sep=".")
56 | rownames(spl.rd)=irnames[warns]
57 |
58 | ir=c(rep("IR",dim(designMatrix)[1]),rep("Splice",dim(designMatrix)[1]))
59 | group=rbind(designMatrix,designMatrix)
60 | group$IRFinder=ir
61 | group$IRFinder=factor(group$IRFinder,levels=c("Splice","IR"))
62 |
63 | #counts.IRFinder=cbind(res.rd,libsz.rd)
64 | counts.IRFinder=cbind(res.rd,spl.rd)
65 |
66 | dd = DESeqDataSetFromMatrix(countData = counts.IRFinder, colData = group, design = designFormula)
67 | sizeFactors(dd)=rep(1,dim(group)[1])
68 | rownames(dd)=irnames[warns]
69 | final=list(dd,res,libsz,spl)
70 | names(final)=c("DESeq2Object","IntronDepth","SpliceDepth","MaxSplice")
71 | return(final)
72 | }
73 |
--------------------------------------------------------------------------------
/bin/IRFinder:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util
4 | source ${LIBEXEC}/bash_utils.sh
5 |
6 | RUNMODES="FastQ|Long|BAM|BuildRef|BuildRefDownload|BuildRefProcess|BuildRefFromSTARRef|Diff"
7 |
8 | function usage() {
9 | echo "" >&2
10 | echo "IRFinder version: $VERSION" >&2
11 | echo "Usage: IRFinder [${RUNMODES}]" >&2
12 | echo "" >&2
13 | echo "Possible RunModes:" >&2
14 | echo "" >&2
15 | echo " BuildRef: Builds IRFinder reference from Ensembl FTP site. Requires Internet" >&2
16 | echo " BuildRefDownload: Only downloads FASTA and GTF files from Ensembl FTP site, ">&2
17 | echo " without building IRFinder reference. Requires Internet" >&2
18 | echo " BuildRefProcess: Builds IRFinder reference from local FASTA and GTF files" >&2
19 | echo " BuildRefFromSTARRef: Builds IRFinder reference from a local STAR reference" >&2
20 | echo " FastQ: Quantifies intron retention from FASTQ file (Default)" >&2
21 | echo " Long: Quantifies intron retention from FASTQ file of long reads" >&2
22 | echo " BAM: Quantifies intron retention from a BAM file" >&2
23 | echo " Diff: Compare IRrates from two conditions using SUPPA2 algorithm" >&2
24 |
25 | echo "" >&2
26 | echo " -v|--version Show version number of current IRFinder ( when no RunMode is given )." >&2
27 | echo " -h|--help Show this usage information. Dedicated usage informations are given if a RunMode is selected." >&2
28 | echo "" >&2
29 | exit 1
30 | }
31 |
32 | function isRunMode() {
33 | if [[ $1 =~ ${RUNMODES} ]]; then
34 | return 0
35 | else
36 | return 1
37 | fi
38 | }
39 |
40 |
41 |
42 | # === Defaults ===
43 |
44 | RUNMODE=""
45 | export START_MESSAGE=0
46 | EXECDIR=$(dirname "$(readlink -nf "$BASH_SOURCE")")
47 |
48 | if [[ $# -eq 0 || $1 == "-h" || $1 == "--help" ]]; then
49 | usage
50 | fi
51 |
52 | if [[ ( $# -eq 1 && $1 == "-v" ) || $( echo "$@" | grep -c "\-\-version" ) == "1" ]]; then
53 | echo "IRFinder version: $VERSION"
54 | exit
55 | fi
56 |
57 | if [[ $1 =~ ^[^-] ]] ; then
58 | RUNMODE=$(echo $1 | awk -v runm="${RUNMODES}" '{IGNORECASE=1; split(runm, arr, "|"); out=$1; for ( k in arr) { if ( arr[k] == $1 ) { out=arr[k] } }; print out }')
59 | shift;
60 | args=$@
61 | elif [[ $( echo $@ | grep -c "\-m" ) == 1 ]] ; then
62 | RUNMODE=$( echo $@ | awk -v runm="${RUNMODES}" ' {out="" ; IGNORECASE=1; split(runm, arr, "|"); for ( i=1; i<= NF; i++ ) { if ($i == "-m" ) { i=i+1; for ( k in arr ) { if ( arr[k] == $i ) { out=arr[k] } } } }; print out } ' )
63 | if [[ "${RUNMODE}" != "" ]] ; then
64 | args=$( echo $@ | awk '{out=""; for ( i=1; i<=NF; i++) { if ( $i == "-m" ) { i=i+1 } else { out = out " " $i } }; print out} ' )
65 | fi
66 | fi
67 |
68 | if [[ "${RUNMODE}" == "" ]]; then
69 | echo "Possible runmodes: $RUNMODES"
70 | exit 1
71 | fi
72 |
73 |
74 |
75 | if isRunMode $RUNMODE; then
76 | $EXECDIR/IRFinder${RUNMODE} ${args}
77 | else
78 | echo "RunMode $RUNMODE not recognized." >&2
79 | echo "Valid options for Mode are: BuildRef, BuildRefDownload, BuildRefProcess, BuildRefFromSTARRef, BAM, FastQ, Long, Diff. Default: FastQ" >&2
80 | fi
81 |
82 |
83 |
--------------------------------------------------------------------------------
/bin/IRFinderBAM:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util
4 | source $LIBEXEC/bash_utils.sh
5 |
6 |
7 | function usage() {
8 | echo "" >&2
9 | echo "IRFinder version: $VERSION" >&2
10 | echo "Usage: IRFinder BAM -r ReferenceDir [Un]sorted.bam " >&2
11 | echo "" >&2
12 | echo " required:" >&2
13 | echo " [Un]sorted.bam: the target bam file. If paired end and sorted by coordinates, "
14 | echo " the process will be slightly slower and more memory consuming." >&2
15 | # TODO - we cannot currently accept fasta input to the trimmer (only fastq), probably should, believe STAR ignores quality anyway, and we strip it on output.
16 | echo " -r ReferenceDir: As built by the 'BuildRef' option." >&2
17 | echo "" >&2
18 | echo " optional:" >&2
19 | echo " -d string : Output Directory. Default is the current directory." >&2
20 | echo " -l Long reads flag." >&2
21 | echo " -j Jitter, consider the position around the splice sites to compensate sequencing errors ( Long reads only, integer number )." >&2
22 | echo " -v Verbose." >&2
23 | echo " additional :" >&2
24 | echo " -R double : Minimum IRratio accepted to consider the intron for the CNN validation. Default: 0.05 " >&2
25 | echo " -w int : Warning level accepted to consider the intron for the CNN validation. Default: 1" >&2
26 | echo " 0: Disabled " >&2
27 | echo " 1: Only without warning " >&2
28 | echo " 2: Include NonUniformIntronCover " >&2
29 | echo " 3: Include also MinorIsoform" >&2
30 | echo " 4: Include also LowSplicing" >&2
31 | echo " 5: Include also LowCover ( consider all )" >&2
32 | echo "" >&2
33 | exit 1
34 | }
35 |
36 |
37 | # === Defaults ===
38 | OUTPUTDIR=.
39 | THREADS=0
40 | REF=
41 | VERBOSE=0
42 | RETRO=0
43 | READ_TYPE="SR"
44 | AI_WARN=1
45 | AI_INTRON=1
46 | AI_RATIO="0.05"
47 | JITTER="3"
48 |
49 | if [ $# -eq 0 ] || [[ $1 == "--help" ]] ; then
50 | usage
51 | fi
52 |
53 | while getopts ":r:t:d:i:w:R:j:vhl" opt; do
54 | case $opt in
55 | r)
56 | # Reference directory.
57 | REF=$OPTARG
58 | ;;
59 | t)
60 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.) IRFinder is single core for now.
61 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
62 | echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2
63 | exit 1
64 | fi
65 | THREADS=$OPTARG
66 | ;;
67 | i)
68 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
69 | echo "Argument error: -i $OPTARG, number of warning must be an integer." >&2
70 | exit 1
71 | fi
72 | AI_INTRON=$OPTARG
73 | ;;
74 | R)
75 | if [[ ! $OPTARG =~ ^0\.[0-9]+$ ]]; then
76 | echo "Argument error: -r $OPTARG, ratio must be a float number between 0 and 1 not included." >&2
77 | exit 1
78 | fi
79 | AI_RATIO=$OPTARG
80 | ;;
81 | w)
82 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
83 | echo "Argument error: -w $OPTARG, number of intron depth must be an integer." >&2
84 | exit 1
85 | fi
86 | AI_WARN=$OPTARG
87 | ;;
88 | j)
89 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
90 | echo "Argument error: -j $OPTARG, jitter must be an integer." >&2
91 | exit 1
92 | fi
93 | JITTER=$OPTARG
94 | ;;
95 | d)
96 | OUTPUTDIR=$OPTARG
97 | ;;
98 | v)
99 | VERBOSE=1
100 | ;;
101 | l)
102 | READ_TYPE="LR"
103 | ;;
104 | h)
105 | usage
106 | ;;
107 | \?)
108 | echo "Invalid option: -$OPTARG" >&2
109 | exit 1
110 | ;;
111 | :)
112 | echo "Option -$OPTARG requires an argument." >&2
113 | exit 1
114 | ;;
115 | esac
116 | done
117 | shift $(($OPTIND - 1))
118 |
119 |
120 | checkRef $REF
121 | checkOutDir $OUTPUTDIR
122 | checkSamtools
123 | setThreads
124 |
125 | if [ ! $# -eq 1 ]; then
126 | echo "Argument error: in run mode BAM, provide a single BAM as input. $# arguments found." >&2
127 | exit 1
128 | fi
129 |
130 | if [[ "${IRF_RUNMODE}" == "" ]]; then
131 | logger init
132 | fi
133 |
134 | RUNMODE="BAM" startMessage $@
135 |
136 | logger "[ " $(date) " ] Processing the BAM file with IRFinder"
137 | logger "---"
138 |
139 | # BAM check
140 | samtools view -H $1 > /dev/null || exit 1
141 | # Sort check
142 | if [ $(samtools view -H $1 | grep -c "SO:coordinate" ) -eq 1 ]; then
143 | # PE check
144 | if [ $( { samtools view -H $1 ; samtools view $1 | head -n 1000 ; } | samtools view -c -f 1 ) -gt 0 ]; then
145 | logger "The given bam file is sorted by coordinate and is paired."
146 | fi
147 | fi
148 |
149 |
150 | if [ $VERBOSE -eq 1 ];then
151 | ${LIBEXEC}/irfinder ${OUTPUTDIR} \
152 | ${REF}/IRFinder/ref-cover.bed \
153 | ${REF}/IRFinder/ref-sj.ref \
154 | ${REF}/IRFinder/ref-read-continues.ref \
155 | ${REF}/IRFinder/ref-ROI.bed ${READ_TYPE} "${AI_WARN}:${AI_INTRON}:${AI_RATIO}" "${JITTER}" $1 2>> $OUTPUTDIR/logs/irfinder.stderr | tee -a $OUTPUTDIR/logs/irfinder.stdout
156 | cat $OUTPUTDIR/logs/irfinder.stderr
157 | else
158 | ${LIBEXEC}/irfinder ${OUTPUTDIR} \
159 | ${REF}/IRFinder/ref-cover.bed \
160 | ${REF}/IRFinder/ref-sj.ref \
161 | ${REF}/IRFinder/ref-read-continues.ref \
162 | ${REF}/IRFinder/ref-ROI.bed ${READ_TYPE} "${AI_WARN}:${AI_INTRON}:${AI_RATIO}" "${JITTER}" $1 >> $OUTPUTDIR/logs/irfinder.stdout 2>> $OUTPUTDIR/logs/irfinder.stderr
163 | fi
164 |
165 |
166 |
167 | logger "---"
168 | logger "[ " $(date) " ] IRFinder BAM analysis completed "
169 | logger "---"
170 |
171 | "$LIBEXEC/warnings" "$OUTPUTDIR"
172 |
173 | N_WARNINGS=$(wc -l $OUTPUTDIR/WARNINGS | awk '{print $1}' )
174 | if [ $N_WARNINGS -gt 0 ]; then
175 | logger "Process completed with warnings. Check $OUTPUTDIR/WARNINGS " >&2
176 | fi
177 |
178 | if [[ -f $OUTPUTDIR/IRFinder-IR-nondir-AI.txt ]]; then
179 | logger "---"
180 | logger "[ " $(date) " ] Running CNN validator "
181 | logger "---"
182 | ${LIBEXEC}/irfinder_cnn -d ${OUTPUTDIR} -m ${LIBEXEC}/model/ && rm ${OUTPUTDIR}/*-AI.txt
183 | logger "---"
184 | logger "[ " $(date) " ] CNN validator completed"
185 | logger "---"
186 | fi
187 |
188 |
189 |
190 |
--------------------------------------------------------------------------------
/bin/IRFinderBuildRef:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util
4 | source $LIBEXEC/bash_utils.sh
5 |
6 | function usage() {
7 | echo "" >&2
8 | echo "IRFinder version: $VERSION" >&2
9 | echo "Usage: IRFinder BuildRef [-v][-h][-t INT][-j INT][-e ExtraGenomeRef.fa][-b Blacklist.bed][-R ROI.bed][-m Mapability.bed][-M INT] -r ReferenceDir URL" >&2
10 | echo "" >&2
11 | echo " required:" >&2
12 | echo " URL A base Ensembl URL to a (gzipped) gtf file. For example: ftp://ftp.ensembl.org/pub/release-100/gtf/homo_sapiens/Homo_sapiens.GRCh38.100.gtf.gz" >&2
13 | echo " -r ReferenceDir: Directory should not yet exist, will be created." >&2
14 | echo "" >&2
15 | echo " optional:" >&2
16 | echo " -t Threads: The number of physical CPUs to use by IRFinder. When ommited (default), IRFinder will use all physical CPUs." >&2
17 | echo " -j INTEGER: An integer that is parsed to '--sjdbOverhang' under STAR 'genomeGenerate' mode. Default: 150." >&2
18 | echo " -M Mapability: A precomputed bed file containing the low mapability areas. Can also be an empty file." >&2
19 | echo " -n MapabilityReadLength: The length of the reads used to compute the mapability. Default: 70" >&2
20 | echo " -e ExtraGenomeRef.fasta.gz: Typically an ERCC reference." >&2
21 | echo " -b Blacklist.bed.gz: BED of regions to be excluded from analysis." >&2
22 | echo " -R ROI.bed.gz: A non-overlapping BED file of additional Regions of Interest for read counts." >&2
23 | echo " -v Show version number of current IRFinder." >&2
24 | echo " -L STAR limitGenomeGenerateRAM argument. Default: 31000000000" >&2
25 | echo " -h Show this usage information." >&2
26 | echo "" >&2
27 | exit 1
28 | }
29 |
30 |
31 | # === Defaults ===
32 | THREADS=0
33 | REF=
34 | SJOH=150
35 | BUILDERCCFILE=
36 | BUILDROI=
37 | BUILDBLACK=
38 | STAREXEC=STAR
39 | MAPABILITY_FILE=
40 | MAPABILITY_LEN=100
41 | GENOMERAM=31000000000
42 |
43 | if [ $# -eq 0 ] || [[ $1 == "--help" ]] ; then
44 | usage
45 | fi
46 |
47 |
48 | while getopts ":r:j:t:S:e:b:R:n:M:L:hv" opt; do
49 | case $opt in
50 | r)
51 | # Reference directory.
52 | REF=$OPTARG
53 | if [ -d "$REF" ]; then
54 | echo "Argument error: -r $REF. Reference directory must not exist, BuildRef will create it." >&2
55 | exit 1
56 | fi
57 | ;;
58 | j) # STAR's --sjdbOverhang
59 | if [[ $OPTARG =~ ^[0-9]+$ ]] ; then
60 | SJOH=$OPTARG
61 | else
62 | echo "Argument error: -j $OPTARG. '$OPTARG' is not an integer." >&2
63 | exit 1
64 | fi
65 | ;;
66 | t)
67 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.)
68 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
69 | echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2
70 | exit 1
71 | fi
72 | THREADS=$OPTARG
73 | ;;
74 | L)
75 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.)
76 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
77 | echo "Argument error: -L $OPTARG, limitGenomeGenerateRAM must be an integer." >&2
78 | exit 1
79 | fi
80 | GENOMERAM=$OPTARG
81 | ;;
82 | S)
83 | ## STAR executable. (must be executable!)
84 | if [ -x "$OPTARG" -a ! -d "$OPTARG" ]; then
85 | STAREXEC=$OPTARG
86 | else
87 | echo "Argument error: -S $OPTARG. STAR executable must be an executable program." >&2
88 | exit 1
89 | fi
90 | ;;
91 | e)
92 | #ERCC file. (must be a file)
93 | if [ ! -f "$OPTARG" ]; then
94 | echo "Argument error: -e $OPTARG. Specified ERCC/extra-reference file does not exist." >&2
95 | exit 1
96 | fi
97 | BUILDERCCFILE=$OPTARG
98 | ;;
99 | b)
100 | #Blacklist local file (must be a file)
101 | if [ ! -f "$OPTARG" ]; then
102 | echo "Argument error: -b $OPTARG. Specified blacklist file does not exist." >&2
103 | exit 1
104 | fi
105 | BUILDBLACK=$OPTARG
106 | ;;
107 | R)
108 | #ROI local file. (must be a file)
109 | if [ ! -f "$OPTARG" ]; then
110 | echo "Argument error: -R $OPTARG. Specified ROI file does not exist." >&2
111 | exit 1
112 | fi
113 | BUILDROI=$OPTARG
114 | ;;
115 | M)
116 | if [ ! -f "$OPTARG" ]; then
117 | echo "Argument error: -m $OPTARG. Specified Mapability file does not exist." >&2
118 | exit 1
119 | fi
120 | MAPABILITY_FILE=$OPTARG
121 | ;;
122 | n)
123 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
124 | echo "Argument error: -n $OPTARG, must be an integer." >&2
125 | exit 1
126 | fi
127 | MAPABILITY_LEN=$OPTARG
128 | ;;
129 | h)
130 | usage
131 | ;;
132 | v)
133 | versionAlert
134 | ;;
135 | \?)
136 | echo "Invalid option: -$OPTARG" >&2
137 | exit 1
138 | ;;
139 | :)
140 | echo "Option -$OPTARG requires an argument." >&2
141 | exit 1
142 | ;;
143 | esac
144 | done
145 | shift $(($OPTIND - 1))
146 |
147 | #echo $@ #The remaining arguments.
148 | #echo $# #The number of remaining arguments.
149 |
150 | if [ ! "$REF" ]; then
151 | echo "Argument error: -r is required." >&2
152 | exit 1
153 | fi
154 |
155 | if [[ "${MAPABILITY_FILE}" == "" ]]; then
156 | checkStar $STAREXEC
157 | fi
158 | setThreads
159 |
160 | if [ ! $# -eq 1 ]; then
161 | echo "Argument error: in run mode BuildRef, provide a single ftp URL. $# arguments found." >&2
162 | exit 1
163 | fi
164 |
165 | BUILDHINT=$1
166 | if [[ "$BUILDHINT" != ftp* ]]; then
167 | echo "Argument error: A single ftp url is required to find and download genome fasta and gtf files. eg: ftp://ftp.ensembl.org/pub/release-78/fasta/mus_musculus/dna/." >&2
168 | exit 1
169 | fi
170 |
171 |
172 | echo "Launching reference build process. The full build might take hours."
173 |
174 | "$LIBEXEC/IRFinder-BuildRefFromEnsembl" BuildRef "$THREADS" "$STAREXEC" "$BUILDHINT" "$REF" "$BUILDERCCFILE" "$BUILDROI" "$BUILDBLACK" "$SJOH" "$MAPABILITY_FILE" "$MAPABILITY_LEN" "$GENOMERAM"
175 |
176 |
177 |
--------------------------------------------------------------------------------
/bin/IRFinderBuildRefDownload:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util
5 | source $LIBEXEC/bash_utils.sh
6 |
7 |
8 | function usage() {
9 | echo "" >&2
10 | echo "IRFinder version: $VERSION" >&2
11 | echo "Usage: IRFinder BuildRefDownload [-v][-h] -r ReferenceDir URL" >&2
12 | echo "" >&2
13 | echo " required:" >&2
14 | echo " URL A base Ensembl URL to a (gzipped) gtf file. For example: ftp://ftp.ensembl.org/pub/release-100/gtf/homo_sapiens/Homo_sapiens.GRCh38.100.gtf.gz" >&2
15 | echo " -r ReferenceDir: Directory should not yet exist, will be created." >&2
16 | echo "" >&2
17 | echo " optional:" >&2
18 | echo " -v Show version number of current IRFinder." >&2
19 | echo " -h Show this usage information." >&2
20 | echo "" >&2
21 | exit 1
22 | }
23 |
24 |
25 | # === Defaults ===
26 | REF=
27 | STAREXEC=STAR
28 |
29 |
30 | if [ $# -eq 0 ] || [[ $1 == "--help" ]] ; then
31 | usage
32 | fi
33 |
34 |
35 | while getopts ":m:r:hv" opt; do
36 | case $opt in
37 | r)
38 | # Reference directory.
39 | REF=$OPTARG
40 | if [ -d "$REF" ]; then
41 | echo "Argument error: -r $REF. Reference directory must not exist, BuildRef will create it." >&2
42 | exit 1
43 | fi
44 | ;;
45 | h)
46 | usage
47 | ;;
48 | v)
49 | versionAlert
50 | ;;
51 | \?)
52 | echo "Invalid option: -$OPTARG" >&2
53 | exit 1
54 | ;;
55 | :)
56 | echo "Option -$OPTARG requires an argument." >&2
57 | exit 1
58 | ;;
59 | esac
60 | done
61 | shift $(($OPTIND - 1))
62 |
63 | #echo $@ #The remaining arguments.
64 | #echo $# #The number of remaining arguments.
65 |
66 | if [ ! "$REF" ]; then
67 | echo "Argument error: -r is required." >&2
68 | exit 1
69 | fi
70 |
71 |
72 | if [ ! $# -eq 1 ]; then
73 | echo "Argument error: in run mode BuildRefDownload, provide a single ftp URL. $# arguments found." >&2
74 | exit 1
75 | fi
76 |
77 | BUILDHINT=$1
78 | if [[ "$BUILDHINT" != ftp* ]]; then
79 | echo "Argument error: A single ftp url is required to find and download genome fasta and gtf files. eg: ftp://ftp.ensembl.org/pub/release-78/fasta/mus_musculus/dna/." >&2
80 | exit 1
81 | fi
82 |
83 |
84 | echo "Launching reference build process. The full build might take hours."
85 |
86 | "$LIBEXEC/IRFinder-BuildRefFromEnsembl" BuildRefDownload "1" "" "$BUILDHINT" "$REF" "" "" "" ""
87 |
88 |
89 |
--------------------------------------------------------------------------------
/bin/IRFinderBuildRefFromSTARRef:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util
5 | source $LIBEXEC/bash_utils.sh
6 |
7 |
8 | function usage() {
9 | echo "" >&2
10 | echo "IRFinder version: $VERSION" >&2
11 | echo "Usage: IRFinder BuildRefFromSTARRef [-v][-h][-t INT][-j INT][-e ExtraGenomeRef.fa][-b Blacklist.bed][-R ROI.bed][-m Mapability.bed] -r ReferenceDir -x STARRefDir" >&2
12 | echo "" >&2
13 | echo " required:" >&2
14 | echo " -r ReferenceDir: Directory should not yet exist, will be created." >&2
15 | echo " -x STARRefDir. An existing STAR reference folder." >&2
16 | echo " Please note: By default, BuildRefFromSTARRef mode automatically looks for the original FASTA and GTF files used to generate STARRefDir." >&2
17 | echo " Specifically, IRFinder investigates 'genomeParameters.txt' in STARRefDir." >&2
18 | echo " If both files can be located, IRFinder will continue to generate reference, ignoring '-f' and '-g' options." >&2
19 | echo " If either file is missing, IRFinder will quit and you have to re-run it by giving both '-f' and '-g' options." >&2
20 | echo "" >&2
21 | echo " optional:" >&2
22 | echo " -t Threads: The number of physical CPUs to use by IRFinder. When ommited (default), IRFinder will use all physical CPUs." >&2
23 | echo " -M Mapability: A precomputed bed file containing the low mapability areas. Can also be an empty file." >&2
24 | echo " -n MapabilityReadLength: The length of the reads used to compute the mapability. Default: 70" >&2
25 | echo " -f GENOME.fa: This MUST be the same FASTA file used to generate STARRefDir. Ignored when IRFinder can automatically locate the original file." >&2
26 | echo " -g TRANSCRIPTS.gtf: This MUST be the same GTF file used to generate STARRefDir. Ignored when IRFinder can automatically locate the original file." >&2
27 | echo " -e ExtraGenomeRef.fasta.gz: Typically an ERCC reference." >&2
28 | echo " -b Blacklist.bed.gz: BED of regions to be excluded from analysis." >&2
29 | echo " -R ROI.bed.gz: A non-overlapping BED file of additional Regions of Interest for read counts." >&2
30 | echo " -l Don't copy the STAR reference and the other files but create symbolic links." >&2
31 | echo " -v Show version number of current IRFinder." >&2
32 | echo " -h Show this usage information." >&2
33 | echo "" >&2
34 | exit 1
35 | }
36 |
37 |
38 | # === Defaults ===
39 | THREADS=0
40 | REF=
41 | SJOH=150
42 | BUILDERCCFILE=
43 | BUILDROI=
44 | BUILDBLACK=
45 | STAREXEC=STAR
46 | MYFASTA="NULL"
47 | MYGTF="NULL"
48 | LINK=0
49 | MAPABILITY_FILE=
50 | MAPABILITY_LEN=100
51 |
52 | if [ $# -eq 0 ] || [[ $1 == "--help" ]] ; then
53 | usage
54 | fi
55 |
56 |
57 | while getopts ":m:r:t:S:e:b:R:x:f:g:m:M:n:hvl" opt; do
58 | case $opt in
59 | r)
60 | # Reference directory.
61 | REF=$OPTARG
62 | if [ -d "$REF" ]; then
63 | echo "Argument error: -r $REF. Reference directory must not exist, BuildRef will create it." >&2
64 | exit 1
65 | fi
66 | ;;
67 | t)
68 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.)
69 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
70 | echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2
71 | exit 1
72 | fi
73 | THREADS=$OPTARG
74 | ;;
75 | S)
76 | ## STAR executable. (must be executable!)
77 | if [ -x "$OPTARG" -a ! -d "$OPTARG" ]; then
78 | STAREXEC=$OPTARG
79 | else
80 | echo "Argument error: -S $OPTARG. STAR executable must be an executable program." >&2
81 | exit 1
82 | fi
83 | ;;
84 | e)
85 | #ERCC file. (must be a file)
86 | if [ ! -f "$OPTARG" ]; then
87 | echo "Argument error: -e $OPTARG. Specified ERCC/extra-reference file does not exist." >&2
88 | exit 1
89 | fi
90 | BUILDERCCFILE=$OPTARG
91 | ;;
92 | b)
93 | #Blacklist local file (must be a file)
94 | if [ ! -f "$OPTARG" ]; then
95 | echo "Argument error: -b $OPTARG. Specified blacklist file does not exist." >&2
96 | exit 1
97 | fi
98 | BUILDBLACK=$OPTARG
99 | ;;
100 | R)
101 | #ROI local file. (must be a file)
102 | if [ ! -f "$OPTARG" ]; then
103 | echo "Argument error: -R $OPTARG. Specified ROI file does not exist." >&2
104 | exit 1
105 | fi
106 | BUILDROI=$OPTARG
107 | ;;
108 | M)
109 | if [ ! -f "$OPTARG" ]; then
110 | echo "Argument error: -m $OPTARG. Specified Mapability file does not exist." >&2
111 | exit 1
112 | fi
113 | MAPABILITY_FILE=$OPTARG
114 | ;;
115 | n)
116 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
117 | echo "Argument error: -n $OPTARG, must be an integer." >&2
118 | exit 1
119 | fi
120 | MAPABILITY_LEN=$OPTARG
121 | ;;
122 | x)
123 | #
124 | STARREF=$(realpath $OPTARG)
125 | ## Must be a directory
126 | ;;
127 | f)
128 | # The original FASTA file to generate STAR reference.
129 | MYFASTA=$(realpath $OPTARG)
130 | checkFile $MYFASTA
131 | ;;
132 | g)
133 | # The original GTF file to generate STAR reference.
134 | MYGTF=$(realpath $OPTARG)
135 | checkFile $MYGTF
136 | ;;
137 | h)
138 | usage
139 | ;;
140 | v)
141 | versionAlert
142 | ;;
143 | l)
144 | LINK=1
145 | ;;
146 | \?)
147 | echo "Invalid option: -$OPTARG" >&2
148 | exit 1
149 | ;;
150 | :)
151 | echo "Option -$OPTARG requires an argument." >&2
152 | exit 1
153 | ;;
154 | esac
155 | done
156 | shift $(($OPTIND - 1))
157 |
158 | #echo $@ #The remaining arguments.
159 | #echo $# #The number of remaining arguments.
160 |
161 | if [ ! "$REF" ]; then
162 | echo "Argument error: -r is required." >&2
163 | exit 1
164 | fi
165 |
166 | if [ -d "$REF" ]; then
167 | echo "Argument error: -r $REF. Reference directory must not exist, BuildRef will create it." >&2
168 | exit 1
169 | fi
170 |
171 | if [[ "${MAPABILITY_FILE}" == "" ]]; then
172 | checkStar $STAREXEC
173 | fi
174 |
175 | setThreads
176 |
177 | if [ ! "$STARREF" ]; then
178 | echo "Argument error: -x is required. Must provide an exisiting STAR reference folder for BuildRefProcess mode." >&2
179 | exit 1
180 | fi
181 | if [ ! -d "$STARREF" ]; then
182 | echo "Error: STAR reference at $STARREF does not exist." >&2
183 | exit 1
184 | fi
185 |
186 |
187 | if [[ "${MYFASTA}" == "NULL" ]] || [[ "${MYGTF}" == "NULL" ]] ; then
188 | if [ ! -f "$STARREF/genomeParameters.txt" ] ; then
189 | echo "Error: Cannot locate the original FASTA and GTF files used to generate STAR reference at $STARREF." >&2
190 | echo " Please provide these two files through '-f' and '-g' options respectively." >&2
191 | echo " Or retry to build IRFinder reference in other modes." >&2
192 | echo " Run 'IRFinder -h' for more details." >&2
193 | exit 1
194 | fi
195 | STARLINE=$(head -n 1 $STARREF/genomeParameters.txt)
196 | STARTMP1=(${STARLINE#*--genomeFastaFiles })
197 | STARTMP2=(${STARLINE#*--sjdbGTFfile })
198 | ORIFASTA=${STARTMP1[0]}
199 | ORIGTF=${STARTMP2[0]}
200 | if [[ "${MYFASTA}" == "NULL" ]]; then
201 | MYFASTA="${ORIFASTA}"
202 | fi
203 | if [[ "${MYGTF}" == "NULL" ]]; then
204 | MYGTF="${ORIGTF}"
205 | fi
206 | fi
207 |
208 |
209 | #get the original fasta and gtf file used to generate STAR reference using the parameters saved in 'genomeParameters.txt'
210 | if [ ! -f "$MYFASTA" ] || [ ! -f "$MYGTF" ]; then
211 | echo "Error: Cannot locate the original FASTA and GTF files used to generate the STAR reference $STARREF" >&2
212 | echo " at the following locations:" >&2
213 | echo " FASTA: $ORIFASTA" >&2
214 | echo " GTF: $ORIGTF" >&2
215 | echo " Please locate these two files through '-f' and '-g' options respectively." >&2
216 | echo " Or retry to build IRFinder reference in other modes." >&2
217 | echo " Run 'IRFinder -h' for more details." >&2
218 | exit 1
219 | fi
220 |
221 | if [ $LINK -eq 1 ]; then
222 | CP_CMD="ln -s "
223 | else
224 | CP_CMD="cp "
225 | fi
226 |
227 | MYFASTA=$(realpath $MYFASTA )
228 | MYGTF=$(realpath $MYGTF )
229 | REF=$(realpath $REF )
230 |
231 | echo "Launching reference build process. The full build might take hours."
232 | echo ""
233 | mkdir "$REF"
234 | date +"%b %d %T ... copying the genome FASTA file..."
235 | $CP_CMD "$MYFASTA" "$REF/genome.fa"
236 | date +"%b %d %T ... copying the transcriptome GTF file..."
237 | $CP_CMD "$MYGTF" "$REF/transcripts.gtf"
238 | date +"%b %d %T ... copying the STAR reference folder..."
239 | $CP_CMD -r "$STARREF" "$REF/STAR"
240 |
241 | "$LIBEXEC/IRFinder-BuildRefFromEnsembl" "BuildRefFromSTARRef" "$THREADS" "$STAREXEC" "$BUILDHINT" "$REF" "$BUILDERCCFILE" "$BUILDROI" "$BUILDBLACK" "$SJOH" "$MAPABILITY_FILE" "$MAPABILITY_LEN"
242 |
243 |
244 |
245 |
246 |
--------------------------------------------------------------------------------
/bin/IRFinderBuildRefProcess:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util
5 | source $LIBEXEC/bash_utils.sh
6 |
7 |
8 | function usage() {
9 | echo "" >&2
10 | echo "IRFinder version: $VERSION" >&2
11 | echo "Usage: IRFinder BuildRefProcess [-v][-h][-t INT][-j INT][-e ExtraGenomeRef.fa][-b Blacklist.bed][-R ROI.bed][-m Mapability.bed] -r ReferenceDir " >&2
12 | echo "" >&2
13 | echo "Parameters for BuildRefProcess mode:" >&2
14 | echo " required:" >&2
15 | echo " -r ReferenceDir. Directory should already contain EXACT files named 'genome.fa' and 'transcripts.gtf' (case-sensitive) for genome and transcriptome annotations respectively." >&2
16 | echo " optional:" >&2
17 | echo " -t Threads: The number of physical CPUs to use by IRFinder. When ommited (default), IRFinder will use all physical CPUs." >&2
18 | echo " -j INTEGER: an integer that is parsed to '--sjdbOverhang' under STAR 'genomeGenerate' mode. Default: 150." >&2
19 | echo " -e ExtraGenomeRef.fasta.gz: Typically an ERCC reference." >&2
20 | echo " -b Blacklist.bed.gz: BED of regions to be excluded from analysis." >&2
21 | echo " -R ROI.bed.gz: A non-overlapping BED file of additional Regions of Interest for read counts." >&2
22 | echo " -m Mapability: A precomputed bed file containing the low mapability areas. Can also be an empty file." >&2
23 | echo " -n MapabilityReadLength: The length of the reads used to compute the mapability. Default: 70" >&2
24 | echo " -L STAR limitGenomeGenerateRAM argument. Default: 31000000000" >&2
25 | echo " -h Show this usage information." >&2
26 | echo "" >&2
27 | exit 1
28 | }
29 |
30 |
31 | # === Defaults ===
32 | THREADS=0
33 | REF=
34 | SJOH=150
35 | BUILDERCCFILE=
36 | BUILDROI=
37 | BUILDBLACK=
38 | STAREXEC=STAR
39 | MAPABILITY_FILE=
40 | MAPABILITY_LEN=100
41 | GENOMERAM=31000000000
42 |
43 | if [ $# -eq 0 ] || [[ $1 == "--help" ]] ; then
44 | usage
45 | fi
46 |
47 |
48 | while getopts ":m:r:j:t:S:e:b:R:M:n:L:l:hv" opt; do
49 | case $opt in
50 | r)
51 | # Reference directory.
52 | REF=$OPTARG
53 | ;;
54 | j) # STAR's --sjdbOverhang
55 | if [[ $OPTARG =~ ^[0-9]+$ ]] ; then
56 | SJOH=$OPTARG
57 | else
58 | echo "Argument error: -j $OPTARG. '$OPTARG' is not an integer." >&2
59 | exit 1
60 | fi
61 | ;;
62 | t)
63 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.)
64 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
65 | echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2
66 | exit 1
67 | fi
68 | THREADS=$OPTARG
69 | ;;
70 | L)
71 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.)
72 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
73 | echo "Argument error: -L $OPTARG, limitGenomeGenerateRAM must be an integer." >&2
74 | exit 1
75 | fi
76 | GENOMERAM=$OPTARG
77 | ;;
78 | S)
79 | ## STAR executable. (must be executable!)
80 | if [ -x "$OPTARG" -a ! -d "$OPTARG" ]; then
81 | STAREXEC=$OPTARG
82 | else
83 | echo "Argument error: -S $OPTARG. STAR executable must be an executable program." >&2
84 | exit 1
85 | fi
86 | ;;
87 | e)
88 | #ERCC file. (must be a file)
89 | if [ ! -f "$OPTARG" ]; then
90 | echo "Argument error: -e $OPTARG. Specified ERCC/extra-reference file does not exist." >&2
91 | exit 1
92 | fi
93 | BUILDERCCFILE=$OPTARG
94 | ;;
95 | b)
96 | #Blacklist local file (must be a file)
97 | if [ ! -f "$OPTARG" ]; then
98 | echo "Argument error: -b $OPTARG. Specified blacklist file does not exist." >&2
99 | exit 1
100 | fi
101 | BUILDBLACK=$OPTARG
102 | ;;
103 | R)
104 | #ROI local file. (must be a file)
105 | if [ ! -f "$OPTARG" ]; then
106 | echo "Argument error: -R $OPTARG. Specified ROI file does not exist." >&2
107 | exit 1
108 | fi
109 | BUILDROI=$OPTARG
110 | ;;
111 | M)
112 | if [ ! -f "$OPTARG" ]; then
113 | echo "Argument error: -m $OPTARG. Specified Mapability file does not exist." >&2
114 | exit 1
115 | fi
116 | MAPABILITY_FILE=$OPTARG
117 | ;;
118 | n)
119 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
120 | echo "Argument error: -n $OPTARG, must be an integer." >&2
121 | exit 1
122 | fi
123 | MAPABILITY_LEN=$OPTARG
124 | ;;
125 | h)
126 | usage
127 | ;;
128 | v)
129 | versionAlert
130 | ;;
131 | \?)
132 | echo "Invalid option: -$OPTARG" >&2
133 | exit 1
134 | ;;
135 | :)
136 | echo "Option -$OPTARG requires an argument." >&2
137 | exit 1
138 | ;;
139 | esac
140 | done
141 | shift $(($OPTIND - 1))
142 |
143 | #echo $@ #The remaining arguments.
144 | #echo $# #The number of remaining arguments.
145 |
146 | if [ ! "$REF" ]; then
147 | echo "Argument error: -r is required." >&2
148 | exit 1
149 | fi
150 |
151 | checkStar $STAREXEC
152 | setThreads
153 |
154 | if [ ! -f "$REF/genome.fa" ] || [ ! -f "$REF/transcripts.gtf" ]; then
155 | echo "Argument error: -r $REF. Reference directory must exist and contain genome.fa and transcripts.gtf files. Use the BuildRefDownload run mode to create these." >&2
156 | exit 1
157 | fi
158 |
159 | if [ -d "$REF/STAR" ] || [ -d "$REF/Mapability" ] || [ -d "$REF/IRFinder" ]; then
160 | echo "Argument error: -r $REF. Will not overwrite. It appears BuildRefProcess has already been run for this reference. Reference directory must not contain STAR, Mapability or IRFinder directories." >&2
161 | exit 1
162 | fi
163 |
164 |
165 | echo "Launching reference build process. The full build might take hours."
166 | "$LIBEXEC/IRFinder-BuildRefFromEnsembl" "BuildRefProcess" "$THREADS" "$STAREXEC" "" "$REF" "$BUILDERCCFILE" "$BUILDROI" "$BUILDBLACK" "$SJOH" "$MAPABILITY_FILE" "$MAPABILITY_LEN" "$GENOMERAM"
167 |
168 |
169 |
170 |
--------------------------------------------------------------------------------
/bin/IRFinderLong:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util
4 |
5 | source $LIBEXEC/bash_utils.sh
6 |
7 |
8 | function usage() {
9 | echo "" >&2
10 | echo "IRFinder version: $VERSION" >&2
11 | echo "Usage: IRFinder Long -r ReferenceDir raw_reads_1.fast[q|a][.gz] [raw_reads_2.fast[q|a][.gz]...]" >&2
12 | echo "" >&2
13 | echo " required:" >&2
14 | echo " raw_reads_1.fast[q|a][.gz]: one or more long reads fastq or fasta files, compressed or not." >&2
15 | echo " -r ReferenceDir. As built by the 'BuildRef' option." >&2
16 | echo "" >&2
17 | echo " optional:" >&2
18 | echo " -d Output Directory: Default is the current directory." >&2
19 | echo " -M Sort memory. Maximum memory to use for sort for each thread, in MB. Default: 768." >&2
20 | echo " -x Minimap2 preset: splice (default), map-[pb|ont], ava-[pb|ont], asm[5|10|20], sr. see minimap2.1 for details." >&2
21 | echo " -E Minimap2 executable: Default is 'minimap2'." >&2
22 | echo " -t Threads: The number of physical CPUs to use by IRFinder. When ommited (default), IRFinder will use all physical CPUs." >&2
23 | echo " -u Unsorted output: Do not sort the read fragment BAM file." >&2
24 | echo " -v Verbose ( Default: print the log only in the output/logs/irfinder.std[out|err] )" >&2
25 | echo " -y STRING: an extra string that is parsed to Minimap2 for reads alignment. Default: '-uf -k14' " >&2
26 | echo " -j Jitter, consider the position around the splice sites to compensate sequencing errors ( integer number )." >&2
27 | echo "" >&2
28 | exit 1
29 | }
30 |
31 | # === Defaults ===
32 | OUTPUTDIR=.
33 | THREADS=0
34 | REF=
35 | MINIMAP_PRESET="splice"
36 | DOSORT=1
37 | MINIMAP_EXTRA="-uf -k14"
38 | MINIMAP_EXEC=minimap2
39 | VERBOSE=0
40 | SORTMEM=768
41 | AI_WARN=1
42 | AI_INTRON=1
43 | JITTER=3
44 |
45 | if [ $# -eq 0 ] || [[ $1 == "--help" ]] ; then
46 | usage
47 | fi
48 |
49 | while getopts ":m:j:r:t:d:E:x:uM:y:i:w:vh" opt; do
50 | case $opt in
51 | r)
52 | # Reference directory.
53 | REF=$OPTARG
54 | ;;
55 | t)
56 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.)
57 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
58 | echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2
59 | exit 1
60 | fi
61 | THREADS=$OPTARG
62 | ;;
63 | d)
64 | OUTPUTDIR=$OPTARG
65 | ;;
66 | x)
67 | ## Minimap preset.
68 | if [[ $OPTARG =~ ^splice$|^map-(pb|ont)$|^ava-(pb|ont)$|^asm[5|10|20]$|^sr$ ]]; then
69 | MINIMAP_PRESET=$OPTARG
70 | else
71 | echo "Argument error: -x $OPTARG. Valid options for Minimap presets are: " >&2
72 | echo " splice (default), map-pb, map-ont, ava-pb, ava-ont, asm5, asm10, asm20 or sr" >&2
73 | exit 1
74 | fi
75 | ;;
76 | j)
77 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
78 | echo "Argument error: -j $OPTARG, jitter must be an integer." >&2
79 | exit 1
80 | fi
81 | JITTER=$OPTARG
82 | ;;
83 | E)
84 | ## Minimap executable. (must be executable!)
85 | if [ -x "$OPTARG" -a ! -d "$OPTARG" ]; then
86 | MINIMAP_EXEC=$OPTARG
87 | else
88 | echo "Argument error: -S $OPTARG. Minimap2 executable must be an executable program." >&2
89 | exit 1
90 | fi
91 | ;;
92 | u)
93 | DOSORT=0
94 | ;;
95 | M)
96 | #Max sort memory in MB.
97 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
98 | echo "Argument error: -M $OPTARG, maximum sort RAM in MB must be an integer." >&2
99 | exit 1
100 | fi
101 | SORTMEM=$OPTARG
102 | ;;
103 | y)
104 | MINIMAP_EXTRA=$OPTARG
105 | ;;
106 | v)
107 | VERBOSE=1
108 | ;;
109 | h)
110 | usage
111 | ;;
112 | \?)
113 | echo "Invalid option: -$OPTARG" >&2
114 | exit 1
115 | ;;
116 | :)
117 | echo "Option -$OPTARG requires an argument." >&2
118 | exit 1
119 | ;;
120 | esac
121 | done
122 | shift $(($OPTIND - 1))
123 |
124 | if [ $# -eq 0 ];then
125 | echo "ERROR! No fasta or fastq file provided." >&2
126 | exit 1
127 | fi
128 |
129 | INPUT_FILES=""
130 | for f in $@; do
131 | if [ -f $f ]; then
132 | INPUT_FILES="${INPUT_FILES} ${f}"
133 | else
134 | echo "ERROR! File ${f} doesn't exists." >&2
135 | exit 1
136 | fi
137 | done
138 |
139 | checkRef $REF
140 | checkOutDir $OUTPUTDIR
141 | checkMinimap $MINIMAP_EXEC
142 | checkSamtools
143 | setThreads
144 |
145 | logger init
146 |
147 | RUNMODE="Long" startMessage $@
148 |
149 |
150 |
151 | logger "[ " $(date) " ] Minimap2 is starting with $THREADS threads"
152 |
153 | $MINIMAP_EXEC -a -t $THREADS -x $MINIMAP_PRESET $MINIMAP_EXTRA $REF/genome.fa $@ 2> $OUTPUTDIR/logs/minimap2.log | samtools view -b > $OUTPUTDIR/Unsorted.bam || exit 1
154 |
155 | logger "---"
156 | logger "[ " $(date) " ] Minimap2 mapping completed"
157 | logger "---"
158 |
159 | VERBOSE_FLAG=""
160 | if [[ "${VERBOSE}" == "1" ]]; then
161 | VERBOSE_FLAG=" -v "
162 | fi
163 |
164 | IRF_RUNMODE="Long" $(dirname "$(readlink -nf "$BASH_SOURCE")")/IRFinderBAM $VERBOSE_FLAG -l -r $REF -t $THREADS -j $JITTER -d $OUTPUTDIR $OUTPUTDIR/Unsorted.bam || exit 1
165 |
166 |
167 | if [ $DOSORT -eq 1 ]; then
168 | logger "---"
169 | logger "[ " $(date) " ] Sorting the bam file"
170 | echo "---- samtools sort -@ $THREADS -m ${SORTMEM}M -o $OUTPUTDIR/Sorted.bam $OUTPUTDIR/Unsorted.bam ---" >> $OUTPUTDIR/logs/samtools.log && \
171 | samtools sort -@ $THREADS -m ${SORTMEM}M -o $OUTPUTDIR/Sorted.bam $OUTPUTDIR/Unsorted.bam &>> $OUTPUTDIR/logs/samtools.log && \
172 | logger "---"
173 | logger "[ " $(date) " ] Indexing the sorted bam file"
174 | echo "---- samtools index -@ $THREADS $OUTPUTDIR/Sorted.bam ---" >> $OUTPUTDIR/logs/samtools.log && \
175 | samtools index -@ $THREADS $OUTPUTDIR/Sorted.bam &>> $OUTPUTDIR/logs/samtools.log && \
176 | rm $OUTPUTDIR/Unsorted.bam
177 | fi
178 | logger "---"
179 | logger "[ " $(date) " ] IRFinder Long completed."
180 | logger "---"
181 |
182 |
183 |
--------------------------------------------------------------------------------
/bin/TrimBAM4IGV:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export LC_ALL=C
4 | export LANG=C
5 |
6 | set -e
7 |
8 | function usage() {
9 | echo "Usage: TrimBAM4IGV -o OUTPUTDIR INPUT.bam" >&2
10 | echo "Output: OUTPUTDIR/INPUT.trimmed.bam; OUTPUTDIR/INPUT.trimmed.bam.bai" >&2
11 | echo "" >&2
12 | echo " -o OUTPUTDIR : required. Directory to save trimmed BAM." >&2
13 | echo " -r region : optional. A string to guide Samtools extracting reads in the corresponding region." >&2
14 | echo " -t NUM_THREADS : optional. Number of threads to use. Default: the number of physical CPUs." >&2
15 | echo " -h Show this usage information." >&2
16 | exit 1
17 | }
18 |
19 |
20 | # === Defaults ===
21 | THREADS=0
22 |
23 | if [ $# -eq 0 ]; then
24 | usage
25 | fi
26 |
27 | while getopts ":o:r:t:h" opt; do
28 | case $opt in
29 | o)
30 | # Reference directory.
31 | OUTPUTDIR=$OPTARG
32 | ;;
33 | t)
34 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.)
35 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then
36 | echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2
37 | exit 1
38 | fi
39 | THREADS=$OPTARG
40 | ;;
41 | r)
42 | REGIONS=$OPTARG
43 | ;;
44 | h)
45 | usage
46 | ;;
47 | \?)
48 | echo "Invalid option: -$OPTARG" >&2
49 | exit 1
50 | ;;
51 | :)
52 | echo "Option -$OPTARG requires an argument." >&2
53 | exit 1
54 | ;;
55 | esac
56 | done
57 | shift $(($OPTIND - 1))
58 |
59 | #echo $@ #The remaining arguments.
60 | #echo $# #The number of remaining arguments.
61 |
62 | STVERSTR=`samtools --version`
63 | STVER=$(echo $STVERSTR|cut -d" " -f2)
64 | STVERMAIN=$(echo $STVER|cut -d"." -f1)
65 | STVERMINOR=$(echo $STVER|cut -d"." -f2)
66 | if [[ ! "$STVERMAIN" -ge 1 ]]; then
67 | echo "Error: Samtools $STVER: version too old (>=1.4 required)." >&2
68 | exit 1
69 | elif [[ ! "$STVERMINOR" -ge 4 ]]; then
70 | echo "Error: Samtools $STVER: version too old (>=1.4 required)." >&2
71 | exit 1
72 | fi
73 |
74 | if [ ! "$OUTPUTDIR" ]; then
75 | echo "Argument error: -o is required." >&2
76 | usage
77 | fi
78 |
79 | if [ ! -d "$OUTPUTDIR" ]; then
80 | mkdir "$OUTPUTDIR"
81 | fi
82 |
83 | # Auto detect CPUs.
84 | if [[ $THREADS == 0 ]]; then
85 | THREADS=`awk 'BEGIN {FS=":"} ($0 ~ /^physical id/ ) { printf $2 " --"} ($0 ~ /^core id/) {print $2}' < /proc/cpuinfo | sort -u | wc -l`
86 | if [ ! -n $THREADS -o $THREADS -eq 0 ]; then
87 | # If physical CPU detection doesn't work for some reason, detect virtual CPUs (includes hyperthreading instances).
88 | THREADS=`grep -c ^processor /proc/cpuinfo`
89 | fi
90 | fi
91 |
92 | SAMPLE=$(echo $1|awk 'BEGIN{FS=".bam"}{print $1}')
93 |
94 | if [ -f "$1"".bai" ]; then
95 | if [ ! "$REGIONS" ]; then
96 | samtools view -h "$1"|awk 'BEGIN{FS=OFS="\t"}(substr($0,1,1)=="@"){print $0}(substr($0,1,1)!="@"){print $1,$2,$3,$4,$5,$6,$7,$8,$9,"*","*"}' > "$OUTPUTDIR/tmp_sorted.trimmed.sam"
97 | else
98 | samtools view -h "$1" $REGIONS|awk 'BEGIN{FS=OFS="\t"}(substr($0,1,1)=="@"){print $0}(substr($0,1,1)!="@"){print $1,$2,$3,$4,$5,$6,$7,$8,$9,"*","*"}' > "$OUTPUTDIR/tmp_sorted.trimmed.sam"
99 | fi
100 | else
101 | echo "Warning: BAM index not found: the input BAM is treated as name-sorted and will be sorted by coordinate first. This might take a while." >&2
102 | echo " If the input BAM has already been sorted by coordinate, please index it and re-run this command." >&2
103 | samtools sort -@ "$THREADS" "$1" > "$OUTPUTDIR/tmp_sorted.bam"
104 | samtools index -@ "$THREADS" "$OUTPUTDIR/tmp_sorted.bam"
105 | if [ ! "$REGIONS" ]; then
106 | samtools view -h "$OUTPUTDIR/tmp_sorted.bam"|awk 'BEGIN{FS=OFS="\t"}(substr($0,1,1)=="@"){print $0}(substr($0,1,1)!="@"){print $1,$2,$3,$4,$5,$6,$7,$8,$9,"*","*"}' > "$OUTPUTDIR/tmp_sorted.trimmed.sam"
107 | else
108 | samtools view -h "$OUTPUTDIR/tmp_sorted.bam" $REGIONS|awk 'BEGIN{FS=OFS="\t"}(substr($0,1,1)=="@"){print $0}(substr($0,1,1)!="@"){print $1,$2,$3,$4,$5,$6,$7,$8,$9,"*","*"}' > "$OUTPUTDIR/tmp_sorted.trimmed.sam"
109 | fi
110 | fi
111 |
112 | samtools view -S -b "$OUTPUTDIR/tmp_sorted.trimmed.sam" > "$OUTPUTDIR/""$SAMPLE"".trimmed.bam"
113 | samtools index -@ "$THREADS" "$OUTPUTDIR/""$SAMPLE"".trimmed.bam"
114 |
115 |
116 | rm "$OUTPUTDIR"/tmp_sorted*
--------------------------------------------------------------------------------
/bin/analysisWithLowReplicates.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use strict;
3 | use Data::Dumper;
4 | use List::Util qw(max min);
5 | use FindBin qw($RealBin);
6 |
7 | my $winflatExec = "winflat";
8 | if ( -x "$RealBin/util/winflat" ) {
9 | $winflatExec = "$RealBin/util/winflat";
10 | }else{
11 | #system('which','winflat', '>/dev/null');
12 | system('which winflat >/dev/null');
13 | if ($? != 0) {
14 | print STDERR "FATAL: winflat utiltiy not found.\nSearched at:\n $RealBin/util/winflat\n and on the PATH\n";
15 | exit 1;
16 | }
17 | }
18 |
19 |
20 | sub arrayEqual {
21 | my ($xref, $yref, $maxCompare) = @_;
22 | return unless @$xref == @$yref;
23 |
24 | for (my $i = 0; $i < $maxCompare && $i < scalar @$xref; $i++) {
25 | return unless $xref->[$i] eq $yref->[$i];
26 | }
27 | return 1;
28 | }
29 |
30 | sub separatedAB {
31 | my ($arrayref, $aCount, $bCount) = @_;
32 | ## An array with aCount elements followed by bCount elements.
33 | ## All of the A elements need to be < or > all of the B elements.
34 |
35 | if ($arrayref->[0] == $arrayref->[$aCount]) {
36 | return 0; #neither > or <.
37 | }elsif ($arrayref->[0] > $arrayref->[$aCount]) {
38 | for (my $a = 0; $a < $aCount; $a++) {
39 | for (my $b = $aCount; $b < $aCount+$bCount; $b++) {
40 | return 0 if (!($arrayref->[$a] > $arrayref->[$b]));
41 | }
42 | }
43 | }else{
44 | for (my $a = 0; $a < $aCount; $a++) {
45 | for (my $b = $aCount; $b < $aCount+$bCount; $b++) {
46 | return 0 if (!($arrayref->[$a] < $arrayref->[$b]));
47 | }
48 | }
49 | }
50 | return 1;
51 | }
52 |
53 |
54 |
55 | my $current = "";
56 | #Filehandles.
57 | my $poolA;
58 | my $poolAname;
59 | my $poolB;
60 | my $poolBname;
61 | my @reps;
62 | my @repsFileNames;
63 | my $repsA = 0;
64 | my $repsB = 0;
65 | my @output;
66 |
67 | while (scalar @ARGV) {
68 | my $param = shift @ARGV;
69 | if ($param =~ m/^\-/) {
70 | if ($param eq '-A') {
71 | $current = 'A';
72 | }elsif ($param eq '-B') {
73 | $current = 'B';
74 | }else{
75 | print STDERR "Invalid parameter: $param\n";
76 | exit 1;
77 | }
78 | }else{
79 | if ($current eq "") {
80 | print STDERR "Invalid parameters. eg: -A pooledA/IR-nondir.txt repA1/IR-nondir.txt repA2/IR-nondir.txt -B pooledB/IR-nondir.txt repB1/IR-nondir.txt repB2/IR-nondir.txt\n";
81 | exit 1;
82 | }elsif ($current eq "A") {
83 | if ($poolA) {
84 | #open $repsA[scalar @repsA], '<', $param or die "Can't open file $param for reading";
85 | ## Insert an element into the array @reps, after the last A element.
86 | splice(@repsFileNames, $repsA, 0, $param);
87 | splice(@reps, $repsA, 0, undef);
88 | open $reps[$repsA], '<', $param or die "Can't open file $param for reading";
89 |
90 | $repsA++;
91 | }else{
92 | open $poolA, '<', $param or die "Can't open file $param for reading";
93 | $poolAname = $param;
94 | }
95 | }elsif ($current eq "B") {
96 | if ($poolB) {
97 | ## Add an element to the very end of the array @reps (ie: after all the A elements, and all the B elements)
98 | @repsFileNames[scalar @repsFileNames]=$param;
99 | open $reps[scalar @reps], '<', $param or die "Can't open file $param for reading";
100 | $repsB++
101 | }else{
102 | open $poolB, '<', $param or die "Can't open file $param for reading";
103 | $poolBname = $param;
104 | }
105 | }else{
106 | print STDERR "error in code\n";
107 | exit 2;
108 | }
109 | }
110 | }
111 |
112 | ( $repsA >= 2 ) or die "For condition A, must provide a pooled data file and at least 2 replicate files.";
113 | ( $repsB >= 2 ) or die "For condition B, must provide a pooled data file and at least 2 replicate files.";
114 |
115 | #print Dumper(\@repsFileNames);
116 | #print Dumper(\@reps);
117 |
118 | my @repsHeader;
119 | my $counter = 1;
120 | foreach(@repsFileNames[0 .. $repsA-1]) {
121 | $_ = '#Condition A replicate ' . $counter . ': ' . $_;
122 | push @repsHeader, "A$counter-IRratio";
123 | $counter++;
124 | }
125 | $counter = 1;
126 | foreach(@repsFileNames[$repsA .. scalar @repsFileNames - 1]) {
127 | $_ = '#Condition B replicate ' . $counter . ': ' . $_;
128 | push @repsHeader, "B$counter-IRratio";
129 | $counter++;
130 | }
131 |
132 | print "#Condition A combined: $poolAname\n";
133 | print join("\n",@repsFileNames[0 .. $repsA-1]), "\n";
134 | print "#Condition B combined: $poolBname\n";
135 | print join("\n",@repsFileNames[$repsA .. scalar @repsFileNames - 1]), "\n";
136 |
137 | print join("\t",
138 | "Chr", "Start", "End", "Intron-GeneName/GeneID","-","Direction","ExcludedBases",
139 | "p-diff","p-increased","p-decreased",
140 | "A-IRratio","A-IRok","A-IntronCover","A-IntronDepth","A-SplicesMax","A-SplicesExact",
141 | "B-IRratio","B-IRok","B-IntronCover","B-IntronDepth","B-SplicesMax","B-SplicesExact",
142 | "replicates", @repsHeader
143 | ),"\n";
144 |
145 |
146 | my $lineNumber = 0;
147 | while(<$poolA>) {
148 | my $pA = $_;
149 | chomp $pA;
150 | my $pB = <$poolB>;
151 | chomp $pB;
152 | $lineNumber++;
153 |
154 | my @pA = split /\t/, $pA;
155 | my @pB = split /\t/, $pB;
156 |
157 | if (!( arrayEqual( \@pA, \@pB, 7) )) {
158 | print STDERR "FATAL: Files do not list records in the same order with identical number of lines.\n";
159 | print join("\t", @pA[0 .. 6]), "\n";
160 | print join("\t", @pB[0 .. 6]), "\n";
161 |
162 | exit 1;
163 | }
164 |
165 | ## Loop through replicates, fill an array. (check the ~~ 0..6)
166 | my @repsIR;
167 | foreach(@reps) {
168 | my @fields = split /\t/, <$_>;
169 | if (!( arrayEqual( \@pA, \@fields, 7) )) {
170 | print STDERR "FATAL: Files do not list records in the same order with identical number of lines.\n";
171 | print join("\t", @fields[0 .. 6]), "\n";
172 | print join("\t", @pA[0 .. 6]), "\n";
173 | exit 1;
174 | }
175 | push @repsIR, @fields[19];
176 | }
177 |
178 | ## Do the maths, are the replicates OK?
179 | my $ok = ($pA[20] eq "-" || $pB[20] eq "-") && ($pA[8] >= 1 || $pB[8] >= 1) && ($pA[19] >= 0.01 || $pB[19] >= 0.01) && separatedAB(\@repsIR, $repsA, $repsB);
180 |
181 | my $pValUp = 99;
182 | my $pValDown = 99;
183 |
184 | if ($ok) {
185 | ## Check if both are sufficiently expressed (either the intron, or the splices)
186 | if (( $pA[8] >= 1 || max($pA[16],$pA[17]) >= 10 ) && ( $pB[8] >= 1 || max($pB[16],$pB[17]) >= 10 )) {
187 | ## calculate the winflat p-value of the difference (from the pooled IRdepth & junctionDepth).
188 | #print $lineNumber, "\n";
189 | open my $winflat, '-|', $winflatExec, '-xvalue', int($pA[8]+0.5), '-yvalue', int($pB[8]+0.5), '-diff', max($pA[16],$pA[17])+int($pA[8]+0.5), max($pB[16],$pB[17])+int($pB[8]+0.5);
190 | my @winflat = <$winflat>;
191 | close $winflat;
192 | foreach (@winflat) {chomp; s/^.*\).*= *//; s/\W*$//};
193 | $pValDown = $winflat[0];
194 | $pValUp = $winflat[1];
195 | }else{
196 | ## Properly expressed in only one of the samples. Flag as interesting, but not differential IR.
197 | $pValUp = 33;
198 | $pValDown = 33;
199 | }
200 | }
201 | my $pValDiff = min($pValUp, $pValDown);
202 |
203 | if ($ok) {
204 | push @output, [@pA[0 .. 6],
205 | $pValDiff, $pValUp, $pValDown,
206 | $pA[19], $pA[20], $pA[7], $pA[8], max($pA[16],$pA[16]), $pA[18],
207 | $pB[19], $pB[20], $pB[7], $pB[8], max($pB[16],$pB[16]), $pB[18],
208 | "reps", @repsIR];
209 | }
210 |
211 | ## Max SJ - 17/18
212 | ## Exact SJ - 29
213 | ## IRRatio = 20
214 | ## ok 21
215 | ## coverage 8
216 | ## trimmedMean 9
217 | ##
218 | }
219 |
220 | foreach ( sort { $a->[7] <=> $b->[7] } @output ) {
221 | print join("\t", @$_), "\n";
222 | }
223 |
--------------------------------------------------------------------------------
/bin/analysisWithNoReplicates.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use strict;
3 | use Data::Dumper;
4 | use List::Util qw(max min);
5 | use FindBin qw($RealBin);
6 | use sort 'stable';
7 |
8 |
9 | my $winflatExec = "winflat";
10 | if ( -x "$RealBin/util/winflat" ) {
11 | $winflatExec = "$RealBin/util/winflat";
12 | }else{
13 | #system('which','winflat', '>/dev/null');
14 | system('which winflat >/dev/null');
15 | if ($? != 0) {
16 | print STDERR "FATAL: winflat utiltiy not found.\nSearched at:\n $RealBin/util/winflat\n and on the PATH\n";
17 | exit 1;
18 | }
19 | }
20 |
21 |
22 | sub arrayEqual {
23 | my ($xref, $yref, $maxCompare) = @_;
24 | return unless @$xref == @$yref;
25 |
26 | for (my $i = 0; $i < $maxCompare && $i < scalar @$xref; $i++) {
27 | return unless $xref->[$i] eq $yref->[$i];
28 | }
29 | return 1;
30 | }
31 |
32 | sub separatedAB {
33 | my ($arrayref, $aCount, $bCount) = @_;
34 | ## An array with aCount elements followed by bCount elements.
35 | ## All of the A elements need to be < or > all of the B elements.
36 |
37 | if ($arrayref->[0] == $arrayref->[$aCount]) {
38 | return 0; #neither > or <.
39 | }elsif ($arrayref->[0] > $arrayref->[$aCount]) {
40 | for (my $a = 0; $a < $aCount; $a++) {
41 | for (my $b = $aCount; $b < $aCount+$bCount; $b++) {
42 | return 0 if (!($arrayref->[$a] > $arrayref->[$b]));
43 | }
44 | }
45 | }else{
46 | for (my $a = 0; $a < $aCount; $a++) {
47 | for (my $b = $aCount; $b < $aCount+$bCount; $b++) {
48 | return 0 if (!($arrayref->[$a] < $arrayref->[$b]));
49 | }
50 | }
51 | }
52 | return 1;
53 | }
54 |
55 |
56 |
57 | my $current = "";
58 | #Filehandles.
59 | my $poolA;
60 | my $poolAname;
61 | my $poolB;
62 | my $poolBname;
63 | my @reps;
64 | my @repsFileNames;
65 | my $repsA = 0;
66 | my $repsB = 0;
67 | my @output;
68 |
69 | while (scalar @ARGV) {
70 | my $param = shift @ARGV;
71 | if ($param =~ m/^\-/) {
72 | if ($param eq '-A') {
73 | $current = 'A';
74 | }elsif ($param eq '-B') {
75 | $current = 'B';
76 | }else{
77 | print STDERR "Invalid parameter: $param\n";
78 | exit 1;
79 | }
80 | }else{
81 | if ($current eq "") {
82 | print STDERR "Invalid parameters. eg: -A pooledA/IR-nondir.txt repA1/IR-nondir.txt repA2/IR-nondir.txt -B pooledB/IR-nondir.txt repB1/IR-nondir.txt repB2/IR-nondir.txt\n";
83 | exit 1;
84 | }elsif ($current eq "A") {
85 | if ($poolA) {
86 | #open $repsA[scalar @repsA], '<', $param or die "Can't open file $param for reading";
87 | ## Insert an element into the array @reps, after the last A element.
88 | splice(@repsFileNames, $repsA, 0, $param);
89 | splice(@reps, $repsA, 0, undef);
90 | open $reps[$repsA], '<', $param or die "Can't open file $param for reading";
91 |
92 | $repsA++;
93 | }else{
94 | open $poolA, '<', $param or die "Can't open file $param for reading";
95 | $poolAname = $param;
96 | }
97 | }elsif ($current eq "B") {
98 | if ($poolB) {
99 | ## Add an element to the very end of the array @reps (ie: after all the A elements, and all the B elements)
100 | @repsFileNames[scalar @repsFileNames]=$param;
101 | open $reps[scalar @reps], '<', $param or die "Can't open file $param for reading";
102 | $repsB++
103 | }else{
104 | open $poolB, '<', $param or die "Can't open file $param for reading";
105 | $poolBname = $param;
106 | }
107 | }else{
108 | print STDERR "error in code\n";
109 | exit 2;
110 | }
111 | }
112 | }
113 |
114 | ( $poolA ) or die "For condition A, must provide a file.";
115 | ( $poolB ) or die "For condition B, must provide a file.";
116 | ( $repsA == 0 ) or die "For condition A, must provide a single file only.";
117 | ( $repsB == 0 ) or die "For condition B, must provide a single file only.";
118 |
119 |
120 | #print Dumper(\@repsFileNames);
121 | #print Dumper(\@reps);
122 |
123 | print "#Condition A: $poolAname\n";
124 | print "#Condition B: $poolBname\n";
125 |
126 |
127 | print join("\t",
128 | "Chr", "Start", "End", "Intron-GeneName/GeneID","-","Direction","ExcludedBases",
129 | "p-diff","p-increased","p-decreased",
130 | "A-IRratio","A-IRok","A-IntronCover","A-IntronDepth","A-SplicesMax","A-SplicesExact",
131 | "B-IRratio","B-IRok","B-IntronCover","B-IntronDepth","B-SplicesMax","B-SplicesExact",
132 | ),"\n";
133 |
134 |
135 | my $lineNumber = 0;
136 | while(<$poolA>) {
137 | my $pA = $_;
138 | chomp $pA;
139 | my $pB = <$poolB>;
140 | chomp $pB;
141 | $lineNumber++;
142 |
143 | my @pA = split /\t/, $pA;
144 | my @pB = split /\t/, $pB;
145 |
146 | if (!( arrayEqual( \@pA, \@pB, 7) )) {
147 | print STDERR "FATAL: Files do not list records in the same order with identical number of lines.\n";
148 | print join("\t", @pA[0 .. 6]), "\n";
149 | print join("\t", @pB[0 .. 6]), "\n";
150 |
151 | exit 1;
152 | }
153 |
154 | # ## Loop through replicates, fill an array. (check the ~~ 0..6)
155 | # my @repsIR;
156 | # foreach(@reps) {
157 | # my @fields = split /\t/, <$_>;
158 | # if (!( arrayEqual( \@pA, \@fields, 7) )) {
159 | # print STDERR "FATAL: Files do not list records in the same order with identical number of lines.\n";
160 | # print join("\t", @fields[0 .. 6]), "\n";
161 | # print join("\t", @pA[0 .. 6]), "\n";
162 | # exit 1;
163 | # }
164 | # push @repsIR, @fields[19];
165 | # }
166 |
167 | ## Do the maths, are the replicates OK?
168 | # my $ok = ($pA[20] eq "ok" || $pB[20] eq "ok") && ($pA[8] >= 1 || $pB[8] >= 1) && ($pA[19] >= 0.01 || $pB[19] >= 0.01) && separatedAB(\@repsIR, $repsA, $repsB);
169 |
170 | # No replicates. Still do a form of check -- is this intron interesting?
171 | my $ok = ($pA[20] eq "-" || $pB[20] eq "-") && ($pA[8] >= 1 || $pB[8] >= 1) && (max($pA[16],$pA[17]) >= 10 || max($pB[16],$pB[17]) >= 10) && ($pA[19] >= 0.01 || $pB[19] >= 0.01);
172 |
173 |
174 | my $pValUp = 99;
175 | my $pValDown = 99;
176 |
177 | if ($ok) {
178 | ## Check if both are sufficiently expressed (either the intron, or the splices)
179 | if (( $pA[8] >= 1 || max($pA[16],$pA[17]) >= 10 ) && ( $pB[8] >= 1 || max($pB[16],$pB[17]) >= 10 )) {
180 | ## calculate the winflat p-value of the difference (from the pooled IRdepth & junctionDepth).
181 | #print $lineNumber, "\n";
182 | open my $winflat, '-|', $winflatExec, '-xvalue', int($pA[8]+0.5), '-yvalue', int($pB[8]+0.5), '-diff', max($pA[16],$pA[17])+int($pA[8]+0.5), max($pB[16],$pB[17])+int($pB[8]+0.5);
183 | my @winflat = <$winflat>;
184 | close $winflat;
185 | foreach (@winflat) {chomp; s/^.*\).*= *//; s/\W*$//};
186 | $pValDown = $winflat[0];
187 | $pValUp = $winflat[1];
188 | }else{
189 | ## Properly expressed in only one of the samples. Flag as interesting, but not differential IR.
190 | $pValUp = 33;
191 | $pValDown = 33;
192 | }
193 | }
194 | my $pValDiff = min($pValUp, $pValDown);
195 |
196 | if ($ok) {
197 | # [ ] pushes an array ref onto the array.
198 | push @output, [@pA[0 .. 6],
199 | $pValDiff, $pValUp, $pValDown,
200 | $pA[19], $pA[20], $pA[7], $pA[8], max($pA[16],$pA[16]), $pA[18],
201 | $pB[19], $pB[20], $pB[7], $pB[8], max($pB[16],$pB[16]), $pB[18]
202 | ];
203 | }
204 |
205 | ## Max SJ - 17/18
206 | ## Exact SJ - 29
207 | ## IRRatio = 20
208 | ## ok 21
209 | ## coverage 8
210 | ## trimmedMean 9
211 | ##
212 | }
213 |
214 | foreach ( sort { $a->[7] <=> $b->[7] } @output ) {
215 | print join("\t", @$_), "\n";
216 | }
217 | #print Dumper (\@output);
218 |
--------------------------------------------------------------------------------
/bin/util/IntronExclusion.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use strict;
3 |
4 | my %genes;
5 |
6 | sub intronNumber
7 | {
8 | my ($gene) = shift;
9 | $genes{$gene} ++;
10 | return $genes{$gene};
11 | }
12 |
13 | sub processIntron {
14 | my ($intron, $overlaps) = @_;
15 | my ($chr, $start, $end, $gene, $score, $dir) = split /\t/, $intron;
16 |
17 | my $len = $end-$start;
18 | my $excl = 0;
19 | my $antisense_dirty = 0;
20 | my $excluded_by_exon = 0;
21 |
22 | my @intron_seg=();
23 | push @intron_seg, {'start' => $start, 'end' => $end};
24 |
25 | foreach my $overlap (@$overlaps) {
26 | # $overlap->{start, end, type}
27 | if ($overlap->{'type'} eq 'A') {
28 | $antisense_dirty = 2 if ($antisense_dirty < 2);
29 | #ignore anti-sense, but mark dirty
30 | }elsif ($overlap->{'type'} eq 'AE') {
31 | $antisense_dirty = 1 if ($antisense_dirty < 1);
32 | #ignore anti-sense, but mark dirty
33 | }elsif ($overlap->{'type'} eq 'E' && $overlap->{'start'} < $start && $overlap->{'end'} > $end) {
34 | #print STDERR "Found an exon/feature entirely covering this intron, skpping\n";
35 | $excluded_by_exon = 1;
36 | }else{
37 | # We want to exclude this segment from our intron.
38 |
39 | foreach my $seg (@intron_seg) {
40 | if ($seg->{'end'}==0) {
41 | # do nothing, this segment has already been deleted.
42 | }elsif ($overlap->{'end'} <= $seg->{'start'}) {
43 | # end is before the start, skip it
44 | }elsif ($overlap->{'start'} >= $seg->{'end'}) {
45 | # start is after the end, skip it
46 | }elsif ($overlap->{'start'} <= $seg->{'start'} && $overlap->{'end'} >= $seg->{'end'}) {
47 | # exclude entirely covers a segment (equality or beyond) then remove it
48 | $seg->{'start'}=0;
49 | $seg->{'end'}=0;
50 | }elsif ($overlap->{'start'} <= $seg->{'start'}) {
51 | # start is before the start, trim the start
52 | $seg->{'start'} = $overlap->{'end'};
53 | }elsif ($overlap->{'end'} >= $seg->{'end'}) {
54 | # end is after the end, trim the end
55 | $seg->{'end'} = $overlap->{'start'};
56 | }else{
57 | # start inside, end inside - split it
58 | push @intron_seg, {'start'=>$overlap->{'end'},'end'=>$seg->{'end'}};
59 | $seg->{'end'}=$overlap->{'start'};
60 | }
61 | }
62 | }
63 | }
64 | # Procesed all overlaps.
65 | # result fragments, in no specific order, in @intron_seg.
66 | #print $intron, "\n";
67 | my $newlen = 0;
68 | my $newstart;
69 | my $newend = 0;
70 | my @sizes;
71 | my @starts;
72 | foreach my $seg (sort {$a->{'start'} <=> $b->{'start'}} @intron_seg) {
73 | if ($seg->{'end'} != 0) {
74 | $newstart = $seg->{'start'} if (!$newstart);
75 | $newend = $seg->{'end'} if ($seg->{'end'} > $newend);
76 | push @starts, $seg->{'start'} - $newstart;
77 | push @sizes, $seg->{'end'}-$seg->{'start'};
78 | }
79 | #print join("\t", "", $seg->{'start'}, $seg->{'end'}), "\n";
80 | $newlen += $seg->{'end'}-$seg->{'start'};
81 | }
82 | if ($newlen > 40 && ($newlen/$len) >= 0.7) {
83 | my $antisense_text = 'clean';
84 | if ($excluded_by_exon >= 1) {
85 | $antisense_text = 'known-exon';
86 | $antisense_text .= '+anti-near' if ($antisense_dirty >= 1);
87 | $antisense_text .= '+anti-over' if ($antisense_dirty >= 2);
88 | }else{
89 | $antisense_text = 'anti-near' if ($antisense_dirty >= 1);
90 | $antisense_text = 'anti-over' if ($antisense_dirty >= 2);
91 | }
92 | print join("\t", $chr, $newstart, $newend, join("/",$gene,intronNumber($gene),$start,$end,$len,$len-$newlen,$antisense_text), $score, $dir, $newstart, $newend, "255,0,0", scalar @sizes, join(",",@sizes), join(",",@starts)), "\n";
93 |
94 | if ($len >= 110) {
95 | print OF50 join("\t", $chr, $start+5, $start+55, "S", 0, $dir, $start+5, $start+55, "255,0,0", 1, 50, 0), "\n";
96 | print OF50 join("\t", $chr, $end-55, $end-5, "S", 0, $dir, $end-55, $end-5, "255,0,0", 1, 50, 0), "\n";
97 | }
98 | # if ($len >= 210) {
99 | # print OF50 join("\t", $chr, $start+55, $start+105, "E", 0, $dir, $start+55, $start+105, "255,0,0", 1, 50, 0), "\n";
100 | # print OF50 join("\t", $chr, $end-105, $end-55, "E", 0, $dir, $end-105, $end-55, "255,0,0", 1, 50, 0), "\n";
101 | # }
102 | print OF1 join("\t", $chr, $start, $dir), "\n";
103 | print OF1 join("\t", $chr, $end, $dir), "\n";
104 | }
105 | }
106 |
107 |
108 |
109 | #### MAIN ####
110 |
111 | if (! (scalar @ARGV == 2) ) {
112 | print STDERR "Usage: cat inputBedIntersection | ./thisTool.pl out2 out3 > out1\n";
113 | exit(1);
114 | }
115 |
116 | open OF50, '>', $ARGV[0];
117 | open OF1, '>', $ARGV[1];
118 |
119 | my $lastintron = '';
120 | my @overlaps;
121 | while() {
122 | chomp;
123 |
124 | ## Directional
125 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 134895 135807 E 0 - 5
126 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 135135 135900 E 0 - 98
127 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 135230 136040 X 0 - 238
128 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 136070 136410 X 0 - 340
129 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 136440 136710 X 0 - 270
130 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 136750 137100 X 0 - 350
131 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 137140 137790 X 0 - 480
132 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 137615 139384 E 0 - 5
133 | #1 736543 741178 RP11-206L10.8/ENSG00000230092/- 0 - 1 736253 736548 E 0 - 5
134 | #1 736543 741178 RP11-206L10.8/ENSG00000230092/- 0 - 1 736550 736680 X 0 - 130
135 | #1 736543 741178 RP11-206L10.8/ENSG00000230092/- 0 - 1 736710 736840 X 0 - 130
136 |
137 | ## Non-Directional
138 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 134895 135807 E 5
139 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 135135 135900 E 98
140 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 135230 136040 X 238
141 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 136070 136410 X 340
142 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 136440 136710 X 270
143 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 136750 137100 X 350
144 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 137140 137790 X 480
145 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 137615 139384 E 5
146 |
147 | my ($intron, $overlapstart, $overlapend, $overlaptype) = $_ =~ /^([^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+)\t[^\t]+\t([^\t]+)\t([^\t]+)\t([^\t]+)/;
148 | push @overlaps, {'start'=>$overlapstart, 'end'=>$overlapend, 'type'=>$overlaptype};
149 |
150 | if ($lastintron ne $intron) {
151 | if ($lastintron ne '') {
152 | processIntron($lastintron, \@overlaps);
153 | undef @overlaps;
154 | }
155 | }
156 | push @overlaps, {'start'=>$overlapstart, 'end'=>$overlapend, 'type'=>$overlaptype};
157 |
158 | $lastintron = $intron;
159 |
160 | }
161 | processIntron($lastintron, \@overlaps);
162 |
--------------------------------------------------------------------------------
/bin/util/Mapability:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ulimit -Su 4000
4 | export LANG=C
5 | export LC_ALL=C
6 |
7 | set -e
8 |
9 | STAREXEC=$1
10 | STARGENOME=$2
11 | FA=$3
12 | LIBEXEC=$4
13 | THREADS=$5
14 | READ_LENGTH=$6
15 |
16 | TMPBED=tmp_$$
17 |
18 | mkdir "$TMPBED"
19 |
20 | TMPCMP=gzip
21 | TMPEXT=gz
22 | if [ -x /usr/bin/lzop ]; then
23 | TMPCMP=/usr/bin/lzop
24 | TMPEXT=lzo
25 | fi
26 |
27 |
28 | #echo ""
29 | echo ""
30 | date +"%b %d %T ... mapping genome fragments of length $READ_LENGTH back to genome..."
31 |
32 | if [ $THREADS -eq 1 ]; then
33 | STAR_THREADS=1
34 | else
35 | STAR_THREADS=$(( THREADS - 1 ))
36 | fi
37 |
38 | "$STAREXEC" \
39 | --genomeDir "$STARGENOME" \
40 | --genomeLoad NoSharedMemory \
41 | --runThreadN $THREADS --outStd SAM --outSAMmode NoQS \
42 | --outSAMattributes None \
43 | --outFilterMultimapNmax 1 \
44 | --readFilesIn <("$LIBEXEC/generateReadsError.pl" $READ_LENGTH 10 < "$FA") \
45 | > genome_fragments.sam
46 |
47 | date +"%b %d %T ... sorting aligned genome fragments..."
48 |
49 | samtools sort -@ "$THREADS" genome_fragments.sam > genome_fragments.bam
50 |
51 | date +"%b %d %T ... indexing aligned genome fragments..."
52 |
53 | samtools index -@ "$THREADS" genome_fragments.bam
54 |
55 | date +"%b %d %T ... filtering aligned genome fragments by chromosome/scaffold..."
56 |
57 | ## prevent histexpand for the character '!'
58 | set +o histexpand
59 |
60 | cat "$STARGENOME/chrName.txt" | \
61 | xargs --max-args 1 --max-procs "${THREADS}" -I{} bash -c "samtools view genome_fragments.bam {}|awk -v read_length=\"${READ_LENGTH}M\" -v tmpdir=\"${TMPBED}\" -v tmpcmp=\"${TMPCMP}\" -v tmpext=\"${TMPEXT}\" 'BEGIN{FS=\"[\\t!]\"; OFS=\"\\t\"}{if ((\$8 == read_length ) && (\$3 == \$6) && (\$2 == \$5)) {print \$5, \$6-1, \$6+69 | (tmpcmp \" -c1 > \" tmpdir \"/\" \$5 \".bed.\" tmpext ) }}END{close( (tmpcmp \" -c1 > \" tmpdir \"/\" \$5 \".bed.\" tmpext ))}'"
62 |
63 | date +"%b %d %T ... merging filtered genome fragments..."
64 |
65 | if [ "$TMPEXT" == "gz" ]; then
66 | find "$TMPBED" -type f -name "*.bed.""$TMPEXT"|xargs --max-args 1 zcat >> genome_fragments.unsorted.bed
67 | elif [ "$TMPEXT" == "lzo" ]; then
68 | find "$TMPBED" -type f -name "*.bed.""$TMPEXT"|xargs --max-args 1 lzop -cdf >> genome_fragments.unsorted.bed
69 | fi
70 |
71 | date +"%b %d %T ... calculating regions for exclusion..."
72 |
73 |
74 | bedtools genomecov -i genome_fragments.unsorted.bed -bga -g "$STARGENOME/chrNameLength.txt" | \
75 | awk 'BEGIN{FS=OFS="\t";chr="random"}($1!=chr){chr=$1}($1==chr){print}' | \
76 | awk 'BEGIN {FS=OFS="\t"} ($4 < 5) {print $1, $2, $3}' | \
77 | bedtools merge -i stdin | \
78 | sort -S5G -k1,1 -k2,2n -k3,3n| \
79 | gzip > MapabilityExclusion.bed.gz
80 |
81 | #ls "$TMPBED"/*.bed."$TMPEXT" | xargs --max-args 1 --max-procs "$THREADS" -I{} bash -c "\"$TMPCMP\" -cd < {} | bedtools genomecov -i stdin -bga -g \"$CHRLEN\"| awk 'NR==1{chr=\$1;print}\$1==chr{print}' | awk 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"} (\$4 < 5) {print \$1, \$2, \$3}' | bedtools merge -i stdin > \"$TMPEXCL/\"{}.exclusion"
82 |
83 | #find "$TMPBED" -type f -name "*.bed.""$TMPEXT"|cut -d"/" -f3|xargs --max-args 1 --max-procs "$THREADS" -I{} bash -c "\"$TMPCMP\" -cd < \"$TMPBED\"/{} | bedtools genomecov -i stdin -bga -g \"$CHRLEN\"| awk 'NR==1{chr=\$1;print}\$1==chr{print}' | awk 'BEGIN {FS=OFS=\"\\t\"} (\$4 < 5) {print \$1, \$2, \$3}' | bedtools merge -i stdin > \"$TMPEXCL\"/{}.exclusion"
84 |
85 | #cat "$TMPEXCL"/*.exclusion | sort -S5G -k1,1 -k2,2n -k3,3n | gzip > MapabilityExclusion.bed.gz
86 |
87 | date +"%b %d %T ... cleaning temporary files..."
88 |
89 | find "$TMPBED" -type f -name "*.bed.""$TMPEXT"|xargs --max-args 1 --max-procs "$THREADS" rm
90 | rm genome_fragments.*
91 | rm Log.*
92 | rm SJ.out.tab
93 | rmdir "$TMPBED"
94 |
--------------------------------------------------------------------------------
/bin/util/adjust.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | args <- commandArgs(trailingOnly = T)
3 | dat=read.table(args[1], stringsAsFactors = F, header = T)
4 | cols=colnames(dat)
5 | for (cn in cols[grepl(pattern ="p.val", x = cols)] ){
6 | dat[,paste0(cn,"_BH_adjusted")]=p.adjust(dat[,cn], method = "BH")
7 | }
8 | write.table(x=dat, file = paste0(args[1], "_adjusted.tsv"), row.names = F, col.names = T, quote = F, sep="\t")
9 |
10 |
--------------------------------------------------------------------------------
/bin/util/bash_utils.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ## Useful functions for IRFinder's utils
3 |
4 | export IRFINDER_BASH_UTILS_IMPORTED=1
5 | export VERSION=2.0.1
6 | export LC_ALL=C
7 | export LANG=C
8 |
9 |
10 | function versionAlert(){
11 | echo "IRFinder version: $VERSION"
12 | exit
13 | }
14 |
15 | function checkFile() {
16 | if [ ! -f "${1}" ]; then
17 | echo "Error: file $1 doesn't exists" >&2
18 | exit 1
19 | fi
20 | }
21 |
22 | function checkSamtools() {
23 | STVERSTR=`samtools --version`
24 | STVER=$(echo $STVERSTR|cut -d" " -f2)
25 | STVERMAIN=$(echo $STVER|cut -d"." -f1)
26 | STVERMINOR=$(echo $STVER|cut -d"." -f2)
27 | if [[ ! "$STVERMAIN" -ge 1 ]]; then
28 | echo "Error: Samtools $STVER: version too old (>=1.4 required)." >&2
29 | exit 1
30 | elif [[ ! "$STVERMINOR" -ge 4 ]]; then
31 | echo "Error: Samtools $STVER: version too old (>=1.4 required)." >&2
32 | exit 1
33 | fi
34 | }
35 |
36 | function getMem(){
37 | local MEMK=`awk '($1 ~ /^MemTotal:/) {print $2}' < /proc/meminfo`
38 | echo $(( $MEMK/1000 ))
39 | }
40 |
41 |
42 | function checkStar(){
43 | MEMM=$(getMem)
44 | if [ "${MEMM}" -lt 32000 ]; then
45 | echo "System limitation: Minimum required RAM is 32GB. This software uses STAR for RNA mapping. RAM requirement is approximately 30GB for the human genome." >&2
46 | echo " RunModes: BAM and BuildRefDownload, may be completed on servers with more RAM." >&2
47 | exit 2
48 | fi
49 | if [[ "$1" != "" ]]; then
50 | STAREXEC="$1"
51 | fi
52 | if [[ "${STAREXEC}" == "" ]]; then
53 | STAREXEC="STAR"
54 | fi
55 | "$STAREXEC" --version &>/dev/null
56 | if [ ! $? -eq 0 ]; then
57 | echo "Error: STAR version is too old. --version parameter returns an error. Minimum version of 2.4.0 required." >&2
58 | exit 2
59 | fi
60 | }
61 |
62 |
63 | function checkMinimap(){
64 | if [[ "$1" != "" ]]; then
65 | MINIMAP_EXEC="$1"
66 | fi
67 | if [[ "${MINIMAP_EXEC}" == "" ]]; then
68 | MINIMAP_EXEC="minimap2"
69 | fi
70 | if ! which $MINIMAP_EXEC > /dev/null 2> /dev/null ; then
71 | echo "minimap2 not found ( executable: $MINIMAP_EXEC ). To use the RunMode Long, install it: https://github.com/lh3/minimap2 " >&2
72 | exit 1
73 | fi
74 | MINIMAP_VERSION=$("$MINIMAP_EXEC" --version)
75 | if [[ $(echo ${MINIMAP_VERSION/-*/} | awk '{if ( $1 > 2.0 ) {print "ok" } else { print "no" }}') != "ok" ]]; then
76 | echo "Error: Minimap version is too old. Minimum version of 2.0.0 required. ${MINIMAP_VERSION} detected" >&2
77 | exit 2
78 | fi
79 | }
80 |
81 |
82 | function checkSuppa(){
83 | if ! which suppa.py >/dev/null 2>/dev/null ; then
84 | echo "SUPPA2 not found ( executable: suppa.py ). To use the RunMode Diff, install it: https://github.com/comprna/SUPPA " >&2
85 | exit 1
86 | fi
87 | }
88 |
89 | function checkDeseq(){
90 | if ! which Rscript > /dev/null 2>/dev/null; then
91 | echo "Rscript not found."
92 | exit 1
93 | fi
94 | DESeqVersion=$(Rscript -e 'installed.packages()' | awk 'BEGIN {v=0} $1=="Version" {v=1; } v==1 && $1 == "DESeq2" { gsub("\"", ""); print $2;v=0 } ' )
95 |
96 | if [[ "${DESeqVersion}" == "" ]]; then
97 | DESeqVersion=$(Rscript -e 'installed.packages()' | awk 'BEGIN {v=0} $NF=="Version" {v=1; } v==1 && $1 == "DESeq2" { gsub("\"", ""); print $NF;v=0 } ' )
98 | if [[ "${DESeqVersion}" == "" ]]; then
99 | echo "DESeq2 not installed. "
100 | exit 1
101 | fi
102 | fi
103 | logger "DESeq2 version $DESeqVersion"
104 | }
105 |
106 | function setThreads(){
107 | if [[ "${THREADS}" == "" || $THREADS == 0 ]]; then
108 | THREADS=`grep -c ^processor /proc/cpuinfo`
109 | if [ ! -n $THREADS ] | [ $THREADS -eq 0 ]; then
110 | THREADS=`awk 'BEGIN {FS=":"} ($0 ~ /^physical id/ ) { printf $2 " --"} ($0 ~ /^core id/) {print $2}' < /proc/cpuinfo | sort -u | wc -l`
111 | if [ ! -n $THREADS ] | [ $THREADS -eq 0 ]; then
112 | THREADS=1
113 | fi
114 | fi
115 | fi
116 | }
117 |
118 |
119 | function checkRef(){
120 | if [ ! "$1" ]; then
121 | echo "Argument error: -r is required." >&2
122 | exit 1
123 | fi
124 | if [ ! -f "$1/IRFinder/ref-cover.bed" ]; then
125 | echo "Argument error: -r $1, Does not appear to be a valid IRFinder reference. Could not find $1/IRFinder/ref-cover.bed" >&2
126 | exit 1
127 | fi
128 | }
129 |
130 | function checkOutDir(){
131 | local OUTPUTDIR=$1
132 | if [ -d "$OUTPUTDIR" ]; then
133 | if [ -e "$OUTPUTDIR/IRFinder-IR-nondir.txt" ]; then
134 | echo "Argument error: -d $OUTPUTDIR, output directory contains files from a previous IRFinder run. Will not overwrite." >&2
135 | exit 1
136 | else
137 | mkdir -p "$OUTPUTDIR/logs/"
138 | fi
139 | else
140 | mkdir -p "$OUTPUTDIR/logs/"
141 | if [ ! -d "$OUTPUTDIR" ]; then
142 | echo "Argument error: Output directory $OUTPUTDIR does not exist, and could not be created." >&2
143 | exit 1
144 | fi
145 | fi
146 | }
147 |
148 | function logger() {
149 | LOGOUT="./irfinder.stdout"
150 | if [[ "$OUTPUTDIR" != "" ]]; then
151 | if [ ! -d ${OUTPUTDIR}/logs/ ]; then
152 | mkdir -p ${OUTPUTDIR}/logs/
153 | fi
154 | LOGOUT="$OUTPUTDIR/logs/irfinder.stdout"
155 | fi
156 | if [[ "$1" == "init" ]] && [[ $# == 1 ]]; then
157 | > $LOGOUT
158 | LOG_MESSAGE="\n --------------------\n| IRFinder v. $VERSION | \n --------------------\n"
159 | else
160 | LOG_MESSAGE="${@}"
161 | fi
162 | if [[ "${VERBOSE}" == "1" ]] ; then
163 | echo -e "${LOG_MESSAGE}" | tee -ai $LOGOUT
164 | else
165 | echo -e "${LOG_MESSAGE}" >> $LOGOUT
166 | fi
167 | }
168 |
169 | function startMessage(){
170 | ## Check if the startMessage was called by the BAM mode after the FastQ or Long analysis
171 | if [[ "${IRF_RUNMODE}" == "" ]]; then
172 | logger "---"
173 | logger "IRFinder version: $VERSION "
174 | logger "IRFinder start: " `date`
175 | logger "IRFinder runmode: $RUNMODE"
176 | logger "IRFinder user@host: $USER @ $HOSTNAME"
177 | logger "IRFinder working dir: " `pwd`
178 | logger "IRFinder reference: $REF"
179 | n=1
180 | for f in $@; do
181 | logger "IRFinder file ${n}: $f"
182 | n=$((n+1))
183 | done
184 | logger "---"
185 | START_MESSAGE=1
186 | fi
187 | }
188 |
--------------------------------------------------------------------------------
/bin/util/bed-to-intron+exon.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | #0 1 2 3 4 5 6 7 8 9 10 11
4 | #1 11868 14409 ENST00000456328/processed_transcript/DDX11L1 0 + 11868 14409 0 3 359,109,1189, 0,744,1352,
5 |
6 |
7 | open EXON, '>', $ARGV[0];
8 | open INTRON, '>', $ARGV[1];
9 |
10 | while () {
11 | chomp;
12 | @f = split /\t/;
13 |
14 | $trans_start = $f[1];
15 |
16 | @length = split /,/, $f[10];
17 | @start = split /,/, $f[11];
18 | $chr = $f[0];
19 | ($gene_id,$gene_name) = $f[3] =~ /\/([^\/]*)\/([^\/]*)$/;
20 | $dir = $f[5];
21 |
22 | $last_end = undef;
23 | while (@length) {
24 | $start = shift @start;
25 | $length = shift @length;
26 | if (defined($last_end)) {
27 | #only output if the intron has length.
28 | if (($last_end+1) < ($start-1)) {
29 | print INTRON join("\t", $chr, $trans_start+$last_end, $trans_start+$start, "$gene_name/$gene_id/$dir", $f[4], $f[5]), "\n";
30 | }
31 | }
32 | #print EXON "$chr\t" . ($trans_start+$start) . "\t" . ($trans_start+$start+$length) . "\t$name\n";
33 | print EXON join("\t", $chr, $trans_start+$start, $trans_start+$start+$length, "$gene_name/$gene_id/$dir", $f[4], $f[5]), "\n";
34 | $last_end = $start+$length;
35 | }
36 | }
37 |
38 | close INTRON;
39 | close EXON;
40 |
--------------------------------------------------------------------------------
/bin/util/deseq2.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | library(DESeq2)
3 | library(ggplot2)
4 | ### Load DESeq2Constructor
5 | initial.options <- commandArgs(trailingOnly = FALSE)
6 | file.arg.name <- "--file="
7 | script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)])
8 | script.basename <- dirname(script.name)
9 | other.name <- file.path(script.basename, "/../DESeq2Constructor.R")
10 | source(other.name)
11 | # source("~/git/IRFinder/bin/DESeq2Constructor.R")
12 | ### Read args
13 | # setwd("~/test/IRFinder2/Diff/sing/")
14 | # setwd("/media/lorencl/f4e6cecd-2fb8-4aa6-991c-620f450fd511/works/IRFinder2/Diff/test_9")
15 | # args=c("./groups.tsv", "0.05", "0" , "0" ,"0")
16 |
17 | args <- commandArgs(trailingOnly = T)
18 | groups=read.table(args[1], stringsAsFactors = F, header = T)
19 | out_folder=dirname(args[1])
20 | IRratio_thr=as.numeric(args[2])
21 | warning_filter=args[3]
22 | cooks_cutoff=args[4]=="1"
23 | if (cooks_cutoff ){
24 | print("cooks_cutoff enabled")
25 | } else {
26 | print("cooks_cutoff disabled")
27 | }
28 |
29 | independentFiltering= args[5]=="1"
30 | if (independentFiltering ){
31 | print("independentFiltering enabled")
32 | } else {
33 | print("independentFiltering disabled")
34 | }
35 |
36 | paths = as.vector(groups$Files)
37 | experiment = groups[,c("SampleName", "Condition")]
38 |
39 | experiment$Condition=factor(experiment$Condition)
40 | rownames(experiment)=NULL
41 |
42 | metaList=DESeqDataSetFromIRFinder(filePaths=paths, designMatrix=experiment, designFormula=~1, irratio_thr=IRratio_thr, warning_filter=warning_filter )
43 |
44 | dds = metaList$DESeq2Object
45 | design(dds) = ~Condition + Condition:IRFinder
46 | conditions=levels(experiment$Condition)
47 | dds = DESeq(dds)
48 | resultsNames(dds)
49 | nn_counts = counts(dds, normalized=F)
50 | global_dat = data.frame( intron = rownames(nn_counts) );
51 | for ( i in 1:nrow(experiment) ) {
52 | s_name =experiment$SampleName[i]
53 | global_dat[,paste0("IRratio.",s_name)] = nn_counts[, paste0("intronDepth.", s_name)] / (nn_counts[, paste0("intronDepth.", s_name)] + nn_counts[, paste0("maxSplice.", s_name)] )
54 | global_dat[,paste0("IRratio.",s_name)][is.na(global_dat[,paste0("IRratio.",s_name)])]=0
55 | }
56 |
57 | for ( i in 1:(length(conditions)) ){
58 | global_dat[,paste0(conditions[i], ".Mean.IRratio")]= rowMeans(global_dat[,paste0("IRratio.",experiment$SampleName[experiment$Condition == conditions[i]])])
59 |
60 | }
61 |
62 |
63 | for ( i in 1:(length(conditions)-1) ){
64 | for (j in (i+1):length(conditions)){
65 | contrast_name=paste0(conditions[i], "_", conditions[j])
66 | res = results(dds, contrast=list(paste0("Condition", conditions[i] ,".IRFinderIR"),paste0("Condition", conditions[j] ,".IRFinderIR")), cooksCutoff=cooks_cutoff, independentFiltering=independentFiltering)
67 | res$padj[is.na(res$padj)]=1
68 | global_dat[,paste0("DESeq2.padj.", contrast_name)]=res$padj
69 | global_dat[,paste0("DESeq2.baseMean.", contrast_name)]=res$baseMean
70 | global_dat[,paste0("DESeq2.log2FoldChange.", contrast_name)]=res$log2FoldChange
71 | if ( sum(res$padj < 0.05 ) > 0 ){
72 | pdf(paste0(out_folder, "/", contrast_name, "_plot.pdf"))
73 | nn_counts = counts(dds, normalized=F)
74 | for ( name in rownames(res)[res$padj < 0.05]){
75 | dat = data.frame( name = experiment$SampleName, grp= experiment$Condition,
76 | intron_depth = nn_counts[name, paste0("intronDepth.", experiment$SampleName)] ,
77 | max_splice= nn_counts[name, paste0("maxSplice.", experiment$SampleName)])
78 | dat$IRratio = dat$intron_depth / ( dat$intron_depth+dat$max_splice)
79 | print(ggplot(dat)+geom_boxplot(aes(x=grp, fill=grp, y=IRratio )) + ggtitle(paste0(name, "\n", res[name, "padj"])))
80 | }
81 | dev.off()
82 | }
83 | write.table(res, file = paste0(out_folder, "/", contrast_name, "_DESeq2.tsv") ,sep="\t", quote = F)
84 | }
85 | }
86 | rownames(global_dat)=global_dat$intron
87 | global_dat=global_dat[,-1]
88 | write.table(global_dat, file = paste0(out_folder, "/all_results_DESeq2.tsv") ,sep="\t", quote = F)
89 |
90 | quit(save = "no")
91 |
92 |
93 |
94 |
--------------------------------------------------------------------------------
/bin/util/generateReadsError.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use strict;
3 | #use Fcntl;
4 |
5 | #fcntl(stdout, F_SETPIPE_SZ, 1048576);
6 | #fcntl(fileno(STDOUT), 1031, 1048576);
7 |
8 | my $generatedCount = 0;
9 |
10 | my $readLen = $ARGV[0];
11 | my $stride = $ARGV[1];
12 |
13 | my $lastDirection = 0;
14 |
15 | sub reverse_complement {
16 | my $dna = shift;
17 |
18 | # reverse the DNA sequence
19 | my $revcomp = reverse($dna);
20 |
21 | # complement the reversed DNA sequence
22 | $revcomp =~ tr/ACGTacgt/TGCAtgca/;
23 | return $revcomp;
24 | }
25 |
26 |
27 | my @error = (
28 | {'A' => 'G', 'G' => 'T', 'T' => 'C', 'C' => 'A', 'N' => 'N' },
29 | {'A' => 'T', 'G' => 'C', 'T' => 'A', 'C' => 'G', 'N' => 'N' },
30 | {'A' => 'C', 'G' => 'A', 'T' => 'G', 'C' => 'T', 'N' => 'N' }
31 | );
32 |
33 | my $readCount = 0;
34 |
35 | sub processRead( $ $ $ ) {
36 | my $read = shift;
37 | my $pos = shift;
38 | my $chr = shift;
39 |
40 | $readCount++;
41 |
42 | my $numN = $read =~ tr/N/N/;
43 | if ($numN * 2 < $readLen) {
44 | #only output reads where less than half of the read will be NNNNN
45 |
46 | # generate a single base error in a deterministic manner.
47 | substr($read,35,1) = $error[$readCount % 3]{substr($read,35,1)};
48 |
49 | if ($lastDirection == 0) {
50 | print ">RF!$chr!$pos\n";
51 | print "$read\n";
52 | $lastDirection = 1;
53 | }else{
54 | print ">RR!$chr!$pos\n";
55 | print reverse_complement($read) . "\n";
56 | $lastDirection = 0;
57 | }
58 |
59 | }
60 | }
61 |
62 | sub processBuffer( $ $ $ ) {
63 | my $b = shift;
64 | my $pos = shift;
65 | my $chr = shift;
66 |
67 | #while (length($$b) >= $readLen + $stride) {
68 | while (length($$b) >= $readLen) {
69 | processRead(substr($$b,0,$readLen), $pos, $chr);
70 | #my $thisread = substr($$b,0,$readLen);
71 | $$b = substr($$b,$stride);
72 | $pos = $pos + $stride;
73 | }
74 | return $pos;
75 | }
76 |
77 | my $count = 0;
78 | my $chr = '';
79 | my $pos = 1;
80 | my $buffer = '';
81 |
82 | while() {
83 | chomp;
84 | $count ++;
85 | if (m/^>/) {
86 | s/ .*$//;
87 | s/^>//;
88 | $chr = $_;
89 | $pos = 1;
90 | $buffer = '';
91 | }
92 | else{
93 | # Should allow into the buffer only valid letters.
94 | $_ = uc($_);
95 | s/[^ATCGN]/N/g;
96 | $buffer .= $_;
97 | $pos = processBuffer(\$buffer, $pos, $chr);
98 | }
99 | # if ($count > 10000) { exit; }
100 | }
101 |
--------------------------------------------------------------------------------
/bin/util/gtf2bed-custom.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | # Copyright (c) 2011 Erik Aronesty (erik@q32.com)
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | # THE SOFTWARE.
22 | #
23 | # ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT.
24 | #
25 | # https://code.google.com/p/ea-utils/source/browse/trunk/clipper/gtf2bed
26 |
27 | use Data::Dumper;
28 | use sort 'stable';
29 | use if $]<5.028, "sort", '_mergesort'; # Note the hash function is not stable on later versions of PERL. Must sort a hash on relevant values if stability is desired.
30 |
31 | $in = shift @ARGV;
32 |
33 | open IN, ($in =~ /\.gz$/ ? "gunzip -c $in" : $in =~ /\.zip$/ ? "unzip -p $in" : "$in");
34 | while () {
35 | $gff = 2 if /^##gff-version 2/;
36 | $gff = 3 if /^##gff-version 3/;
37 | next if /^#/ && $gff;
38 |
39 | s/\s+$//;
40 | # 0-chr 1-src 2-feat 3-beg 4-end 5-scor 6-dir 7-fram 8-attr
41 | my @f = split /\t/;
42 | if ($gff) {
43 | # most ver 2's stick gene names in the id field
44 | ($id) = $f[8]=~ /\bID="([^"]+)"/;
45 | # most ver 3's stick unquoted names in the name field
46 | ($id) = $f[8]=~ /\bName=([^";]+)/ if !$id && $gff == 3;
47 | } else {
48 | ($id) = $f[8]=~ /transcript_id "([^"]+)"/;
49 | }
50 |
51 | next unless $id && $f[0];
52 |
53 | if ($f[2] eq 'exon') {
54 | die "no position at exon on line $." if ! $f[3];
55 | # gff3 puts :\d in exons sometimes
56 | $id =~ s/:\d+$// if $gff == 3;
57 | push @{$exons{$id}}, \@f;
58 | # save lowest start
59 | $trans{$id} = \@f if !$trans{$id};
60 | }# elsif ($f[2] eq 'start_codon') {
61 | # #optional, output codon start/stop as "thick" region in bed
62 | # $sc{$id}->[0] = $f[3];
63 | #}# elsif ($f[2] eq 'CDS') {
64 | # #optional, output codon start/stop as "thick" region in bed
65 | # push @{$cds{$id}}, \@f;
66 | # # save lowest start
67 | # $cdx{$id} = \@f if !$cdx{$id};
68 | #} elsif ($f[2] eq 'stop_codon') {
69 | # $sc{$id}->[1] = $f[4];
70 | #}# elsif ($f[2] eq 'miRNA' ) {
71 | # $trans{$id} = \@f if !$trans{$id};
72 | # push @{$exons{$id}}, \@f;
73 | #}
74 | }
75 |
76 | for $id (
77 | # sort by chr then pos
78 | sort {
79 | $trans{$a}->[0] eq $trans{$b}->[0] ?
80 | $trans{$a}->[3] <=> $trans{$b}->[3] :
81 | $trans{$a}->[0] cmp $trans{$b}->[0]
82 | } (keys(%trans)) ) {
83 | my ($chr, undef, undef, undef, undef, undef, $dir, undef, $attr, undef, $cds, $cde) = @{$trans{$id}};
84 | my ($cds, $cde);
85 | ($cds, $cde) = @{$sc{$id}} if $sc{$id};
86 | my ($gene_name) = $attr=~ /gene_name "([^"]+)"/;
87 | my ($gene_id) = $attr=~ /gene_id "([^"]+)"/;
88 | my ($trans_type) = $attr=~ /transcript_biotype "([^"]+)"/;
89 | if (!( $trans_type && length($trans_type)>0)) {
90 | ($trans_type) = $attr=~ /gene_biotype "([^"]+)"/;
91 | }
92 | if (!( $trans_type && length($trans_type)>0)) {
93 | ($trans_type) = $attr=~ /transcript_type "([^"]+)"/;
94 | }
95 | if (!( $trans_type && length($trans_type)>0)) {
96 | ($trans_type) = $attr=~ /gene_type "([^"]+)"/;
97 | }
98 | # sort by pos
99 | my @ex = sort {
100 | $a->[3] <=> $b->[3]
101 | } @{$exons{$id}};
102 |
103 | my $beg = $ex[0][3];
104 | my $end = $ex[-1][4];
105 |
106 | if ($dir eq '-') {
107 | # swap
108 | $tmp=$cds;
109 | $cds=$cde;
110 | $cde=$tmp;
111 | $cds -= 2 if $cds;
112 | $cde += 2 if $cde;
113 | }
114 |
115 | # not specified, just use exons
116 | $cds = $beg if !$cds;
117 | $cde = $end if !$cde;
118 |
119 | # adjust start for bed
120 | --$beg; --$cds;
121 |
122 | my $exn = @ex; # exon count
123 | my $exst = join ",", map {$_->[3]-$beg-1} @ex; # exon start
124 | my $exsz = join ",", map {$_->[4]-$_->[3]+1} @ex; # exon size
125 |
126 | # if (($trans_type eq 'protein_coding') || ($trans_type eq 'processed_transcript')) {
127 | #if (!(($trans_type eq 'protein_coding') || ($trans_type eq 'processed_transcript'))) {
128 | # added an extra comma to make it look exactly like ucsc's beds
129 | print "$chr\t$beg\t$end\t$id/$trans_type/$gene_id/$gene_name\t0\t$dir\t$cds\t$cde\t0\t$exn\t$exsz,\t$exst,\n";
130 | # }
131 | }
132 |
133 |
134 | close IN;
135 |
--------------------------------------------------------------------------------
/bin/util/irfinder:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/irfinder
--------------------------------------------------------------------------------
/bin/util/irfinder_cnn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/irfinder_cnn
--------------------------------------------------------------------------------
/bin/util/model/best_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/model/best_model.h5
--------------------------------------------------------------------------------
/bin/util/model/best_model.tflite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/model/best_model.tflite
--------------------------------------------------------------------------------
/bin/util/model/model_info.json:
--------------------------------------------------------------------------------
1 | {
2 | "Image directory": "/work/ritchieteam/EMT_irfinder/trainingshortread/",
3 | "Output directory": "depth15_ir0.1_noir0.cov50long_.01cov25allshortnondir_congruant_2_2021-05-20_12_01",
4 | "Validation split": 0.1,
5 | "Epochs": 500,
6 | "Batch size": 50,
7 | "Model json": null,
8 | "Image size": 256,
9 | "Number of colors": 0,
10 | "Seed": 123,
11 | "Threads": 5,
12 | "Dataset": {
13 | "counts": [
14 | [
15 | 1662,
16 | 185
17 | ],
18 | [
19 | 8164,
20 | 907
21 | ]
22 | ],
23 | "class_names": [
24 | "hIR",
25 | "noIR"
26 | ]
27 | }
28 | }
--------------------------------------------------------------------------------
/bin/util/trim:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/trim
--------------------------------------------------------------------------------
/bin/util/winflat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/winflat
--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [[ "$1" != "local" ]] && [[ "$1" != "" ]] && [[ "$1" != "remove" ]]; then
4 | echo -e "Usage: \nGlobal installation:\tsudo install.sh\nLocal installation:\tinstall.sh local" >&2
5 | echo -e "Uninstall all:\tsudo install.sh remove\nUninstall local:\tinstall.sh remove local\nUninstall local:\tsudo install.sh remove global" >&2
6 | exit 1
7 | fi
8 |
9 | function checkDependencies(){
10 | local distro=$(cat /proc/version )
11 | local deps=0
12 | echo "Checking dependencies..."
13 | for pkg in $@ ; do
14 | if [[ "${distro}" =~ Ubuntu|Debian ]] ; then
15 | if ! dpkg -s $pkg >/dev/null 2>/dev/null ; then
16 | echo "Dependency $pkg not found." >&2
17 | deps=1
18 | fi
19 | else
20 | if ! rpm -q $pkg >/dev/null 2>/dev/null ; then
21 | echo "Dependency $pkg not found." >&2
22 | deps=1
23 | fi
24 | fi
25 | done
26 | if [ $deps -eq 1 ]; then
27 | exit 1
28 | fi
29 | }
30 |
31 |
32 | if [[ $1 == "remove" ]]; then
33 | if [[ "$2" != "global" ]] && [[ "$2" != "local" ]] && [[ "$2" != "" ]] ; then
34 | echo "Error: $2 not recognized. Use 'local' or 'global' or leave empty" >&2
35 | exit 1
36 | fi
37 | if [[ "$2" == "" || "$2" == "global" ]]; then
38 | if [ "$EUID" -ne 0 ]; then
39 | echo "Please run as root"
40 | exit
41 | fi
42 | if [ -d /usr/local/IRFinder ]; then
43 | rm -fr /usr/local/IRFinder /usr/bin/IRFinder
44 | echo "Removed system installation"
45 | else
46 | echo "Global installation of IRFinder not found"
47 | fi
48 | fi
49 | if [[ "$2" == "" || "$2" == "local" ]] ;then
50 | if [ -d ~/.local/IRFinder ] ; then
51 | rm -fr ~/.local/IRFinder ~/.local/bin/IRFinder
52 | echo "Removed local installation"
53 | else
54 | echo "Local installation of IRFinder not found."
55 | fi
56 | fi
57 | exit
58 | fi
59 |
60 |
61 | if [[ "${1}" != "local" ]]; then
62 | if [ "$EUID" -ne 0 ]; then
63 | echo "Please run as root or to install IRFinder locally call"
64 | echo "./install.sh local"
65 | echo ""
66 | exit 1
67 | fi
68 | fi
69 |
70 | checkDependencies "make bedtools samtools gzip gawk libboost-iostreams-dev zlib1g"
71 |
72 |
73 | ORIGINAL_FOLDER=$(realpath $PWD)
74 | BASE_FOLDER=$(dirname "$(readlink -nf "$BASH_SOURCE")")
75 |
76 | cd $BASE_FOLDER/src/trim/
77 | make clean
78 | make
79 | cp ./trim $BASE_FOLDER/bin/util/trim
80 | make clean
81 | cd ../winflat
82 | make clean
83 | make
84 | cp ./winflat $BASE_FOLDER/bin/util/winflat
85 | cd ../irfinder/Release
86 | make clean
87 | make
88 | cp ./irfinder $BASE_FOLDER/bin/util/irfinder
89 | make clean
90 | cd $BASE_FOLDER
91 | chmod -R a+x ./bin
92 | if [[ "${1}" == "local" ]];then
93 | if [ -d ~/.local/IRFinder ]; then
94 | rm -fr ~/.local/IRFinder ~/.local/bin/IRFinder
95 | fi
96 | cp -r $BASE_FOLDER ~/.local/IRFinder
97 | ln -s $(realpath ~/.local/IRFinder/bin/IRFinder) ~/.local/bin/IRFinder
98 | else
99 | if [ -d /usr/local/IRFinder ]; then
100 | rm -fr /usr/local/IRFinder /usr/bin/IRFinder
101 | fi
102 | cp -r $BASE_FOLDER /usr/local/IRFinder
103 | ln -s /usr/local/IRFinder/bin/IRFinder /usr/bin/IRFinder
104 | fi
105 |
106 | cd $ORIGINAL_FOLDER
107 |
108 |
109 | if ! which suppa.py >/dev/null 2>/dev/null ; then
110 | echo "SUPPA2 not found. To use the RunMode Diff, install it: https://github.com/comprna/SUPPA " >&2
111 | fi
112 |
113 | if ! which STAR > /dev/null 2> /dev/null ; then
114 | echo "STAR not found. To use the RunMode FastQ and to produce your own mapability files during the reference build, install it: https://github.com/alexdobin/STAR " >&2
115 | fi
116 |
117 | if ! which minimap2 > /dev/null 2> /dev/null ; then
118 | echo "minimap2 not found. To use the RunMode Long, install it: https://github.com/lh3/minimap2 " >&2
119 | fi
120 |
121 |
122 |
123 |
124 |
125 |
126 |
--------------------------------------------------------------------------------
/src/cnnfilter/cnnfilter/actions/resultgraph.py:
--------------------------------------------------------------------------------
1 | import sys,os
2 | import pandas as pd
3 | import numpy as np
4 | #EMT5p
5 | if sys.argv[1] == "EMT5m":
6 |
7 | shortnames = [["T5moins_rep1", "T5moins_rep2", "T5moins_rep3"]]
8 |
9 | if sys.argv[1] == "EMT5p":
10 |
11 | shortnames = [["T5plus_rep1", "T5plus_rep2", "T5plus_rep3"]]
12 |
13 | if sys.argv[1] == "EMT1p":
14 |
15 | shortnames = [["T1plus_rep1", "T1plus_rep2", "T1plus_rep3"]]
16 |
17 |
18 | folder = "/work/sylvain/IntronScanner/test/training/EMT_training/depth15_ir0.05_noir0.cov50long_.023allshort_rmALLnondirnoncongruant2021-05-07_16_53/"
19 | folder = sys.argv[1]
20 |
21 | pred = pd.read_csv("prediction_for_EMT_test.tsv",sep="\t|-|:",skiprows=1,header=None)
22 | pred[2]=pred[2]+15
23 | pred[3]=pred[3]-15
24 | pred["id"]=pred[1].apply(str)+":"+pred[2].apply(str)+"-"+pred[3].apply(str)
25 | pred["truelab"]="no"
26 | for sit, sname in enumerate(shortnames[0]):
27 | data = pd.read_csv(sname+".tsv",delimiter="\t")
28 | data["id"]=data["Chr"].apply(str)+":"+data["Start"].apply(str)+"-"+data["End"].apply(str)
29 | pred.loc[pred[0]==sname].loc[pred['id'].isin(data["id"])]
30 | data.loc[data['id'].isin(pred.loc[pred[0] == sname]["id"])]
31 | pred.loc[pred[0] == sname]["truelab"]=data.loc[data['id'].isin(pred.loc[pred[0] == sname]["id"])]["Warnings"]
32 | (pred[1]==data["Chr"]) & (pred[2]==data["Start"]) & (pred[3]==data["End"]) & (pred[0]==sname)
33 | pred[pred[0]==sname]
34 | # for sit, sname in enumerate(shortnames[0]):
35 | # txt = numpy.loadtxt(os.path.join(folder, sname, "IRFinder-IR-nondir.txt"), delimiter="\t)
36 | # data = np.genfromtxt(fname=os.path.join(folder, sname, "IRFinder-IR-nondir.txt"), delimiter="\t", skip_header=1)
37 | # sarray = open(os.path.join(folder, sname, "IRFinder-IR-nondir-AI.txt"), "rt")
38 | # print(os.path.join(folder, sname, "IRFinder-IR-nondir-AI.txt"))
39 | #
40 | #
41 | # line = _array.readline()
42 | #
43 | # while True:
44 | # line = _array.readline()
45 | # if not line:
46 | # break
47 | #
48 | # irratio = float(a[19])
49 | # classpred= a[20]
50 | # irratio=float(a[3])
--------------------------------------------------------------------------------
/src/cnnfilter/cnnfilter/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # encoding: utf-8
3 | '''
4 | intron_scanner.intron_scanner -- shortdesc
5 |
6 | intron_scanner.intron_scanner is a description
7 |
8 | It defines classes_and_methods
9 |
10 | @author: user_name
11 |
12 | @copyright: 2020 organization_name. All rights reserved.
13 |
14 | @license: license
15 |
16 | @contact: user_email
17 | @deffield updated: Updated
18 | '''
19 |
20 | import sys
21 | import os
22 |
23 | LIBRARY_LOCATION = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
24 | sys.path.append(LIBRARY_LOCATION)
25 |
26 | from optparse import OptionParser
27 |
28 |
29 | def title(title):
30 | bar = '-' * 50
31 | white = ' ' * round((50 - len(title)) / 2)
32 | white_b = ' ' * 5
33 | title = "\n " + white_b + bar + "\n" + white_b + "|" + white + title + white + "|\n " + white_b + bar + "\n\n"
34 | print(title)
35 |
36 |
37 | def print_help():
38 | print("Usage: intron_scanner action [options]\n\nPossible actions:")
39 | print("\t- extract:\t exctract genomic regions (bed file) from a bam file as images.")
40 | print(
41 | "\t- a2i: \t exctract images from an array produced by the extract command and organize in according to given labels")
42 | print("\t- train: \t train a tensorflow model on a given set of images")
43 | print("\t- test: \t use a trained model to predict the class of a given set of images")
44 | print("\n")
45 |
46 |
47 | def main(argv=None):
48 | '''Command line options.'''
49 | program_name = os.path.basename(sys.argv[0])
50 | title("Intron scanner")
51 | action = "help"
52 | if len(sys.argv) > 1:
53 | action = sys.argv[1]
54 |
55 | if argv is None:
56 | argv = sys.argv[2:]
57 | try:
58 |
59 | if action == "train":
60 | program_name = program_name + " " + 'training process'
61 | parser = OptionParser(usage="Usage: %prog train ",
62 | description="Train a neural network model on a given set of images")
63 | parser.add_option("-d", "--img-dir", dest="dir",
64 | help="The directory containing the images, divided in subdirectories in according to the classes. Example: ./training/ -> ./training/labelA/ ./training/labelB/ ",
65 | metavar="DIR", type="string")
66 | parser.add_option("-o", "--out", dest="outdir", help="Output directory [default: %default]", metavar="DIR",
67 | type="string")
68 | parser.add_option("-b", "--batch", dest="batch", help="Number of images to load. [default: %default]",
69 | metavar="INT", type="int")
70 | parser.add_option("-s", "--image-size", dest="size", help="Images size [default: %default]", metavar="INT",
71 | type="int")
72 | # parser.add_option("-c", "--color-number", dest="colorN",
73 | # help="Number of color's dimension : 1 -> grey (read)\n\t\t\t 2 ->(read and annotation ??),\n\t\t\t3 -> 3colors [default: %default]",
74 | # metavar="INT", type="int")
75 | parser.add_option("-S", "--seed", dest="seed", help="Seed for the validation split [default: %default]",
76 | metavar="INT", type="int")
77 | parser.add_option("-t", "--threads", dest="threads", help="Number of threads to use. [default: %default]",
78 | metavar="INT", type="int")
79 | parser.add_option("-V", "--validation-split", dest="vsplit", metavar="FLOAT",
80 | help="Fraction of the dataset to use for the validation [default: %default]",
81 | type="float")
82 | parser.add_option("-e", "--epochs", dest="epochs", metavar="INT",
83 | help="Number of training epoch [default: %default]", type="int")
84 | parser.add_option("-E", "--earlystop", dest="earlystop", metavar="INT",
85 | help="Number of patience epoch for earlystop , -1 for no earlystop [default: 0.1*epochs (10 percent of the total number of epochs]",
86 | type="int")
87 | parser.add_option("-m", "--json-model", dest="model", metavar="FILE",
88 | help="Load the tensorflow model from a json file [default: %default]", type="string")
89 | parser.add_option("-v", "--verbose", dest="verbose", action="count", help="Set tensorflow verbosity level")
90 | # set defaults
91 | parser.set_defaults(outdir="./model/", size=256, colorN=0, verbose=0, epoch=10, earlystop=-200, ext="png",
92 | model=None, vsplit=0.20, batch=50, seed=123, threads=None, epochs=10)
93 | # process options
94 | (opts, _) = parser.parse_args(argv)
95 | required = "dir ".split()
96 | for r in required:
97 | if opts.__dict__[r] is None:
98 | parser.error("Parameter %s required\n\nUse --help to get more information\n" % r)
99 | from cnnfilter.actions.models import IntronModeller
100 | modeller = IntronModeller(opts.verbose)
101 | # modeller.train(opts.dir, opts.outdir, opts.size, opts.batch, opts.vsplit, opts.seed, opts.epochs, opts.threads, opts.model, opts.colorN, opts.earlystop)
102 | modeller.train_from_array(opts.dir, opts.outdir, opts.size, opts.batch, opts.vsplit, opts.seed, opts.epochs,
103 | opts.threads, opts.model, opts.colorN, opts.earlystop)
104 |
105 | elif action == "test":
106 | program_name = program_name + " " + 'test process'
107 | parser = OptionParser(usage="Usage: %prog test ",
108 | description="Test a neural network model on a given set of images")
109 | parser.add_option("-d", "--img-dir", dest="dir",
110 | help="The directory containing the images to predict. If they are in subdirectories, the subfolder name is used as true label.",
111 | metavar="DIR", type="string")
112 | parser.add_option("-a", "--array-file", dest="array", metavar="FILE",
113 | help="Use a file conaining the image information, produced by the extract process",
114 | type="string")
115 | parser.add_option("-b", "--bed-file", dest="bed", metavar="FILE",
116 | help="bed file associated to the array (-a). Can be a general tsv file. The last column is used as true label",
117 | type="string")
118 | parser.add_option("-m", "--model-dir", dest="model", metavar="DIR",
119 | help="Folder containing the model. It has to contain the files best_model.h5 and model_info.json [default: %default]",
120 | type="string")
121 | # parser.add_option("-c", "--color-number", dest="colorN",
122 | # help="Number of color's dimension : 1 -> grey (read)\n\t\t\t 2 ->(read and annotation ??),\n\t\t\t3 -> 3colors [default: %default]",
123 | # metavar="INT", type="int")
124 | parser.add_option("-o", "--out", dest="out", help="Output file [default: %default]", metavar="FILE",
125 | type="string")
126 | parser.add_option("-v", "--verbose", dest="verbose", action="count", help="Set tensorflow verbosity level")
127 | # set defaults
128 | parser.set_defaults(out="./predictions.tsv", verbose=0, dir=None, array=None, bed=None,
129 | model="./model/")
130 | # process options
131 | (opts, _) = parser.parse_args(argv)
132 | if (opts.dir != None) == (opts.array != None):
133 | parser.error(
134 | "Parameters -a and -d are mutual exclusive and at least one is required\n\nUse --help to get more information\n")
135 | from cnnfilter.actions.models import IntronModeller
136 | modeller = IntronModeller(opts.verbose)
137 | modeller.test(opts.dir, opts.array, opts.bed, opts.model, opts.out)
138 | elif action == "help" or action == "-h" or action == "--help":
139 | print_help()
140 | else:
141 | raise ValueError("Action %s not recognized." % action)
142 |
143 | except Exception as e:
144 | print(program_name + ": " + repr(e) + "\n")
145 | print("\n\nFor help use --help\n\n")
146 | print(e)
147 | if __debug__:
148 | raise e
149 | return 2
150 |
151 |
152 | if __name__ == "__main__":
153 | sys.exit(main())
154 |
155 |
156 | #json,gzip,time,tensorflow,matplotlib,numpy,sklearn,re,progress
--------------------------------------------------------------------------------
/src/cnnfilter/cnnfilter/model/best_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/src/cnnfilter/cnnfilter/model/best_model.h5
--------------------------------------------------------------------------------
/src/cnnfilter/cnnfilter/model/model_info.json:
--------------------------------------------------------------------------------
1 | {
2 | "Image directory": "/work/ritchieteam/EMT_irfinder/trainingshortread/",
3 | "Output directory": "depth15_ir0.1_noir0.cov50long_.01cov25allshortnondir_congruant_2_2021-05-20_12_01",
4 | "Validation split": 0.1,
5 | "Epochs": 500,
6 | "Batch size": 50,
7 | "Model json": null,
8 | "Image size": 256,
9 | "Number of colors": 0,
10 | "Seed": 123,
11 | "Threads": 5,
12 | "Dataset": {
13 | "counts": [
14 | [
15 | 1662,
16 | 185
17 | ],
18 | [
19 | 8164,
20 | 907
21 | ]
22 | ],
23 | "class_names": [
24 | "hIR",
25 | "noIR"
26 | ]
27 | }
28 | }
--------------------------------------------------------------------------------
/src/cnnfilter/cnnfilter/utils/reader.py:
--------------------------------------------------------------------------------
1 | import json
2 | import gzip
3 |
4 | class ImageArray:
5 | def __init__(self, raw_line):
6 | raw_line=raw_line.split("\t")
7 | raw_name=raw_line[0].split(":")
8 | self.name=raw_name[0]+":"+raw_name[1]
9 | self.strand=raw_name[2]
10 | self.region=json.loads(raw_line[1])
11 | self.is_valid= len(self.region) > 1
12 | if self.is_valid:
13 | self.is_valid=max([sum(i) for i in self.region]) > 0
14 |
15 |
16 | class ImageArchive:
17 |
18 | def _open_file(self, fname):
19 | if fname.endswith(".gz"):
20 | return gzip.open(fname, "rt")
21 | else:
22 | return open(fname, "rt")
23 |
24 | def _count_lines(self, fname):
25 | tmp = self._open_file(fname)
26 | count=0
27 | for _ in tmp:
28 | count+=1
29 | tmp.close()
30 | return count
31 |
32 |
33 | def __init__(self, bed_file, array_file):
34 | self._len=self._count_lines(array_file)
35 | if bed_file != None:
36 | if self._count_lines(bed_file) != self._len :
37 | raise AssertionError("Files {} and {} have not the same number of lines!".format(array_file, bed_file))
38 | self._bed = self._open_file(bed_file)
39 | self._has_bed=True
40 | else:
41 | self._has_bed=False
42 | self._array= self._open_file(array_file)
43 | self._index=-1
44 |
45 |
46 | def __iter__(self):
47 | return self
48 |
49 | def __next__(self):
50 | self._index+=1
51 | if self._index < self._len:
52 | if self._has_bed:
53 | return self._bed.readline().strip().split("\t"), ImageArray(self._array.readline())
54 | else:
55 | return ["NA"], ImageArray(self._array.readline())
56 | else:
57 | raise StopIteration
58 |
59 | def __len__(self):
60 | return self._len
61 |
62 | def __del__(self):
63 | self.close()
64 |
65 | def close(self):
66 | self._array.close()
67 | if self._has_bed:
68 | self._bed.close()
69 |
70 | def getIndex(self):
71 | return self._index
72 |
73 |
--------------------------------------------------------------------------------
/src/cnnfilter/testCNN/actions/extract.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.interpolate import interp1d
3 |
4 |
5 |
6 |
7 |
8 | def getImageArrayFromRegion(region, img_size=None):
9 | '''
10 | Return the numpy array representing the image from a given region
11 | '''
12 |
13 |
14 | read_img, ann_img = generateImagesArrayGreyFromRegion(region,img_size)
15 |
16 | return read_img
17 |
18 |
19 |
20 | def generateImagesArrayGreyFromRegion(region, img_size=None):
21 |
22 |
23 | '''
24 | Return the arrays composing an image from a given region
25 | '''
26 |
27 | region_size = len(region)
28 |
29 | depth = max([sum(i) for i in region])
30 | if depth == 0:
31 | raise ArithmeticError("Error! trying to generate an image with zero depth.")
32 | reads_img = (np.array(region)[:, :] / depth) * 255
33 |
34 |
35 | if region_size < img_size:
36 | kindinterp = "nearest"
37 | else:
38 | kindinterp = "zero" #"linear"
39 |
40 |
41 | f0 = interp1d(np.arange(0, region_size-30), reads_img[15:-15,0], kind=kindinterp)
42 | f1 = interp1d(np.arange(0, region_size-30), reads_img[15:-15,1], kind=kindinterp)
43 |
44 | reads_imgd1 = np.array([np.array(reads_img[0:15, 0])])
45 | reads_imgd1 = np.append(reads_imgd1, f0(np.arange(0, (img_size - 30)) * ((region_size - 31) / (img_size - 30))))
46 | reads_imgd1 = np.append(reads_imgd1, reads_img[-15:, 0])
47 |
48 | reads_imgd2 = np.array([np.array(reads_img[0:15, 1]+reads_img[0:15, 0])])
49 |
50 |
51 | reads_imgd2 = np.append(reads_imgd2, f1(np.arange(0, (img_size - 30)) * ((region_size - 31) / (img_size - 30)))+reads_imgd1[15:-15])
52 |
53 | reads_imgd2 = np.append(reads_imgd2, reads_img[ -15:,1]+reads_img[-15:, 0])
54 |
55 | reads_img2 = np.array([reads_imgd1,reads_imgd2])
56 |
57 | reads_img2 = np.expand_dims(np.rot90(np.round(reads_img2).astype("float32"), k=3), axis=2)
58 |
59 | return reads_img2, None
60 |
61 |
--------------------------------------------------------------------------------
/src/cnnfilter/testCNN/actions/models.py:
--------------------------------------------------------------------------------
1 | import os
2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
3 |
4 | import tflite_runtime.interpreter as tflite
5 | from scipy.special import softmax
6 |
7 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
8 |
9 | import json
10 | import numpy as np
11 |
12 | from utils.reader import ImageArchive
13 | from actions.extract import getImageArrayFromRegion
14 |
15 |
16 | class IntronModeller():
17 | def __init__(self, verbosity=0):
18 | if verbosity > 3:
19 | verbosity=3
20 | os.environ['TF_CPP_MIN_LOG_LEVEL'] ="{}".format(3-verbosity)
21 | self.verbosity = verbosity
22 |
23 |
24 | def test(self, img_dir=None, model_dir="./model/", colorN=3, imagemode=0):
25 | if (img_dir == None):
26 | raise ValueError("img_dir is required.")
27 | self._load_model(model_dir)
28 | self._model_dir = model_dir
29 |
30 |
31 | if img_dir != None:
32 | for filesdir in ["IRFinder-IR-dir", "IRFinder-IR-nondir"]:
33 | if os.path.isfile(os.path.join(img_dir, filesdir+"-AI.txt")):
34 | output_file=os.path.join(img_dir, filesdir+"-val.txt")
35 | self._out_f = open(output_file, "wt")
36 | arr_file = os.path.join(img_dir, filesdir + "-AI.txt")
37 | bed_file = os.path.join(img_dir, filesdir + ".txt")
38 | print("Processing "+filesdir+".txt")
39 | self._test_irfinder_result(arr_file,bed_file)
40 | print("Done.")
41 | self._out_f.close()
42 |
43 | def _predict(self, arr):
44 | self.model["Model"].reset_all_variables()
45 | self.model["Model"].set_tensor(self.model["InputDetails"][0]['index'], [arr])
46 | self.model["Model"].invoke()
47 | return self.model["Model"].get_tensor(self.model["OutputDetails"][0]['index'])[0]
48 |
49 |
50 |
51 | def _test_irfinder_result(self, arr_file, bed_file):
52 | ori_res = open(bed_file , "rt")
53 | line = ori_res.readline().split("\t")
54 | line[4] = "CNN_IRscore"
55 | self._out_f.write(("\t").join(line))
56 | ori_res.close()
57 | arch = ImageArchive(bed_file,arr_file )
58 | for bed, arr in arch:
59 | if arr.is_valid:
60 | pred= self._predict(getImageArrayFromRegion(arr.region, self.model["Image size"]))
61 | score = softmax(pred)
62 | idx_max = np.argmax(score)
63 | pred_lab = self.model["Dataset"]["class_names"][idx_max]
64 | if pred_lab=="hIR":
65 | line = bed
66 | line[4] = str(score[0])
67 | self._out_f.write(("\t").join(line)+"\n")
68 | return
69 |
70 |
71 |
72 | def _load_model(self, model_dir):
73 | print("Loading the best_model in {}".format(model_dir))
74 | model_info_file="{}/model_info.json".format(model_dir)
75 | model_file="{}/best_model.tflite".format(model_dir)
76 | if not os.path.exists(model_info_file) or not os.path.exists(model_file):
77 | raise FileNotFoundError("Error! files model_info.json and best_model.h5 have to be in the model folder! ")
78 | with open(model_info_file, "rt") as fp:
79 | self.model=json.load(fp)
80 | self.model["Model"]=tflite.Interpreter(model_path=model_file)
81 | self.model["Model"].allocate_tensors()
82 | self.model["InputDetails"]=self.model["Model"].get_input_details()
83 | self.model["OutputDetails"]=self.model["Model"].get_output_details()
84 | print("Done.")
85 | return
86 |
87 |
88 | return data
89 |
90 |
91 |
92 |
93 |
94 |
95 |
--------------------------------------------------------------------------------
/src/cnnfilter/testCNN/irfinder_cnn.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # encoding: utf-8
3 | '''
4 | intron_scanner.intron_scanner -- shortdesc
5 |
6 | intron_scanner.intron_scanner is a description
7 |
8 | It defines classes_and_methods
9 |
10 | @author: user_name
11 |
12 | @copyright: 2020 organization_name. All rights reserved.
13 |
14 | @license: license
15 |
16 | @contact: user_email
17 | @deffield updated: Updated
18 | '''
19 |
20 | import sys
21 | import os
22 | import json
23 | import gzip
24 |
25 | LIBRARY_LOCATION = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
26 | sys.path.append(LIBRARY_LOCATION)
27 |
28 | from optparse import OptionParser
29 |
30 |
31 |
32 |
33 |
34 | def title(title):
35 | bar = '-' * 50
36 | white = ' ' * round((50 - len(title)) / 2)
37 | white_b = ' ' * 5
38 | title = "\n " + white_b + bar + "\n" + white_b + "|" + white + title + white + "|\n " + white_b + bar + "\n\n"
39 | print(title)
40 |
41 |
42 | def print_help():
43 | print("Usage: intron_scanner action [options]\n\nPossible actions:")
44 |
45 | print("\t- test: \t use a trained model to predict the class of a given set of images")
46 | print("\n")
47 |
48 |
49 | def main(argv=None):
50 | '''Command line options.'''
51 | program_name = os.path.basename(sys.argv[0])
52 | title("CNN filter")
53 | if argv is None:
54 | argv = sys.argv
55 | try:
56 | program_name = program_name + " " + 'test process'
57 | parser = OptionParser(usage="Usage: %prog test ",
58 | description="Test a neural network model on a given set of images")
59 | parser.add_option("-d", "--img-dir", dest="dir",
60 | help="The directory containing the IRFinder results to predict. ",
61 | metavar="DIR", type="string")
62 |
63 | parser.add_option("-m", "--model-dir", dest="model", metavar="DIR",
64 | help="Folder containing the model. It has to contain the files best_model.h5 and model_info.json [default: %default]",
65 | type="string")
66 | parser.add_option("-o", "--out", dest="out", help="Output file [default: %default]", metavar="FILE",
67 | type="string")
68 | parser.add_option("-v", "--verbose", dest="verbose", action="count", help="Set tensorflow verbosity level")
69 | # set defaults
70 | parser.set_defaults(out="./predictions.tsv", verbose=0, colorN=3, dir=None, array=None, bed=None,
71 | model="./model/")
72 | # process options
73 | (opts, _) = parser.parse_args(argv)
74 | if (opts.dir != None) == (opts.array != None):
75 | parser.error(
76 | "Parameter -d is required\n\nUse --help to get more information\n")
77 | from actions.models import IntronModeller
78 | modeller = IntronModeller(opts.verbose)
79 | modeller.test(opts.dir, opts.model, opts.out)
80 | except Exception as e:
81 | print(program_name + ": " + repr(e) + "\n")
82 | print("\n\nFor help use --help\n\n")
83 | print(e)
84 | if __debug__:
85 | raise e
86 | return 2
87 |
88 |
89 |
90 |
91 | if __name__ == "__main__":
92 | sys.exit(main())
93 |
94 |
95 | #json,gzip,time,tensorflow,matplotlib,numpy,sklearn,re,progress
--------------------------------------------------------------------------------
/src/cnnfilter/testCNN/model/best_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/src/cnnfilter/testCNN/model/best_model.h5
--------------------------------------------------------------------------------
/src/cnnfilter/testCNN/model/model_info.json:
--------------------------------------------------------------------------------
1 | {
2 | "Image directory": "/work/ritchieteam/EMT_irfinder/trainingshortread/",
3 | "Output directory": "depth15_ir0.1_noir0.cov50long_.01cov25allshortnondir_congruant_2_2021-05-20_12_01",
4 | "Validation split": 0.1,
5 | "Epochs": 500,
6 | "Batch size": 50,
7 | "Model json": null,
8 | "Image size": 256,
9 | "Number of colors": 0,
10 | "Seed": 123,
11 | "Threads": 5,
12 | "Dataset": {
13 | "counts": [
14 | [
15 | 1662,
16 | 185
17 | ],
18 | [
19 | 8164,
20 | 907
21 | ]
22 | ],
23 | "class_names": [
24 | "hIR",
25 | "noIR"
26 | ]
27 | }
28 | }
--------------------------------------------------------------------------------
/src/cnnfilter/testCNN/utils/reader.py:
--------------------------------------------------------------------------------
1 | import json
2 | import gzip
3 |
4 | class ImageArray:
5 | def __init__(self, raw_line):
6 | raw_line=raw_line.split("\t")
7 | raw_name=raw_line[0].split(":")
8 | self.name=raw_name[0]+":"+raw_name[1]
9 | pos=raw_name[1].split("-")
10 | self.intron_name="{}:{}-{}".format(raw_name[0],int(pos[0])+15,int(pos[1])-15)
11 | self.strand=raw_name[2]
12 | self.region=json.loads(raw_line[1])
13 | self.is_valid= len(self.region) > 1
14 | if self.is_valid:
15 | self.is_valid=max([sum(i) for i in self.region]) > 0
16 |
17 |
18 | class ImageArchive:
19 |
20 | def _open_file(self, fname):
21 | if fname.endswith(".gz"):
22 | return gzip.open(fname, "rt")
23 | else:
24 | return open(fname, "rt")
25 |
26 | def _count_lines(self, fname):
27 | tmp = self._open_file(fname)
28 | count=0
29 | for _ in tmp:
30 | count+=1
31 | tmp.close()
32 | return count
33 |
34 |
35 | def __init__(self, bed_file, array_file):
36 | self._len=self._count_lines(array_file)
37 | if bed_file != None:
38 | self._bed = self._open_file(bed_file)
39 | self._has_bed=True
40 | else:
41 | self._has_bed=False
42 | self._array= self._open_file(array_file)
43 | self._index=-1
44 |
45 |
46 | def __iter__(self):
47 | return self
48 |
49 | def __next__(self):
50 | self._index+=1
51 | if self._index < self._len:
52 | if self._has_bed:
53 | img_array=ImageArray(self._array.readline())
54 | bed_line=self._bed.readline().strip().split("\t")
55 | while bed_line[0] + ":" + bed_line[1] + "-" + bed_line[2] != img_array.intron_name :
56 | bed_line=self._bed.readline().strip().split("\t")
57 | return bed_line , img_array
58 | else:
59 | return ["NA"], ImageArray(self._array.readline())
60 | else:
61 | raise StopIteration
62 |
63 | def __len__(self):
64 | return self._len
65 |
66 | def __del__(self):
67 | self.close()
68 |
69 | def close(self):
70 | self._array.close()
71 | if self._has_bed:
72 | self._bed.close()
73 |
74 | def getIndex(self):
75 | return self._index
76 |
77 |
--------------------------------------------------------------------------------
/src/irfinder/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | irfinder
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.cdt.managedbuilder.core.genmakebuilder
10 | clean,full,incremental,
11 |
12 |
13 |
14 |
15 | org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder
16 | full,incremental,
17 |
18 |
19 |
20 |
21 |
22 | org.eclipse.cdt.core.cnature
23 | org.eclipse.cdt.core.ccnature
24 | org.eclipse.cdt.managedbuilder.core.managedBuildNature
25 | org.eclipse.cdt.managedbuilder.core.ScannerConfigNature
26 |
27 |
28 |
--------------------------------------------------------------------------------
/src/irfinder/.settings/language.settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/src/irfinder/.settings/org.eclipse.cdt.core.prefs:
--------------------------------------------------------------------------------
1 | doxygen/doxygen_new_line_after_brief=true
2 | doxygen/doxygen_use_brief_tag=false
3 | doxygen/doxygen_use_javadoc_tags=true
4 | doxygen/doxygen_use_pre_tag=false
5 | doxygen/doxygen_use_structural_commands=false
6 | eclipse.preferences.version=1
7 |
--------------------------------------------------------------------------------
/src/irfinder/Release/makefile:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 |
5 | -include ../makefile.init
6 |
7 | RM := rm -rf
8 |
9 | # All of the sources participating in the build are defined here
10 | -include sources.mk
11 | -include src/Utils/subdir.mk
12 | -include src/ReadBlock/subdir.mk
13 | -include src/Blocks/subdir.mk
14 | -include src/subdir.mk
15 | -include subdir.mk
16 | -include objects.mk
17 |
18 | ifneq ($(MAKECMDGOALS),clean)
19 | ifneq ($(strip $(CC_DEPS)),)
20 | -include $(CC_DEPS)
21 | endif
22 | ifneq ($(strip $(C++_DEPS)),)
23 | -include $(C++_DEPS)
24 | endif
25 | ifneq ($(strip $(C_UPPER_DEPS)),)
26 | -include $(C_UPPER_DEPS)
27 | endif
28 | ifneq ($(strip $(CXX_DEPS)),)
29 | -include $(CXX_DEPS)
30 | endif
31 | ifneq ($(strip $(CPP_DEPS)),)
32 | -include $(CPP_DEPS)
33 | endif
34 | ifneq ($(strip $(C_DEPS)),)
35 | -include $(C_DEPS)
36 | endif
37 | endif
38 |
39 | -include ../makefile.defs
40 |
41 | # Add inputs and outputs from these tool invocations to the build variables
42 |
43 | # All Target
44 | all: irfinder
45 |
46 | # Tool invocations
47 | irfinder: $(OBJS) $(USER_OBJS)
48 | @echo 'Building target: $@'
49 | @echo 'Invoking: GCC C++ Linker'
50 | g++ -o "irfinder" $(OBJS) $(USER_OBJS) $(LIBS)
51 | @echo 'Finished building target: $@'
52 | @echo ' '
53 |
54 | # Other Targets
55 | clean:
56 | -$(RM) $(CC_DEPS)$(C++_DEPS)$(EXECUTABLES)$(C_UPPER_DEPS)$(CXX_DEPS)$(OBJS)$(CPP_DEPS)$(C_DEPS) irfinder
57 | -@echo ' '
58 |
59 | .PHONY: all clean dependents
60 |
61 | -include ../makefile.targets
62 |
--------------------------------------------------------------------------------
/src/irfinder/Release/objects.mk:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 |
5 | USER_OBJS :=
6 |
7 | LIBS := -lboost_iostreams
8 |
9 |
--------------------------------------------------------------------------------
/src/irfinder/Release/sources.mk:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 |
5 | C_UPPER_SRCS :=
6 | CXX_SRCS :=
7 | C++_SRCS :=
8 | OBJ_SRCS :=
9 | CC_SRCS :=
10 | ASM_SRCS :=
11 | CPP_SRCS :=
12 | C_SRCS :=
13 | O_SRCS :=
14 | S_UPPER_SRCS :=
15 | CC_DEPS :=
16 | C++_DEPS :=
17 | EXECUTABLES :=
18 | C_UPPER_DEPS :=
19 | CXX_DEPS :=
20 | OBJS :=
21 | CPP_DEPS :=
22 | C_DEPS :=
23 |
24 | # Every subdirectory with source files must be described here
25 | SUBDIRS := \
26 | src/Blocks \
27 | src \
28 | src/ReadBlock \
29 | src/Utils \
30 |
31 |
--------------------------------------------------------------------------------
/src/irfinder/Release/src/Blocks/subdir.mk:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 |
5 | # Add inputs and outputs from these tool invocations to the build variables
6 | CPP_SRCS += \
7 | ../src/Blocks/BAM2blocks.cpp \
8 | ../src/Blocks/CoverageBlock.cpp \
9 | ../src/Blocks/FragmentBlocks.cpp
10 |
11 | OBJS += \
12 | ./src/Blocks/BAM2blocks.o \
13 | ./src/Blocks/CoverageBlock.o \
14 | ./src/Blocks/FragmentBlocks.o
15 |
16 | CPP_DEPS += \
17 | ./src/Blocks/BAM2blocks.d \
18 | ./src/Blocks/CoverageBlock.d \
19 | ./src/Blocks/FragmentBlocks.d
20 |
21 |
22 | # Each subdirectory must supply rules for building sources it contributes
23 | src/Blocks/%.o: ../src/Blocks/%.cpp
24 | @echo 'Building file: $<'
25 | @echo 'Invoking: GCC C++ Compiler'
26 | g++ -O3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@)" -o "$@" "$<"
27 | @echo 'Finished building: $<'
28 | @echo ' '
29 |
30 |
31 |
--------------------------------------------------------------------------------
/src/irfinder/Release/src/ReadBlock/subdir.mk:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 |
5 | # Add inputs and outputs from these tool invocations to the build variables
6 | CPP_SRCS += \
7 | ../src/ReadBlock/CoverageBlocks.cpp \
8 | ../src/ReadBlock/ReadBlockProcessor.cpp
9 |
10 | OBJS += \
11 | ./src/ReadBlock/CoverageBlocks.o \
12 | ./src/ReadBlock/ReadBlockProcessor.o
13 |
14 | CPP_DEPS += \
15 | ./src/ReadBlock/CoverageBlocks.d \
16 | ./src/ReadBlock/ReadBlockProcessor.d
17 |
18 |
19 | # Each subdirectory must supply rules for building sources it contributes
20 | src/ReadBlock/%.o: ../src/ReadBlock/%.cpp
21 | @echo 'Building file: $<'
22 | @echo 'Invoking: GCC C++ Compiler'
23 | g++ -O3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@)" -o "$@" "$<"
24 | @echo 'Finished building: $<'
25 | @echo ' '
26 |
27 |
28 |
--------------------------------------------------------------------------------
/src/irfinder/Release/src/Utils/subdir.mk:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 |
5 | # Add inputs and outputs from these tool invocations to the build variables
6 | CPP_SRCS += \
7 | ../src/Utils/crc32.cpp
8 |
9 | OBJS += \
10 | ./src/Utils/crc32.o
11 |
12 | CPP_DEPS += \
13 | ./src/Utils/crc32.d
14 |
15 |
16 | # Each subdirectory must supply rules for building sources it contributes
17 | src/Utils/%.o: ../src/Utils/%.cpp
18 | @echo 'Building file: $<'
19 | @echo 'Invoking: GCC C++ Compiler'
20 | g++ -O3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@)" -o "$@" "$<"
21 | @echo 'Finished building: $<'
22 | @echo ' '
23 |
24 |
25 |
--------------------------------------------------------------------------------
/src/irfinder/Release/src/subdir.mk:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Automatically-generated file. Do not edit!
3 | ################################################################################
4 |
5 | # Add inputs and outputs from these tool invocations to the build variables
6 | CPP_SRCS += \
7 | ../src/IRFinder2.cpp
8 |
9 | OBJS += \
10 | ./src/IRFinder2.o
11 |
12 | CPP_DEPS += \
13 | ./src/IRFinder2.d
14 |
15 |
16 | # Each subdirectory must supply rules for building sources it contributes
17 | src/%.o: ../src/%.cpp
18 | @echo 'Building file: $<'
19 | @echo 'Invoking: GCC C++ Compiler'
20 | g++ -O3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@)" -o "$@" "$<"
21 | @echo 'Finished building: $<'
22 | @echo ' '
23 |
24 |
25 |
--------------------------------------------------------------------------------
/src/irfinder/src/Blocks/BAM2blocks.h:
--------------------------------------------------------------------------------
1 | #ifndef CODE_BAM2BLOCKS
2 | #define CODE_BAM2BLOCKS
3 |
4 | #include "FragmentBlocks.h"
5 | #include
6 | #include
7 | #include
8 |
9 | /* Little Endian .. for big endian each group of 4 bytes needs to be reversed before individual members are accessed. */
10 | // std c11 allows anonymous struct/union. -Wall may give a warning as non-portable to older c++ standards.
11 |
12 |
13 |
14 | class BAM2blocks {
15 |
16 | // TODO -- are structs best hidden inside the class? Does doing so push them into namespace of the class only?
17 | struct bam_read_core {
18 | union {
19 | char c[36];
20 | struct {
21 | int32_t block_size;
22 | int32_t refID;
23 | int32_t pos;
24 | uint8_t l_read_name;
25 | uint8_t mapq;
26 | uint16_t bin;
27 | uint16_t n_cigar_op;
28 | uint16_t flag;
29 | int32_t l_seq;
30 | int32_t next_refID;
31 | int32_t next_pos;
32 | int32_t tlen;
33 | }; // anonymous struct to allow easy access to members.
34 | };
35 | char read_name[256];
36 | union {
37 | char cigar_buffer[20000];
38 | int32_t cigar[5000];
39 | };
40 | };
41 |
42 | union bam_header {
43 | char c[8];
44 | struct {
45 | char magic[4];
46 | int32_t l_text;
47 | };
48 | };
49 |
50 | union stream_int32 {
51 | char c[4];
52 | int32_t i;
53 | };
54 |
55 | static const int BAM_HEADER_BYTES = 8;
56 | static const int BAM_READ_CORE_BYTES = 36;
57 | static const int BAM_READ_CORE_MAX_CIGAR = 20000;
58 |
59 | FragmentBlocks oBlocks;
60 |
61 | std::vector< std::function &)> > callbacksChrMappingChange;
62 | std::vector< std::function > callbacksProcessBlocks;
63 |
64 | // Statistics.
65 | ulong cShortPairs;
66 | ulong cIntersectPairs;
67 | ulong cLongPairs;
68 | ulong cSingleReads;
69 | ulong cPairedReads;
70 | ulong cErrorReads;
71 | ulong cSkippedReads;
72 | uint64_t totalNucleotides;
73 | std::map skippedReason;
74 |
75 | std::map> tmp_reads;
76 | bam_read_core tmp_read;
77 | bam_read_core tmp_mate;
78 | uint64_t current_read=0;
79 |
80 | bool getNextReadHead(bam_read_core &);
81 | void errorMessage();
82 | void getReadBody(bam_read_core &);
83 | void handlePairs(bam_read_core &, bam_read_core &);
84 | std::string getName(bam_read_core &);
85 | void setMate(std::vector & mate);
86 | void saveMate();
87 | std::istream * IN;
88 | std::istream instream;
89 | void cigar2block(int32_t * cigar, uint16_t n_cigar_op, std::vector &starts, std::vector &lens, int &ret_genome_len);
90 |
91 | unsigned int processPair(bam_read_core * read1, bam_read_core * read2);
92 | unsigned int processSingle(bam_read_core * read1);
93 |
94 | std::vector stream_buffer;
95 | void fillBuffer();
96 | std::ifstream file;
97 | boost::iostreams::filtering_streambuf inbuf;
98 | bool coord_sorted=false;
99 | public:
100 | BAM2blocks();
101 | void openFile(std::istream * _IN);
102 | void openFile(std::string in_file);
103 | void readBamHeader(); // implied by openFile. So perhaps should be private.
104 | int processAll();
105 |
106 | void registerCallbackChrMappingChange( std::function &)> callback );
107 | void registerCallbackProcessBlocks( std::function callback );
108 |
109 | std::string samHeader;
110 | std::vector chr_names; //tab terminated chromosome names.
111 | std::vector chr_lens; //length of each chromosome (not used when reading, used if optionally outputting an altered BAM file)
112 | };
113 |
114 |
115 | #endif
116 |
--------------------------------------------------------------------------------
/src/irfinder/src/Blocks/CoverageBlock.cpp:
--------------------------------------------------------------------------------
1 | #include "CoverageBlock.h"
2 | // using namespace std;
3 |
4 | CoverageBlock::CoverageBlock(uint start, uint end) {
5 | blockStart = start;
6 | blockEnd = end;
7 | firstDepth[0] = 0;
8 | firstDepth[1] = 0;
9 | blockExtents = NULL;
10 | blockExtentsL = NULL;
11 | }
12 |
13 | //direction -- 0=False/Neg, 1=True/Pos.
14 | void CoverageBlock::RecordCover(uint readStart, uint readEnd, bool dir) {
15 |
16 | if (readStart <= blockStart && readEnd > blockStart) {
17 | firstDepth[dir]++;
18 | } else if (readStart < blockEnd) {
19 | // Need to increment the starts vector.
20 | uint inc_index = readStart - blockStart - 1;
21 | if (blockExtentsL) { //already an int vector
22 | blockExtentsL->at(inc_index).start[dir]++;
23 | } else if (!blockExtents) { //don't have a char vector either - create first.
24 | blockExtents = new std::vector(vectorLen());
25 | blockExtents->at(inc_index).start[dir]++;
26 | } else {
27 | if (blockExtents->at(inc_index).start[dir] == 254) {
28 | blockExtentsL = new std::vector(
29 | blockExtents->begin(), blockExtents->end());
30 | delete blockExtents;
31 | blockExtents = NULL;
32 | blockExtentsL->at(inc_index).start[dir]++;
33 | } else {
34 | blockExtents->at(inc_index).start[dir]++;
35 | }
36 | }
37 | } else {
38 | return;
39 | }
40 |
41 | if (readEnd >= blockEnd) {
42 | return;
43 | } else {
44 | // Need to increment the ends vector.
45 | uint inc_index = readEnd - blockStart - 1;
46 |
47 | if (blockExtentsL) { //already an int vector
48 | blockExtentsL->at(inc_index).end[dir]++;
49 | } else if (!blockExtents) { //don't have a char vector either - create first.
50 | blockExtents = new std::vector(vectorLen());
51 | blockExtents->at(inc_index).end[dir]++;
52 | } else {
53 | if (blockExtents->at(inc_index).end[dir] == 254) {
54 | blockExtentsL = new std::vector(
55 | blockExtents->begin(), blockExtents->end());
56 | delete blockExtents;
57 | blockExtents = NULL;
58 | blockExtentsL->at(inc_index).end[dir]++;
59 | } else {
60 | blockExtents->at(inc_index).end[dir]++;
61 | }
62 | }
63 | }
64 | // Can Throw: Out of range exception.
65 | }
66 |
67 | void CoverageBlock::updateCoverageHist(std::map &hist, uint start,
68 | uint end) const {
69 |
70 | if (!blockExtentsL && !blockExtents) {
71 | // how many bases in this block?
72 | hist[firstDepth[0] + firstDepth[1]] += std::min(blockEnd, end)
73 | - std::max(blockStart, start);
74 | } else {
75 | // There are read starts and ends -- need to walk the positions from the start of this block
76 | // even if not in the region of interest.
77 |
78 | //special handling for the first base -- the one before the vector starts.
79 | uint depth = firstDepth[0] + firstDepth[1];
80 | if (start <= blockStart) {
81 | // use the first depth, before commencing in the vector.
82 | hist[depth]++;
83 | }
84 |
85 | uint startindex = std::max(blockStart + 1, start) - blockStart - 1;
86 | uint endindex = std::min(blockEnd, end) - blockStart - 1;
87 | if (blockExtents) {
88 | for (uint i = 0; i < endindex; i++) {
89 | depth += -(*blockExtents)[i].end[0] - (*blockExtents)[i].end[1]
90 | + (*blockExtents)[i].start[0]
91 | + (*blockExtents)[i].start[1];
92 | if (i >= startindex) {
93 | hist[depth]++;
94 | }
95 | }
96 | } else {
97 | for (uint i = 0; i < endindex; i++) {
98 | depth += -(*blockExtentsL)[i].end[0]
99 | - (*blockExtentsL)[i].end[1]
100 | + (*blockExtentsL)[i].start[0]
101 | + (*blockExtentsL)[i].start[1];
102 | if (i >= startindex) {
103 | hist[depth]++;
104 | }
105 | }
106 | }
107 | // When in the region of interest, update the hist each step.
108 | }
109 | }
110 |
111 | void CoverageBlock::updateCoverageHist(std::map &hist, uint start,
112 | uint end, bool dir) const {
113 | if (!blockExtentsL && !blockExtents) {
114 | // how many bases in this block?
115 | hist[firstDepth[dir]] += std::min(blockEnd, end)
116 | - std::max(blockStart, start);
117 | } else {
118 | //special handling for the first base -- the one before the vector starts.
119 | uint depth = firstDepth[dir];
120 | if (start <= blockStart) {
121 | // use the first depth, before commencing in the vector.
122 | hist[depth]++;
123 | }
124 |
125 | uint startindex = std::max(blockStart + 1, start) - blockStart - 1;
126 | uint endindex = std::min(blockEnd, end) - blockStart - 1;
127 | if (blockExtents) {
128 | for (uint i = 0; i < endindex; i++) {
129 | depth += -(*blockExtents)[i].end[dir]
130 | + (*blockExtents)[i].start[dir];
131 | if (i >= startindex) {
132 | hist[depth]++;
133 | }
134 | }
135 | } else {
136 | for (uint i = 0; i < endindex; i++) {
137 | depth += -(*blockExtentsL)[i].end[dir]
138 | + (*blockExtentsL)[i].start[dir];
139 | if (i >= startindex) {
140 | hist[depth]++;
141 | }
142 | }
143 | }
144 | }
145 | }
146 |
147 | void CoverageBlock::updateCoverageArray(std::vector &arr,
148 | std::vector &covered, uint start, uint end) const {
149 | uint depth = firstDepth[0] + firstDepth[1],
150 | startindex = std::max( blockStart, start-1) - blockStart,
151 | endindex = std::min(blockEnd, end) - blockStart,
152 | startarray = std::max(blockStart+1, start ) - start ,
153 | endarray = std::min(blockEnd, end) - start ;
154 |
155 | if (!blockExtentsL && !blockExtents) {
156 | for (uint i = startindex; i < endindex && startarray < endarray;
157 | i++, startarray++) {
158 | arr[startarray] += depth;
159 | covered[startarray] = true;
160 | }
161 | } else {
162 | // There are read starts and ends -- need to walk the positions from the start of this block
163 | // even if not in the region of interest.
164 | if (blockExtents) {
165 | for (uint i = 0; i < endindex && startarray < endarray; i++) {
166 | depth += -(*blockExtents)[i].end[0] - (*blockExtents)[i].end[1]
167 | + (*blockExtents)[i].start[0]
168 | + (*blockExtents)[i].start[1];
169 | if (i >= startindex) {
170 | arr[startarray] += depth;
171 | covered[startarray] = true;
172 | startarray++;
173 | }
174 | }
175 | } else {
176 | for (uint i = 0; i < endindex && startarray < endarray; i++) {
177 | depth += -(*blockExtentsL)[i].end[0]
178 | - (*blockExtentsL)[i].end[1]
179 | + (*blockExtentsL)[i].start[0]
180 | + (*blockExtentsL)[i].start[1];
181 | if (i >= startindex) {
182 | arr[startarray] += depth;
183 | covered[startarray] = true;
184 | startarray++;
185 | }
186 | }
187 | }
188 | // When in the region of interest, update the hist each step.
189 | }
190 | }
191 |
192 | void CoverageBlock::updateCoverageArray(std::vector &arr,
193 | std::vector &covered, uint start, uint end, bool dir) const {
194 |
195 | uint depth = firstDepth[0] + firstDepth[1],
196 | startindex = std::max( blockStart, start-1) - blockStart,
197 | endindex = std::min(blockEnd, end) - blockStart,
198 | startarray = std::max(blockStart+1, start ) - start ,
199 | endarray = std::min(blockEnd, end) - start ;
200 | if (!blockExtentsL && !blockExtents) {
201 | for (uint i = startindex; i < endindex && startarray < endarray;
202 | i++, startarray++) {
203 | arr[startarray] += depth;
204 | covered[startarray] = true;
205 | }
206 | } else {
207 | // There are read starts and ends -- need to walk the positions from the start of this block
208 | // even if not in the region of interest.
209 | if (blockExtents) {
210 | for (uint i = 0; i < endindex && startarray < endarray; i++) {
211 | depth += -(*blockExtents)[i].end[dir]
212 | + (*blockExtents)[i].start[dir];
213 | if (i >= startindex) {
214 | arr[startarray] += depth;
215 | covered[startarray] = true;
216 | startarray++;
217 | }
218 | }
219 | } else {
220 | for (uint i = 0; i < endindex && startarray < endarray; i++) {
221 | depth += -(*blockExtentsL)[i].end[dir]
222 | + (*blockExtentsL)[i].start[dir];
223 | if (i >= startindex) {
224 | arr[startarray] += depth;
225 | covered[startarray] = true;
226 | startarray++;
227 | }
228 | }
229 | }
230 | // When in the region of interest, update the hist each step.
231 | }
232 | }
233 |
234 | void CoverageBlock::print(std::ostream &os) const {
235 | os << "Coverage block " << blockStart << " - " << blockEnd << "\n";
236 | os << "First depth 0 : " << firstDepth[0] << "\n";
237 | os << "First depth 1 : " << firstDepth[0] << "\n";
238 | uint i=0;
239 | if (blockExtents) {
240 | os << "BlockExtents: \n";
241 | for (auto &a : (*blockExtents)) {
242 | os << i+blockStart << " " << (uint) a.start[0] << ":" << (uint) a.start[1] << " - " << (uint) a.end[0] << ":"<< (uint) a.end[1] << "\n";
243 | i++;
244 | }
245 | }
246 | if (blockExtentsL) {
247 | os << "BlockExtentsL: \n";
248 | for (auto &a : (*blockExtentsL)) {
249 | os << i+blockStart << " " << (uint) a.start[0] << ":" << (uint) a.start[1] << " - " << (uint) a.end[0] << ":"<< (uint) a.end[1] << "\n";
250 | i++;
251 | }
252 | }
253 | }
254 |
255 | std::ostream& operator<<(std::ostream &os, const CoverageBlock &cb) {
256 | cb.print(os);
257 | return os;
258 | }
259 |
--------------------------------------------------------------------------------
/src/irfinder/src/Blocks/CoverageBlock.h:
--------------------------------------------------------------------------------
1 | #ifndef CODE_COVERAGEBLOCK
2 | #define CODE_COVERAGEBLOCK
3 |
4 | #include "../Utils/includedefine.h"
5 |
6 | class start_stops {
7 | public:
8 | unsigned char start[2];
9 | unsigned char end[2];
10 |
11 | start_stops() {
12 | start[0]=0;
13 | start[1]=0;
14 | end[0]=0;
15 | end[1]=0;
16 | };
17 | };
18 |
19 | class start_stopsL {
20 | public:
21 | unsigned int start[2];
22 | unsigned int end[2];
23 |
24 | start_stopsL() {
25 | start[0]=0;
26 | start[1]=0;
27 | end[0]=0;
28 | end[1]=0;
29 | };
30 | start_stopsL(const start_stops ©) {
31 | start[0]=copy.start[0];
32 | start[1]=copy.start[1];
33 | end[0]=copy.end[0];
34 | end[1]=copy.end[1];
35 | };
36 |
37 | };
38 |
39 |
40 | class CoverageBlock {
41 | private:
42 | uint blockStart;
43 | uint blockEnd;
44 | uint firstDepth[2];
45 | std::vector* blockExtents;
46 | std::vector* blockExtentsL;
47 |
48 | inline uint vectorLen() {
49 | return (blockEnd - blockStart - 1);
50 | };
51 | public:
52 | uint getLength() { return blockEnd - blockStart;}
53 | uint getStart(){return blockStart;}
54 | uint getEnd(){return blockEnd;}
55 | CoverageBlock(uint start, uint end);
56 | void RecordCover(uint start, uint end, bool dir);
57 | //RetrieveCover(..);
58 | void print(std::ostream& os) const;
59 |
60 | //First form, non-directional. Second form, directional with "dir" specifiying whether sense or anti-sense.
61 | void updateCoverageHist(std::map &hist, uint start, uint end) const;
62 | void updateCoverageHist(std::map &hist, uint start, uint end, bool dir) const;
63 | void updateCoverageArray(std::vector &arr,std::vector & covered, uint start, uint end) const;
64 | void updateCoverageArray(std::vector &arr,std::vector & covered, uint start, uint end, bool dir) const;
65 | inline bool posIsAfterStart(const uint &compareval) const {
66 | return (compareval > blockStart);
67 | };
68 |
69 | // http://www.learncpp.com/cpp-tutorial/94-overloading-the-comparison-operators/
70 | // http://en.cppreference.com/w/cpp/language/operator_comparison
71 | inline bool operator<(const CoverageBlock &b) const {
72 | return (blockEnd < b.blockEnd);
73 | };
74 | inline bool operator<(const uint &b) const {
75 | return (blockEnd < b); //a is the object.
76 | };
77 | friend inline bool operator<(const uint &a, const CoverageBlock &b) {
78 | return (a < b.blockEnd); //a is a uint.
79 | };
80 | };
81 |
82 | std::ostream& operator<<( std::ostream& os, const CoverageBlock& cb);
83 |
84 | #endif
85 |
--------------------------------------------------------------------------------
/src/irfinder/src/Blocks/FragmentBlocks.cpp:
--------------------------------------------------------------------------------
1 | #include "FragmentBlocks.h"
2 |
3 |
4 | // This class is an information storage container only -- pretty much a struct.
5 | // It allows all the relevant information relating to an interpreted fragment to be passed
6 | // to the variety of callback watchers that require fragment blocks to update their stats.
7 |
8 | FragmentBlocks::FragmentBlocks() {
9 | rStarts[0].reserve(initial_alloc);
10 | rLens[0].reserve(initial_alloc);
11 | rStarts[1].reserve(initial_alloc);
12 | rLens[1].reserve(initial_alloc);
13 | readName.reserve(max_read_name);
14 | readCount = 0;
15 | }
16 |
17 | // Return a string representation of the Chromosome name.
18 | const std::string FragmentBlocks::chrName() const {
19 | return chr_names.at(chr_id);
20 | }
21 |
22 | // Update the internal data structure with a new mapping between Chromosome ID# and Chromosome name (string).
23 | void FragmentBlocks::ChrMapUpdate(const std::vector &chrmap) {
24 | chr_names = chrmap;
25 | }
26 |
--------------------------------------------------------------------------------
/src/irfinder/src/Blocks/FragmentBlocks.h:
--------------------------------------------------------------------------------
1 | #ifndef CODE_FRAGMENTBLOCKS
2 | #define CODE_FRAGMENTBLOCKS
3 |
4 | #include "../Utils/includedefine.h"
5 |
6 | /* A class to store up to 2 reads belonging to a single fragment.
7 | * It is a storage class, almost a struct, it does not perform processing itself.
8 | * Read1 is always valid.
9 | * Read2 is only valid if readCount == 2.
10 | *
11 | * There may only be a single read if:
12 | * - the sequencing is single end rather than paired end..
13 | * - the sequencing is paired end, but the two reads overlapped and have been combined
14 | * into a single synthetic read / block of coverage.
15 | */
16 | class FragmentBlocks {
17 | private:
18 | static const int initial_alloc = 100;
19 | static const int max_read_name = 300;
20 | std::vector chr_names; //TODO - this is currently unused??
21 | public:
22 | FragmentBlocks();
23 | const std::string chrName() const;
24 | void ChrMapUpdate(const std::vector& chrmap);
25 |
26 | std::string readName;
27 | std::vector rStarts[2];
28 | std::vector rLens[2];
29 | uint readStart[2];
30 | uint readEnd[2];
31 | int readCount;
32 | uint chr_id; // Assumption that both r1 & r2 are on the same chromosome?
33 | // if they aren't we shouldn't process them as a single fragment.
34 | // perhaps a sanity check in pairing, only treat them as a pair
35 | // if the name of the reads matches and the Chr matches.
36 | bool direction;
37 | };
38 |
39 | #endif
40 |
--------------------------------------------------------------------------------
/src/irfinder/src/ReadBlock/CoverageBlocks.h:
--------------------------------------------------------------------------------
1 | #ifndef CODE_READBLOCKPROCESSOR_COVERAGEBLOCKS
2 | #define CODE_READBLOCKPROCESSOR_COVERAGEBLOCKS
3 |
4 | #include "../Blocks/CoverageBlock.h"
5 | #include "ReadBlockProcessor.h"
6 | #include "../Blocks/FragmentBlocks.h"
7 |
8 | struct BEDrecord {
9 | std::string chrName;
10 | std::string name;
11 | uint start;
12 | uint end;
13 | bool direction;
14 | std::vector> blocks;
15 | };
16 |
17 |
18 | class CoverageBlocks : public ReadBlockProcessor {
19 | //Store the Blocked BED record for each ROI/intron. This won't be referred to again until the end.
20 | //XX Create the temporary vectors (per Chr) which simply list the blocks sequentially as read.
21 | //XX Sort the temporary vectors
22 | //XX Build the final vectors of "blocks of interest"
23 | //xx Delete the temporary vectors
24 | //xx Create the parallel vectors with counter objects. (do these as a batch at the end, once vector size is known - for best memory layout)
25 | //xx Process fragments against the counter structure. (have I already written a class/object for this?)
26 |
27 | //Produce summary statistical output for each Blocked BED record, using the counter structure.
28 |
29 | protected:
30 |
31 | // Coverage depth data-structures.
32 | std::map> chrName_CoverageBlocks;
33 | std::map> chrName_FlankCoverageBlocks;
34 | // Shortcut pointers to depth data-structures.
35 | std::vector*> chrID_CoverageBlocks;
36 | std::vector*> chrID_FlankCoverageBlocks;
37 |
38 | // TODO: what is optimal for speed & memory usage?
39 | // static const uint coverage_block_max_length = 5000;
40 | static const uint coverage_block_max_length = 500;
41 |
42 |
43 | std::vector BEDrecords;
44 | bool long_read=false;
45 | int jitter = 3;
46 |
47 | public:
48 | CoverageBlocks(std::string read_type) {
49 | long_read = read_type == "LR";
50 | }
51 | void setJitter(int j){jitter=j;};
52 | void ProcessBlocks(const FragmentBlocks &fragblock);
53 | void ChrMapUpdate(const std::vector &chrmap);
54 | void loadRef(std::istream &IN);
55 | int WriteOutput(std::ostream *os) const;
56 |
57 | void fillHist(std::map &hist, const std::string &chrName, const std::vector> &blocks) const;
58 | void fillHist(std::map &hist, const std::string &chrName, const std::vector> &blocks, bool direction) const;
59 | void getCoverageArray(std::vector &coverages,
60 | std::vector & covered,
61 | const std::string &chrName,
62 | const uint arr_start, const uint arr_end) const;
63 | void getCoverageArray(std::vector &coverages,
64 | std::vector & covered,
65 | const std::string &chrName,
66 | const uint arr_start, const uint arr_end,
67 | bool direction) const;
68 | double meanFromHist(const std::map &hist) const;
69 | double coverageFromHist(const std::map &hist) const;
70 | double percentileFromHist(const std::map &hist, uint percentile) const;
71 | double trimmedMeanFromHist(const std::map &hist, uint centerPercent) const;
72 | };
73 |
74 | class CoverageBlocksIRFinder : public CoverageBlocks {
75 | private:
76 | uint AI_warn=0;
77 | uint AI_intron=1;
78 | double AI_ratio=0.05;
79 | public:
80 |
81 | CoverageBlocksIRFinder(std::string read_type) : CoverageBlocks(read_type){
82 | }
83 | void setAI(uint AI_warning_level, uint AI_min_intron_coverage, double AI_IRratio){
84 | AI_warn=AI_warning_level;
85 | AI_intron=AI_min_intron_coverage;
86 | AI_ratio=AI_IRratio;
87 | }
88 | int WriteOutput(std::ostream *os, std::ostream *osAI, const JunctionCount &JC, const SpansPoint &SP, int directionality = 0) const;
89 |
90 | };
91 |
92 |
93 | #endif
94 |
--------------------------------------------------------------------------------
/src/irfinder/src/ReadBlock/ReadBlockProcessor.h:
--------------------------------------------------------------------------------
1 | #ifndef CODE_READBLOCKPROCESSOR
2 | #define CODE_READBLOCKPROCESSOR
3 |
4 | #include "../Blocks/FragmentBlocks.h"
5 |
6 | /*
7 | The code can be finished faster if we force a requirement that all input files are coordinate sorted by the start of each block.
8 | ie: sort -k2,2n (for BED files).
9 | Chromosome sorted or not won't matter, as these get split into different vectors in all cases.
10 | */
11 |
12 |
13 |
14 | class ReadBlockProcessor {
15 | public:
16 | virtual void ProcessBlocks(const FragmentBlocks &fragblock) = 0;
17 | virtual void ChrMapUpdate(const std::vector &chrmap) = 0; //Maybe some of these funcs shouldn't be pure virtual - overloadable if needed, but default often ok.
18 | };
19 |
20 |
21 | class BED12Output : public ReadBlockProcessor {
22 | private:
23 | std::vector chr_names;
24 | std::ostream* out;
25 | public:
26 | void ProcessBlocks(const FragmentBlocks &fragblock);
27 | void ChrMapUpdate(const std::vector &chrmap);
28 | void SetOutputStream(std::ostream *os);
29 | };
30 |
31 |
32 | class JunctionCount : public ReadBlockProcessor {
33 | private:
34 | std::map,uint[3]>> chrName_junc_count;
35 | std::vector,uint[3]>*> chrID_junc_count;
36 | //uint[3] - 0, neg strand count; 1, pos strand count; 2 = expected direction from ref: 0=unknown, 1=neg, 2=pos.
37 |
38 | std::map> chrName_juncLeft_count;
39 | std::vector*> chrID_juncLeft_count;
40 |
41 | std::map> chrName_juncRight_count;
42 | std::vector*> chrID_juncRight_count;
43 | //chrID_... stores a fast access pointer to the appropriate structure in chrName_...
44 | public:
45 | void ProcessBlocks(const FragmentBlocks &fragblock);
46 | void ChrMapUpdate(const std::vector &chrmap);
47 | int WriteOutput(std::ostream *os) const;
48 | void loadRef(std::istream &IN); //loadRef is optional, it allows directional detection to determine not just non-dir vs dir, but also which direction.
49 |
50 | int Directional() const;
51 |
52 | uint lookup(std::string ChrName, uint left, uint right, bool direction) const;
53 | uint lookup(std::string ChrName, uint left, uint right) const;
54 | uint lookupLeft(std::string ChrName, uint left, bool direction) const;
55 | uint lookupLeft(std::string ChrName, uint left) const;
56 | uint lookupRight(std::string ChrName, uint right, bool direction) const;
57 | uint lookupRight(std::string ChrName, uint right) const;
58 |
59 | // Ideally we would read the XS junction strand attribute from the BAM if we want to count junctions from non-directional sequencing.
60 | // that will require BAM2blocks to be informed it should read the optional attributes looking for that attrib in that case.
61 | // -- or we can just ignore direction -- the splice start/end information effectively determines the XS info (by ref to the reference)
62 | };
63 |
64 |
65 | class SpansPoint : public ReadBlockProcessor {
66 | private:
67 | std::map> chrName_pos;
68 | std::map> chrName_count[2];
69 | std::vector*> chrID_pos;
70 | std::vector*> chrID_count[2];
71 | char overhangLeft;
72 | char overhangRight;
73 | char overhangTotal;
74 | //chrID_... stores a fast access pointer to the appropriate structure in chrName_...
75 | public:
76 | void setSpanLength(uint overhang_left, uint overhang_right);
77 | void loadRef(std::istream &IN);
78 | void ProcessBlocks(const FragmentBlocks &fragblock);
79 | void ChrMapUpdate(const std::vector &chrmap);
80 | //void SetOutputStream(std::ostream *os);
81 | int WriteOutput(std::ostream *os) const;
82 | uint lookup(std::string ChrName, uint pos, bool direction) const;
83 | uint lookup(std::string ChrName, uint pos) const;
84 | };
85 |
86 | class FragmentsInChr : public ReadBlockProcessor {
87 | // Counts the number of fragments in each Chromosome. (for both + & - strands).
88 | private:
89 | std::map> chrName_count; //only expecting 2 items in our vector.
90 | std::vector*> chrID_count;
91 | public:
92 | void ProcessBlocks(const FragmentBlocks &blocks);
93 | void ChrMapUpdate(const std::vector &chrmap);
94 | int WriteOutput(std::ostream *os) const;
95 | };
96 |
97 |
98 | class FragmentsInROI : public ReadBlockProcessor {
99 | // Counts the number of fragments fully contained within a ROI.
100 | // the ROIs may not overlap. Direction ignored for overlap detect.
101 | private:
102 | std::map RegionID_counter[2];
103 |
104 | std::map>> chrName_ROI;
105 | std::map> chrName_count[2];
106 |
107 | std::vector>*> chrID_ROI;
108 | std::vector*> chrID_count[2];
109 |
110 | // Perhaps we want to store some text relating to each record too? Easy to do if the input is pre-sorted (at least within each Chr).
111 | // if pre-sorted, it may be easier to check for no overlapping blocks on read .. or can do this immediately after read with a single nested-walk.
112 | std::map> chrName_ROI_text;
113 | public:
114 | void ProcessBlocks(const FragmentBlocks &blocks);
115 | void ChrMapUpdate(const std::vector &chrmap);
116 | void loadRef(std::istream &IN);
117 | int WriteOutput(std::ostream *os) const;
118 | };
119 |
120 |
121 | /*
122 | class CoverageBlocks : public ReadBlockProcessor { ... }
123 | // In it's own file -- bigger code.
124 | */
125 |
126 | #endif
127 |
--------------------------------------------------------------------------------
/src/irfinder/src/Utils/crc32.h:
--------------------------------------------------------------------------------
1 | // //////////////////////////////////////////////////////////
2 | // crc32.h
3 | // Copyright (c) 2014 Stephan Brumme. All rights reserved.
4 | // see http://create.stephan-brumme.com/disclaimer.html
5 | //
6 |
7 | #pragma once
8 |
9 | //#include "hash.h"
10 | #include
11 |
12 | // define fixed size integer types
13 | #ifdef _MSC_VER
14 | // Windows
15 | typedef unsigned __int8 uint8_t;
16 | typedef unsigned __int32 uint32_t;
17 | #else
18 | // GCC
19 | #include
20 | #endif
21 |
22 |
23 | /// compute CRC32 hash, based on Intel's Slicing-by-8 algorithm
24 | /** Usage:
25 | CRC32 crc32;
26 | std::string myHash = crc32("Hello World"); // std::string
27 | std::string myHash2 = crc32("How are you", 11); // arbitrary data, 11 bytes
28 |
29 | // or in a streaming fashion:
30 |
31 | CRC32 crc32;
32 | while (more data available)
33 | crc32.add(pointer to fresh data, number of new bytes);
34 | std::string myHash3 = crc32.getHash();
35 | */
36 | class CRC32 //: public Hash
37 | {
38 | public:
39 | /// same as reset()
40 | CRC32();
41 |
42 | /// compute CRC32 of a memory block
43 | std::string operator()(const void* data, size_t numBytes);
44 | /// compute CRC32 of a string, excluding final zero
45 | std::string operator()(const std::string& text);
46 |
47 | /// add arbitrary number of bytes
48 | void add(const void* data, size_t numBytes);
49 |
50 | /// return latest hash as 16 hex characters
51 | std::string getHash();
52 |
53 | /// return latest hash as a raw 32 bit integer
54 | uint32_t getRawHash();
55 |
56 | /// restart
57 | void reset();
58 |
59 | private:
60 | /// hash
61 | uint32_t m_hash;
62 | };
63 |
--------------------------------------------------------------------------------
/src/irfinder/src/Utils/includedefine.h:
--------------------------------------------------------------------------------
1 | #ifndef INCLUDEDEFINE_DEF
2 | #define INCLUDEDEFINE_DEF
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include