├── .gitignore ├── .project ├── .pydevproject ├── .settings └── org.eclipse.core.resources.prefs ├── CHANGELOG.md ├── Dockerfile ├── LICENSE ├── README.md ├── REF ├── Mapabilities │ └── hg38 │ │ ├── MapabilityExclusion.100bp.bed.gz │ │ ├── MapabilityExclusion.150.bed.gz │ │ └── MapabilityExclusion.70bp.bed.gz └── extra-input-files │ ├── Human_hg19_nonPolyA_ROI.bed │ ├── Human_hg19_wgEncodeDacMapabilityConsensusExcludable.bed.gz │ ├── Human_hg38_nonPolyA_ROI.bed │ ├── Mouse_mm10_nonPolyA_ROI.bed │ ├── Mouse_mm9_nonPolyA_ROI.bed │ ├── RNA.SpikeIn.ERCC.fasta.gz │ └── URLs ├── bin ├── AdaptorDetect.pl ├── DESeq2Constructor.R ├── IRFinder ├── IRFinderBAM ├── IRFinderBuildRef ├── IRFinderBuildRefDownload ├── IRFinderBuildRefFromSTARRef ├── IRFinderBuildRefProcess ├── IRFinderDiff ├── IRFinderFastQ ├── IRFinderLong ├── TrimBAM4IGV ├── analysisWithLowReplicates.pl ├── analysisWithNoReplicates.pl └── util │ ├── Build-BED-refs.sh │ ├── IRFinder-BuildRefFromEnsembl │ ├── IntronExclusion.pl │ ├── Mapability │ ├── adjust.R │ ├── bash_utils.sh │ ├── bed-to-intron+exon.pl │ ├── deseq2.R │ ├── generateReadsError.pl │ ├── gtf2bed-custom.pl │ ├── irfinder │ ├── irfinder_cnn │ ├── model │ ├── best_model.h5 │ ├── best_model.tflite │ └── model_info.json │ ├── trim │ ├── warnings │ └── winflat ├── install.sh └── src ├── cnnfilter ├── cnnfilter │ ├── actions │ │ ├── extract.py │ │ ├── models.py │ │ ├── resultgraph.py │ │ └── selectclass.py │ ├── main.py │ ├── model │ │ ├── best_model.h5 │ │ └── model_info.json │ └── utils │ │ └── reader.py └── testCNN │ ├── actions │ ├── extract.py │ └── models.py │ ├── irfinder_cnn.py │ ├── model │ ├── best_model.h5 │ └── model_info.json │ └── utils │ └── reader.py ├── irfinder ├── .cproject ├── .project ├── .settings │ ├── language.settings.xml │ └── org.eclipse.cdt.core.prefs ├── Release │ ├── makefile │ ├── objects.mk │ ├── sources.mk │ └── src │ │ ├── Blocks │ │ └── subdir.mk │ │ ├── ReadBlock │ │ └── subdir.mk │ │ ├── Utils │ │ └── subdir.mk │ │ └── subdir.mk └── src │ ├── Blocks │ ├── BAM2blocks.cpp │ ├── BAM2blocks.h │ ├── CoverageBlock.cpp │ ├── CoverageBlock.h │ ├── FragmentBlocks.cpp │ └── FragmentBlocks.h │ ├── IRFinder2.cpp │ ├── ReadBlock │ ├── CoverageBlocks.cpp │ ├── CoverageBlocks.h │ ├── ReadBlockProcessor.cpp │ └── ReadBlockProcessor.h │ └── Utils │ ├── crc32.cpp │ ├── crc32.h │ └── includedefine.h ├── trim ├── Makefile ├── TrimReads.cpp ├── TrimReads.h ├── includedefine.h ├── sequenceTools.cpp ├── sequenceTools.h └── trim.cpp └── winflat ├── Makefile ├── README ├── runtest.sh ├── winflat └── winflat_with_beta.c /.gitignore: -------------------------------------------------------------------------------- 1 | img/ 2 | docker_routine.sh 3 | src/irfinder/Release/irfinder 4 | src/irfinder/Release/**/*.o 5 | src/irfinder/Release/**/*.d 6 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | IRFinder 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | python3 5 | 6 | python interpreter 7 | 8 | 9 | -------------------------------------------------------------------------------- /.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/cnnfilter/cnnfilter/main.py=utf-8 3 | encoding//src/cnnfilter/testCNN/irfinder_cnn.py=utf-8 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 2 | # IRFinder Changelogs 3 | 4 | **2.0.0** 5 | 1. **Novelties** 6 | 1. New **Long** RunMode to process fast[q|a] files from long reads 7 | using Minimap2 as aligner. 8 | 2. New **-l** argument in **BAM** RunMode, to process long reads using an alternative algorithm. More information in the paper. 9 | 3. New **AI** process that uses a CNN model to detect false IR events on introns without warning in the last column of the result `IRFinder-IR-[non]dir.txt` file. It will generate a file containing only validated introns ( `IRFinder-IR-[non]dir-val.txt` ) 10 | 4. New **Diff** RunMode that uses SUPPA2 ( https://github.com/comprna/SUPPA ) or DESeq2 algorithm to identify differential IR events. 11 | 5. New **CLI** with dedicated helps for each RunMode and a verbose mode. 12 | 6. New **installation script**, to check the dependencies and install or uninstall IRFinder globally and locally. 13 | 14 | 7. **Docker** and **Singularity** images available, based on Ubuntu 18 LTS ( bionic ) and containing IRFinder and all his dependencies ( latest versions of STAR, Minimap2 and SUPPA2). 15 | 16 | 2. **Major changes** ( can impact the results between different versions ) 17 | 1. **NonUniformIntronCover** warning threshold simplified: now it uses the 25th/50th/75th percentile of intronic depth. Changed from: 18 | `` 19 | (max(Column13, Column14) > 2 + Column9 && max(Column13, Column14) > Column9 * 1.5 ) || (min(Column13, Column14) + 2 < Column9 && min(Column13, Column14)*1.5 < Column9 ) 20 | `` 21 | to 22 | `` Column12-Column10 > Column11 `` 23 | 2. **Default Mapability read length** is now 100 instead of 70. It's not anymore hard coded and can be changed with the argument **-n** in the RunModes *BuildRef[Process|FromSTARRef]* 24 | 3. **Paired reads with one pair unmapped** are now processed as single reads instead of being removed. 25 | 26 | 3. **Minor changes** ( no impact on the results but improves the usability) 27 | 1. The mapability file can be given as argument **-M** in the RunModes *BuildRef[Process|FromSTARRef]*. Precomputed mapabilities for hg38 are available under the git subdirectory */REF/Mapabilities/hg38/* for different read lengths ( 70, 100 and 150). This will reduce drastically the time to build the IRFinder reference. 28 | 2. New argument **-l** in *BuildRefFromSTARRef* to create a **l**ink to the existing reference STAR folder, the genome file and the annotation file, instead of copy them. This will save disk space in case of multiple IRFinder reference directories using the same STAR reference. 29 | 30 | **1.3.1** 31 | 1. IRFinder now exits immediately after error, instead of trying to complete the remaining processes. 32 | 2. Improved Perl version judgement during Phase 3 of reference preparation. 33 | 34 | **1.3.0** 35 | New features: 36 | 1. New `BuildRefFromSTARRef` mode. This allows users to use an existing STAR reference to build IRFinder reference, which significantly reduces the total preparation time. This new mode also tries to automatically figure out the original FASTA and GTF files used to generate the existing STAR reference. Call `IRFinder -h` for more details. 37 | 2. `BuildRef` and `BuildRefProcess` mode now support `-j` option to parse an integer that changes the default value of `--sjdbOverhang` argument in STAR. 38 | 3. `FASTQ` mode now supports `-y` option to feed extra STAR arguments to control alignment behaviors. 39 | 40 | Improvements: 41 | 1. `FASTQ` mode now outputs a full BAM file in "Unsorted.bam", instead of a BAM file with a trimmed QS column. 42 | 2. IRFinder does not automatically generate "unsorted.frag.bam" to save disk space and to avoid redundancy to "Unsorted.bam". Instead, IRFinder now provides a tool at `bin/TrimBAM4IGV` to generate this kind of trimmed BAM file to facilitate visualization purpose in IGV. 43 | 3. Re-design of standard output information during IRFinder reference preparation. It is easier to recognize occured errors now. 44 | 4. Usage information now can be viewed by `-h` option. 45 | 46 | Bug fixes: 47 | 1. The mapability calculation during the IRFinder reference preparation stage has been re-designed. The previous algorithm encountered buffer size issues when dealing with genomes with a huge amount of chromosomes/scaffolds. This has been fixed. Please note, the new algorithm requires `samtools` (>=1.4) executable binary ready in $PATH. 48 | 2. Since Perl 5.28.0, `sort '_mergesort'` is no longer supported. IRFinder now checks the Perl version and uses `sort` functions correspondingly. 49 | 50 | **1.2.6** 51 | 1. IRFinder now keeps introns with the same effective regions as separate entries in the reference. 52 | 2. IRFinder now automatically checks if the reference preparation stage generates empty reference files, which indicates process failure. 53 | 3. The R object genreated by Differential IR Analysis script now includes an additional slot named "MaxSplice", which represents the maximum splice reads at either end of introns. Each value is the maximum value between Column 17 and 18 in the IR quantification output. 54 | 4. During differential IR analysis, values in "MaxSplice" are now used as the denominators in the GLM, instead of using the values of Column 19 in the IR quantification output. This makes the IR ratio in the differential IR analysis more consistent with the values of Column 20 in the IR quantification output. 55 | 5. User manual has been updated. 56 | 57 | **1.2.5** 58 | 1. Headers are now correctly added to output files `IRFinder-IR-dir.txt` and `IRFinder-IR-nondir.txt`. 59 | 60 | **1.2.4** 61 | 1. In the GLM-based method for differential IR comparison, now the orginal matrix for DESeq2 is now made up by IR depth and correct splicing depth. In the previous versions, the latter one is the sum of splicing depth and IR depth. This change is supposed to give a smoother dispersion estimation across all introns. 62 | 63 | **1.2.3:** 64 | 1. IRFinder now supports GTF attribution tags `gene_type` and `transcript_type` upon the original requirement for typical Ensembl tags `gene_biotype` and `transcript_biotype`. Either of these two pairs is required to correctly build IRFinder reference. 65 | 66 | **1.2.2:** 67 | 1. In GLM-based differential IR comparison, fixed an error caused by duplicated row names when creating DESeq2 object with a version of DESeq2 later than 1.10. 68 | 69 | **1.2.1:** 70 | 1. Improved the performance of DESeq2-based GLM analysis for differential IR. This new approach should improve the estimation of dispersion. Normal splicing from IRFinder result is now used as a variable in the GLM, instead of using the value of normal splicing as an offset. This approach is adapted from [detection of allele-specific expression](http://rpubs.com/mikelove/ase) from Michael Love. See Wiki page for details. 71 | 2. Updated some out-of-date usage information 72 | 73 | **1.2.0:** 74 | 1. IRFinder is now compatible with GLM-based analysis. This is achieved by passing IRFinder result to DESeq2 using the function in bin/DESeq2Constructor.R. See Wiki page for details 75 | 2. Fixed the conflict with latest version "bedtools complement" that used to cause failure in preparing IRFinder reference 76 | 3. Improved memory usage when passing lines to bedtools genomecov. This is also supposed to benefit reference preparation of those genomes with a lot of chromosomes contigs. Thanks for the smart solution from Andreas @andpet0101. 77 | 4. Specified the gtf file to be downloaded during reference preparation via automatic downloading. Ensembl currently holds several versions of gtf files for the same genome release. This confused IRFinder BuildRefDownload function in the previous version. 78 | 5. Added -v option to print out version number. 79 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG R_VERSION=4.1.2 2 | 3 | FROM rocker/r-ver:${R_VERSION} 4 | 5 | LABEL version=v2.0.1 6 | 7 | 8 | ENV LD_LIBRARY_PATH="/usr/local/lib/:$LD_LIBRARY_PATH" 9 | ENV PYTHONNOUSERSITE="true" 10 | ENV PATH="/Utils/bin/:${PATH}" 11 | 12 | ARG DEBIAN_FRONTEND=noninteractive 13 | 14 | ### All the dependencies 15 | RUN apt-get update && \ 16 | apt-get -y upgrade && \ 17 | export DEBIAN_FRONTEND=noninteractive && \ 18 | apt-get install -qy make build-essential libxml2-dev libcurl4-openssl-dev gcc bedtools samtools git gzip \ 19 | zlib1g gawk libz-dev wget libboost-iostreams-dev python3.6 apt-transport-https software-properties-common \ 20 | python3-pip && \ 21 | apt-get clean && apt-get purge && \ 22 | rm -rf /var/lib/apt/lists/* 23 | 24 | RUN pip3 install -U --no-cache-dir numpy pandas \ 25 | scikit-learn scipy \ 26 | statsmodels 27 | 28 | RUN Rscript -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager", force=TRUE) } ; BiocManager::install(c("BiocManager", "tximport", "readr", "RCurl", "DESeq2"), force=TRUE,ask=F, quiet=F)' && \ 29 | Rscript -e 'options(warn=2); installed.packages()' | awk 'BEGIN {v=0} $1=="Version" {v=1; } v==1 && $1 == "DESeq2" { gsub("\"", ""); print $2;v=0 } ' 30 | 31 | 32 | 33 | RUN mkdir -p /Utils/bin/ && \ 34 | cd /Utils/ && \ 35 | git clone https://github.com/alexdobin/STAR.git && \ 36 | cd ./STAR && git checkout tags/2.7.9a && \ 37 | cd ./source && \ 38 | make STAR && \ 39 | ln -s /Utils/STAR/source/STAR /Utils/bin/STAR && \ 40 | cd /Utils && \ 41 | git clone https://github.com/comprna/SUPPA.git && \ 42 | cd ./SUPPA && \ 43 | echo '#!/usr/bin/env python3' > /Utils/SUPPA/suppa.py.tmp && \ 44 | cat /Utils/SUPPA/suppa.py >> /Utils/SUPPA/suppa.py.tmp && \ 45 | mv /Utils/SUPPA/suppa.py.tmp /Utils/SUPPA/suppa.py && \ 46 | chmod +x /Utils/SUPPA/suppa.py && \ 47 | ln -s /Utils/SUPPA/suppa.py /Utils/bin/suppa.py 48 | 49 | RUN cd /Utils/ && git clone https://github.com/lh3/minimap2 && \ 50 | cd minimap2 && git checkout tags/v2.3 && make && \ 51 | ln -s /Utils/minimap2/minimap2 /Utils/bin/minimap2 52 | 53 | 54 | ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache 55 | 56 | COPY ./bin /IRFinder/bin 57 | COPY ./REF /IRFinder/REF 58 | COPY ./src /IRFinder/src 59 | COPY ./install.sh /IRFinder/ 60 | RUN cd /IRFinder/ && \ 61 | ./install.sh 62 | 63 | 64 | 65 | 66 | ENTRYPOINT ["IRFinder"] 67 | 68 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 william ritchie 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # IRFinder-S 3 | IRFinder-S is a suite of tools to analyse and explore intron retention events in multiple samples. It comprehends: 4 | 5 | - IRFinder : detect intron retention from RNA-Seq experiments. Includes an automatic CNN filter that emulate a visual inspection to validate the events. 6 | - IRBase : visualize and share IRFinder's results. 7 | 8 | To start using IRFinder, read our [wiki user manual.](https://github.com/RitchieLabIGH/IRFinder/wiki) 9 | 10 | [CHANGELOG](https://github.com/RitchieLabIGH/IRFinder/CHANGELOG.md) 11 | 12 | IRFinder Version 1 is still available at https://github.com/williamritchie/IRFinder but is not anymore maintained. 13 | ## About IRFinder 14 | 15 | IRFinder, developed at the [Center for Genomic Medicine of Massachusetts General Hospital](https://cgm.massgeneral.org/), the [CNRS](http://www.cnrs.com) and the [Centenary Institute](https://www.centenary.org.au), implements an end-to-end analysis of intron retention (IR) from mRNA sequencing data in multiple species. 16 | IRFinder includes alignment via the STAR (for short reads) and minimap2 (for long read) algorithm, quality controls on the sample analyzed, IR detection, quantification, convolutional neural network based validation and statistical comparison between multiple samples. 17 | IRFinder was capable of estimating IR events with low coverage or low mappability as confirmed by RT-qPCR. 18 | 19 | 20 | 21 | ## Before Start: Intron Retention Database - [IRBase](http://irbase.igh.cnrs.fr/) 22 | Before diving into IRFinder package, users might also consider [IRBase](http://irbase.igh.cnrs.fr/). It is a database for human IR inquiry and visualization, based upon pre-calculated IRFinder results from **over 935** public available human cell lines RNA-Seq sample. 23 | [IRBase](http://irbase.igh.cnrs.fr/) allows users to enquire, visualize and download single-gene IR results in a tissue/cell-type of interest, download transcriptome-wide IR results of a sample of interest, upload your results to compare with the public ones and share them with the community. 24 | 25 | 26 | ## Cite IRFinder 27 | 28 | Lorenzi, C., Barriere, S., Arnold, K. et al. IRFinder-S: a comprehensive suite to discover and explore intron retention. [Genome Biol 22, 307 (2021)](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02515-8). doi: [10.1186/s13059-021-02515-8](https://doi.org/10.1186/s13059-021-02515-8) 29 | 30 | Middleton R*, Gao D*, Thomas A, Singh B, Au A, Wong JJ, Bomane A, Cosson B, Eyras E, Rasko JE, Ritchie W. **IRFinder: assessing the impact of intron retention on mammalian gene expression**. [Genome Biol. 2017 Mar 15;18(1):51](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-017-1184-4). doi: 10.1186/s13059-017-1184-4. [PubMed PMID: 28298237](https://www-ncbi-nlm-nih-gov.ezp-prod1.hul.harvard.edu/pubmed/28298237). 31 | 32 | -------------------------------------------------------------------------------- /REF/Mapabilities/hg38/MapabilityExclusion.100bp.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/REF/Mapabilities/hg38/MapabilityExclusion.100bp.bed.gz -------------------------------------------------------------------------------- /REF/Mapabilities/hg38/MapabilityExclusion.150.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/REF/Mapabilities/hg38/MapabilityExclusion.150.bed.gz -------------------------------------------------------------------------------- /REF/Mapabilities/hg38/MapabilityExclusion.70bp.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/REF/Mapabilities/hg38/MapabilityExclusion.70bp.bed.gz -------------------------------------------------------------------------------- /REF/extra-input-files/Human_hg19_nonPolyA_ROI.bed: -------------------------------------------------------------------------------- 1 | 1 28160912 28161077 2 | 1 28975112 28975245 3 | 1 32674695 32681797 4 | 1 45241536 45241615 5 | 1 45242162 45242265 6 | 1 109642815 109643241 7 | 1 149754245 149783928 8 | 1 149784826 149785236 9 | 1 149858525 149858961 10 | 1 149859019 149859466 11 | 1 155895749 155895877 12 | 1 228645065 228645560 13 | 1 228645808 228646259 14 | 1 235291118 235291252 15 | 10 101996913 101997059 16 | 11 811681 811814 17 | 11 2985001 2985123 18 | 11 8705774 8705903 19 | 11 9450320 9450501 20 | 11 10823014 10823155 21 | 11 62432894 62433042 22 | 11 75111435 75111582 23 | 11 75115465 75115610 24 | 11 93464145 93464265 25 | 11 93465527 93465665 26 | 11 93466632 93466763 27 | 11 93468277 93468402 28 | 12 6619388 6619717 29 | 12 7076500 7076769 30 | 12 14920933 14924065 31 | 12 49048165 49048301 32 | 12 62995531 62997214 33 | 12 98993413 98993661 34 | 12 132515769 132515904 35 | 13 27829538 27829663 36 | 13 45911615 45911744 37 | 14 20811207 20811844 38 | 14 21860309 21860412 39 | 14 21865451 21865560 40 | 14 95999692 95999966 41 | 14 103804186 103804311 42 | 15 66795581 66795652 43 | 16 2205024 2205106 44 | 16 58582403 58582537 45 | 17 7809440 7809578 46 | 17 37009116 37009247 47 | 17 58308877 58309007 48 | 18 51748654 51748782 49 | 19 17973397 17973529 50 | 19 57791419 57804937 51 | 2 86362993 86363129 52 | 2 234184373 234184648 53 | 2 234197322 234197586 54 | 20 17943353 17943589 55 | 20 37053843 37053979 56 | 20 37058313 37058446 57 | 20 37062508 37062641 58 | 20 47895477 47895565 59 | 20 47896856 47896946 60 | 21 33749496 33749631 61 | 3 39449880 39450030 62 | 3 39452545 39452697 63 | 3 160232695 160233024 64 | 3 169482308 169482848 65 | 3 186502585 186502653 66 | 3 186504464 186504641 67 | 3 186505089 186505220 68 | 4 53579416 53579537 69 | 5 82360023 82360156 70 | 5 111497182 111497314 71 | 5 138614470 138614667 72 | 5 140090860 140090958 73 | 5 172447731 172447931 74 | 6 26020718 26021186 75 | 6 26021907 26022278 76 | 6 26027124 26027480 77 | 6 26031817 26032288 78 | 6 26033320 26033796 79 | 6 26043455 26043885 80 | 6 26045639 26046097 81 | 6 26055968 26056699 82 | 6 26115101 26124154 83 | 6 26124373 26139344 84 | 6 26156559 26157343 85 | 6 26158349 26171577 86 | 6 26188938 26189304 87 | 6 26197068 26199521 88 | 6 26199748 26200942 89 | 6 26216428 26216872 90 | 6 26217165 26217711 91 | 6 26225383 26225844 92 | 6 26250370 26250835 93 | 6 26251879 26252303 94 | 6 26271146 26271612 95 | 6 26273144 26273622 96 | 6 27093676 27100541 97 | 6 27100832 27103070 98 | 6 27114861 27115317 99 | 6 27775257 27775709 100 | 6 27777842 27778314 101 | 6 27782112 27782607 102 | 6 27782822 27783267 103 | 6 27805658 27806117 104 | 6 27806323 27823487 105 | 6 27834570 27835359 106 | 6 27858093 27860963 107 | 6 27861203 27861669 108 | 6 116440086 116479910 109 | 6 160201282 160201413 110 | 7 45143948 45144081 111 | 7 45144505 45144641 112 | 8 99054314 99054445 113 | 8 128959126 128960591 114 | 9 35657748 35658015 115 | 9 95054743 95054875 116 | 9 125796806 125797975 117 | 9 130210780 130210909 118 | -------------------------------------------------------------------------------- /REF/extra-input-files/Human_hg19_wgEncodeDacMapabilityConsensusExcludable.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/REF/extra-input-files/Human_hg19_wgEncodeDacMapabilityConsensusExcludable.bed.gz -------------------------------------------------------------------------------- /REF/extra-input-files/Human_hg38_nonPolyA_ROI.bed: -------------------------------------------------------------------------------- 1 | 1 27834401 27834566 2 | 1 28648600 28648733 3 | 1 32209094 32216196 4 | 1 44775864 44775943 5 | 1 44776490 44776593 6 | 1 109100193 109100619 7 | 1 149782689 149812373 8 | 1 149813271 149813681 9 | 1 149886975 149887411 10 | 1 149887469 149887916 11 | 1 155925958 155926086 12 | 1 228457364 228457859 13 | 1 228458107 228458558 14 | 1 235127803 235127937 15 | 10 100237156 100237302 16 | 11 811681 811814 17 | 11 2963771 2963893 18 | 11 8684227 8684356 19 | 11 9428773 9428954 20 | 11 10801467 10801608 21 | 11 62665422 62665570 22 | 11 75400391 75400538 23 | 11 75404421 75404566 24 | 11 93730979 93731099 25 | 11 93732361 93732499 26 | 11 93733466 93733597 27 | 11 93735111 93735236 28 | 12 6510222 6510551 29 | 12 6967337 6967606 30 | 12 14767999 14771131 31 | 12 48654382 48654518 32 | 12 62601751 62603434 33 | 12 98599635 98599883 34 | 12 132031224 132031359 35 | 13 27255401 27255526 36 | 13 45337480 45337609 37 | 14 20343048 20343685 38 | 14 21392150 21392253 39 | 14 21397292 21397401 40 | 14 95533355 95533629 41 | 14 103337849 103337974 42 | 15 66503243 66503314 43 | 16 2155023 2155105 44 | 16 58548499 58548633 45 | 17 7906122 7906260 46 | 17 38852863 38852994 47 | 17 60231516 60231646 48 | 18 54222284 54222412 49 | 19 17862588 17862720 50 | 19 57280051 57293569 51 | 2 86135870 86136006 52 | 2 233275727 233276002 53 | 2 233288676 233288940 54 | 20 17962710 17962946 55 | 20 38425195 38425331 56 | 20 38429670 38429803 57 | 20 38433865 38433998 58 | 20 49278940 49279028 59 | 20 49280319 49280409 60 | 21 32377187 32377322 61 | 3 39408389 39408539 62 | 3 39411054 39411206 63 | 3 160514907 160515236 64 | 3 169764520 169765060 65 | 3 186784796 186784864 66 | 3 186786675 186786852 67 | 3 186787300 186787431 68 | 4 52713249 52713370 69 | 5 83064204 83064337 70 | 5 112161485 112161617 71 | 5 139278781 139278978 72 | 5 140711275 140711373 73 | 5 173020728 173020928 74 | 6 26020490 26020958 75 | 6 26021679 26022050 76 | 6 26026896 26027252 77 | 6 26031589 26032060 78 | 6 26033092 26033568 79 | 6 26043227 26043657 80 | 6 26045411 26045869 81 | 6 26055740 26056471 82 | 6 26114873 26123926 83 | 6 26124145 26139116 84 | 6 26156331 26157115 85 | 6 26158121 26171349 86 | 6 26188710 26189076 87 | 6 26196840 26199293 88 | 6 26199520 26200714 89 | 6 26216200 26216644 90 | 6 26216937 26217483 91 | 6 26225155 26225616 92 | 6 26250142 26250607 93 | 6 26251651 26252075 94 | 6 26270918 26271384 95 | 6 26272916 26273394 96 | 6 27125897 27132762 97 | 6 27133053 27135291 98 | 6 27147082 27147538 99 | 6 27807479 27807931 100 | 6 27810064 27810536 101 | 6 27814334 27814829 102 | 6 27815044 27815489 103 | 6 27837880 27838339 104 | 6 27838545 27855709 105 | 6 27866792 27867581 106 | 6 27890315 27893185 107 | 6 27893425 27893891 108 | 6 116118923 116158747 109 | 6 159780250 159780381 110 | 7 45104349 45104482 111 | 7 45104906 45105042 112 | 8 98042086 98042217 113 | 8 127946880 127948345 114 | 9 35657751 35658018 115 | 9 92292461 92292593 116 | 9 123034527 123035696 117 | 9 127448501 127448630 118 | -------------------------------------------------------------------------------- /REF/extra-input-files/Mouse_mm10_nonPolyA_ROI.bed: -------------------------------------------------------------------------------- 1 | 1 86099026 86111970 2 | 1 87776938 87777209 3 | 1 87784556 87784820 4 | 1 127375131 127375239 5 | 1 133601223 133601355 6 | 10 11959880 11960005 7 | 10 34389981 34397085 8 | 10 91118291 91118536 9 | 10 117183335 117183411 10 | 10 125226464 125328963 11 | 11 6619808 6619940 12 | 11 6620319 6620454 13 | 11 11913621 11913748 14 | 11 20847753 20847881 15 | 11 58948911 58949532 16 | 11 58954685 58956674 17 | 11 69350518 69350661 18 | 11 69666936 69672423 19 | 11 97777527 97782437 20 | 11 99017031 99017152 21 | 11 106501245 106501377 22 | 11 116163910 116164036 23 | 12 16939994 16940115 24 | 12 31258933 31259062 25 | 12 105031075 105031349 26 | 12 111540941 111541067 27 | 13 21715763 21716143 28 | 13 21717628 21718115 29 | 13 21722098 21722478 30 | 13 21735062 21735837 31 | 13 21750194 21750505 32 | 13 21753395 21753907 33 | 13 21754123 21754503 34 | 13 21779883 21780625 35 | 13 21782915 21783397 36 | 13 21786826 21787218 37 | 13 21787461 21789213 38 | 13 21806412 21810199 39 | 13 21810465 21810944 40 | 13 21811746 21812150 41 | 13 21831767 21832196 42 | 13 21833057 21833575 43 | 13 21833743 21837530 44 | 13 22035113 22035643 45 | 13 22035821 22036299 46 | 13 22040816 22041352 47 | 13 22042479 22042949 48 | 13 22043214 22043676 49 | 13 23531044 23531519 50 | 13 23533906 23534304 51 | 13 23535422 23535951 52 | 13 23542924 23543359 53 | 13 23544052 23545055 54 | 13 23551258 23551648 55 | 13 23570517 23571220 56 | 13 23571396 23572013 57 | 13 23573736 23574196 58 | 13 23574381 23574952 59 | 13 23581598 23581990 60 | 13 23583742 23621124 61 | 13 23621755 23622502 62 | 13 23684199 23692488 63 | 13 23738807 23740366 64 | 13 23744973 23745602 65 | 13 23746734 23747202 66 | 13 23751088 23751593 67 | 13 23756937 23757427 68 | 13 23760802 23761249 69 | 13 23761853 23762386 70 | 13 24811447 24811568 71 | 13 49684301 49684433 72 | 13 51202688 51203065 73 | 13 62136762 62136831 74 | 13 62543434 62543503 75 | 13 74371426 74376566 76 | 13 75905147 75905278 77 | 13 95332618 95332749 78 | 14 11227552 11227992 79 | 14 26497655 26497785 80 | 14 32191854 32192050 81 | 14 52209785 52209894 82 | 14 57333366 57333485 83 | 14 57697809 57697937 84 | 15 71794188 71794365 85 | 15 98519716 98519853 86 | 16 23107444 23114136 87 | 16 43886682 43886814 88 | 16 52105015 52105147 89 | 16 53404086 53404193 90 | 16 54354462 54354593 91 | 16 71663788 71663918 92 | 16 90602496 90602617 93 | 17 12922790 12922917 94 | 17 24528476 24528553 95 | 17 55915403 55915870 96 | 18 33795164 33795295 97 | 18 35557032 35557227 98 | 18 36801866 36802008 99 | 19 8888538 8888685 100 | 19 20033504 20033635 101 | 19 44113979 44114124 102 | 19 46359036 46359144 103 | 19 47170469 47171134 104 | 2 19934863 19934953 105 | 2 32675109 32675245 106 | 2 32963291 32963420 107 | 2 37516332 37520603 108 | 2 38997476 39006168 109 | 2 85420137 85420280 110 | 2 144265979 144266216 111 | 2 158356424 158356554 112 | 2 158358360 158358472 113 | 2 158359798 158359929 114 | 2 158378222 158378354 115 | 2 167063473 167063565 116 | 2 167064998 167065086 117 | 2 171306168 171306431 118 | 3 24333046 24333552 119 | 3 30595346 30595757 120 | 3 69085105 69085447 121 | 3 86100534 86100655 122 | 3 88693930 88694057 123 | 3 96219865 96220308 124 | 3 96220361 96220880 125 | 3 96221121 96223738 126 | 3 96238110 96239127 127 | 3 96261704 96263311 128 | 3 96269721 96279001 129 | 3 96414437 96414859 130 | 3 108554338 108554751 131 | 3 128540372 128540480 132 | 3 150072590 150073620 133 | 4 43492788 43493058 134 | 4 117153827 117156243 135 | 4 129608331 129614257 136 | 4 132270080 132270213 137 | 4 132838383 132838547 138 | 4 134167808 134167895 139 | 5 92429785 92429928 140 | 5 100831281 100831414 141 | 5 110692049 110692181 142 | 5 146832890 146837032 143 | 5 149145724 149145821 144 | 6 8501236 8501356 145 | 6 39422289 39422419 146 | 6 52639234 52639355 147 | 6 71882557 71882693 148 | 6 124715232 124715502 149 | 6 132656957 132657844 150 | 6 132777179 132778162 151 | 6 136801553 136804431 152 | 7 97521808 97521916 153 | 7 99479563 99479707 154 | 7 99482785 99482932 155 | 7 109519147 109522367 156 | 7 110023210 110023342 157 | 7 110046364 110046547 158 | 7 111076060 111076227 159 | 7 118153480 118153610 160 | 7 127527874 127528003 161 | 7 141447370 141451585 162 | 7 143531394 143531520 163 | 8 13876097 13876226 164 | 8 57549775 57549888 165 | 8 69742862 69774886 166 | 8 70894722 70897443 167 | 8 95746060 95746195 168 | 8 110923116 110923263 169 | 8 121666628 121666706 170 | 9 3352657 3352786 171 | 9 15306214 15312104 172 | 9 15313802 15313932 173 | 9 15314845 15314981 174 | 9 15316489 15316588 175 | 9 64173387 64178562 176 | 9 120128780 120128935 177 | X 35838127 35838401 178 | X 93164902 93164984 179 | X 121308217 121308340 180 | X 156455999 156456095 181 | -------------------------------------------------------------------------------- /REF/extra-input-files/Mouse_mm9_nonPolyA_ROI.bed: -------------------------------------------------------------------------------- 1 | 1 87995601 88008545 2 | 1 89673513 89673784 3 | 1 89681131 89681395 4 | 1 129271708 129271816 5 | 1 135497800 135497932 6 | 10 11679678 11679803 7 | 10 34109787 34116891 8 | 10 90581036 90581281 9 | 10 116620391 116620467 10 | 10 124663520 124766019 11 | 11 6519811 6519943 12 | 11 6520322 6520457 13 | 11 11813624 11813751 14 | 11 20747756 20747884 15 | 11 58762413 58763034 16 | 11 58768187 58770176 17 | 11 69164020 69164163 18 | 11 69480438 69485925 19 | 11 97638841 97643751 20 | 11 98878345 98878466 21 | 11 106362559 106362691 22 | 11 116025224 116025350 23 | 12 16946800 16946921 24 | 12 31943798 31943927 25 | 12 106269285 106269559 26 | 12 112779152 112779278 27 | 13 21807632 21808012 28 | 13 21809497 21809984 29 | 13 21813967 21814347 30 | 13 21826931 21827706 31 | 13 21842063 21842374 32 | 13 21845264 21845776 33 | 13 21845992 21846372 34 | 13 21871752 21872494 35 | 13 21874784 21875266 36 | 13 21878695 21879087 37 | 13 21879330 21881082 38 | 13 21898281 21902068 39 | 13 21902334 21902813 40 | 13 21903615 21904019 41 | 13 21923636 21924065 42 | 13 21924926 21925444 43 | 13 21925612 21929399 44 | 13 22126982 22127512 45 | 13 22127690 22128168 46 | 13 22132685 22133221 47 | 13 22134348 22134818 48 | 13 22135083 22135545 49 | 13 23622913 23623388 50 | 13 23625775 23626173 51 | 13 23627291 23627820 52 | 13 23634793 23635228 53 | 13 23635921 23636924 54 | 13 23643127 23643517 55 | 13 23662386 23663089 56 | 13 23663265 23663882 57 | 13 23665605 23666065 58 | 13 23666250 23666821 59 | 13 23673467 23673859 60 | 13 23675611 23712993 61 | 13 23713624 23714371 62 | 13 23776068 23784357 63 | 13 23830676 23832235 64 | 13 23836842 23837471 65 | 13 23838603 23839071 66 | 13 23842957 23843462 67 | 13 23848806 23849296 68 | 13 23852671 23853118 69 | 13 23853722 23854255 70 | 13 24903316 24903437 71 | 13 49779670 49779802 72 | 13 51298057 51298434 73 | 13 62238122 62238191 74 | 13 62644794 62644863 75 | 13 74508874 74514014 76 | 13 76042595 76042726 77 | 13 96102573 96102704 78 | 14 12060066 12060506 79 | 14 27317141 27317271 80 | 14 33005040 33005236 81 | 14 52829460 52829569 82 | 14 57952203 57952322 83 | 14 58316646 58316774 84 | 15 71624618 71624795 85 | 15 98350147 98350284 86 | 16 23107517 23114209 87 | 16 43886795 43886927 88 | 16 52105128 52105260 89 | 16 53404199 53404306 90 | 16 54354575 54354706 91 | 16 71664033 71664163 92 | 16 90602741 90602862 93 | 17 13115656 13115783 94 | 17 24665421 24665498 95 | 17 56054826 56055293 96 | 18 33954818 33954949 97 | 18 35716686 35716881 98 | 18 36961520 36961662 99 | 19 8963028 8963175 100 | 19 20107994 20108125 101 | 19 44188469 44188614 102 | 19 46433526 46433634 103 | 19 47244959 47245624 104 | 2 19856490 19856580 105 | 2 32530629 32530765 106 | 2 32818811 32818940 107 | 2 37371852 37376123 108 | 2 38852996 38861688 109 | 2 85260294 85260437 110 | 2 144091715 144091952 111 | 2 158182160 158182290 112 | 2 158184096 158184208 113 | 2 158185534 158185665 114 | 2 158203958 158204090 115 | 2 166888973 166889065 116 | 2 166890498 166890586 117 | 2 171131668 171131931 118 | 3 24231968 24232474 119 | 3 30494268 30494679 120 | 3 68889027 68889369 121 | 3 85904456 85904577 122 | 3 88497852 88497979 123 | 3 96023788 96024231 124 | 3 96024284 96024803 125 | 3 96025044 96027661 126 | 3 96042033 96043050 127 | 3 96065627 96067234 128 | 3 96073644 96082924 129 | 3 96218360 96218782 130 | 3 108357256 108357669 131 | 3 128243290 128243398 132 | 3 149735554 149736584 133 | 4 43505660 43505930 134 | 4 116826432 116828848 135 | 4 129285575 129291501 136 | 4 131825995 131826128 137 | 4 132394298 132394462 138 | 4 133723723 133723810 139 | 5 92858811 92858954 140 | 5 101260300 101260433 141 | 5 111121068 111121200 142 | 5 147644466 147648608 143 | 5 149957300 149957397 144 | 6 8451236 8451356 145 | 6 39372288 39372418 146 | 6 52589228 52589349 147 | 6 71832551 71832687 148 | 6 124665250 124665520 149 | 6 132606975 132607862 150 | 6 132727197 132728180 151 | 6 136750074 136752952 152 | 7 104670318 104670426 153 | 7 106628073 106628217 154 | 7 106631295 106631442 155 | 7 116662661 116665881 156 | 7 117166724 117166856 157 | 7 117189878 117190061 158 | 7 118219574 118219741 159 | 7 125296994 125297124 160 | 7 134671388 134671517 161 | 7 148633269 148637484 162 | 7 150717299 150717425 163 | 8 13876097 13876226 164 | 8 60028572 60028685 165 | 8 72266761 72298785 166 | 8 73418621 73421342 167 | 8 98269960 98270095 168 | 8 113447016 113447163 169 | 8 124190528 124190606 170 | 9 3352657 3352786 171 | 9 15110658 15116548 172 | 9 15118246 15118376 173 | 9 15119289 15119425 174 | 9 15120933 15121032 175 | 9 64021194 64026369 176 | 9 120037898 120038053 177 | X 33378122 33378396 178 | X 90410241 90410323 179 | X 118421826 118421949 180 | X 152890542 152890638 181 | -------------------------------------------------------------------------------- /REF/extra-input-files/RNA.SpikeIn.ERCC.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/REF/extra-input-files/RNA.SpikeIn.ERCC.fasta.gz -------------------------------------------------------------------------------- /REF/extra-input-files/URLs: -------------------------------------------------------------------------------- 1 | Human: 2 | http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/wgEncodeDacMapabilityConsensusExcludable.txt.gz 3 | 4 | ERCC: 5 | ftp://ftp.ncbi.nlm.nih.gov/repository/acedb/SEQC_Reference_Targets/RNA.SpikeIn.ERCC.fasta.gz 6 | 7 | === Ensembl Base FTP === 8 | 9 | mm20: (mouse) 10 | ftp://ftp.ensembl.org/pub/release-81/fasta/mus_musculus/dna/ 11 | 12 | hg19: (human, popular) 13 | ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/ 14 | 15 | hg38: (human, most recent) 16 | ftp://ftp.ensembl.org/pub/release-81/fasta/homo_sapiens/dna/ 17 | -------------------------------------------------------------------------------- /bin/DESeq2Constructor.R: -------------------------------------------------------------------------------- 1 | DESeqDataSetFromIRFinder = function(filePaths,designMatrix,designFormula,irratio_thr=0, warning_filter="^$" ){ 2 | res=c() 3 | libsz=c() 4 | spl=c() 5 | irtest=read.table(filePaths[1]) 6 | if (irtest[1,1]=="Chr"){irtest=irtest[-1,]} 7 | irnames=unname(apply(as.matrix(irtest),1,FUN=function(x){return(paste0(x[4],"/",x[1],":",x[2],"-",x[3],":",x[6]))})) 8 | n=1 9 | warns=c() 10 | ratio_mask=c() 11 | for (i in filePaths){ 12 | print(paste0("processing file ",n," at ",i)) 13 | irtab=read.table(i) 14 | if (irtab[1,1]=="Chr"){irtab=irtab[-1,]} 15 | #rn=unname(apply(irtab,1,FUN=function(x){return(paste0(x[4],"/",x[1],":",x[2],"-",x[3],":",x[6]))})) 16 | #row.names(irtab)=rn 17 | #tmp1=round(as.numeric(as.vector(irtab[irnames,9]))) 18 | #tmp2=as.numeric(as.vector(irtab[irnames,19])) 19 | tmp1=as.numeric(as.vector(irtab[,9])) 20 | tmp2=as.numeric(as.vector(irtab[,19])) 21 | tmp3=tmp1+tmp2 22 | tmp4=as.numeric(as.vector(irtab[,17])) 23 | tmp5=as.numeric(as.vector(irtab[,18])) 24 | tmp6=pmax(tmp4,tmp5, na.rm=T) 25 | res=cbind(res,tmp1) 26 | libsz=cbind(libsz,tmp2) 27 | spl=cbind(spl,tmp6) 28 | if (length(warns) == 0){ 29 | warns= ! grepl(as.character(irtab[,21]), pattern = warning_filter ) 30 | } else { 31 | warns=warns & ! grepl(as.character(irtab[,21]), pattern = warning_filter ) 32 | } 33 | ratios=(tmp1 / (tmp6+tmp1)) 34 | rmsk=(! is.nan(ratios)) & ratios >= irratio_thr 35 | if (length(ratio_mask) == 0 ){ 36 | ratio_mask = rmsk 37 | } else { 38 | ratio_mask = ratio_mask | rmsk 39 | } 40 | n=n+1 41 | } 42 | print(warning_filter) 43 | print(irratio_thr) 44 | print(paste0("Warning removed: ", sum(! warns))) 45 | print(paste0("Ratio removed: ", sum(! ratio_mask))) 46 | warns=warns & ratio_mask 47 | print(paste0("Combined removed: ", sum(! warns))) 48 | res.rd=round(res)[warns,] 49 | libsz.rd=round(libsz)[warns,] 50 | spl.rd=round(spl)[warns,] 51 | colnames(res.rd)=paste("intronDepth",as.vector(designMatrix[,1]),sep=".") 52 | rownames(res.rd)=irnames[warns] 53 | colnames(libsz.rd)=paste("totalSplice",as.vector(designMatrix[,1]),sep=".") 54 | rownames(libsz.rd)=irnames[warns] 55 | colnames(spl.rd)=paste("maxSplice",as.vector(designMatrix[,1]),sep=".") 56 | rownames(spl.rd)=irnames[warns] 57 | 58 | ir=c(rep("IR",dim(designMatrix)[1]),rep("Splice",dim(designMatrix)[1])) 59 | group=rbind(designMatrix,designMatrix) 60 | group$IRFinder=ir 61 | group$IRFinder=factor(group$IRFinder,levels=c("Splice","IR")) 62 | 63 | #counts.IRFinder=cbind(res.rd,libsz.rd) 64 | counts.IRFinder=cbind(res.rd,spl.rd) 65 | 66 | dd = DESeqDataSetFromMatrix(countData = counts.IRFinder, colData = group, design = designFormula) 67 | sizeFactors(dd)=rep(1,dim(group)[1]) 68 | rownames(dd)=irnames[warns] 69 | final=list(dd,res,libsz,spl) 70 | names(final)=c("DESeq2Object","IntronDepth","SpliceDepth","MaxSplice") 71 | return(final) 72 | } 73 | -------------------------------------------------------------------------------- /bin/IRFinder: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util 4 | source ${LIBEXEC}/bash_utils.sh 5 | 6 | RUNMODES="FastQ|Long|BAM|BuildRef|BuildRefDownload|BuildRefProcess|BuildRefFromSTARRef|Diff" 7 | 8 | function usage() { 9 | echo "" >&2 10 | echo "IRFinder version: $VERSION" >&2 11 | echo "Usage: IRFinder [${RUNMODES}]" >&2 12 | echo "" >&2 13 | echo "Possible RunModes:" >&2 14 | echo "" >&2 15 | echo " BuildRef: Builds IRFinder reference from Ensembl FTP site. Requires Internet" >&2 16 | echo " BuildRefDownload: Only downloads FASTA and GTF files from Ensembl FTP site, ">&2 17 | echo " without building IRFinder reference. Requires Internet" >&2 18 | echo " BuildRefProcess: Builds IRFinder reference from local FASTA and GTF files" >&2 19 | echo " BuildRefFromSTARRef: Builds IRFinder reference from a local STAR reference" >&2 20 | echo " FastQ: Quantifies intron retention from FASTQ file (Default)" >&2 21 | echo " Long: Quantifies intron retention from FASTQ file of long reads" >&2 22 | echo " BAM: Quantifies intron retention from a BAM file" >&2 23 | echo " Diff: Compare IRrates from two conditions using SUPPA2 algorithm" >&2 24 | 25 | echo "" >&2 26 | echo " -v|--version Show version number of current IRFinder ( when no RunMode is given )." >&2 27 | echo " -h|--help Show this usage information. Dedicated usage informations are given if a RunMode is selected." >&2 28 | echo "" >&2 29 | exit 1 30 | } 31 | 32 | function isRunMode() { 33 | if [[ $1 =~ ${RUNMODES} ]]; then 34 | return 0 35 | else 36 | return 1 37 | fi 38 | } 39 | 40 | 41 | 42 | # === Defaults === 43 | 44 | RUNMODE="" 45 | export START_MESSAGE=0 46 | EXECDIR=$(dirname "$(readlink -nf "$BASH_SOURCE")") 47 | 48 | if [[ $# -eq 0 || $1 == "-h" || $1 == "--help" ]]; then 49 | usage 50 | fi 51 | 52 | if [[ ( $# -eq 1 && $1 == "-v" ) || $( echo "$@" | grep -c "\-\-version" ) == "1" ]]; then 53 | echo "IRFinder version: $VERSION" 54 | exit 55 | fi 56 | 57 | if [[ $1 =~ ^[^-] ]] ; then 58 | RUNMODE=$(echo $1 | awk -v runm="${RUNMODES}" '{IGNORECASE=1; split(runm, arr, "|"); out=$1; for ( k in arr) { if ( arr[k] == $1 ) { out=arr[k] } }; print out }') 59 | shift; 60 | args=$@ 61 | elif [[ $( echo $@ | grep -c "\-m" ) == 1 ]] ; then 62 | RUNMODE=$( echo $@ | awk -v runm="${RUNMODES}" ' {out="" ; IGNORECASE=1; split(runm, arr, "|"); for ( i=1; i<= NF; i++ ) { if ($i == "-m" ) { i=i+1; for ( k in arr ) { if ( arr[k] == $i ) { out=arr[k] } } } }; print out } ' ) 63 | if [[ "${RUNMODE}" != "" ]] ; then 64 | args=$( echo $@ | awk '{out=""; for ( i=1; i<=NF; i++) { if ( $i == "-m" ) { i=i+1 } else { out = out " " $i } }; print out} ' ) 65 | fi 66 | fi 67 | 68 | if [[ "${RUNMODE}" == "" ]]; then 69 | echo "Possible runmodes: $RUNMODES" 70 | exit 1 71 | fi 72 | 73 | 74 | 75 | if isRunMode $RUNMODE; then 76 | $EXECDIR/IRFinder${RUNMODE} ${args} 77 | else 78 | echo "RunMode $RUNMODE not recognized." >&2 79 | echo "Valid options for Mode are: BuildRef, BuildRefDownload, BuildRefProcess, BuildRefFromSTARRef, BAM, FastQ, Long, Diff. Default: FastQ" >&2 80 | fi 81 | 82 | 83 | -------------------------------------------------------------------------------- /bin/IRFinderBAM: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util 4 | source $LIBEXEC/bash_utils.sh 5 | 6 | 7 | function usage() { 8 | echo "" >&2 9 | echo "IRFinder version: $VERSION" >&2 10 | echo "Usage: IRFinder BAM -r ReferenceDir [Un]sorted.bam " >&2 11 | echo "" >&2 12 | echo " required:" >&2 13 | echo " [Un]sorted.bam: the target bam file. If paired end and sorted by coordinates, " 14 | echo " the process will be slightly slower and more memory consuming." >&2 15 | # TODO - we cannot currently accept fasta input to the trimmer (only fastq), probably should, believe STAR ignores quality anyway, and we strip it on output. 16 | echo " -r ReferenceDir: As built by the 'BuildRef' option." >&2 17 | echo "" >&2 18 | echo " optional:" >&2 19 | echo " -d string : Output Directory. Default is the current directory." >&2 20 | echo " -l Long reads flag." >&2 21 | echo " -j Jitter, consider the position around the splice sites to compensate sequencing errors ( Long reads only, integer number )." >&2 22 | echo " -v Verbose." >&2 23 | echo " additional :" >&2 24 | echo " -R double : Minimum IRratio accepted to consider the intron for the CNN validation. Default: 0.05 " >&2 25 | echo " -w int : Warning level accepted to consider the intron for the CNN validation. Default: 1" >&2 26 | echo " 0: Disabled " >&2 27 | echo " 1: Only without warning " >&2 28 | echo " 2: Include NonUniformIntronCover " >&2 29 | echo " 3: Include also MinorIsoform" >&2 30 | echo " 4: Include also LowSplicing" >&2 31 | echo " 5: Include also LowCover ( consider all )" >&2 32 | echo "" >&2 33 | exit 1 34 | } 35 | 36 | 37 | # === Defaults === 38 | OUTPUTDIR=. 39 | THREADS=0 40 | REF= 41 | VERBOSE=0 42 | RETRO=0 43 | READ_TYPE="SR" 44 | AI_WARN=1 45 | AI_INTRON=1 46 | AI_RATIO="0.05" 47 | JITTER="3" 48 | 49 | if [ $# -eq 0 ] || [[ $1 == "--help" ]] ; then 50 | usage 51 | fi 52 | 53 | while getopts ":r:t:d:i:w:R:j:vhl" opt; do 54 | case $opt in 55 | r) 56 | # Reference directory. 57 | REF=$OPTARG 58 | ;; 59 | t) 60 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.) IRFinder is single core for now. 61 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 62 | echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2 63 | exit 1 64 | fi 65 | THREADS=$OPTARG 66 | ;; 67 | i) 68 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 69 | echo "Argument error: -i $OPTARG, number of warning must be an integer." >&2 70 | exit 1 71 | fi 72 | AI_INTRON=$OPTARG 73 | ;; 74 | R) 75 | if [[ ! $OPTARG =~ ^0\.[0-9]+$ ]]; then 76 | echo "Argument error: -r $OPTARG, ratio must be a float number between 0 and 1 not included." >&2 77 | exit 1 78 | fi 79 | AI_RATIO=$OPTARG 80 | ;; 81 | w) 82 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 83 | echo "Argument error: -w $OPTARG, number of intron depth must be an integer." >&2 84 | exit 1 85 | fi 86 | AI_WARN=$OPTARG 87 | ;; 88 | j) 89 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 90 | echo "Argument error: -j $OPTARG, jitter must be an integer." >&2 91 | exit 1 92 | fi 93 | JITTER=$OPTARG 94 | ;; 95 | d) 96 | OUTPUTDIR=$OPTARG 97 | ;; 98 | v) 99 | VERBOSE=1 100 | ;; 101 | l) 102 | READ_TYPE="LR" 103 | ;; 104 | h) 105 | usage 106 | ;; 107 | \?) 108 | echo "Invalid option: -$OPTARG" >&2 109 | exit 1 110 | ;; 111 | :) 112 | echo "Option -$OPTARG requires an argument." >&2 113 | exit 1 114 | ;; 115 | esac 116 | done 117 | shift $(($OPTIND - 1)) 118 | 119 | 120 | checkRef $REF 121 | checkOutDir $OUTPUTDIR 122 | checkSamtools 123 | setThreads 124 | 125 | if [ ! $# -eq 1 ]; then 126 | echo "Argument error: in run mode BAM, provide a single BAM as input. $# arguments found." >&2 127 | exit 1 128 | fi 129 | 130 | if [[ "${IRF_RUNMODE}" == "" ]]; then 131 | logger init 132 | fi 133 | 134 | RUNMODE="BAM" startMessage $@ 135 | 136 | logger "[ " $(date) " ] Processing the BAM file with IRFinder" 137 | logger "---" 138 | 139 | # BAM check 140 | samtools view -H $1 > /dev/null || exit 1 141 | # Sort check 142 | if [ $(samtools view -H $1 | grep -c "SO:coordinate" ) -eq 1 ]; then 143 | # PE check 144 | if [ $( { samtools view -H $1 ; samtools view $1 | head -n 1000 ; } | samtools view -c -f 1 ) -gt 0 ]; then 145 | logger "The given bam file is sorted by coordinate and is paired." 146 | fi 147 | fi 148 | 149 | 150 | if [ $VERBOSE -eq 1 ];then 151 | ${LIBEXEC}/irfinder ${OUTPUTDIR} \ 152 | ${REF}/IRFinder/ref-cover.bed \ 153 | ${REF}/IRFinder/ref-sj.ref \ 154 | ${REF}/IRFinder/ref-read-continues.ref \ 155 | ${REF}/IRFinder/ref-ROI.bed ${READ_TYPE} "${AI_WARN}:${AI_INTRON}:${AI_RATIO}" "${JITTER}" $1 2>> $OUTPUTDIR/logs/irfinder.stderr | tee -a $OUTPUTDIR/logs/irfinder.stdout 156 | cat $OUTPUTDIR/logs/irfinder.stderr 157 | else 158 | ${LIBEXEC}/irfinder ${OUTPUTDIR} \ 159 | ${REF}/IRFinder/ref-cover.bed \ 160 | ${REF}/IRFinder/ref-sj.ref \ 161 | ${REF}/IRFinder/ref-read-continues.ref \ 162 | ${REF}/IRFinder/ref-ROI.bed ${READ_TYPE} "${AI_WARN}:${AI_INTRON}:${AI_RATIO}" "${JITTER}" $1 >> $OUTPUTDIR/logs/irfinder.stdout 2>> $OUTPUTDIR/logs/irfinder.stderr 163 | fi 164 | 165 | 166 | 167 | logger "---" 168 | logger "[ " $(date) " ] IRFinder BAM analysis completed " 169 | logger "---" 170 | 171 | "$LIBEXEC/warnings" "$OUTPUTDIR" 172 | 173 | N_WARNINGS=$(wc -l $OUTPUTDIR/WARNINGS | awk '{print $1}' ) 174 | if [ $N_WARNINGS -gt 0 ]; then 175 | logger "Process completed with warnings. Check $OUTPUTDIR/WARNINGS " >&2 176 | fi 177 | 178 | if [[ -f $OUTPUTDIR/IRFinder-IR-nondir-AI.txt ]]; then 179 | logger "---" 180 | logger "[ " $(date) " ] Running CNN validator " 181 | logger "---" 182 | ${LIBEXEC}/irfinder_cnn -d ${OUTPUTDIR} -m ${LIBEXEC}/model/ && rm ${OUTPUTDIR}/*-AI.txt 183 | logger "---" 184 | logger "[ " $(date) " ] CNN validator completed" 185 | logger "---" 186 | fi 187 | 188 | 189 | 190 | -------------------------------------------------------------------------------- /bin/IRFinderBuildRef: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util 4 | source $LIBEXEC/bash_utils.sh 5 | 6 | function usage() { 7 | echo "" >&2 8 | echo "IRFinder version: $VERSION" >&2 9 | echo "Usage: IRFinder BuildRef [-v][-h][-t INT][-j INT][-e ExtraGenomeRef.fa][-b Blacklist.bed][-R ROI.bed][-m Mapability.bed][-M INT] -r ReferenceDir URL" >&2 10 | echo "" >&2 11 | echo " required:" >&2 12 | echo " URL A base Ensembl URL to a (gzipped) gtf file. For example: ftp://ftp.ensembl.org/pub/release-100/gtf/homo_sapiens/Homo_sapiens.GRCh38.100.gtf.gz" >&2 13 | echo " -r ReferenceDir: Directory should not yet exist, will be created." >&2 14 | echo "" >&2 15 | echo " optional:" >&2 16 | echo " -t Threads: The number of physical CPUs to use by IRFinder. When ommited (default), IRFinder will use all physical CPUs." >&2 17 | echo " -j INTEGER: An integer that is parsed to '--sjdbOverhang' under STAR 'genomeGenerate' mode. Default: 150." >&2 18 | echo " -M Mapability: A precomputed bed file containing the low mapability areas. Can also be an empty file." >&2 19 | echo " -n MapabilityReadLength: The length of the reads used to compute the mapability. Default: 70" >&2 20 | echo " -e ExtraGenomeRef.fasta.gz: Typically an ERCC reference." >&2 21 | echo " -b Blacklist.bed.gz: BED of regions to be excluded from analysis." >&2 22 | echo " -R ROI.bed.gz: A non-overlapping BED file of additional Regions of Interest for read counts." >&2 23 | echo " -v Show version number of current IRFinder." >&2 24 | echo " -L STAR limitGenomeGenerateRAM argument. Default: 31000000000" >&2 25 | echo " -h Show this usage information." >&2 26 | echo "" >&2 27 | exit 1 28 | } 29 | 30 | 31 | # === Defaults === 32 | THREADS=0 33 | REF= 34 | SJOH=150 35 | BUILDERCCFILE= 36 | BUILDROI= 37 | BUILDBLACK= 38 | STAREXEC=STAR 39 | MAPABILITY_FILE= 40 | MAPABILITY_LEN=100 41 | GENOMERAM=31000000000 42 | 43 | if [ $# -eq 0 ] || [[ $1 == "--help" ]] ; then 44 | usage 45 | fi 46 | 47 | 48 | while getopts ":r:j:t:S:e:b:R:n:M:L:hv" opt; do 49 | case $opt in 50 | r) 51 | # Reference directory. 52 | REF=$OPTARG 53 | if [ -d "$REF" ]; then 54 | echo "Argument error: -r $REF. Reference directory must not exist, BuildRef will create it." >&2 55 | exit 1 56 | fi 57 | ;; 58 | j) # STAR's --sjdbOverhang 59 | if [[ $OPTARG =~ ^[0-9]+$ ]] ; then 60 | SJOH=$OPTARG 61 | else 62 | echo "Argument error: -j $OPTARG. '$OPTARG' is not an integer." >&2 63 | exit 1 64 | fi 65 | ;; 66 | t) 67 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.) 68 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 69 | echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2 70 | exit 1 71 | fi 72 | THREADS=$OPTARG 73 | ;; 74 | L) 75 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.) 76 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 77 | echo "Argument error: -L $OPTARG, limitGenomeGenerateRAM must be an integer." >&2 78 | exit 1 79 | fi 80 | GENOMERAM=$OPTARG 81 | ;; 82 | S) 83 | ## STAR executable. (must be executable!) 84 | if [ -x "$OPTARG" -a ! -d "$OPTARG" ]; then 85 | STAREXEC=$OPTARG 86 | else 87 | echo "Argument error: -S $OPTARG. STAR executable must be an executable program." >&2 88 | exit 1 89 | fi 90 | ;; 91 | e) 92 | #ERCC file. (must be a file) 93 | if [ ! -f "$OPTARG" ]; then 94 | echo "Argument error: -e $OPTARG. Specified ERCC/extra-reference file does not exist." >&2 95 | exit 1 96 | fi 97 | BUILDERCCFILE=$OPTARG 98 | ;; 99 | b) 100 | #Blacklist local file (must be a file) 101 | if [ ! -f "$OPTARG" ]; then 102 | echo "Argument error: -b $OPTARG. Specified blacklist file does not exist." >&2 103 | exit 1 104 | fi 105 | BUILDBLACK=$OPTARG 106 | ;; 107 | R) 108 | #ROI local file. (must be a file) 109 | if [ ! -f "$OPTARG" ]; then 110 | echo "Argument error: -R $OPTARG. Specified ROI file does not exist." >&2 111 | exit 1 112 | fi 113 | BUILDROI=$OPTARG 114 | ;; 115 | M) 116 | if [ ! -f "$OPTARG" ]; then 117 | echo "Argument error: -m $OPTARG. Specified Mapability file does not exist." >&2 118 | exit 1 119 | fi 120 | MAPABILITY_FILE=$OPTARG 121 | ;; 122 | n) 123 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 124 | echo "Argument error: -n $OPTARG, must be an integer." >&2 125 | exit 1 126 | fi 127 | MAPABILITY_LEN=$OPTARG 128 | ;; 129 | h) 130 | usage 131 | ;; 132 | v) 133 | versionAlert 134 | ;; 135 | \?) 136 | echo "Invalid option: -$OPTARG" >&2 137 | exit 1 138 | ;; 139 | :) 140 | echo "Option -$OPTARG requires an argument." >&2 141 | exit 1 142 | ;; 143 | esac 144 | done 145 | shift $(($OPTIND - 1)) 146 | 147 | #echo $@ #The remaining arguments. 148 | #echo $# #The number of remaining arguments. 149 | 150 | if [ ! "$REF" ]; then 151 | echo "Argument error: -r is required." >&2 152 | exit 1 153 | fi 154 | 155 | if [[ "${MAPABILITY_FILE}" == "" ]]; then 156 | checkStar $STAREXEC 157 | fi 158 | setThreads 159 | 160 | if [ ! $# -eq 1 ]; then 161 | echo "Argument error: in run mode BuildRef, provide a single ftp URL. $# arguments found." >&2 162 | exit 1 163 | fi 164 | 165 | BUILDHINT=$1 166 | if [[ "$BUILDHINT" != ftp* ]]; then 167 | echo "Argument error: A single ftp url is required to find and download genome fasta and gtf files. eg: ftp://ftp.ensembl.org/pub/release-78/fasta/mus_musculus/dna/." >&2 168 | exit 1 169 | fi 170 | 171 | 172 | echo "Launching reference build process. The full build might take hours." 173 | 174 | "$LIBEXEC/IRFinder-BuildRefFromEnsembl" BuildRef "$THREADS" "$STAREXEC" "$BUILDHINT" "$REF" "$BUILDERCCFILE" "$BUILDROI" "$BUILDBLACK" "$SJOH" "$MAPABILITY_FILE" "$MAPABILITY_LEN" "$GENOMERAM" 175 | 176 | 177 | -------------------------------------------------------------------------------- /bin/IRFinderBuildRefDownload: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util 5 | source $LIBEXEC/bash_utils.sh 6 | 7 | 8 | function usage() { 9 | echo "" >&2 10 | echo "IRFinder version: $VERSION" >&2 11 | echo "Usage: IRFinder BuildRefDownload [-v][-h] -r ReferenceDir URL" >&2 12 | echo "" >&2 13 | echo " required:" >&2 14 | echo " URL A base Ensembl URL to a (gzipped) gtf file. For example: ftp://ftp.ensembl.org/pub/release-100/gtf/homo_sapiens/Homo_sapiens.GRCh38.100.gtf.gz" >&2 15 | echo " -r ReferenceDir: Directory should not yet exist, will be created." >&2 16 | echo "" >&2 17 | echo " optional:" >&2 18 | echo " -v Show version number of current IRFinder." >&2 19 | echo " -h Show this usage information." >&2 20 | echo "" >&2 21 | exit 1 22 | } 23 | 24 | 25 | # === Defaults === 26 | REF= 27 | STAREXEC=STAR 28 | 29 | 30 | if [ $# -eq 0 ] || [[ $1 == "--help" ]] ; then 31 | usage 32 | fi 33 | 34 | 35 | while getopts ":m:r:hv" opt; do 36 | case $opt in 37 | r) 38 | # Reference directory. 39 | REF=$OPTARG 40 | if [ -d "$REF" ]; then 41 | echo "Argument error: -r $REF. Reference directory must not exist, BuildRef will create it." >&2 42 | exit 1 43 | fi 44 | ;; 45 | h) 46 | usage 47 | ;; 48 | v) 49 | versionAlert 50 | ;; 51 | \?) 52 | echo "Invalid option: -$OPTARG" >&2 53 | exit 1 54 | ;; 55 | :) 56 | echo "Option -$OPTARG requires an argument." >&2 57 | exit 1 58 | ;; 59 | esac 60 | done 61 | shift $(($OPTIND - 1)) 62 | 63 | #echo $@ #The remaining arguments. 64 | #echo $# #The number of remaining arguments. 65 | 66 | if [ ! "$REF" ]; then 67 | echo "Argument error: -r is required." >&2 68 | exit 1 69 | fi 70 | 71 | 72 | if [ ! $# -eq 1 ]; then 73 | echo "Argument error: in run mode BuildRefDownload, provide a single ftp URL. $# arguments found." >&2 74 | exit 1 75 | fi 76 | 77 | BUILDHINT=$1 78 | if [[ "$BUILDHINT" != ftp* ]]; then 79 | echo "Argument error: A single ftp url is required to find and download genome fasta and gtf files. eg: ftp://ftp.ensembl.org/pub/release-78/fasta/mus_musculus/dna/." >&2 80 | exit 1 81 | fi 82 | 83 | 84 | echo "Launching reference build process. The full build might take hours." 85 | 86 | "$LIBEXEC/IRFinder-BuildRefFromEnsembl" BuildRefDownload "1" "" "$BUILDHINT" "$REF" "" "" "" "" 87 | 88 | 89 | -------------------------------------------------------------------------------- /bin/IRFinderBuildRefFromSTARRef: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util 5 | source $LIBEXEC/bash_utils.sh 6 | 7 | 8 | function usage() { 9 | echo "" >&2 10 | echo "IRFinder version: $VERSION" >&2 11 | echo "Usage: IRFinder BuildRefFromSTARRef [-v][-h][-t INT][-j INT][-e ExtraGenomeRef.fa][-b Blacklist.bed][-R ROI.bed][-m Mapability.bed] -r ReferenceDir -x STARRefDir" >&2 12 | echo "" >&2 13 | echo " required:" >&2 14 | echo " -r ReferenceDir: Directory should not yet exist, will be created." >&2 15 | echo " -x STARRefDir. An existing STAR reference folder." >&2 16 | echo " Please note: By default, BuildRefFromSTARRef mode automatically looks for the original FASTA and GTF files used to generate STARRefDir." >&2 17 | echo " Specifically, IRFinder investigates 'genomeParameters.txt' in STARRefDir." >&2 18 | echo " If both files can be located, IRFinder will continue to generate reference, ignoring '-f' and '-g' options." >&2 19 | echo " If either file is missing, IRFinder will quit and you have to re-run it by giving both '-f' and '-g' options." >&2 20 | echo "" >&2 21 | echo " optional:" >&2 22 | echo " -t Threads: The number of physical CPUs to use by IRFinder. When ommited (default), IRFinder will use all physical CPUs." >&2 23 | echo " -M Mapability: A precomputed bed file containing the low mapability areas. Can also be an empty file." >&2 24 | echo " -n MapabilityReadLength: The length of the reads used to compute the mapability. Default: 70" >&2 25 | echo " -f GENOME.fa: This MUST be the same FASTA file used to generate STARRefDir. Ignored when IRFinder can automatically locate the original file." >&2 26 | echo " -g TRANSCRIPTS.gtf: This MUST be the same GTF file used to generate STARRefDir. Ignored when IRFinder can automatically locate the original file." >&2 27 | echo " -e ExtraGenomeRef.fasta.gz: Typically an ERCC reference." >&2 28 | echo " -b Blacklist.bed.gz: BED of regions to be excluded from analysis." >&2 29 | echo " -R ROI.bed.gz: A non-overlapping BED file of additional Regions of Interest for read counts." >&2 30 | echo " -l Don't copy the STAR reference and the other files but create symbolic links." >&2 31 | echo " -v Show version number of current IRFinder." >&2 32 | echo " -h Show this usage information." >&2 33 | echo "" >&2 34 | exit 1 35 | } 36 | 37 | 38 | # === Defaults === 39 | THREADS=0 40 | REF= 41 | SJOH=150 42 | BUILDERCCFILE= 43 | BUILDROI= 44 | BUILDBLACK= 45 | STAREXEC=STAR 46 | MYFASTA="NULL" 47 | MYGTF="NULL" 48 | LINK=0 49 | MAPABILITY_FILE= 50 | MAPABILITY_LEN=100 51 | 52 | if [ $# -eq 0 ] || [[ $1 == "--help" ]] ; then 53 | usage 54 | fi 55 | 56 | 57 | while getopts ":m:r:t:S:e:b:R:x:f:g:m:M:n:hvl" opt; do 58 | case $opt in 59 | r) 60 | # Reference directory. 61 | REF=$OPTARG 62 | if [ -d "$REF" ]; then 63 | echo "Argument error: -r $REF. Reference directory must not exist, BuildRef will create it." >&2 64 | exit 1 65 | fi 66 | ;; 67 | t) 68 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.) 69 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 70 | echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2 71 | exit 1 72 | fi 73 | THREADS=$OPTARG 74 | ;; 75 | S) 76 | ## STAR executable. (must be executable!) 77 | if [ -x "$OPTARG" -a ! -d "$OPTARG" ]; then 78 | STAREXEC=$OPTARG 79 | else 80 | echo "Argument error: -S $OPTARG. STAR executable must be an executable program." >&2 81 | exit 1 82 | fi 83 | ;; 84 | e) 85 | #ERCC file. (must be a file) 86 | if [ ! -f "$OPTARG" ]; then 87 | echo "Argument error: -e $OPTARG. Specified ERCC/extra-reference file does not exist." >&2 88 | exit 1 89 | fi 90 | BUILDERCCFILE=$OPTARG 91 | ;; 92 | b) 93 | #Blacklist local file (must be a file) 94 | if [ ! -f "$OPTARG" ]; then 95 | echo "Argument error: -b $OPTARG. Specified blacklist file does not exist." >&2 96 | exit 1 97 | fi 98 | BUILDBLACK=$OPTARG 99 | ;; 100 | R) 101 | #ROI local file. (must be a file) 102 | if [ ! -f "$OPTARG" ]; then 103 | echo "Argument error: -R $OPTARG. Specified ROI file does not exist." >&2 104 | exit 1 105 | fi 106 | BUILDROI=$OPTARG 107 | ;; 108 | M) 109 | if [ ! -f "$OPTARG" ]; then 110 | echo "Argument error: -m $OPTARG. Specified Mapability file does not exist." >&2 111 | exit 1 112 | fi 113 | MAPABILITY_FILE=$OPTARG 114 | ;; 115 | n) 116 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 117 | echo "Argument error: -n $OPTARG, must be an integer." >&2 118 | exit 1 119 | fi 120 | MAPABILITY_LEN=$OPTARG 121 | ;; 122 | x) 123 | # 124 | STARREF=$(realpath $OPTARG) 125 | ## Must be a directory 126 | ;; 127 | f) 128 | # The original FASTA file to generate STAR reference. 129 | MYFASTA=$(realpath $OPTARG) 130 | checkFile $MYFASTA 131 | ;; 132 | g) 133 | # The original GTF file to generate STAR reference. 134 | MYGTF=$(realpath $OPTARG) 135 | checkFile $MYGTF 136 | ;; 137 | h) 138 | usage 139 | ;; 140 | v) 141 | versionAlert 142 | ;; 143 | l) 144 | LINK=1 145 | ;; 146 | \?) 147 | echo "Invalid option: -$OPTARG" >&2 148 | exit 1 149 | ;; 150 | :) 151 | echo "Option -$OPTARG requires an argument." >&2 152 | exit 1 153 | ;; 154 | esac 155 | done 156 | shift $(($OPTIND - 1)) 157 | 158 | #echo $@ #The remaining arguments. 159 | #echo $# #The number of remaining arguments. 160 | 161 | if [ ! "$REF" ]; then 162 | echo "Argument error: -r is required." >&2 163 | exit 1 164 | fi 165 | 166 | if [ -d "$REF" ]; then 167 | echo "Argument error: -r $REF. Reference directory must not exist, BuildRef will create it." >&2 168 | exit 1 169 | fi 170 | 171 | if [[ "${MAPABILITY_FILE}" == "" ]]; then 172 | checkStar $STAREXEC 173 | fi 174 | 175 | setThreads 176 | 177 | if [ ! "$STARREF" ]; then 178 | echo "Argument error: -x is required. Must provide an exisiting STAR reference folder for BuildRefProcess mode." >&2 179 | exit 1 180 | fi 181 | if [ ! -d "$STARREF" ]; then 182 | echo "Error: STAR reference at $STARREF does not exist." >&2 183 | exit 1 184 | fi 185 | 186 | 187 | if [[ "${MYFASTA}" == "NULL" ]] || [[ "${MYGTF}" == "NULL" ]] ; then 188 | if [ ! -f "$STARREF/genomeParameters.txt" ] ; then 189 | echo "Error: Cannot locate the original FASTA and GTF files used to generate STAR reference at $STARREF." >&2 190 | echo " Please provide these two files through '-f' and '-g' options respectively." >&2 191 | echo " Or retry to build IRFinder reference in other modes." >&2 192 | echo " Run 'IRFinder -h' for more details." >&2 193 | exit 1 194 | fi 195 | STARLINE=$(head -n 1 $STARREF/genomeParameters.txt) 196 | STARTMP1=(${STARLINE#*--genomeFastaFiles }) 197 | STARTMP2=(${STARLINE#*--sjdbGTFfile }) 198 | ORIFASTA=${STARTMP1[0]} 199 | ORIGTF=${STARTMP2[0]} 200 | if [[ "${MYFASTA}" == "NULL" ]]; then 201 | MYFASTA="${ORIFASTA}" 202 | fi 203 | if [[ "${MYGTF}" == "NULL" ]]; then 204 | MYGTF="${ORIGTF}" 205 | fi 206 | fi 207 | 208 | 209 | #get the original fasta and gtf file used to generate STAR reference using the parameters saved in 'genomeParameters.txt' 210 | if [ ! -f "$MYFASTA" ] || [ ! -f "$MYGTF" ]; then 211 | echo "Error: Cannot locate the original FASTA and GTF files used to generate the STAR reference $STARREF" >&2 212 | echo " at the following locations:" >&2 213 | echo " FASTA: $ORIFASTA" >&2 214 | echo " GTF: $ORIGTF" >&2 215 | echo " Please locate these two files through '-f' and '-g' options respectively." >&2 216 | echo " Or retry to build IRFinder reference in other modes." >&2 217 | echo " Run 'IRFinder -h' for more details." >&2 218 | exit 1 219 | fi 220 | 221 | if [ $LINK -eq 1 ]; then 222 | CP_CMD="ln -s " 223 | else 224 | CP_CMD="cp " 225 | fi 226 | 227 | MYFASTA=$(realpath $MYFASTA ) 228 | MYGTF=$(realpath $MYGTF ) 229 | REF=$(realpath $REF ) 230 | 231 | echo "Launching reference build process. The full build might take hours." 232 | echo "" 233 | mkdir "$REF" 234 | date +"%b %d %T ... copying the genome FASTA file..." 235 | $CP_CMD "$MYFASTA" "$REF/genome.fa" 236 | date +"%b %d %T ... copying the transcriptome GTF file..." 237 | $CP_CMD "$MYGTF" "$REF/transcripts.gtf" 238 | date +"%b %d %T ... copying the STAR reference folder..." 239 | $CP_CMD -r "$STARREF" "$REF/STAR" 240 | 241 | "$LIBEXEC/IRFinder-BuildRefFromEnsembl" "BuildRefFromSTARRef" "$THREADS" "$STAREXEC" "$BUILDHINT" "$REF" "$BUILDERCCFILE" "$BUILDROI" "$BUILDBLACK" "$SJOH" "$MAPABILITY_FILE" "$MAPABILITY_LEN" 242 | 243 | 244 | 245 | 246 | -------------------------------------------------------------------------------- /bin/IRFinderBuildRefProcess: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util 5 | source $LIBEXEC/bash_utils.sh 6 | 7 | 8 | function usage() { 9 | echo "" >&2 10 | echo "IRFinder version: $VERSION" >&2 11 | echo "Usage: IRFinder BuildRefProcess [-v][-h][-t INT][-j INT][-e ExtraGenomeRef.fa][-b Blacklist.bed][-R ROI.bed][-m Mapability.bed] -r ReferenceDir " >&2 12 | echo "" >&2 13 | echo "Parameters for BuildRefProcess mode:" >&2 14 | echo " required:" >&2 15 | echo " -r ReferenceDir. Directory should already contain EXACT files named 'genome.fa' and 'transcripts.gtf' (case-sensitive) for genome and transcriptome annotations respectively." >&2 16 | echo " optional:" >&2 17 | echo " -t Threads: The number of physical CPUs to use by IRFinder. When ommited (default), IRFinder will use all physical CPUs." >&2 18 | echo " -j INTEGER: an integer that is parsed to '--sjdbOverhang' under STAR 'genomeGenerate' mode. Default: 150." >&2 19 | echo " -e ExtraGenomeRef.fasta.gz: Typically an ERCC reference." >&2 20 | echo " -b Blacklist.bed.gz: BED of regions to be excluded from analysis." >&2 21 | echo " -R ROI.bed.gz: A non-overlapping BED file of additional Regions of Interest for read counts." >&2 22 | echo " -m Mapability: A precomputed bed file containing the low mapability areas. Can also be an empty file." >&2 23 | echo " -n MapabilityReadLength: The length of the reads used to compute the mapability. Default: 70" >&2 24 | echo " -L STAR limitGenomeGenerateRAM argument. Default: 31000000000" >&2 25 | echo " -h Show this usage information." >&2 26 | echo "" >&2 27 | exit 1 28 | } 29 | 30 | 31 | # === Defaults === 32 | THREADS=0 33 | REF= 34 | SJOH=150 35 | BUILDERCCFILE= 36 | BUILDROI= 37 | BUILDBLACK= 38 | STAREXEC=STAR 39 | MAPABILITY_FILE= 40 | MAPABILITY_LEN=100 41 | GENOMERAM=31000000000 42 | 43 | if [ $# -eq 0 ] || [[ $1 == "--help" ]] ; then 44 | usage 45 | fi 46 | 47 | 48 | while getopts ":m:r:j:t:S:e:b:R:M:n:L:l:hv" opt; do 49 | case $opt in 50 | r) 51 | # Reference directory. 52 | REF=$OPTARG 53 | ;; 54 | j) # STAR's --sjdbOverhang 55 | if [[ $OPTARG =~ ^[0-9]+$ ]] ; then 56 | SJOH=$OPTARG 57 | else 58 | echo "Argument error: -j $OPTARG. '$OPTARG' is not an integer." >&2 59 | exit 1 60 | fi 61 | ;; 62 | t) 63 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.) 64 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 65 | echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2 66 | exit 1 67 | fi 68 | THREADS=$OPTARG 69 | ;; 70 | L) 71 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.) 72 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 73 | echo "Argument error: -L $OPTARG, limitGenomeGenerateRAM must be an integer." >&2 74 | exit 1 75 | fi 76 | GENOMERAM=$OPTARG 77 | ;; 78 | S) 79 | ## STAR executable. (must be executable!) 80 | if [ -x "$OPTARG" -a ! -d "$OPTARG" ]; then 81 | STAREXEC=$OPTARG 82 | else 83 | echo "Argument error: -S $OPTARG. STAR executable must be an executable program." >&2 84 | exit 1 85 | fi 86 | ;; 87 | e) 88 | #ERCC file. (must be a file) 89 | if [ ! -f "$OPTARG" ]; then 90 | echo "Argument error: -e $OPTARG. Specified ERCC/extra-reference file does not exist." >&2 91 | exit 1 92 | fi 93 | BUILDERCCFILE=$OPTARG 94 | ;; 95 | b) 96 | #Blacklist local file (must be a file) 97 | if [ ! -f "$OPTARG" ]; then 98 | echo "Argument error: -b $OPTARG. Specified blacklist file does not exist." >&2 99 | exit 1 100 | fi 101 | BUILDBLACK=$OPTARG 102 | ;; 103 | R) 104 | #ROI local file. (must be a file) 105 | if [ ! -f "$OPTARG" ]; then 106 | echo "Argument error: -R $OPTARG. Specified ROI file does not exist." >&2 107 | exit 1 108 | fi 109 | BUILDROI=$OPTARG 110 | ;; 111 | M) 112 | if [ ! -f "$OPTARG" ]; then 113 | echo "Argument error: -m $OPTARG. Specified Mapability file does not exist." >&2 114 | exit 1 115 | fi 116 | MAPABILITY_FILE=$OPTARG 117 | ;; 118 | n) 119 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 120 | echo "Argument error: -n $OPTARG, must be an integer." >&2 121 | exit 1 122 | fi 123 | MAPABILITY_LEN=$OPTARG 124 | ;; 125 | h) 126 | usage 127 | ;; 128 | v) 129 | versionAlert 130 | ;; 131 | \?) 132 | echo "Invalid option: -$OPTARG" >&2 133 | exit 1 134 | ;; 135 | :) 136 | echo "Option -$OPTARG requires an argument." >&2 137 | exit 1 138 | ;; 139 | esac 140 | done 141 | shift $(($OPTIND - 1)) 142 | 143 | #echo $@ #The remaining arguments. 144 | #echo $# #The number of remaining arguments. 145 | 146 | if [ ! "$REF" ]; then 147 | echo "Argument error: -r is required." >&2 148 | exit 1 149 | fi 150 | 151 | checkStar $STAREXEC 152 | setThreads 153 | 154 | if [ ! -f "$REF/genome.fa" ] || [ ! -f "$REF/transcripts.gtf" ]; then 155 | echo "Argument error: -r $REF. Reference directory must exist and contain genome.fa and transcripts.gtf files. Use the BuildRefDownload run mode to create these." >&2 156 | exit 1 157 | fi 158 | 159 | if [ -d "$REF/STAR" ] || [ -d "$REF/Mapability" ] || [ -d "$REF/IRFinder" ]; then 160 | echo "Argument error: -r $REF. Will not overwrite. It appears BuildRefProcess has already been run for this reference. Reference directory must not contain STAR, Mapability or IRFinder directories." >&2 161 | exit 1 162 | fi 163 | 164 | 165 | echo "Launching reference build process. The full build might take hours." 166 | "$LIBEXEC/IRFinder-BuildRefFromEnsembl" "BuildRefProcess" "$THREADS" "$STAREXEC" "" "$REF" "$BUILDERCCFILE" "$BUILDROI" "$BUILDBLACK" "$SJOH" "$MAPABILITY_FILE" "$MAPABILITY_LEN" "$GENOMERAM" 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /bin/IRFinderLong: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | LIBEXEC=$(dirname "$(readlink -nf "$BASH_SOURCE")")/util 4 | 5 | source $LIBEXEC/bash_utils.sh 6 | 7 | 8 | function usage() { 9 | echo "" >&2 10 | echo "IRFinder version: $VERSION" >&2 11 | echo "Usage: IRFinder Long -r ReferenceDir raw_reads_1.fast[q|a][.gz] [raw_reads_2.fast[q|a][.gz]...]" >&2 12 | echo "" >&2 13 | echo " required:" >&2 14 | echo " raw_reads_1.fast[q|a][.gz]: one or more long reads fastq or fasta files, compressed or not." >&2 15 | echo " -r ReferenceDir. As built by the 'BuildRef' option." >&2 16 | echo "" >&2 17 | echo " optional:" >&2 18 | echo " -d Output Directory: Default is the current directory." >&2 19 | echo " -M Sort memory. Maximum memory to use for sort for each thread, in MB. Default: 768." >&2 20 | echo " -x Minimap2 preset: splice (default), map-[pb|ont], ava-[pb|ont], asm[5|10|20], sr. see minimap2.1 for details." >&2 21 | echo " -E Minimap2 executable: Default is 'minimap2'." >&2 22 | echo " -t Threads: The number of physical CPUs to use by IRFinder. When ommited (default), IRFinder will use all physical CPUs." >&2 23 | echo " -u Unsorted output: Do not sort the read fragment BAM file." >&2 24 | echo " -v Verbose ( Default: print the log only in the output/logs/irfinder.std[out|err] )" >&2 25 | echo " -y STRING: an extra string that is parsed to Minimap2 for reads alignment. Default: '-uf -k14' " >&2 26 | echo " -j Jitter, consider the position around the splice sites to compensate sequencing errors ( integer number )." >&2 27 | echo "" >&2 28 | exit 1 29 | } 30 | 31 | # === Defaults === 32 | OUTPUTDIR=. 33 | THREADS=0 34 | REF= 35 | MINIMAP_PRESET="splice" 36 | DOSORT=1 37 | MINIMAP_EXTRA="-uf -k14" 38 | MINIMAP_EXEC=minimap2 39 | VERBOSE=0 40 | SORTMEM=768 41 | AI_WARN=1 42 | AI_INTRON=1 43 | JITTER=3 44 | 45 | if [ $# -eq 0 ] || [[ $1 == "--help" ]] ; then 46 | usage 47 | fi 48 | 49 | while getopts ":m:j:r:t:d:E:x:uM:y:i:w:vh" opt; do 50 | case $opt in 51 | r) 52 | # Reference directory. 53 | REF=$OPTARG 54 | ;; 55 | t) 56 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.) 57 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 58 | echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2 59 | exit 1 60 | fi 61 | THREADS=$OPTARG 62 | ;; 63 | d) 64 | OUTPUTDIR=$OPTARG 65 | ;; 66 | x) 67 | ## Minimap preset. 68 | if [[ $OPTARG =~ ^splice$|^map-(pb|ont)$|^ava-(pb|ont)$|^asm[5|10|20]$|^sr$ ]]; then 69 | MINIMAP_PRESET=$OPTARG 70 | else 71 | echo "Argument error: -x $OPTARG. Valid options for Minimap presets are: " >&2 72 | echo " splice (default), map-pb, map-ont, ava-pb, ava-ont, asm5, asm10, asm20 or sr" >&2 73 | exit 1 74 | fi 75 | ;; 76 | j) 77 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 78 | echo "Argument error: -j $OPTARG, jitter must be an integer." >&2 79 | exit 1 80 | fi 81 | JITTER=$OPTARG 82 | ;; 83 | E) 84 | ## Minimap executable. (must be executable!) 85 | if [ -x "$OPTARG" -a ! -d "$OPTARG" ]; then 86 | MINIMAP_EXEC=$OPTARG 87 | else 88 | echo "Argument error: -S $OPTARG. Minimap2 executable must be an executable program." >&2 89 | exit 1 90 | fi 91 | ;; 92 | u) 93 | DOSORT=0 94 | ;; 95 | M) 96 | #Max sort memory in MB. 97 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 98 | echo "Argument error: -M $OPTARG, maximum sort RAM in MB must be an integer." >&2 99 | exit 1 100 | fi 101 | SORTMEM=$OPTARG 102 | ;; 103 | y) 104 | MINIMAP_EXTRA=$OPTARG 105 | ;; 106 | v) 107 | VERBOSE=1 108 | ;; 109 | h) 110 | usage 111 | ;; 112 | \?) 113 | echo "Invalid option: -$OPTARG" >&2 114 | exit 1 115 | ;; 116 | :) 117 | echo "Option -$OPTARG requires an argument." >&2 118 | exit 1 119 | ;; 120 | esac 121 | done 122 | shift $(($OPTIND - 1)) 123 | 124 | if [ $# -eq 0 ];then 125 | echo "ERROR! No fasta or fastq file provided." >&2 126 | exit 1 127 | fi 128 | 129 | INPUT_FILES="" 130 | for f in $@; do 131 | if [ -f $f ]; then 132 | INPUT_FILES="${INPUT_FILES} ${f}" 133 | else 134 | echo "ERROR! File ${f} doesn't exists." >&2 135 | exit 1 136 | fi 137 | done 138 | 139 | checkRef $REF 140 | checkOutDir $OUTPUTDIR 141 | checkMinimap $MINIMAP_EXEC 142 | checkSamtools 143 | setThreads 144 | 145 | logger init 146 | 147 | RUNMODE="Long" startMessage $@ 148 | 149 | 150 | 151 | logger "[ " $(date) " ] Minimap2 is starting with $THREADS threads" 152 | 153 | $MINIMAP_EXEC -a -t $THREADS -x $MINIMAP_PRESET $MINIMAP_EXTRA $REF/genome.fa $@ 2> $OUTPUTDIR/logs/minimap2.log | samtools view -b > $OUTPUTDIR/Unsorted.bam || exit 1 154 | 155 | logger "---" 156 | logger "[ " $(date) " ] Minimap2 mapping completed" 157 | logger "---" 158 | 159 | VERBOSE_FLAG="" 160 | if [[ "${VERBOSE}" == "1" ]]; then 161 | VERBOSE_FLAG=" -v " 162 | fi 163 | 164 | IRF_RUNMODE="Long" $(dirname "$(readlink -nf "$BASH_SOURCE")")/IRFinderBAM $VERBOSE_FLAG -l -r $REF -t $THREADS -j $JITTER -d $OUTPUTDIR $OUTPUTDIR/Unsorted.bam || exit 1 165 | 166 | 167 | if [ $DOSORT -eq 1 ]; then 168 | logger "---" 169 | logger "[ " $(date) " ] Sorting the bam file" 170 | echo "---- samtools sort -@ $THREADS -m ${SORTMEM}M -o $OUTPUTDIR/Sorted.bam $OUTPUTDIR/Unsorted.bam ---" >> $OUTPUTDIR/logs/samtools.log && \ 171 | samtools sort -@ $THREADS -m ${SORTMEM}M -o $OUTPUTDIR/Sorted.bam $OUTPUTDIR/Unsorted.bam &>> $OUTPUTDIR/logs/samtools.log && \ 172 | logger "---" 173 | logger "[ " $(date) " ] Indexing the sorted bam file" 174 | echo "---- samtools index -@ $THREADS $OUTPUTDIR/Sorted.bam ---" >> $OUTPUTDIR/logs/samtools.log && \ 175 | samtools index -@ $THREADS $OUTPUTDIR/Sorted.bam &>> $OUTPUTDIR/logs/samtools.log && \ 176 | rm $OUTPUTDIR/Unsorted.bam 177 | fi 178 | logger "---" 179 | logger "[ " $(date) " ] IRFinder Long completed." 180 | logger "---" 181 | 182 | 183 | -------------------------------------------------------------------------------- /bin/TrimBAM4IGV: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export LC_ALL=C 4 | export LANG=C 5 | 6 | set -e 7 | 8 | function usage() { 9 | echo "Usage: TrimBAM4IGV -o OUTPUTDIR INPUT.bam" >&2 10 | echo "Output: OUTPUTDIR/INPUT.trimmed.bam; OUTPUTDIR/INPUT.trimmed.bam.bai" >&2 11 | echo "" >&2 12 | echo " -o OUTPUTDIR : required. Directory to save trimmed BAM." >&2 13 | echo " -r region : optional. A string to guide Samtools extracting reads in the corresponding region." >&2 14 | echo " -t NUM_THREADS : optional. Number of threads to use. Default: the number of physical CPUs." >&2 15 | echo " -h Show this usage information." >&2 16 | exit 1 17 | } 18 | 19 | 20 | # === Defaults === 21 | THREADS=0 22 | 23 | if [ $# -eq 0 ]; then 24 | usage 25 | fi 26 | 27 | while getopts ":o:r:t:h" opt; do 28 | case $opt in 29 | o) 30 | # Reference directory. 31 | OUTPUTDIR=$OPTARG 32 | ;; 33 | t) 34 | ## Number of threads to use. (must be a positive integer. Zero ok, means auto-detect.) 35 | if [[ ! $OPTARG =~ ^[0-9]+$ ]]; then 36 | echo "Argument error: -t $OPTARG, number of threads must be an integer." >&2 37 | exit 1 38 | fi 39 | THREADS=$OPTARG 40 | ;; 41 | r) 42 | REGIONS=$OPTARG 43 | ;; 44 | h) 45 | usage 46 | ;; 47 | \?) 48 | echo "Invalid option: -$OPTARG" >&2 49 | exit 1 50 | ;; 51 | :) 52 | echo "Option -$OPTARG requires an argument." >&2 53 | exit 1 54 | ;; 55 | esac 56 | done 57 | shift $(($OPTIND - 1)) 58 | 59 | #echo $@ #The remaining arguments. 60 | #echo $# #The number of remaining arguments. 61 | 62 | STVERSTR=`samtools --version` 63 | STVER=$(echo $STVERSTR|cut -d" " -f2) 64 | STVERMAIN=$(echo $STVER|cut -d"." -f1) 65 | STVERMINOR=$(echo $STVER|cut -d"." -f2) 66 | if [[ ! "$STVERMAIN" -ge 1 ]]; then 67 | echo "Error: Samtools $STVER: version too old (>=1.4 required)." >&2 68 | exit 1 69 | elif [[ ! "$STVERMINOR" -ge 4 ]]; then 70 | echo "Error: Samtools $STVER: version too old (>=1.4 required)." >&2 71 | exit 1 72 | fi 73 | 74 | if [ ! "$OUTPUTDIR" ]; then 75 | echo "Argument error: -o is required." >&2 76 | usage 77 | fi 78 | 79 | if [ ! -d "$OUTPUTDIR" ]; then 80 | mkdir "$OUTPUTDIR" 81 | fi 82 | 83 | # Auto detect CPUs. 84 | if [[ $THREADS == 0 ]]; then 85 | THREADS=`awk 'BEGIN {FS=":"} ($0 ~ /^physical id/ ) { printf $2 " --"} ($0 ~ /^core id/) {print $2}' < /proc/cpuinfo | sort -u | wc -l` 86 | if [ ! -n $THREADS -o $THREADS -eq 0 ]; then 87 | # If physical CPU detection doesn't work for some reason, detect virtual CPUs (includes hyperthreading instances). 88 | THREADS=`grep -c ^processor /proc/cpuinfo` 89 | fi 90 | fi 91 | 92 | SAMPLE=$(echo $1|awk 'BEGIN{FS=".bam"}{print $1}') 93 | 94 | if [ -f "$1"".bai" ]; then 95 | if [ ! "$REGIONS" ]; then 96 | samtools view -h "$1"|awk 'BEGIN{FS=OFS="\t"}(substr($0,1,1)=="@"){print $0}(substr($0,1,1)!="@"){print $1,$2,$3,$4,$5,$6,$7,$8,$9,"*","*"}' > "$OUTPUTDIR/tmp_sorted.trimmed.sam" 97 | else 98 | samtools view -h "$1" $REGIONS|awk 'BEGIN{FS=OFS="\t"}(substr($0,1,1)=="@"){print $0}(substr($0,1,1)!="@"){print $1,$2,$3,$4,$5,$6,$7,$8,$9,"*","*"}' > "$OUTPUTDIR/tmp_sorted.trimmed.sam" 99 | fi 100 | else 101 | echo "Warning: BAM index not found: the input BAM is treated as name-sorted and will be sorted by coordinate first. This might take a while." >&2 102 | echo " If the input BAM has already been sorted by coordinate, please index it and re-run this command." >&2 103 | samtools sort -@ "$THREADS" "$1" > "$OUTPUTDIR/tmp_sorted.bam" 104 | samtools index -@ "$THREADS" "$OUTPUTDIR/tmp_sorted.bam" 105 | if [ ! "$REGIONS" ]; then 106 | samtools view -h "$OUTPUTDIR/tmp_sorted.bam"|awk 'BEGIN{FS=OFS="\t"}(substr($0,1,1)=="@"){print $0}(substr($0,1,1)!="@"){print $1,$2,$3,$4,$5,$6,$7,$8,$9,"*","*"}' > "$OUTPUTDIR/tmp_sorted.trimmed.sam" 107 | else 108 | samtools view -h "$OUTPUTDIR/tmp_sorted.bam" $REGIONS|awk 'BEGIN{FS=OFS="\t"}(substr($0,1,1)=="@"){print $0}(substr($0,1,1)!="@"){print $1,$2,$3,$4,$5,$6,$7,$8,$9,"*","*"}' > "$OUTPUTDIR/tmp_sorted.trimmed.sam" 109 | fi 110 | fi 111 | 112 | samtools view -S -b "$OUTPUTDIR/tmp_sorted.trimmed.sam" > "$OUTPUTDIR/""$SAMPLE"".trimmed.bam" 113 | samtools index -@ "$THREADS" "$OUTPUTDIR/""$SAMPLE"".trimmed.bam" 114 | 115 | 116 | rm "$OUTPUTDIR"/tmp_sorted* -------------------------------------------------------------------------------- /bin/analysisWithLowReplicates.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use Data::Dumper; 4 | use List::Util qw(max min); 5 | use FindBin qw($RealBin); 6 | 7 | my $winflatExec = "winflat"; 8 | if ( -x "$RealBin/util/winflat" ) { 9 | $winflatExec = "$RealBin/util/winflat"; 10 | }else{ 11 | #system('which','winflat', '>/dev/null'); 12 | system('which winflat >/dev/null'); 13 | if ($? != 0) { 14 | print STDERR "FATAL: winflat utiltiy not found.\nSearched at:\n $RealBin/util/winflat\n and on the PATH\n"; 15 | exit 1; 16 | } 17 | } 18 | 19 | 20 | sub arrayEqual { 21 | my ($xref, $yref, $maxCompare) = @_; 22 | return unless @$xref == @$yref; 23 | 24 | for (my $i = 0; $i < $maxCompare && $i < scalar @$xref; $i++) { 25 | return unless $xref->[$i] eq $yref->[$i]; 26 | } 27 | return 1; 28 | } 29 | 30 | sub separatedAB { 31 | my ($arrayref, $aCount, $bCount) = @_; 32 | ## An array with aCount elements followed by bCount elements. 33 | ## All of the A elements need to be < or > all of the B elements. 34 | 35 | if ($arrayref->[0] == $arrayref->[$aCount]) { 36 | return 0; #neither > or <. 37 | }elsif ($arrayref->[0] > $arrayref->[$aCount]) { 38 | for (my $a = 0; $a < $aCount; $a++) { 39 | for (my $b = $aCount; $b < $aCount+$bCount; $b++) { 40 | return 0 if (!($arrayref->[$a] > $arrayref->[$b])); 41 | } 42 | } 43 | }else{ 44 | for (my $a = 0; $a < $aCount; $a++) { 45 | for (my $b = $aCount; $b < $aCount+$bCount; $b++) { 46 | return 0 if (!($arrayref->[$a] < $arrayref->[$b])); 47 | } 48 | } 49 | } 50 | return 1; 51 | } 52 | 53 | 54 | 55 | my $current = ""; 56 | #Filehandles. 57 | my $poolA; 58 | my $poolAname; 59 | my $poolB; 60 | my $poolBname; 61 | my @reps; 62 | my @repsFileNames; 63 | my $repsA = 0; 64 | my $repsB = 0; 65 | my @output; 66 | 67 | while (scalar @ARGV) { 68 | my $param = shift @ARGV; 69 | if ($param =~ m/^\-/) { 70 | if ($param eq '-A') { 71 | $current = 'A'; 72 | }elsif ($param eq '-B') { 73 | $current = 'B'; 74 | }else{ 75 | print STDERR "Invalid parameter: $param\n"; 76 | exit 1; 77 | } 78 | }else{ 79 | if ($current eq "") { 80 | print STDERR "Invalid parameters. eg: -A pooledA/IR-nondir.txt repA1/IR-nondir.txt repA2/IR-nondir.txt -B pooledB/IR-nondir.txt repB1/IR-nondir.txt repB2/IR-nondir.txt\n"; 81 | exit 1; 82 | }elsif ($current eq "A") { 83 | if ($poolA) { 84 | #open $repsA[scalar @repsA], '<', $param or die "Can't open file $param for reading"; 85 | ## Insert an element into the array @reps, after the last A element. 86 | splice(@repsFileNames, $repsA, 0, $param); 87 | splice(@reps, $repsA, 0, undef); 88 | open $reps[$repsA], '<', $param or die "Can't open file $param for reading"; 89 | 90 | $repsA++; 91 | }else{ 92 | open $poolA, '<', $param or die "Can't open file $param for reading"; 93 | $poolAname = $param; 94 | } 95 | }elsif ($current eq "B") { 96 | if ($poolB) { 97 | ## Add an element to the very end of the array @reps (ie: after all the A elements, and all the B elements) 98 | @repsFileNames[scalar @repsFileNames]=$param; 99 | open $reps[scalar @reps], '<', $param or die "Can't open file $param for reading"; 100 | $repsB++ 101 | }else{ 102 | open $poolB, '<', $param or die "Can't open file $param for reading"; 103 | $poolBname = $param; 104 | } 105 | }else{ 106 | print STDERR "error in code\n"; 107 | exit 2; 108 | } 109 | } 110 | } 111 | 112 | ( $repsA >= 2 ) or die "For condition A, must provide a pooled data file and at least 2 replicate files."; 113 | ( $repsB >= 2 ) or die "For condition B, must provide a pooled data file and at least 2 replicate files."; 114 | 115 | #print Dumper(\@repsFileNames); 116 | #print Dumper(\@reps); 117 | 118 | my @repsHeader; 119 | my $counter = 1; 120 | foreach(@repsFileNames[0 .. $repsA-1]) { 121 | $_ = '#Condition A replicate ' . $counter . ': ' . $_; 122 | push @repsHeader, "A$counter-IRratio"; 123 | $counter++; 124 | } 125 | $counter = 1; 126 | foreach(@repsFileNames[$repsA .. scalar @repsFileNames - 1]) { 127 | $_ = '#Condition B replicate ' . $counter . ': ' . $_; 128 | push @repsHeader, "B$counter-IRratio"; 129 | $counter++; 130 | } 131 | 132 | print "#Condition A combined: $poolAname\n"; 133 | print join("\n",@repsFileNames[0 .. $repsA-1]), "\n"; 134 | print "#Condition B combined: $poolBname\n"; 135 | print join("\n",@repsFileNames[$repsA .. scalar @repsFileNames - 1]), "\n"; 136 | 137 | print join("\t", 138 | "Chr", "Start", "End", "Intron-GeneName/GeneID","-","Direction","ExcludedBases", 139 | "p-diff","p-increased","p-decreased", 140 | "A-IRratio","A-IRok","A-IntronCover","A-IntronDepth","A-SplicesMax","A-SplicesExact", 141 | "B-IRratio","B-IRok","B-IntronCover","B-IntronDepth","B-SplicesMax","B-SplicesExact", 142 | "replicates", @repsHeader 143 | ),"\n"; 144 | 145 | 146 | my $lineNumber = 0; 147 | while(<$poolA>) { 148 | my $pA = $_; 149 | chomp $pA; 150 | my $pB = <$poolB>; 151 | chomp $pB; 152 | $lineNumber++; 153 | 154 | my @pA = split /\t/, $pA; 155 | my @pB = split /\t/, $pB; 156 | 157 | if (!( arrayEqual( \@pA, \@pB, 7) )) { 158 | print STDERR "FATAL: Files do not list records in the same order with identical number of lines.\n"; 159 | print join("\t", @pA[0 .. 6]), "\n"; 160 | print join("\t", @pB[0 .. 6]), "\n"; 161 | 162 | exit 1; 163 | } 164 | 165 | ## Loop through replicates, fill an array. (check the ~~ 0..6) 166 | my @repsIR; 167 | foreach(@reps) { 168 | my @fields = split /\t/, <$_>; 169 | if (!( arrayEqual( \@pA, \@fields, 7) )) { 170 | print STDERR "FATAL: Files do not list records in the same order with identical number of lines.\n"; 171 | print join("\t", @fields[0 .. 6]), "\n"; 172 | print join("\t", @pA[0 .. 6]), "\n"; 173 | exit 1; 174 | } 175 | push @repsIR, @fields[19]; 176 | } 177 | 178 | ## Do the maths, are the replicates OK? 179 | my $ok = ($pA[20] eq "-" || $pB[20] eq "-") && ($pA[8] >= 1 || $pB[8] >= 1) && ($pA[19] >= 0.01 || $pB[19] >= 0.01) && separatedAB(\@repsIR, $repsA, $repsB); 180 | 181 | my $pValUp = 99; 182 | my $pValDown = 99; 183 | 184 | if ($ok) { 185 | ## Check if both are sufficiently expressed (either the intron, or the splices) 186 | if (( $pA[8] >= 1 || max($pA[16],$pA[17]) >= 10 ) && ( $pB[8] >= 1 || max($pB[16],$pB[17]) >= 10 )) { 187 | ## calculate the winflat p-value of the difference (from the pooled IRdepth & junctionDepth). 188 | #print $lineNumber, "\n"; 189 | open my $winflat, '-|', $winflatExec, '-xvalue', int($pA[8]+0.5), '-yvalue', int($pB[8]+0.5), '-diff', max($pA[16],$pA[17])+int($pA[8]+0.5), max($pB[16],$pB[17])+int($pB[8]+0.5); 190 | my @winflat = <$winflat>; 191 | close $winflat; 192 | foreach (@winflat) {chomp; s/^.*\).*= *//; s/\W*$//}; 193 | $pValDown = $winflat[0]; 194 | $pValUp = $winflat[1]; 195 | }else{ 196 | ## Properly expressed in only one of the samples. Flag as interesting, but not differential IR. 197 | $pValUp = 33; 198 | $pValDown = 33; 199 | } 200 | } 201 | my $pValDiff = min($pValUp, $pValDown); 202 | 203 | if ($ok) { 204 | push @output, [@pA[0 .. 6], 205 | $pValDiff, $pValUp, $pValDown, 206 | $pA[19], $pA[20], $pA[7], $pA[8], max($pA[16],$pA[16]), $pA[18], 207 | $pB[19], $pB[20], $pB[7], $pB[8], max($pB[16],$pB[16]), $pB[18], 208 | "reps", @repsIR]; 209 | } 210 | 211 | ## Max SJ - 17/18 212 | ## Exact SJ - 29 213 | ## IRRatio = 20 214 | ## ok 21 215 | ## coverage 8 216 | ## trimmedMean 9 217 | ## 218 | } 219 | 220 | foreach ( sort { $a->[7] <=> $b->[7] } @output ) { 221 | print join("\t", @$_), "\n"; 222 | } 223 | -------------------------------------------------------------------------------- /bin/analysisWithNoReplicates.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use Data::Dumper; 4 | use List::Util qw(max min); 5 | use FindBin qw($RealBin); 6 | use sort 'stable'; 7 | 8 | 9 | my $winflatExec = "winflat"; 10 | if ( -x "$RealBin/util/winflat" ) { 11 | $winflatExec = "$RealBin/util/winflat"; 12 | }else{ 13 | #system('which','winflat', '>/dev/null'); 14 | system('which winflat >/dev/null'); 15 | if ($? != 0) { 16 | print STDERR "FATAL: winflat utiltiy not found.\nSearched at:\n $RealBin/util/winflat\n and on the PATH\n"; 17 | exit 1; 18 | } 19 | } 20 | 21 | 22 | sub arrayEqual { 23 | my ($xref, $yref, $maxCompare) = @_; 24 | return unless @$xref == @$yref; 25 | 26 | for (my $i = 0; $i < $maxCompare && $i < scalar @$xref; $i++) { 27 | return unless $xref->[$i] eq $yref->[$i]; 28 | } 29 | return 1; 30 | } 31 | 32 | sub separatedAB { 33 | my ($arrayref, $aCount, $bCount) = @_; 34 | ## An array with aCount elements followed by bCount elements. 35 | ## All of the A elements need to be < or > all of the B elements. 36 | 37 | if ($arrayref->[0] == $arrayref->[$aCount]) { 38 | return 0; #neither > or <. 39 | }elsif ($arrayref->[0] > $arrayref->[$aCount]) { 40 | for (my $a = 0; $a < $aCount; $a++) { 41 | for (my $b = $aCount; $b < $aCount+$bCount; $b++) { 42 | return 0 if (!($arrayref->[$a] > $arrayref->[$b])); 43 | } 44 | } 45 | }else{ 46 | for (my $a = 0; $a < $aCount; $a++) { 47 | for (my $b = $aCount; $b < $aCount+$bCount; $b++) { 48 | return 0 if (!($arrayref->[$a] < $arrayref->[$b])); 49 | } 50 | } 51 | } 52 | return 1; 53 | } 54 | 55 | 56 | 57 | my $current = ""; 58 | #Filehandles. 59 | my $poolA; 60 | my $poolAname; 61 | my $poolB; 62 | my $poolBname; 63 | my @reps; 64 | my @repsFileNames; 65 | my $repsA = 0; 66 | my $repsB = 0; 67 | my @output; 68 | 69 | while (scalar @ARGV) { 70 | my $param = shift @ARGV; 71 | if ($param =~ m/^\-/) { 72 | if ($param eq '-A') { 73 | $current = 'A'; 74 | }elsif ($param eq '-B') { 75 | $current = 'B'; 76 | }else{ 77 | print STDERR "Invalid parameter: $param\n"; 78 | exit 1; 79 | } 80 | }else{ 81 | if ($current eq "") { 82 | print STDERR "Invalid parameters. eg: -A pooledA/IR-nondir.txt repA1/IR-nondir.txt repA2/IR-nondir.txt -B pooledB/IR-nondir.txt repB1/IR-nondir.txt repB2/IR-nondir.txt\n"; 83 | exit 1; 84 | }elsif ($current eq "A") { 85 | if ($poolA) { 86 | #open $repsA[scalar @repsA], '<', $param or die "Can't open file $param for reading"; 87 | ## Insert an element into the array @reps, after the last A element. 88 | splice(@repsFileNames, $repsA, 0, $param); 89 | splice(@reps, $repsA, 0, undef); 90 | open $reps[$repsA], '<', $param or die "Can't open file $param for reading"; 91 | 92 | $repsA++; 93 | }else{ 94 | open $poolA, '<', $param or die "Can't open file $param for reading"; 95 | $poolAname = $param; 96 | } 97 | }elsif ($current eq "B") { 98 | if ($poolB) { 99 | ## Add an element to the very end of the array @reps (ie: after all the A elements, and all the B elements) 100 | @repsFileNames[scalar @repsFileNames]=$param; 101 | open $reps[scalar @reps], '<', $param or die "Can't open file $param for reading"; 102 | $repsB++ 103 | }else{ 104 | open $poolB, '<', $param or die "Can't open file $param for reading"; 105 | $poolBname = $param; 106 | } 107 | }else{ 108 | print STDERR "error in code\n"; 109 | exit 2; 110 | } 111 | } 112 | } 113 | 114 | ( $poolA ) or die "For condition A, must provide a file."; 115 | ( $poolB ) or die "For condition B, must provide a file."; 116 | ( $repsA == 0 ) or die "For condition A, must provide a single file only."; 117 | ( $repsB == 0 ) or die "For condition B, must provide a single file only."; 118 | 119 | 120 | #print Dumper(\@repsFileNames); 121 | #print Dumper(\@reps); 122 | 123 | print "#Condition A: $poolAname\n"; 124 | print "#Condition B: $poolBname\n"; 125 | 126 | 127 | print join("\t", 128 | "Chr", "Start", "End", "Intron-GeneName/GeneID","-","Direction","ExcludedBases", 129 | "p-diff","p-increased","p-decreased", 130 | "A-IRratio","A-IRok","A-IntronCover","A-IntronDepth","A-SplicesMax","A-SplicesExact", 131 | "B-IRratio","B-IRok","B-IntronCover","B-IntronDepth","B-SplicesMax","B-SplicesExact", 132 | ),"\n"; 133 | 134 | 135 | my $lineNumber = 0; 136 | while(<$poolA>) { 137 | my $pA = $_; 138 | chomp $pA; 139 | my $pB = <$poolB>; 140 | chomp $pB; 141 | $lineNumber++; 142 | 143 | my @pA = split /\t/, $pA; 144 | my @pB = split /\t/, $pB; 145 | 146 | if (!( arrayEqual( \@pA, \@pB, 7) )) { 147 | print STDERR "FATAL: Files do not list records in the same order with identical number of lines.\n"; 148 | print join("\t", @pA[0 .. 6]), "\n"; 149 | print join("\t", @pB[0 .. 6]), "\n"; 150 | 151 | exit 1; 152 | } 153 | 154 | # ## Loop through replicates, fill an array. (check the ~~ 0..6) 155 | # my @repsIR; 156 | # foreach(@reps) { 157 | # my @fields = split /\t/, <$_>; 158 | # if (!( arrayEqual( \@pA, \@fields, 7) )) { 159 | # print STDERR "FATAL: Files do not list records in the same order with identical number of lines.\n"; 160 | # print join("\t", @fields[0 .. 6]), "\n"; 161 | # print join("\t", @pA[0 .. 6]), "\n"; 162 | # exit 1; 163 | # } 164 | # push @repsIR, @fields[19]; 165 | # } 166 | 167 | ## Do the maths, are the replicates OK? 168 | # my $ok = ($pA[20] eq "ok" || $pB[20] eq "ok") && ($pA[8] >= 1 || $pB[8] >= 1) && ($pA[19] >= 0.01 || $pB[19] >= 0.01) && separatedAB(\@repsIR, $repsA, $repsB); 169 | 170 | # No replicates. Still do a form of check -- is this intron interesting? 171 | my $ok = ($pA[20] eq "-" || $pB[20] eq "-") && ($pA[8] >= 1 || $pB[8] >= 1) && (max($pA[16],$pA[17]) >= 10 || max($pB[16],$pB[17]) >= 10) && ($pA[19] >= 0.01 || $pB[19] >= 0.01); 172 | 173 | 174 | my $pValUp = 99; 175 | my $pValDown = 99; 176 | 177 | if ($ok) { 178 | ## Check if both are sufficiently expressed (either the intron, or the splices) 179 | if (( $pA[8] >= 1 || max($pA[16],$pA[17]) >= 10 ) && ( $pB[8] >= 1 || max($pB[16],$pB[17]) >= 10 )) { 180 | ## calculate the winflat p-value of the difference (from the pooled IRdepth & junctionDepth). 181 | #print $lineNumber, "\n"; 182 | open my $winflat, '-|', $winflatExec, '-xvalue', int($pA[8]+0.5), '-yvalue', int($pB[8]+0.5), '-diff', max($pA[16],$pA[17])+int($pA[8]+0.5), max($pB[16],$pB[17])+int($pB[8]+0.5); 183 | my @winflat = <$winflat>; 184 | close $winflat; 185 | foreach (@winflat) {chomp; s/^.*\).*= *//; s/\W*$//}; 186 | $pValDown = $winflat[0]; 187 | $pValUp = $winflat[1]; 188 | }else{ 189 | ## Properly expressed in only one of the samples. Flag as interesting, but not differential IR. 190 | $pValUp = 33; 191 | $pValDown = 33; 192 | } 193 | } 194 | my $pValDiff = min($pValUp, $pValDown); 195 | 196 | if ($ok) { 197 | # [ ] pushes an array ref onto the array. 198 | push @output, [@pA[0 .. 6], 199 | $pValDiff, $pValUp, $pValDown, 200 | $pA[19], $pA[20], $pA[7], $pA[8], max($pA[16],$pA[16]), $pA[18], 201 | $pB[19], $pB[20], $pB[7], $pB[8], max($pB[16],$pB[16]), $pB[18] 202 | ]; 203 | } 204 | 205 | ## Max SJ - 17/18 206 | ## Exact SJ - 29 207 | ## IRRatio = 20 208 | ## ok 21 209 | ## coverage 8 210 | ## trimmedMean 9 211 | ## 212 | } 213 | 214 | foreach ( sort { $a->[7] <=> $b->[7] } @output ) { 215 | print join("\t", @$_), "\n"; 216 | } 217 | #print Dumper (\@output); 218 | -------------------------------------------------------------------------------- /bin/util/IntronExclusion.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | my %genes; 5 | 6 | sub intronNumber 7 | { 8 | my ($gene) = shift; 9 | $genes{$gene} ++; 10 | return $genes{$gene}; 11 | } 12 | 13 | sub processIntron { 14 | my ($intron, $overlaps) = @_; 15 | my ($chr, $start, $end, $gene, $score, $dir) = split /\t/, $intron; 16 | 17 | my $len = $end-$start; 18 | my $excl = 0; 19 | my $antisense_dirty = 0; 20 | my $excluded_by_exon = 0; 21 | 22 | my @intron_seg=(); 23 | push @intron_seg, {'start' => $start, 'end' => $end}; 24 | 25 | foreach my $overlap (@$overlaps) { 26 | # $overlap->{start, end, type} 27 | if ($overlap->{'type'} eq 'A') { 28 | $antisense_dirty = 2 if ($antisense_dirty < 2); 29 | #ignore anti-sense, but mark dirty 30 | }elsif ($overlap->{'type'} eq 'AE') { 31 | $antisense_dirty = 1 if ($antisense_dirty < 1); 32 | #ignore anti-sense, but mark dirty 33 | }elsif ($overlap->{'type'} eq 'E' && $overlap->{'start'} < $start && $overlap->{'end'} > $end) { 34 | #print STDERR "Found an exon/feature entirely covering this intron, skpping\n"; 35 | $excluded_by_exon = 1; 36 | }else{ 37 | # We want to exclude this segment from our intron. 38 | 39 | foreach my $seg (@intron_seg) { 40 | if ($seg->{'end'}==0) { 41 | # do nothing, this segment has already been deleted. 42 | }elsif ($overlap->{'end'} <= $seg->{'start'}) { 43 | # end is before the start, skip it 44 | }elsif ($overlap->{'start'} >= $seg->{'end'}) { 45 | # start is after the end, skip it 46 | }elsif ($overlap->{'start'} <= $seg->{'start'} && $overlap->{'end'} >= $seg->{'end'}) { 47 | # exclude entirely covers a segment (equality or beyond) then remove it 48 | $seg->{'start'}=0; 49 | $seg->{'end'}=0; 50 | }elsif ($overlap->{'start'} <= $seg->{'start'}) { 51 | # start is before the start, trim the start 52 | $seg->{'start'} = $overlap->{'end'}; 53 | }elsif ($overlap->{'end'} >= $seg->{'end'}) { 54 | # end is after the end, trim the end 55 | $seg->{'end'} = $overlap->{'start'}; 56 | }else{ 57 | # start inside, end inside - split it 58 | push @intron_seg, {'start'=>$overlap->{'end'},'end'=>$seg->{'end'}}; 59 | $seg->{'end'}=$overlap->{'start'}; 60 | } 61 | } 62 | } 63 | } 64 | # Procesed all overlaps. 65 | # result fragments, in no specific order, in @intron_seg. 66 | #print $intron, "\n"; 67 | my $newlen = 0; 68 | my $newstart; 69 | my $newend = 0; 70 | my @sizes; 71 | my @starts; 72 | foreach my $seg (sort {$a->{'start'} <=> $b->{'start'}} @intron_seg) { 73 | if ($seg->{'end'} != 0) { 74 | $newstart = $seg->{'start'} if (!$newstart); 75 | $newend = $seg->{'end'} if ($seg->{'end'} > $newend); 76 | push @starts, $seg->{'start'} - $newstart; 77 | push @sizes, $seg->{'end'}-$seg->{'start'}; 78 | } 79 | #print join("\t", "", $seg->{'start'}, $seg->{'end'}), "\n"; 80 | $newlen += $seg->{'end'}-$seg->{'start'}; 81 | } 82 | if ($newlen > 40 && ($newlen/$len) >= 0.7) { 83 | my $antisense_text = 'clean'; 84 | if ($excluded_by_exon >= 1) { 85 | $antisense_text = 'known-exon'; 86 | $antisense_text .= '+anti-near' if ($antisense_dirty >= 1); 87 | $antisense_text .= '+anti-over' if ($antisense_dirty >= 2); 88 | }else{ 89 | $antisense_text = 'anti-near' if ($antisense_dirty >= 1); 90 | $antisense_text = 'anti-over' if ($antisense_dirty >= 2); 91 | } 92 | print join("\t", $chr, $newstart, $newend, join("/",$gene,intronNumber($gene),$start,$end,$len,$len-$newlen,$antisense_text), $score, $dir, $newstart, $newend, "255,0,0", scalar @sizes, join(",",@sizes), join(",",@starts)), "\n"; 93 | 94 | if ($len >= 110) { 95 | print OF50 join("\t", $chr, $start+5, $start+55, "S", 0, $dir, $start+5, $start+55, "255,0,0", 1, 50, 0), "\n"; 96 | print OF50 join("\t", $chr, $end-55, $end-5, "S", 0, $dir, $end-55, $end-5, "255,0,0", 1, 50, 0), "\n"; 97 | } 98 | # if ($len >= 210) { 99 | # print OF50 join("\t", $chr, $start+55, $start+105, "E", 0, $dir, $start+55, $start+105, "255,0,0", 1, 50, 0), "\n"; 100 | # print OF50 join("\t", $chr, $end-105, $end-55, "E", 0, $dir, $end-105, $end-55, "255,0,0", 1, 50, 0), "\n"; 101 | # } 102 | print OF1 join("\t", $chr, $start, $dir), "\n"; 103 | print OF1 join("\t", $chr, $end, $dir), "\n"; 104 | } 105 | } 106 | 107 | 108 | 109 | #### MAIN #### 110 | 111 | if (! (scalar @ARGV == 2) ) { 112 | print STDERR "Usage: cat inputBedIntersection | ./thisTool.pl out2 out3 > out1\n"; 113 | exit(1); 114 | } 115 | 116 | open OF50, '>', $ARGV[0]; 117 | open OF1, '>', $ARGV[1]; 118 | 119 | my $lastintron = ''; 120 | my @overlaps; 121 | while() { 122 | chomp; 123 | 124 | ## Directional 125 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 134895 135807 E 0 - 5 126 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 135135 135900 E 0 - 98 127 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 135230 136040 X 0 - 238 128 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 136070 136410 X 0 - 340 129 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 136440 136710 X 0 - 270 130 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 136750 137100 X 0 - 350 131 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 137140 137790 X 0 - 480 132 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 137615 139384 E 0 - 5 133 | #1 736543 741178 RP11-206L10.8/ENSG00000230092/- 0 - 1 736253 736548 E 0 - 5 134 | #1 736543 741178 RP11-206L10.8/ENSG00000230092/- 0 - 1 736550 736680 X 0 - 130 135 | #1 736543 741178 RP11-206L10.8/ENSG00000230092/- 0 - 1 736710 736840 X 0 - 130 136 | 137 | ## Non-Directional 138 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 134895 135807 E 5 139 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 135135 135900 E 98 140 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 135230 136040 X 238 141 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 136070 136410 X 340 142 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 136440 136710 X 270 143 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 136750 137100 X 350 144 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 137140 137790 X 480 145 | #1 135802 137620 AL627309.1/ENSG00000237683/- 0 - 1 137615 139384 E 5 146 | 147 | my ($intron, $overlapstart, $overlapend, $overlaptype) = $_ =~ /^([^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+)\t[^\t]+\t([^\t]+)\t([^\t]+)\t([^\t]+)/; 148 | push @overlaps, {'start'=>$overlapstart, 'end'=>$overlapend, 'type'=>$overlaptype}; 149 | 150 | if ($lastintron ne $intron) { 151 | if ($lastintron ne '') { 152 | processIntron($lastintron, \@overlaps); 153 | undef @overlaps; 154 | } 155 | } 156 | push @overlaps, {'start'=>$overlapstart, 'end'=>$overlapend, 'type'=>$overlaptype}; 157 | 158 | $lastintron = $intron; 159 | 160 | } 161 | processIntron($lastintron, \@overlaps); 162 | -------------------------------------------------------------------------------- /bin/util/Mapability: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ulimit -Su 4000 4 | export LANG=C 5 | export LC_ALL=C 6 | 7 | set -e 8 | 9 | STAREXEC=$1 10 | STARGENOME=$2 11 | FA=$3 12 | LIBEXEC=$4 13 | THREADS=$5 14 | READ_LENGTH=$6 15 | 16 | TMPBED=tmp_$$ 17 | 18 | mkdir "$TMPBED" 19 | 20 | TMPCMP=gzip 21 | TMPEXT=gz 22 | if [ -x /usr/bin/lzop ]; then 23 | TMPCMP=/usr/bin/lzop 24 | TMPEXT=lzo 25 | fi 26 | 27 | 28 | #echo "" 29 | echo "" 30 | date +"%b %d %T ... mapping genome fragments of length $READ_LENGTH back to genome..." 31 | 32 | if [ $THREADS -eq 1 ]; then 33 | STAR_THREADS=1 34 | else 35 | STAR_THREADS=$(( THREADS - 1 )) 36 | fi 37 | 38 | "$STAREXEC" \ 39 | --genomeDir "$STARGENOME" \ 40 | --genomeLoad NoSharedMemory \ 41 | --runThreadN $THREADS --outStd SAM --outSAMmode NoQS \ 42 | --outSAMattributes None \ 43 | --outFilterMultimapNmax 1 \ 44 | --readFilesIn <("$LIBEXEC/generateReadsError.pl" $READ_LENGTH 10 < "$FA") \ 45 | > genome_fragments.sam 46 | 47 | date +"%b %d %T ... sorting aligned genome fragments..." 48 | 49 | samtools sort -@ "$THREADS" genome_fragments.sam > genome_fragments.bam 50 | 51 | date +"%b %d %T ... indexing aligned genome fragments..." 52 | 53 | samtools index -@ "$THREADS" genome_fragments.bam 54 | 55 | date +"%b %d %T ... filtering aligned genome fragments by chromosome/scaffold..." 56 | 57 | ## prevent histexpand for the character '!' 58 | set +o histexpand 59 | 60 | cat "$STARGENOME/chrName.txt" | \ 61 | xargs --max-args 1 --max-procs "${THREADS}" -I{} bash -c "samtools view genome_fragments.bam {}|awk -v read_length=\"${READ_LENGTH}M\" -v tmpdir=\"${TMPBED}\" -v tmpcmp=\"${TMPCMP}\" -v tmpext=\"${TMPEXT}\" 'BEGIN{FS=\"[\\t!]\"; OFS=\"\\t\"}{if ((\$8 == read_length ) && (\$3 == \$6) && (\$2 == \$5)) {print \$5, \$6-1, \$6+69 | (tmpcmp \" -c1 > \" tmpdir \"/\" \$5 \".bed.\" tmpext ) }}END{close( (tmpcmp \" -c1 > \" tmpdir \"/\" \$5 \".bed.\" tmpext ))}'" 62 | 63 | date +"%b %d %T ... merging filtered genome fragments..." 64 | 65 | if [ "$TMPEXT" == "gz" ]; then 66 | find "$TMPBED" -type f -name "*.bed.""$TMPEXT"|xargs --max-args 1 zcat >> genome_fragments.unsorted.bed 67 | elif [ "$TMPEXT" == "lzo" ]; then 68 | find "$TMPBED" -type f -name "*.bed.""$TMPEXT"|xargs --max-args 1 lzop -cdf >> genome_fragments.unsorted.bed 69 | fi 70 | 71 | date +"%b %d %T ... calculating regions for exclusion..." 72 | 73 | 74 | bedtools genomecov -i genome_fragments.unsorted.bed -bga -g "$STARGENOME/chrNameLength.txt" | \ 75 | awk 'BEGIN{FS=OFS="\t";chr="random"}($1!=chr){chr=$1}($1==chr){print}' | \ 76 | awk 'BEGIN {FS=OFS="\t"} ($4 < 5) {print $1, $2, $3}' | \ 77 | bedtools merge -i stdin | \ 78 | sort -S5G -k1,1 -k2,2n -k3,3n| \ 79 | gzip > MapabilityExclusion.bed.gz 80 | 81 | #ls "$TMPBED"/*.bed."$TMPEXT" | xargs --max-args 1 --max-procs "$THREADS" -I{} bash -c "\"$TMPCMP\" -cd < {} | bedtools genomecov -i stdin -bga -g \"$CHRLEN\"| awk 'NR==1{chr=\$1;print}\$1==chr{print}' | awk 'BEGIN {FS=\"\\t\"; OFS=\"\\t\"} (\$4 < 5) {print \$1, \$2, \$3}' | bedtools merge -i stdin > \"$TMPEXCL/\"{}.exclusion" 82 | 83 | #find "$TMPBED" -type f -name "*.bed.""$TMPEXT"|cut -d"/" -f3|xargs --max-args 1 --max-procs "$THREADS" -I{} bash -c "\"$TMPCMP\" -cd < \"$TMPBED\"/{} | bedtools genomecov -i stdin -bga -g \"$CHRLEN\"| awk 'NR==1{chr=\$1;print}\$1==chr{print}' | awk 'BEGIN {FS=OFS=\"\\t\"} (\$4 < 5) {print \$1, \$2, \$3}' | bedtools merge -i stdin > \"$TMPEXCL\"/{}.exclusion" 84 | 85 | #cat "$TMPEXCL"/*.exclusion | sort -S5G -k1,1 -k2,2n -k3,3n | gzip > MapabilityExclusion.bed.gz 86 | 87 | date +"%b %d %T ... cleaning temporary files..." 88 | 89 | find "$TMPBED" -type f -name "*.bed.""$TMPEXT"|xargs --max-args 1 --max-procs "$THREADS" rm 90 | rm genome_fragments.* 91 | rm Log.* 92 | rm SJ.out.tab 93 | rmdir "$TMPBED" 94 | -------------------------------------------------------------------------------- /bin/util/adjust.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | args <- commandArgs(trailingOnly = T) 3 | dat=read.table(args[1], stringsAsFactors = F, header = T) 4 | cols=colnames(dat) 5 | for (cn in cols[grepl(pattern ="p.val", x = cols)] ){ 6 | dat[,paste0(cn,"_BH_adjusted")]=p.adjust(dat[,cn], method = "BH") 7 | } 8 | write.table(x=dat, file = paste0(args[1], "_adjusted.tsv"), row.names = F, col.names = T, quote = F, sep="\t") 9 | 10 | -------------------------------------------------------------------------------- /bin/util/bash_utils.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## Useful functions for IRFinder's utils 3 | 4 | export IRFINDER_BASH_UTILS_IMPORTED=1 5 | export VERSION=2.0.1 6 | export LC_ALL=C 7 | export LANG=C 8 | 9 | 10 | function versionAlert(){ 11 | echo "IRFinder version: $VERSION" 12 | exit 13 | } 14 | 15 | function checkFile() { 16 | if [ ! -f "${1}" ]; then 17 | echo "Error: file $1 doesn't exists" >&2 18 | exit 1 19 | fi 20 | } 21 | 22 | function checkSamtools() { 23 | STVERSTR=`samtools --version` 24 | STVER=$(echo $STVERSTR|cut -d" " -f2) 25 | STVERMAIN=$(echo $STVER|cut -d"." -f1) 26 | STVERMINOR=$(echo $STVER|cut -d"." -f2) 27 | if [[ ! "$STVERMAIN" -ge 1 ]]; then 28 | echo "Error: Samtools $STVER: version too old (>=1.4 required)." >&2 29 | exit 1 30 | elif [[ ! "$STVERMINOR" -ge 4 ]]; then 31 | echo "Error: Samtools $STVER: version too old (>=1.4 required)." >&2 32 | exit 1 33 | fi 34 | } 35 | 36 | function getMem(){ 37 | local MEMK=`awk '($1 ~ /^MemTotal:/) {print $2}' < /proc/meminfo` 38 | echo $(( $MEMK/1000 )) 39 | } 40 | 41 | 42 | function checkStar(){ 43 | MEMM=$(getMem) 44 | if [ "${MEMM}" -lt 32000 ]; then 45 | echo "System limitation: Minimum required RAM is 32GB. This software uses STAR for RNA mapping. RAM requirement is approximately 30GB for the human genome." >&2 46 | echo " RunModes: BAM and BuildRefDownload, may be completed on servers with more RAM." >&2 47 | exit 2 48 | fi 49 | if [[ "$1" != "" ]]; then 50 | STAREXEC="$1" 51 | fi 52 | if [[ "${STAREXEC}" == "" ]]; then 53 | STAREXEC="STAR" 54 | fi 55 | "$STAREXEC" --version &>/dev/null 56 | if [ ! $? -eq 0 ]; then 57 | echo "Error: STAR version is too old. --version parameter returns an error. Minimum version of 2.4.0 required." >&2 58 | exit 2 59 | fi 60 | } 61 | 62 | 63 | function checkMinimap(){ 64 | if [[ "$1" != "" ]]; then 65 | MINIMAP_EXEC="$1" 66 | fi 67 | if [[ "${MINIMAP_EXEC}" == "" ]]; then 68 | MINIMAP_EXEC="minimap2" 69 | fi 70 | if ! which $MINIMAP_EXEC > /dev/null 2> /dev/null ; then 71 | echo "minimap2 not found ( executable: $MINIMAP_EXEC ). To use the RunMode Long, install it: https://github.com/lh3/minimap2 " >&2 72 | exit 1 73 | fi 74 | MINIMAP_VERSION=$("$MINIMAP_EXEC" --version) 75 | if [[ $(echo ${MINIMAP_VERSION/-*/} | awk '{if ( $1 > 2.0 ) {print "ok" } else { print "no" }}') != "ok" ]]; then 76 | echo "Error: Minimap version is too old. Minimum version of 2.0.0 required. ${MINIMAP_VERSION} detected" >&2 77 | exit 2 78 | fi 79 | } 80 | 81 | 82 | function checkSuppa(){ 83 | if ! which suppa.py >/dev/null 2>/dev/null ; then 84 | echo "SUPPA2 not found ( executable: suppa.py ). To use the RunMode Diff, install it: https://github.com/comprna/SUPPA " >&2 85 | exit 1 86 | fi 87 | } 88 | 89 | function checkDeseq(){ 90 | if ! which Rscript > /dev/null 2>/dev/null; then 91 | echo "Rscript not found." 92 | exit 1 93 | fi 94 | DESeqVersion=$(Rscript -e 'installed.packages()' | awk 'BEGIN {v=0} $1=="Version" {v=1; } v==1 && $1 == "DESeq2" { gsub("\"", ""); print $2;v=0 } ' ) 95 | 96 | if [[ "${DESeqVersion}" == "" ]]; then 97 | DESeqVersion=$(Rscript -e 'installed.packages()' | awk 'BEGIN {v=0} $NF=="Version" {v=1; } v==1 && $1 == "DESeq2" { gsub("\"", ""); print $NF;v=0 } ' ) 98 | if [[ "${DESeqVersion}" == "" ]]; then 99 | echo "DESeq2 not installed. " 100 | exit 1 101 | fi 102 | fi 103 | logger "DESeq2 version $DESeqVersion" 104 | } 105 | 106 | function setThreads(){ 107 | if [[ "${THREADS}" == "" || $THREADS == 0 ]]; then 108 | THREADS=`grep -c ^processor /proc/cpuinfo` 109 | if [ ! -n $THREADS ] | [ $THREADS -eq 0 ]; then 110 | THREADS=`awk 'BEGIN {FS=":"} ($0 ~ /^physical id/ ) { printf $2 " --"} ($0 ~ /^core id/) {print $2}' < /proc/cpuinfo | sort -u | wc -l` 111 | if [ ! -n $THREADS ] | [ $THREADS -eq 0 ]; then 112 | THREADS=1 113 | fi 114 | fi 115 | fi 116 | } 117 | 118 | 119 | function checkRef(){ 120 | if [ ! "$1" ]; then 121 | echo "Argument error: -r is required." >&2 122 | exit 1 123 | fi 124 | if [ ! -f "$1/IRFinder/ref-cover.bed" ]; then 125 | echo "Argument error: -r $1, Does not appear to be a valid IRFinder reference. Could not find $1/IRFinder/ref-cover.bed" >&2 126 | exit 1 127 | fi 128 | } 129 | 130 | function checkOutDir(){ 131 | local OUTPUTDIR=$1 132 | if [ -d "$OUTPUTDIR" ]; then 133 | if [ -e "$OUTPUTDIR/IRFinder-IR-nondir.txt" ]; then 134 | echo "Argument error: -d $OUTPUTDIR, output directory contains files from a previous IRFinder run. Will not overwrite." >&2 135 | exit 1 136 | else 137 | mkdir -p "$OUTPUTDIR/logs/" 138 | fi 139 | else 140 | mkdir -p "$OUTPUTDIR/logs/" 141 | if [ ! -d "$OUTPUTDIR" ]; then 142 | echo "Argument error: Output directory $OUTPUTDIR does not exist, and could not be created." >&2 143 | exit 1 144 | fi 145 | fi 146 | } 147 | 148 | function logger() { 149 | LOGOUT="./irfinder.stdout" 150 | if [[ "$OUTPUTDIR" != "" ]]; then 151 | if [ ! -d ${OUTPUTDIR}/logs/ ]; then 152 | mkdir -p ${OUTPUTDIR}/logs/ 153 | fi 154 | LOGOUT="$OUTPUTDIR/logs/irfinder.stdout" 155 | fi 156 | if [[ "$1" == "init" ]] && [[ $# == 1 ]]; then 157 | > $LOGOUT 158 | LOG_MESSAGE="\n --------------------\n| IRFinder v. $VERSION | \n --------------------\n" 159 | else 160 | LOG_MESSAGE="${@}" 161 | fi 162 | if [[ "${VERBOSE}" == "1" ]] ; then 163 | echo -e "${LOG_MESSAGE}" | tee -ai $LOGOUT 164 | else 165 | echo -e "${LOG_MESSAGE}" >> $LOGOUT 166 | fi 167 | } 168 | 169 | function startMessage(){ 170 | ## Check if the startMessage was called by the BAM mode after the FastQ or Long analysis 171 | if [[ "${IRF_RUNMODE}" == "" ]]; then 172 | logger "---" 173 | logger "IRFinder version: $VERSION " 174 | logger "IRFinder start: " `date` 175 | logger "IRFinder runmode: $RUNMODE" 176 | logger "IRFinder user@host: $USER @ $HOSTNAME" 177 | logger "IRFinder working dir: " `pwd` 178 | logger "IRFinder reference: $REF" 179 | n=1 180 | for f in $@; do 181 | logger "IRFinder file ${n}: $f" 182 | n=$((n+1)) 183 | done 184 | logger "---" 185 | START_MESSAGE=1 186 | fi 187 | } 188 | -------------------------------------------------------------------------------- /bin/util/bed-to-intron+exon.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #0 1 2 3 4 5 6 7 8 9 10 11 4 | #1 11868 14409 ENST00000456328/processed_transcript/DDX11L1 0 + 11868 14409 0 3 359,109,1189, 0,744,1352, 5 | 6 | 7 | open EXON, '>', $ARGV[0]; 8 | open INTRON, '>', $ARGV[1]; 9 | 10 | while () { 11 | chomp; 12 | @f = split /\t/; 13 | 14 | $trans_start = $f[1]; 15 | 16 | @length = split /,/, $f[10]; 17 | @start = split /,/, $f[11]; 18 | $chr = $f[0]; 19 | ($gene_id,$gene_name) = $f[3] =~ /\/([^\/]*)\/([^\/]*)$/; 20 | $dir = $f[5]; 21 | 22 | $last_end = undef; 23 | while (@length) { 24 | $start = shift @start; 25 | $length = shift @length; 26 | if (defined($last_end)) { 27 | #only output if the intron has length. 28 | if (($last_end+1) < ($start-1)) { 29 | print INTRON join("\t", $chr, $trans_start+$last_end, $trans_start+$start, "$gene_name/$gene_id/$dir", $f[4], $f[5]), "\n"; 30 | } 31 | } 32 | #print EXON "$chr\t" . ($trans_start+$start) . "\t" . ($trans_start+$start+$length) . "\t$name\n"; 33 | print EXON join("\t", $chr, $trans_start+$start, $trans_start+$start+$length, "$gene_name/$gene_id/$dir", $f[4], $f[5]), "\n"; 34 | $last_end = $start+$length; 35 | } 36 | } 37 | 38 | close INTRON; 39 | close EXON; 40 | -------------------------------------------------------------------------------- /bin/util/deseq2.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(DESeq2) 3 | library(ggplot2) 4 | ### Load DESeq2Constructor 5 | initial.options <- commandArgs(trailingOnly = FALSE) 6 | file.arg.name <- "--file=" 7 | script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) 8 | script.basename <- dirname(script.name) 9 | other.name <- file.path(script.basename, "/../DESeq2Constructor.R") 10 | source(other.name) 11 | # source("~/git/IRFinder/bin/DESeq2Constructor.R") 12 | ### Read args 13 | # setwd("~/test/IRFinder2/Diff/sing/") 14 | # setwd("/media/lorencl/f4e6cecd-2fb8-4aa6-991c-620f450fd511/works/IRFinder2/Diff/test_9") 15 | # args=c("./groups.tsv", "0.05", "0" , "0" ,"0") 16 | 17 | args <- commandArgs(trailingOnly = T) 18 | groups=read.table(args[1], stringsAsFactors = F, header = T) 19 | out_folder=dirname(args[1]) 20 | IRratio_thr=as.numeric(args[2]) 21 | warning_filter=args[3] 22 | cooks_cutoff=args[4]=="1" 23 | if (cooks_cutoff ){ 24 | print("cooks_cutoff enabled") 25 | } else { 26 | print("cooks_cutoff disabled") 27 | } 28 | 29 | independentFiltering= args[5]=="1" 30 | if (independentFiltering ){ 31 | print("independentFiltering enabled") 32 | } else { 33 | print("independentFiltering disabled") 34 | } 35 | 36 | paths = as.vector(groups$Files) 37 | experiment = groups[,c("SampleName", "Condition")] 38 | 39 | experiment$Condition=factor(experiment$Condition) 40 | rownames(experiment)=NULL 41 | 42 | metaList=DESeqDataSetFromIRFinder(filePaths=paths, designMatrix=experiment, designFormula=~1, irratio_thr=IRratio_thr, warning_filter=warning_filter ) 43 | 44 | dds = metaList$DESeq2Object 45 | design(dds) = ~Condition + Condition:IRFinder 46 | conditions=levels(experiment$Condition) 47 | dds = DESeq(dds) 48 | resultsNames(dds) 49 | nn_counts = counts(dds, normalized=F) 50 | global_dat = data.frame( intron = rownames(nn_counts) ); 51 | for ( i in 1:nrow(experiment) ) { 52 | s_name =experiment$SampleName[i] 53 | global_dat[,paste0("IRratio.",s_name)] = nn_counts[, paste0("intronDepth.", s_name)] / (nn_counts[, paste0("intronDepth.", s_name)] + nn_counts[, paste0("maxSplice.", s_name)] ) 54 | global_dat[,paste0("IRratio.",s_name)][is.na(global_dat[,paste0("IRratio.",s_name)])]=0 55 | } 56 | 57 | for ( i in 1:(length(conditions)) ){ 58 | global_dat[,paste0(conditions[i], ".Mean.IRratio")]= rowMeans(global_dat[,paste0("IRratio.",experiment$SampleName[experiment$Condition == conditions[i]])]) 59 | 60 | } 61 | 62 | 63 | for ( i in 1:(length(conditions)-1) ){ 64 | for (j in (i+1):length(conditions)){ 65 | contrast_name=paste0(conditions[i], "_", conditions[j]) 66 | res = results(dds, contrast=list(paste0("Condition", conditions[i] ,".IRFinderIR"),paste0("Condition", conditions[j] ,".IRFinderIR")), cooksCutoff=cooks_cutoff, independentFiltering=independentFiltering) 67 | res$padj[is.na(res$padj)]=1 68 | global_dat[,paste0("DESeq2.padj.", contrast_name)]=res$padj 69 | global_dat[,paste0("DESeq2.baseMean.", contrast_name)]=res$baseMean 70 | global_dat[,paste0("DESeq2.log2FoldChange.", contrast_name)]=res$log2FoldChange 71 | if ( sum(res$padj < 0.05 ) > 0 ){ 72 | pdf(paste0(out_folder, "/", contrast_name, "_plot.pdf")) 73 | nn_counts = counts(dds, normalized=F) 74 | for ( name in rownames(res)[res$padj < 0.05]){ 75 | dat = data.frame( name = experiment$SampleName, grp= experiment$Condition, 76 | intron_depth = nn_counts[name, paste0("intronDepth.", experiment$SampleName)] , 77 | max_splice= nn_counts[name, paste0("maxSplice.", experiment$SampleName)]) 78 | dat$IRratio = dat$intron_depth / ( dat$intron_depth+dat$max_splice) 79 | print(ggplot(dat)+geom_boxplot(aes(x=grp, fill=grp, y=IRratio )) + ggtitle(paste0(name, "\n", res[name, "padj"]))) 80 | } 81 | dev.off() 82 | } 83 | write.table(res, file = paste0(out_folder, "/", contrast_name, "_DESeq2.tsv") ,sep="\t", quote = F) 84 | } 85 | } 86 | rownames(global_dat)=global_dat$intron 87 | global_dat=global_dat[,-1] 88 | write.table(global_dat, file = paste0(out_folder, "/all_results_DESeq2.tsv") ,sep="\t", quote = F) 89 | 90 | quit(save = "no") 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /bin/util/generateReadsError.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | #use Fcntl; 4 | 5 | #fcntl(stdout, F_SETPIPE_SZ, 1048576); 6 | #fcntl(fileno(STDOUT), 1031, 1048576); 7 | 8 | my $generatedCount = 0; 9 | 10 | my $readLen = $ARGV[0]; 11 | my $stride = $ARGV[1]; 12 | 13 | my $lastDirection = 0; 14 | 15 | sub reverse_complement { 16 | my $dna = shift; 17 | 18 | # reverse the DNA sequence 19 | my $revcomp = reverse($dna); 20 | 21 | # complement the reversed DNA sequence 22 | $revcomp =~ tr/ACGTacgt/TGCAtgca/; 23 | return $revcomp; 24 | } 25 | 26 | 27 | my @error = ( 28 | {'A' => 'G', 'G' => 'T', 'T' => 'C', 'C' => 'A', 'N' => 'N' }, 29 | {'A' => 'T', 'G' => 'C', 'T' => 'A', 'C' => 'G', 'N' => 'N' }, 30 | {'A' => 'C', 'G' => 'A', 'T' => 'G', 'C' => 'T', 'N' => 'N' } 31 | ); 32 | 33 | my $readCount = 0; 34 | 35 | sub processRead( $ $ $ ) { 36 | my $read = shift; 37 | my $pos = shift; 38 | my $chr = shift; 39 | 40 | $readCount++; 41 | 42 | my $numN = $read =~ tr/N/N/; 43 | if ($numN * 2 < $readLen) { 44 | #only output reads where less than half of the read will be NNNNN 45 | 46 | # generate a single base error in a deterministic manner. 47 | substr($read,35,1) = $error[$readCount % 3]{substr($read,35,1)}; 48 | 49 | if ($lastDirection == 0) { 50 | print ">RF!$chr!$pos\n"; 51 | print "$read\n"; 52 | $lastDirection = 1; 53 | }else{ 54 | print ">RR!$chr!$pos\n"; 55 | print reverse_complement($read) . "\n"; 56 | $lastDirection = 0; 57 | } 58 | 59 | } 60 | } 61 | 62 | sub processBuffer( $ $ $ ) { 63 | my $b = shift; 64 | my $pos = shift; 65 | my $chr = shift; 66 | 67 | #while (length($$b) >= $readLen + $stride) { 68 | while (length($$b) >= $readLen) { 69 | processRead(substr($$b,0,$readLen), $pos, $chr); 70 | #my $thisread = substr($$b,0,$readLen); 71 | $$b = substr($$b,$stride); 72 | $pos = $pos + $stride; 73 | } 74 | return $pos; 75 | } 76 | 77 | my $count = 0; 78 | my $chr = ''; 79 | my $pos = 1; 80 | my $buffer = ''; 81 | 82 | while() { 83 | chomp; 84 | $count ++; 85 | if (m/^>/) { 86 | s/ .*$//; 87 | s/^>//; 88 | $chr = $_; 89 | $pos = 1; 90 | $buffer = ''; 91 | } 92 | else{ 93 | # Should allow into the buffer only valid letters. 94 | $_ = uc($_); 95 | s/[^ATCGN]/N/g; 96 | $buffer .= $_; 97 | $pos = processBuffer(\$buffer, $pos, $chr); 98 | } 99 | # if ($count > 10000) { exit; } 100 | } 101 | -------------------------------------------------------------------------------- /bin/util/gtf2bed-custom.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Copyright (c) 2011 Erik Aronesty (erik@q32.com) 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | # 23 | # ALSO, IT WOULD BE NICE IF YOU LET ME KNOW YOU USED IT. 24 | # 25 | # https://code.google.com/p/ea-utils/source/browse/trunk/clipper/gtf2bed 26 | 27 | use Data::Dumper; 28 | use sort 'stable'; 29 | use if $]<5.028, "sort", '_mergesort'; # Note the hash function is not stable on later versions of PERL. Must sort a hash on relevant values if stability is desired. 30 | 31 | $in = shift @ARGV; 32 | 33 | open IN, ($in =~ /\.gz$/ ? "gunzip -c $in" : $in =~ /\.zip$/ ? "unzip -p $in" : "$in"); 34 | while () { 35 | $gff = 2 if /^##gff-version 2/; 36 | $gff = 3 if /^##gff-version 3/; 37 | next if /^#/ && $gff; 38 | 39 | s/\s+$//; 40 | # 0-chr 1-src 2-feat 3-beg 4-end 5-scor 6-dir 7-fram 8-attr 41 | my @f = split /\t/; 42 | if ($gff) { 43 | # most ver 2's stick gene names in the id field 44 | ($id) = $f[8]=~ /\bID="([^"]+)"/; 45 | # most ver 3's stick unquoted names in the name field 46 | ($id) = $f[8]=~ /\bName=([^";]+)/ if !$id && $gff == 3; 47 | } else { 48 | ($id) = $f[8]=~ /transcript_id "([^"]+)"/; 49 | } 50 | 51 | next unless $id && $f[0]; 52 | 53 | if ($f[2] eq 'exon') { 54 | die "no position at exon on line $." if ! $f[3]; 55 | # gff3 puts :\d in exons sometimes 56 | $id =~ s/:\d+$// if $gff == 3; 57 | push @{$exons{$id}}, \@f; 58 | # save lowest start 59 | $trans{$id} = \@f if !$trans{$id}; 60 | }# elsif ($f[2] eq 'start_codon') { 61 | # #optional, output codon start/stop as "thick" region in bed 62 | # $sc{$id}->[0] = $f[3]; 63 | #}# elsif ($f[2] eq 'CDS') { 64 | # #optional, output codon start/stop as "thick" region in bed 65 | # push @{$cds{$id}}, \@f; 66 | # # save lowest start 67 | # $cdx{$id} = \@f if !$cdx{$id}; 68 | #} elsif ($f[2] eq 'stop_codon') { 69 | # $sc{$id}->[1] = $f[4]; 70 | #}# elsif ($f[2] eq 'miRNA' ) { 71 | # $trans{$id} = \@f if !$trans{$id}; 72 | # push @{$exons{$id}}, \@f; 73 | #} 74 | } 75 | 76 | for $id ( 77 | # sort by chr then pos 78 | sort { 79 | $trans{$a}->[0] eq $trans{$b}->[0] ? 80 | $trans{$a}->[3] <=> $trans{$b}->[3] : 81 | $trans{$a}->[0] cmp $trans{$b}->[0] 82 | } (keys(%trans)) ) { 83 | my ($chr, undef, undef, undef, undef, undef, $dir, undef, $attr, undef, $cds, $cde) = @{$trans{$id}}; 84 | my ($cds, $cde); 85 | ($cds, $cde) = @{$sc{$id}} if $sc{$id}; 86 | my ($gene_name) = $attr=~ /gene_name "([^"]+)"/; 87 | my ($gene_id) = $attr=~ /gene_id "([^"]+)"/; 88 | my ($trans_type) = $attr=~ /transcript_biotype "([^"]+)"/; 89 | if (!( $trans_type && length($trans_type)>0)) { 90 | ($trans_type) = $attr=~ /gene_biotype "([^"]+)"/; 91 | } 92 | if (!( $trans_type && length($trans_type)>0)) { 93 | ($trans_type) = $attr=~ /transcript_type "([^"]+)"/; 94 | } 95 | if (!( $trans_type && length($trans_type)>0)) { 96 | ($trans_type) = $attr=~ /gene_type "([^"]+)"/; 97 | } 98 | # sort by pos 99 | my @ex = sort { 100 | $a->[3] <=> $b->[3] 101 | } @{$exons{$id}}; 102 | 103 | my $beg = $ex[0][3]; 104 | my $end = $ex[-1][4]; 105 | 106 | if ($dir eq '-') { 107 | # swap 108 | $tmp=$cds; 109 | $cds=$cde; 110 | $cde=$tmp; 111 | $cds -= 2 if $cds; 112 | $cde += 2 if $cde; 113 | } 114 | 115 | # not specified, just use exons 116 | $cds = $beg if !$cds; 117 | $cde = $end if !$cde; 118 | 119 | # adjust start for bed 120 | --$beg; --$cds; 121 | 122 | my $exn = @ex; # exon count 123 | my $exst = join ",", map {$_->[3]-$beg-1} @ex; # exon start 124 | my $exsz = join ",", map {$_->[4]-$_->[3]+1} @ex; # exon size 125 | 126 | # if (($trans_type eq 'protein_coding') || ($trans_type eq 'processed_transcript')) { 127 | #if (!(($trans_type eq 'protein_coding') || ($trans_type eq 'processed_transcript'))) { 128 | # added an extra comma to make it look exactly like ucsc's beds 129 | print "$chr\t$beg\t$end\t$id/$trans_type/$gene_id/$gene_name\t0\t$dir\t$cds\t$cde\t0\t$exn\t$exsz,\t$exst,\n"; 130 | # } 131 | } 132 | 133 | 134 | close IN; 135 | -------------------------------------------------------------------------------- /bin/util/irfinder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/irfinder -------------------------------------------------------------------------------- /bin/util/irfinder_cnn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/irfinder_cnn -------------------------------------------------------------------------------- /bin/util/model/best_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/model/best_model.h5 -------------------------------------------------------------------------------- /bin/util/model/best_model.tflite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/model/best_model.tflite -------------------------------------------------------------------------------- /bin/util/model/model_info.json: -------------------------------------------------------------------------------- 1 | { 2 | "Image directory": "/work/ritchieteam/EMT_irfinder/trainingshortread/", 3 | "Output directory": "depth15_ir0.1_noir0.cov50long_.01cov25allshortnondir_congruant_2_2021-05-20_12_01", 4 | "Validation split": 0.1, 5 | "Epochs": 500, 6 | "Batch size": 50, 7 | "Model json": null, 8 | "Image size": 256, 9 | "Number of colors": 0, 10 | "Seed": 123, 11 | "Threads": 5, 12 | "Dataset": { 13 | "counts": [ 14 | [ 15 | 1662, 16 | 185 17 | ], 18 | [ 19 | 8164, 20 | 907 21 | ] 22 | ], 23 | "class_names": [ 24 | "hIR", 25 | "noIR" 26 | ] 27 | } 28 | } -------------------------------------------------------------------------------- /bin/util/trim: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/trim -------------------------------------------------------------------------------- /bin/util/winflat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/bin/util/winflat -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ "$1" != "local" ]] && [[ "$1" != "" ]] && [[ "$1" != "remove" ]]; then 4 | echo -e "Usage: \nGlobal installation:\tsudo install.sh\nLocal installation:\tinstall.sh local" >&2 5 | echo -e "Uninstall all:\tsudo install.sh remove\nUninstall local:\tinstall.sh remove local\nUninstall local:\tsudo install.sh remove global" >&2 6 | exit 1 7 | fi 8 | 9 | function checkDependencies(){ 10 | local distro=$(cat /proc/version ) 11 | local deps=0 12 | echo "Checking dependencies..." 13 | for pkg in $@ ; do 14 | if [[ "${distro}" =~ Ubuntu|Debian ]] ; then 15 | if ! dpkg -s $pkg >/dev/null 2>/dev/null ; then 16 | echo "Dependency $pkg not found." >&2 17 | deps=1 18 | fi 19 | else 20 | if ! rpm -q $pkg >/dev/null 2>/dev/null ; then 21 | echo "Dependency $pkg not found." >&2 22 | deps=1 23 | fi 24 | fi 25 | done 26 | if [ $deps -eq 1 ]; then 27 | exit 1 28 | fi 29 | } 30 | 31 | 32 | if [[ $1 == "remove" ]]; then 33 | if [[ "$2" != "global" ]] && [[ "$2" != "local" ]] && [[ "$2" != "" ]] ; then 34 | echo "Error: $2 not recognized. Use 'local' or 'global' or leave empty" >&2 35 | exit 1 36 | fi 37 | if [[ "$2" == "" || "$2" == "global" ]]; then 38 | if [ "$EUID" -ne 0 ]; then 39 | echo "Please run as root" 40 | exit 41 | fi 42 | if [ -d /usr/local/IRFinder ]; then 43 | rm -fr /usr/local/IRFinder /usr/bin/IRFinder 44 | echo "Removed system installation" 45 | else 46 | echo "Global installation of IRFinder not found" 47 | fi 48 | fi 49 | if [[ "$2" == "" || "$2" == "local" ]] ;then 50 | if [ -d ~/.local/IRFinder ] ; then 51 | rm -fr ~/.local/IRFinder ~/.local/bin/IRFinder 52 | echo "Removed local installation" 53 | else 54 | echo "Local installation of IRFinder not found." 55 | fi 56 | fi 57 | exit 58 | fi 59 | 60 | 61 | if [[ "${1}" != "local" ]]; then 62 | if [ "$EUID" -ne 0 ]; then 63 | echo "Please run as root or to install IRFinder locally call" 64 | echo "./install.sh local" 65 | echo "" 66 | exit 1 67 | fi 68 | fi 69 | 70 | checkDependencies "make bedtools samtools gzip gawk libboost-iostreams-dev zlib1g" 71 | 72 | 73 | ORIGINAL_FOLDER=$(realpath $PWD) 74 | BASE_FOLDER=$(dirname "$(readlink -nf "$BASH_SOURCE")") 75 | 76 | cd $BASE_FOLDER/src/trim/ 77 | make clean 78 | make 79 | cp ./trim $BASE_FOLDER/bin/util/trim 80 | make clean 81 | cd ../winflat 82 | make clean 83 | make 84 | cp ./winflat $BASE_FOLDER/bin/util/winflat 85 | cd ../irfinder/Release 86 | make clean 87 | make 88 | cp ./irfinder $BASE_FOLDER/bin/util/irfinder 89 | make clean 90 | cd $BASE_FOLDER 91 | chmod -R a+x ./bin 92 | if [[ "${1}" == "local" ]];then 93 | if [ -d ~/.local/IRFinder ]; then 94 | rm -fr ~/.local/IRFinder ~/.local/bin/IRFinder 95 | fi 96 | cp -r $BASE_FOLDER ~/.local/IRFinder 97 | ln -s $(realpath ~/.local/IRFinder/bin/IRFinder) ~/.local/bin/IRFinder 98 | else 99 | if [ -d /usr/local/IRFinder ]; then 100 | rm -fr /usr/local/IRFinder /usr/bin/IRFinder 101 | fi 102 | cp -r $BASE_FOLDER /usr/local/IRFinder 103 | ln -s /usr/local/IRFinder/bin/IRFinder /usr/bin/IRFinder 104 | fi 105 | 106 | cd $ORIGINAL_FOLDER 107 | 108 | 109 | if ! which suppa.py >/dev/null 2>/dev/null ; then 110 | echo "SUPPA2 not found. To use the RunMode Diff, install it: https://github.com/comprna/SUPPA " >&2 111 | fi 112 | 113 | if ! which STAR > /dev/null 2> /dev/null ; then 114 | echo "STAR not found. To use the RunMode FastQ and to produce your own mapability files during the reference build, install it: https://github.com/alexdobin/STAR " >&2 115 | fi 116 | 117 | if ! which minimap2 > /dev/null 2> /dev/null ; then 118 | echo "minimap2 not found. To use the RunMode Long, install it: https://github.com/lh3/minimap2 " >&2 119 | fi 120 | 121 | 122 | 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /src/cnnfilter/cnnfilter/actions/resultgraph.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | import pandas as pd 3 | import numpy as np 4 | #EMT5p 5 | if sys.argv[1] == "EMT5m": 6 | 7 | shortnames = [["T5moins_rep1", "T5moins_rep2", "T5moins_rep3"]] 8 | 9 | if sys.argv[1] == "EMT5p": 10 | 11 | shortnames = [["T5plus_rep1", "T5plus_rep2", "T5plus_rep3"]] 12 | 13 | if sys.argv[1] == "EMT1p": 14 | 15 | shortnames = [["T1plus_rep1", "T1plus_rep2", "T1plus_rep3"]] 16 | 17 | 18 | folder = "/work/sylvain/IntronScanner/test/training/EMT_training/depth15_ir0.05_noir0.cov50long_.023allshort_rmALLnondirnoncongruant2021-05-07_16_53/" 19 | folder = sys.argv[1] 20 | 21 | pred = pd.read_csv("prediction_for_EMT_test.tsv",sep="\t|-|:",skiprows=1,header=None) 22 | pred[2]=pred[2]+15 23 | pred[3]=pred[3]-15 24 | pred["id"]=pred[1].apply(str)+":"+pred[2].apply(str)+"-"+pred[3].apply(str) 25 | pred["truelab"]="no" 26 | for sit, sname in enumerate(shortnames[0]): 27 | data = pd.read_csv(sname+".tsv",delimiter="\t") 28 | data["id"]=data["Chr"].apply(str)+":"+data["Start"].apply(str)+"-"+data["End"].apply(str) 29 | pred.loc[pred[0]==sname].loc[pred['id'].isin(data["id"])] 30 | data.loc[data['id'].isin(pred.loc[pred[0] == sname]["id"])] 31 | pred.loc[pred[0] == sname]["truelab"]=data.loc[data['id'].isin(pred.loc[pred[0] == sname]["id"])]["Warnings"] 32 | (pred[1]==data["Chr"]) & (pred[2]==data["Start"]) & (pred[3]==data["End"]) & (pred[0]==sname) 33 | pred[pred[0]==sname] 34 | # for sit, sname in enumerate(shortnames[0]): 35 | # txt = numpy.loadtxt(os.path.join(folder, sname, "IRFinder-IR-nondir.txt"), delimiter="\t) 36 | # data = np.genfromtxt(fname=os.path.join(folder, sname, "IRFinder-IR-nondir.txt"), delimiter="\t", skip_header=1) 37 | # sarray = open(os.path.join(folder, sname, "IRFinder-IR-nondir-AI.txt"), "rt") 38 | # print(os.path.join(folder, sname, "IRFinder-IR-nondir-AI.txt")) 39 | # 40 | # 41 | # line = _array.readline() 42 | # 43 | # while True: 44 | # line = _array.readline() 45 | # if not line: 46 | # break 47 | # 48 | # irratio = float(a[19]) 49 | # classpred= a[20] 50 | # irratio=float(a[3]) -------------------------------------------------------------------------------- /src/cnnfilter/cnnfilter/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # encoding: utf-8 3 | ''' 4 | intron_scanner.intron_scanner -- shortdesc 5 | 6 | intron_scanner.intron_scanner is a description 7 | 8 | It defines classes_and_methods 9 | 10 | @author: user_name 11 | 12 | @copyright: 2020 organization_name. All rights reserved. 13 | 14 | @license: license 15 | 16 | @contact: user_email 17 | @deffield updated: Updated 18 | ''' 19 | 20 | import sys 21 | import os 22 | 23 | LIBRARY_LOCATION = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 24 | sys.path.append(LIBRARY_LOCATION) 25 | 26 | from optparse import OptionParser 27 | 28 | 29 | def title(title): 30 | bar = '-' * 50 31 | white = ' ' * round((50 - len(title)) / 2) 32 | white_b = ' ' * 5 33 | title = "\n " + white_b + bar + "\n" + white_b + "|" + white + title + white + "|\n " + white_b + bar + "\n\n" 34 | print(title) 35 | 36 | 37 | def print_help(): 38 | print("Usage: intron_scanner action [options]\n\nPossible actions:") 39 | print("\t- extract:\t exctract genomic regions (bed file) from a bam file as images.") 40 | print( 41 | "\t- a2i: \t exctract images from an array produced by the extract command and organize in according to given labels") 42 | print("\t- train: \t train a tensorflow model on a given set of images") 43 | print("\t- test: \t use a trained model to predict the class of a given set of images") 44 | print("\n") 45 | 46 | 47 | def main(argv=None): 48 | '''Command line options.''' 49 | program_name = os.path.basename(sys.argv[0]) 50 | title("Intron scanner") 51 | action = "help" 52 | if len(sys.argv) > 1: 53 | action = sys.argv[1] 54 | 55 | if argv is None: 56 | argv = sys.argv[2:] 57 | try: 58 | 59 | if action == "train": 60 | program_name = program_name + " " + 'training process' 61 | parser = OptionParser(usage="Usage: %prog train ", 62 | description="Train a neural network model on a given set of images") 63 | parser.add_option("-d", "--img-dir", dest="dir", 64 | help="The directory containing the images, divided in subdirectories in according to the classes. Example: ./training/ -> ./training/labelA/ ./training/labelB/ ", 65 | metavar="DIR", type="string") 66 | parser.add_option("-o", "--out", dest="outdir", help="Output directory [default: %default]", metavar="DIR", 67 | type="string") 68 | parser.add_option("-b", "--batch", dest="batch", help="Number of images to load. [default: %default]", 69 | metavar="INT", type="int") 70 | parser.add_option("-s", "--image-size", dest="size", help="Images size [default: %default]", metavar="INT", 71 | type="int") 72 | # parser.add_option("-c", "--color-number", dest="colorN", 73 | # help="Number of color's dimension : 1 -> grey (read)\n\t\t\t 2 ->(read and annotation ??),\n\t\t\t3 -> 3colors [default: %default]", 74 | # metavar="INT", type="int") 75 | parser.add_option("-S", "--seed", dest="seed", help="Seed for the validation split [default: %default]", 76 | metavar="INT", type="int") 77 | parser.add_option("-t", "--threads", dest="threads", help="Number of threads to use. [default: %default]", 78 | metavar="INT", type="int") 79 | parser.add_option("-V", "--validation-split", dest="vsplit", metavar="FLOAT", 80 | help="Fraction of the dataset to use for the validation [default: %default]", 81 | type="float") 82 | parser.add_option("-e", "--epochs", dest="epochs", metavar="INT", 83 | help="Number of training epoch [default: %default]", type="int") 84 | parser.add_option("-E", "--earlystop", dest="earlystop", metavar="INT", 85 | help="Number of patience epoch for earlystop , -1 for no earlystop [default: 0.1*epochs (10 percent of the total number of epochs]", 86 | type="int") 87 | parser.add_option("-m", "--json-model", dest="model", metavar="FILE", 88 | help="Load the tensorflow model from a json file [default: %default]", type="string") 89 | parser.add_option("-v", "--verbose", dest="verbose", action="count", help="Set tensorflow verbosity level") 90 | # set defaults 91 | parser.set_defaults(outdir="./model/", size=256, colorN=0, verbose=0, epoch=10, earlystop=-200, ext="png", 92 | model=None, vsplit=0.20, batch=50, seed=123, threads=None, epochs=10) 93 | # process options 94 | (opts, _) = parser.parse_args(argv) 95 | required = "dir ".split() 96 | for r in required: 97 | if opts.__dict__[r] is None: 98 | parser.error("Parameter %s required\n\nUse --help to get more information\n" % r) 99 | from cnnfilter.actions.models import IntronModeller 100 | modeller = IntronModeller(opts.verbose) 101 | # modeller.train(opts.dir, opts.outdir, opts.size, opts.batch, opts.vsplit, opts.seed, opts.epochs, opts.threads, opts.model, opts.colorN, opts.earlystop) 102 | modeller.train_from_array(opts.dir, opts.outdir, opts.size, opts.batch, opts.vsplit, opts.seed, opts.epochs, 103 | opts.threads, opts.model, opts.colorN, opts.earlystop) 104 | 105 | elif action == "test": 106 | program_name = program_name + " " + 'test process' 107 | parser = OptionParser(usage="Usage: %prog test ", 108 | description="Test a neural network model on a given set of images") 109 | parser.add_option("-d", "--img-dir", dest="dir", 110 | help="The directory containing the images to predict. If they are in subdirectories, the subfolder name is used as true label.", 111 | metavar="DIR", type="string") 112 | parser.add_option("-a", "--array-file", dest="array", metavar="FILE", 113 | help="Use a file conaining the image information, produced by the extract process", 114 | type="string") 115 | parser.add_option("-b", "--bed-file", dest="bed", metavar="FILE", 116 | help="bed file associated to the array (-a). Can be a general tsv file. The last column is used as true label", 117 | type="string") 118 | parser.add_option("-m", "--model-dir", dest="model", metavar="DIR", 119 | help="Folder containing the model. It has to contain the files best_model.h5 and model_info.json [default: %default]", 120 | type="string") 121 | # parser.add_option("-c", "--color-number", dest="colorN", 122 | # help="Number of color's dimension : 1 -> grey (read)\n\t\t\t 2 ->(read and annotation ??),\n\t\t\t3 -> 3colors [default: %default]", 123 | # metavar="INT", type="int") 124 | parser.add_option("-o", "--out", dest="out", help="Output file [default: %default]", metavar="FILE", 125 | type="string") 126 | parser.add_option("-v", "--verbose", dest="verbose", action="count", help="Set tensorflow verbosity level") 127 | # set defaults 128 | parser.set_defaults(out="./predictions.tsv", verbose=0, dir=None, array=None, bed=None, 129 | model="./model/") 130 | # process options 131 | (opts, _) = parser.parse_args(argv) 132 | if (opts.dir != None) == (opts.array != None): 133 | parser.error( 134 | "Parameters -a and -d are mutual exclusive and at least one is required\n\nUse --help to get more information\n") 135 | from cnnfilter.actions.models import IntronModeller 136 | modeller = IntronModeller(opts.verbose) 137 | modeller.test(opts.dir, opts.array, opts.bed, opts.model, opts.out) 138 | elif action == "help" or action == "-h" or action == "--help": 139 | print_help() 140 | else: 141 | raise ValueError("Action %s not recognized." % action) 142 | 143 | except Exception as e: 144 | print(program_name + ": " + repr(e) + "\n") 145 | print("\n\nFor help use --help\n\n") 146 | print(e) 147 | if __debug__: 148 | raise e 149 | return 2 150 | 151 | 152 | if __name__ == "__main__": 153 | sys.exit(main()) 154 | 155 | 156 | #json,gzip,time,tensorflow,matplotlib,numpy,sklearn,re,progress -------------------------------------------------------------------------------- /src/cnnfilter/cnnfilter/model/best_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/src/cnnfilter/cnnfilter/model/best_model.h5 -------------------------------------------------------------------------------- /src/cnnfilter/cnnfilter/model/model_info.json: -------------------------------------------------------------------------------- 1 | { 2 | "Image directory": "/work/ritchieteam/EMT_irfinder/trainingshortread/", 3 | "Output directory": "depth15_ir0.1_noir0.cov50long_.01cov25allshortnondir_congruant_2_2021-05-20_12_01", 4 | "Validation split": 0.1, 5 | "Epochs": 500, 6 | "Batch size": 50, 7 | "Model json": null, 8 | "Image size": 256, 9 | "Number of colors": 0, 10 | "Seed": 123, 11 | "Threads": 5, 12 | "Dataset": { 13 | "counts": [ 14 | [ 15 | 1662, 16 | 185 17 | ], 18 | [ 19 | 8164, 20 | 907 21 | ] 22 | ], 23 | "class_names": [ 24 | "hIR", 25 | "noIR" 26 | ] 27 | } 28 | } -------------------------------------------------------------------------------- /src/cnnfilter/cnnfilter/utils/reader.py: -------------------------------------------------------------------------------- 1 | import json 2 | import gzip 3 | 4 | class ImageArray: 5 | def __init__(self, raw_line): 6 | raw_line=raw_line.split("\t") 7 | raw_name=raw_line[0].split(":") 8 | self.name=raw_name[0]+":"+raw_name[1] 9 | self.strand=raw_name[2] 10 | self.region=json.loads(raw_line[1]) 11 | self.is_valid= len(self.region) > 1 12 | if self.is_valid: 13 | self.is_valid=max([sum(i) for i in self.region]) > 0 14 | 15 | 16 | class ImageArchive: 17 | 18 | def _open_file(self, fname): 19 | if fname.endswith(".gz"): 20 | return gzip.open(fname, "rt") 21 | else: 22 | return open(fname, "rt") 23 | 24 | def _count_lines(self, fname): 25 | tmp = self._open_file(fname) 26 | count=0 27 | for _ in tmp: 28 | count+=1 29 | tmp.close() 30 | return count 31 | 32 | 33 | def __init__(self, bed_file, array_file): 34 | self._len=self._count_lines(array_file) 35 | if bed_file != None: 36 | if self._count_lines(bed_file) != self._len : 37 | raise AssertionError("Files {} and {} have not the same number of lines!".format(array_file, bed_file)) 38 | self._bed = self._open_file(bed_file) 39 | self._has_bed=True 40 | else: 41 | self._has_bed=False 42 | self._array= self._open_file(array_file) 43 | self._index=-1 44 | 45 | 46 | def __iter__(self): 47 | return self 48 | 49 | def __next__(self): 50 | self._index+=1 51 | if self._index < self._len: 52 | if self._has_bed: 53 | return self._bed.readline().strip().split("\t"), ImageArray(self._array.readline()) 54 | else: 55 | return ["NA"], ImageArray(self._array.readline()) 56 | else: 57 | raise StopIteration 58 | 59 | def __len__(self): 60 | return self._len 61 | 62 | def __del__(self): 63 | self.close() 64 | 65 | def close(self): 66 | self._array.close() 67 | if self._has_bed: 68 | self._bed.close() 69 | 70 | def getIndex(self): 71 | return self._index 72 | 73 | -------------------------------------------------------------------------------- /src/cnnfilter/testCNN/actions/extract.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.interpolate import interp1d 3 | 4 | 5 | 6 | 7 | 8 | def getImageArrayFromRegion(region, img_size=None): 9 | ''' 10 | Return the numpy array representing the image from a given region 11 | ''' 12 | 13 | 14 | read_img, ann_img = generateImagesArrayGreyFromRegion(region,img_size) 15 | 16 | return read_img 17 | 18 | 19 | 20 | def generateImagesArrayGreyFromRegion(region, img_size=None): 21 | 22 | 23 | ''' 24 | Return the arrays composing an image from a given region 25 | ''' 26 | 27 | region_size = len(region) 28 | 29 | depth = max([sum(i) for i in region]) 30 | if depth == 0: 31 | raise ArithmeticError("Error! trying to generate an image with zero depth.") 32 | reads_img = (np.array(region)[:, :] / depth) * 255 33 | 34 | 35 | if region_size < img_size: 36 | kindinterp = "nearest" 37 | else: 38 | kindinterp = "zero" #"linear" 39 | 40 | 41 | f0 = interp1d(np.arange(0, region_size-30), reads_img[15:-15,0], kind=kindinterp) 42 | f1 = interp1d(np.arange(0, region_size-30), reads_img[15:-15,1], kind=kindinterp) 43 | 44 | reads_imgd1 = np.array([np.array(reads_img[0:15, 0])]) 45 | reads_imgd1 = np.append(reads_imgd1, f0(np.arange(0, (img_size - 30)) * ((region_size - 31) / (img_size - 30)))) 46 | reads_imgd1 = np.append(reads_imgd1, reads_img[-15:, 0]) 47 | 48 | reads_imgd2 = np.array([np.array(reads_img[0:15, 1]+reads_img[0:15, 0])]) 49 | 50 | 51 | reads_imgd2 = np.append(reads_imgd2, f1(np.arange(0, (img_size - 30)) * ((region_size - 31) / (img_size - 30)))+reads_imgd1[15:-15]) 52 | 53 | reads_imgd2 = np.append(reads_imgd2, reads_img[ -15:,1]+reads_img[-15:, 0]) 54 | 55 | reads_img2 = np.array([reads_imgd1,reads_imgd2]) 56 | 57 | reads_img2 = np.expand_dims(np.rot90(np.round(reads_img2).astype("float32"), k=3), axis=2) 58 | 59 | return reads_img2, None 60 | 61 | -------------------------------------------------------------------------------- /src/cnnfilter/testCNN/actions/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 3 | 4 | import tflite_runtime.interpreter as tflite 5 | from scipy.special import softmax 6 | 7 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 8 | 9 | import json 10 | import numpy as np 11 | 12 | from utils.reader import ImageArchive 13 | from actions.extract import getImageArrayFromRegion 14 | 15 | 16 | class IntronModeller(): 17 | def __init__(self, verbosity=0): 18 | if verbosity > 3: 19 | verbosity=3 20 | os.environ['TF_CPP_MIN_LOG_LEVEL'] ="{}".format(3-verbosity) 21 | self.verbosity = verbosity 22 | 23 | 24 | def test(self, img_dir=None, model_dir="./model/", colorN=3, imagemode=0): 25 | if (img_dir == None): 26 | raise ValueError("img_dir is required.") 27 | self._load_model(model_dir) 28 | self._model_dir = model_dir 29 | 30 | 31 | if img_dir != None: 32 | for filesdir in ["IRFinder-IR-dir", "IRFinder-IR-nondir"]: 33 | if os.path.isfile(os.path.join(img_dir, filesdir+"-AI.txt")): 34 | output_file=os.path.join(img_dir, filesdir+"-val.txt") 35 | self._out_f = open(output_file, "wt") 36 | arr_file = os.path.join(img_dir, filesdir + "-AI.txt") 37 | bed_file = os.path.join(img_dir, filesdir + ".txt") 38 | print("Processing "+filesdir+".txt") 39 | self._test_irfinder_result(arr_file,bed_file) 40 | print("Done.") 41 | self._out_f.close() 42 | 43 | def _predict(self, arr): 44 | self.model["Model"].reset_all_variables() 45 | self.model["Model"].set_tensor(self.model["InputDetails"][0]['index'], [arr]) 46 | self.model["Model"].invoke() 47 | return self.model["Model"].get_tensor(self.model["OutputDetails"][0]['index'])[0] 48 | 49 | 50 | 51 | def _test_irfinder_result(self, arr_file, bed_file): 52 | ori_res = open(bed_file , "rt") 53 | line = ori_res.readline().split("\t") 54 | line[4] = "CNN_IRscore" 55 | self._out_f.write(("\t").join(line)) 56 | ori_res.close() 57 | arch = ImageArchive(bed_file,arr_file ) 58 | for bed, arr in arch: 59 | if arr.is_valid: 60 | pred= self._predict(getImageArrayFromRegion(arr.region, self.model["Image size"])) 61 | score = softmax(pred) 62 | idx_max = np.argmax(score) 63 | pred_lab = self.model["Dataset"]["class_names"][idx_max] 64 | if pred_lab=="hIR": 65 | line = bed 66 | line[4] = str(score[0]) 67 | self._out_f.write(("\t").join(line)+"\n") 68 | return 69 | 70 | 71 | 72 | def _load_model(self, model_dir): 73 | print("Loading the best_model in {}".format(model_dir)) 74 | model_info_file="{}/model_info.json".format(model_dir) 75 | model_file="{}/best_model.tflite".format(model_dir) 76 | if not os.path.exists(model_info_file) or not os.path.exists(model_file): 77 | raise FileNotFoundError("Error! files model_info.json and best_model.h5 have to be in the model folder! ") 78 | with open(model_info_file, "rt") as fp: 79 | self.model=json.load(fp) 80 | self.model["Model"]=tflite.Interpreter(model_path=model_file) 81 | self.model["Model"].allocate_tensors() 82 | self.model["InputDetails"]=self.model["Model"].get_input_details() 83 | self.model["OutputDetails"]=self.model["Model"].get_output_details() 84 | print("Done.") 85 | return 86 | 87 | 88 | return data 89 | 90 | 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /src/cnnfilter/testCNN/irfinder_cnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # encoding: utf-8 3 | ''' 4 | intron_scanner.intron_scanner -- shortdesc 5 | 6 | intron_scanner.intron_scanner is a description 7 | 8 | It defines classes_and_methods 9 | 10 | @author: user_name 11 | 12 | @copyright: 2020 organization_name. All rights reserved. 13 | 14 | @license: license 15 | 16 | @contact: user_email 17 | @deffield updated: Updated 18 | ''' 19 | 20 | import sys 21 | import os 22 | import json 23 | import gzip 24 | 25 | LIBRARY_LOCATION = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 26 | sys.path.append(LIBRARY_LOCATION) 27 | 28 | from optparse import OptionParser 29 | 30 | 31 | 32 | 33 | 34 | def title(title): 35 | bar = '-' * 50 36 | white = ' ' * round((50 - len(title)) / 2) 37 | white_b = ' ' * 5 38 | title = "\n " + white_b + bar + "\n" + white_b + "|" + white + title + white + "|\n " + white_b + bar + "\n\n" 39 | print(title) 40 | 41 | 42 | def print_help(): 43 | print("Usage: intron_scanner action [options]\n\nPossible actions:") 44 | 45 | print("\t- test: \t use a trained model to predict the class of a given set of images") 46 | print("\n") 47 | 48 | 49 | def main(argv=None): 50 | '''Command line options.''' 51 | program_name = os.path.basename(sys.argv[0]) 52 | title("CNN filter") 53 | if argv is None: 54 | argv = sys.argv 55 | try: 56 | program_name = program_name + " " + 'test process' 57 | parser = OptionParser(usage="Usage: %prog test ", 58 | description="Test a neural network model on a given set of images") 59 | parser.add_option("-d", "--img-dir", dest="dir", 60 | help="The directory containing the IRFinder results to predict. ", 61 | metavar="DIR", type="string") 62 | 63 | parser.add_option("-m", "--model-dir", dest="model", metavar="DIR", 64 | help="Folder containing the model. It has to contain the files best_model.h5 and model_info.json [default: %default]", 65 | type="string") 66 | parser.add_option("-o", "--out", dest="out", help="Output file [default: %default]", metavar="FILE", 67 | type="string") 68 | parser.add_option("-v", "--verbose", dest="verbose", action="count", help="Set tensorflow verbosity level") 69 | # set defaults 70 | parser.set_defaults(out="./predictions.tsv", verbose=0, colorN=3, dir=None, array=None, bed=None, 71 | model="./model/") 72 | # process options 73 | (opts, _) = parser.parse_args(argv) 74 | if (opts.dir != None) == (opts.array != None): 75 | parser.error( 76 | "Parameter -d is required\n\nUse --help to get more information\n") 77 | from actions.models import IntronModeller 78 | modeller = IntronModeller(opts.verbose) 79 | modeller.test(opts.dir, opts.model, opts.out) 80 | except Exception as e: 81 | print(program_name + ": " + repr(e) + "\n") 82 | print("\n\nFor help use --help\n\n") 83 | print(e) 84 | if __debug__: 85 | raise e 86 | return 2 87 | 88 | 89 | 90 | 91 | if __name__ == "__main__": 92 | sys.exit(main()) 93 | 94 | 95 | #json,gzip,time,tensorflow,matplotlib,numpy,sklearn,re,progress -------------------------------------------------------------------------------- /src/cnnfilter/testCNN/model/best_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/src/cnnfilter/testCNN/model/best_model.h5 -------------------------------------------------------------------------------- /src/cnnfilter/testCNN/model/model_info.json: -------------------------------------------------------------------------------- 1 | { 2 | "Image directory": "/work/ritchieteam/EMT_irfinder/trainingshortread/", 3 | "Output directory": "depth15_ir0.1_noir0.cov50long_.01cov25allshortnondir_congruant_2_2021-05-20_12_01", 4 | "Validation split": 0.1, 5 | "Epochs": 500, 6 | "Batch size": 50, 7 | "Model json": null, 8 | "Image size": 256, 9 | "Number of colors": 0, 10 | "Seed": 123, 11 | "Threads": 5, 12 | "Dataset": { 13 | "counts": [ 14 | [ 15 | 1662, 16 | 185 17 | ], 18 | [ 19 | 8164, 20 | 907 21 | ] 22 | ], 23 | "class_names": [ 24 | "hIR", 25 | "noIR" 26 | ] 27 | } 28 | } -------------------------------------------------------------------------------- /src/cnnfilter/testCNN/utils/reader.py: -------------------------------------------------------------------------------- 1 | import json 2 | import gzip 3 | 4 | class ImageArray: 5 | def __init__(self, raw_line): 6 | raw_line=raw_line.split("\t") 7 | raw_name=raw_line[0].split(":") 8 | self.name=raw_name[0]+":"+raw_name[1] 9 | pos=raw_name[1].split("-") 10 | self.intron_name="{}:{}-{}".format(raw_name[0],int(pos[0])+15,int(pos[1])-15) 11 | self.strand=raw_name[2] 12 | self.region=json.loads(raw_line[1]) 13 | self.is_valid= len(self.region) > 1 14 | if self.is_valid: 15 | self.is_valid=max([sum(i) for i in self.region]) > 0 16 | 17 | 18 | class ImageArchive: 19 | 20 | def _open_file(self, fname): 21 | if fname.endswith(".gz"): 22 | return gzip.open(fname, "rt") 23 | else: 24 | return open(fname, "rt") 25 | 26 | def _count_lines(self, fname): 27 | tmp = self._open_file(fname) 28 | count=0 29 | for _ in tmp: 30 | count+=1 31 | tmp.close() 32 | return count 33 | 34 | 35 | def __init__(self, bed_file, array_file): 36 | self._len=self._count_lines(array_file) 37 | if bed_file != None: 38 | self._bed = self._open_file(bed_file) 39 | self._has_bed=True 40 | else: 41 | self._has_bed=False 42 | self._array= self._open_file(array_file) 43 | self._index=-1 44 | 45 | 46 | def __iter__(self): 47 | return self 48 | 49 | def __next__(self): 50 | self._index+=1 51 | if self._index < self._len: 52 | if self._has_bed: 53 | img_array=ImageArray(self._array.readline()) 54 | bed_line=self._bed.readline().strip().split("\t") 55 | while bed_line[0] + ":" + bed_line[1] + "-" + bed_line[2] != img_array.intron_name : 56 | bed_line=self._bed.readline().strip().split("\t") 57 | return bed_line , img_array 58 | else: 59 | return ["NA"], ImageArray(self._array.readline()) 60 | else: 61 | raise StopIteration 62 | 63 | def __len__(self): 64 | return self._len 65 | 66 | def __del__(self): 67 | self.close() 68 | 69 | def close(self): 70 | self._array.close() 71 | if self._has_bed: 72 | self._bed.close() 73 | 74 | def getIndex(self): 75 | return self._index 76 | 77 | -------------------------------------------------------------------------------- /src/irfinder/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | irfinder 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.cdt.managedbuilder.core.genmakebuilder 10 | clean,full,incremental, 11 | 12 | 13 | 14 | 15 | org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder 16 | full,incremental, 17 | 18 | 19 | 20 | 21 | 22 | org.eclipse.cdt.core.cnature 23 | org.eclipse.cdt.core.ccnature 24 | org.eclipse.cdt.managedbuilder.core.managedBuildNature 25 | org.eclipse.cdt.managedbuilder.core.ScannerConfigNature 26 | 27 | 28 | -------------------------------------------------------------------------------- /src/irfinder/.settings/language.settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /src/irfinder/.settings/org.eclipse.cdt.core.prefs: -------------------------------------------------------------------------------- 1 | doxygen/doxygen_new_line_after_brief=true 2 | doxygen/doxygen_use_brief_tag=false 3 | doxygen/doxygen_use_javadoc_tags=true 4 | doxygen/doxygen_use_pre_tag=false 5 | doxygen/doxygen_use_structural_commands=false 6 | eclipse.preferences.version=1 7 | -------------------------------------------------------------------------------- /src/irfinder/Release/makefile: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | -include ../makefile.init 6 | 7 | RM := rm -rf 8 | 9 | # All of the sources participating in the build are defined here 10 | -include sources.mk 11 | -include src/Utils/subdir.mk 12 | -include src/ReadBlock/subdir.mk 13 | -include src/Blocks/subdir.mk 14 | -include src/subdir.mk 15 | -include subdir.mk 16 | -include objects.mk 17 | 18 | ifneq ($(MAKECMDGOALS),clean) 19 | ifneq ($(strip $(CC_DEPS)),) 20 | -include $(CC_DEPS) 21 | endif 22 | ifneq ($(strip $(C++_DEPS)),) 23 | -include $(C++_DEPS) 24 | endif 25 | ifneq ($(strip $(C_UPPER_DEPS)),) 26 | -include $(C_UPPER_DEPS) 27 | endif 28 | ifneq ($(strip $(CXX_DEPS)),) 29 | -include $(CXX_DEPS) 30 | endif 31 | ifneq ($(strip $(CPP_DEPS)),) 32 | -include $(CPP_DEPS) 33 | endif 34 | ifneq ($(strip $(C_DEPS)),) 35 | -include $(C_DEPS) 36 | endif 37 | endif 38 | 39 | -include ../makefile.defs 40 | 41 | # Add inputs and outputs from these tool invocations to the build variables 42 | 43 | # All Target 44 | all: irfinder 45 | 46 | # Tool invocations 47 | irfinder: $(OBJS) $(USER_OBJS) 48 | @echo 'Building target: $@' 49 | @echo 'Invoking: GCC C++ Linker' 50 | g++ -o "irfinder" $(OBJS) $(USER_OBJS) $(LIBS) 51 | @echo 'Finished building target: $@' 52 | @echo ' ' 53 | 54 | # Other Targets 55 | clean: 56 | -$(RM) $(CC_DEPS)$(C++_DEPS)$(EXECUTABLES)$(C_UPPER_DEPS)$(CXX_DEPS)$(OBJS)$(CPP_DEPS)$(C_DEPS) irfinder 57 | -@echo ' ' 58 | 59 | .PHONY: all clean dependents 60 | 61 | -include ../makefile.targets 62 | -------------------------------------------------------------------------------- /src/irfinder/Release/objects.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | USER_OBJS := 6 | 7 | LIBS := -lboost_iostreams 8 | 9 | -------------------------------------------------------------------------------- /src/irfinder/Release/sources.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | C_UPPER_SRCS := 6 | CXX_SRCS := 7 | C++_SRCS := 8 | OBJ_SRCS := 9 | CC_SRCS := 10 | ASM_SRCS := 11 | CPP_SRCS := 12 | C_SRCS := 13 | O_SRCS := 14 | S_UPPER_SRCS := 15 | CC_DEPS := 16 | C++_DEPS := 17 | EXECUTABLES := 18 | C_UPPER_DEPS := 19 | CXX_DEPS := 20 | OBJS := 21 | CPP_DEPS := 22 | C_DEPS := 23 | 24 | # Every subdirectory with source files must be described here 25 | SUBDIRS := \ 26 | src/Blocks \ 27 | src \ 28 | src/ReadBlock \ 29 | src/Utils \ 30 | 31 | -------------------------------------------------------------------------------- /src/irfinder/Release/src/Blocks/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../src/Blocks/BAM2blocks.cpp \ 8 | ../src/Blocks/CoverageBlock.cpp \ 9 | ../src/Blocks/FragmentBlocks.cpp 10 | 11 | OBJS += \ 12 | ./src/Blocks/BAM2blocks.o \ 13 | ./src/Blocks/CoverageBlock.o \ 14 | ./src/Blocks/FragmentBlocks.o 15 | 16 | CPP_DEPS += \ 17 | ./src/Blocks/BAM2blocks.d \ 18 | ./src/Blocks/CoverageBlock.d \ 19 | ./src/Blocks/FragmentBlocks.d 20 | 21 | 22 | # Each subdirectory must supply rules for building sources it contributes 23 | src/Blocks/%.o: ../src/Blocks/%.cpp 24 | @echo 'Building file: $<' 25 | @echo 'Invoking: GCC C++ Compiler' 26 | g++ -O3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@)" -o "$@" "$<" 27 | @echo 'Finished building: $<' 28 | @echo ' ' 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/irfinder/Release/src/ReadBlock/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../src/ReadBlock/CoverageBlocks.cpp \ 8 | ../src/ReadBlock/ReadBlockProcessor.cpp 9 | 10 | OBJS += \ 11 | ./src/ReadBlock/CoverageBlocks.o \ 12 | ./src/ReadBlock/ReadBlockProcessor.o 13 | 14 | CPP_DEPS += \ 15 | ./src/ReadBlock/CoverageBlocks.d \ 16 | ./src/ReadBlock/ReadBlockProcessor.d 17 | 18 | 19 | # Each subdirectory must supply rules for building sources it contributes 20 | src/ReadBlock/%.o: ../src/ReadBlock/%.cpp 21 | @echo 'Building file: $<' 22 | @echo 'Invoking: GCC C++ Compiler' 23 | g++ -O3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@)" -o "$@" "$<" 24 | @echo 'Finished building: $<' 25 | @echo ' ' 26 | 27 | 28 | -------------------------------------------------------------------------------- /src/irfinder/Release/src/Utils/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../src/Utils/crc32.cpp 8 | 9 | OBJS += \ 10 | ./src/Utils/crc32.o 11 | 12 | CPP_DEPS += \ 13 | ./src/Utils/crc32.d 14 | 15 | 16 | # Each subdirectory must supply rules for building sources it contributes 17 | src/Utils/%.o: ../src/Utils/%.cpp 18 | @echo 'Building file: $<' 19 | @echo 'Invoking: GCC C++ Compiler' 20 | g++ -O3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@)" -o "$@" "$<" 21 | @echo 'Finished building: $<' 22 | @echo ' ' 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/irfinder/Release/src/subdir.mk: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Automatically-generated file. Do not edit! 3 | ################################################################################ 4 | 5 | # Add inputs and outputs from these tool invocations to the build variables 6 | CPP_SRCS += \ 7 | ../src/IRFinder2.cpp 8 | 9 | OBJS += \ 10 | ./src/IRFinder2.o 11 | 12 | CPP_DEPS += \ 13 | ./src/IRFinder2.d 14 | 15 | 16 | # Each subdirectory must supply rules for building sources it contributes 17 | src/%.o: ../src/%.cpp 18 | @echo 'Building file: $<' 19 | @echo 'Invoking: GCC C++ Compiler' 20 | g++ -O3 -Wall -c -fmessage-length=0 -MMD -MP -MF"$(@:%.o=%.d)" -MT"$(@)" -o "$@" "$<" 21 | @echo 'Finished building: $<' 22 | @echo ' ' 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/irfinder/src/Blocks/BAM2blocks.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_BAM2BLOCKS 2 | #define CODE_BAM2BLOCKS 3 | 4 | #include "FragmentBlocks.h" 5 | #include 6 | #include 7 | #include 8 | 9 | /* Little Endian .. for big endian each group of 4 bytes needs to be reversed before individual members are accessed. */ 10 | // std c11 allows anonymous struct/union. -Wall may give a warning as non-portable to older c++ standards. 11 | 12 | 13 | 14 | class BAM2blocks { 15 | 16 | // TODO -- are structs best hidden inside the class? Does doing so push them into namespace of the class only? 17 | struct bam_read_core { 18 | union { 19 | char c[36]; 20 | struct { 21 | int32_t block_size; 22 | int32_t refID; 23 | int32_t pos; 24 | uint8_t l_read_name; 25 | uint8_t mapq; 26 | uint16_t bin; 27 | uint16_t n_cigar_op; 28 | uint16_t flag; 29 | int32_t l_seq; 30 | int32_t next_refID; 31 | int32_t next_pos; 32 | int32_t tlen; 33 | }; // anonymous struct to allow easy access to members. 34 | }; 35 | char read_name[256]; 36 | union { 37 | char cigar_buffer[20000]; 38 | int32_t cigar[5000]; 39 | }; 40 | }; 41 | 42 | union bam_header { 43 | char c[8]; 44 | struct { 45 | char magic[4]; 46 | int32_t l_text; 47 | }; 48 | }; 49 | 50 | union stream_int32 { 51 | char c[4]; 52 | int32_t i; 53 | }; 54 | 55 | static const int BAM_HEADER_BYTES = 8; 56 | static const int BAM_READ_CORE_BYTES = 36; 57 | static const int BAM_READ_CORE_MAX_CIGAR = 20000; 58 | 59 | FragmentBlocks oBlocks; 60 | 61 | std::vector< std::function &)> > callbacksChrMappingChange; 62 | std::vector< std::function > callbacksProcessBlocks; 63 | 64 | // Statistics. 65 | ulong cShortPairs; 66 | ulong cIntersectPairs; 67 | ulong cLongPairs; 68 | ulong cSingleReads; 69 | ulong cPairedReads; 70 | ulong cErrorReads; 71 | ulong cSkippedReads; 72 | uint64_t totalNucleotides; 73 | std::map skippedReason; 74 | 75 | std::map> tmp_reads; 76 | bam_read_core tmp_read; 77 | bam_read_core tmp_mate; 78 | uint64_t current_read=0; 79 | 80 | bool getNextReadHead(bam_read_core &); 81 | void errorMessage(); 82 | void getReadBody(bam_read_core &); 83 | void handlePairs(bam_read_core &, bam_read_core &); 84 | std::string getName(bam_read_core &); 85 | void setMate(std::vector & mate); 86 | void saveMate(); 87 | std::istream * IN; 88 | std::istream instream; 89 | void cigar2block(int32_t * cigar, uint16_t n_cigar_op, std::vector &starts, std::vector &lens, int &ret_genome_len); 90 | 91 | unsigned int processPair(bam_read_core * read1, bam_read_core * read2); 92 | unsigned int processSingle(bam_read_core * read1); 93 | 94 | std::vector stream_buffer; 95 | void fillBuffer(); 96 | std::ifstream file; 97 | boost::iostreams::filtering_streambuf inbuf; 98 | bool coord_sorted=false; 99 | public: 100 | BAM2blocks(); 101 | void openFile(std::istream * _IN); 102 | void openFile(std::string in_file); 103 | void readBamHeader(); // implied by openFile. So perhaps should be private. 104 | int processAll(); 105 | 106 | void registerCallbackChrMappingChange( std::function &)> callback ); 107 | void registerCallbackProcessBlocks( std::function callback ); 108 | 109 | std::string samHeader; 110 | std::vector chr_names; //tab terminated chromosome names. 111 | std::vector chr_lens; //length of each chromosome (not used when reading, used if optionally outputting an altered BAM file) 112 | }; 113 | 114 | 115 | #endif 116 | -------------------------------------------------------------------------------- /src/irfinder/src/Blocks/CoverageBlock.cpp: -------------------------------------------------------------------------------- 1 | #include "CoverageBlock.h" 2 | // using namespace std; 3 | 4 | CoverageBlock::CoverageBlock(uint start, uint end) { 5 | blockStart = start; 6 | blockEnd = end; 7 | firstDepth[0] = 0; 8 | firstDepth[1] = 0; 9 | blockExtents = NULL; 10 | blockExtentsL = NULL; 11 | } 12 | 13 | //direction -- 0=False/Neg, 1=True/Pos. 14 | void CoverageBlock::RecordCover(uint readStart, uint readEnd, bool dir) { 15 | 16 | if (readStart <= blockStart && readEnd > blockStart) { 17 | firstDepth[dir]++; 18 | } else if (readStart < blockEnd) { 19 | // Need to increment the starts vector. 20 | uint inc_index = readStart - blockStart - 1; 21 | if (blockExtentsL) { //already an int vector 22 | blockExtentsL->at(inc_index).start[dir]++; 23 | } else if (!blockExtents) { //don't have a char vector either - create first. 24 | blockExtents = new std::vector(vectorLen()); 25 | blockExtents->at(inc_index).start[dir]++; 26 | } else { 27 | if (blockExtents->at(inc_index).start[dir] == 254) { 28 | blockExtentsL = new std::vector( 29 | blockExtents->begin(), blockExtents->end()); 30 | delete blockExtents; 31 | blockExtents = NULL; 32 | blockExtentsL->at(inc_index).start[dir]++; 33 | } else { 34 | blockExtents->at(inc_index).start[dir]++; 35 | } 36 | } 37 | } else { 38 | return; 39 | } 40 | 41 | if (readEnd >= blockEnd) { 42 | return; 43 | } else { 44 | // Need to increment the ends vector. 45 | uint inc_index = readEnd - blockStart - 1; 46 | 47 | if (blockExtentsL) { //already an int vector 48 | blockExtentsL->at(inc_index).end[dir]++; 49 | } else if (!blockExtents) { //don't have a char vector either - create first. 50 | blockExtents = new std::vector(vectorLen()); 51 | blockExtents->at(inc_index).end[dir]++; 52 | } else { 53 | if (blockExtents->at(inc_index).end[dir] == 254) { 54 | blockExtentsL = new std::vector( 55 | blockExtents->begin(), blockExtents->end()); 56 | delete blockExtents; 57 | blockExtents = NULL; 58 | blockExtentsL->at(inc_index).end[dir]++; 59 | } else { 60 | blockExtents->at(inc_index).end[dir]++; 61 | } 62 | } 63 | } 64 | // Can Throw: Out of range exception. 65 | } 66 | 67 | void CoverageBlock::updateCoverageHist(std::map &hist, uint start, 68 | uint end) const { 69 | 70 | if (!blockExtentsL && !blockExtents) { 71 | // how many bases in this block? 72 | hist[firstDepth[0] + firstDepth[1]] += std::min(blockEnd, end) 73 | - std::max(blockStart, start); 74 | } else { 75 | // There are read starts and ends -- need to walk the positions from the start of this block 76 | // even if not in the region of interest. 77 | 78 | //special handling for the first base -- the one before the vector starts. 79 | uint depth = firstDepth[0] + firstDepth[1]; 80 | if (start <= blockStart) { 81 | // use the first depth, before commencing in the vector. 82 | hist[depth]++; 83 | } 84 | 85 | uint startindex = std::max(blockStart + 1, start) - blockStart - 1; 86 | uint endindex = std::min(blockEnd, end) - blockStart - 1; 87 | if (blockExtents) { 88 | for (uint i = 0; i < endindex; i++) { 89 | depth += -(*blockExtents)[i].end[0] - (*blockExtents)[i].end[1] 90 | + (*blockExtents)[i].start[0] 91 | + (*blockExtents)[i].start[1]; 92 | if (i >= startindex) { 93 | hist[depth]++; 94 | } 95 | } 96 | } else { 97 | for (uint i = 0; i < endindex; i++) { 98 | depth += -(*blockExtentsL)[i].end[0] 99 | - (*blockExtentsL)[i].end[1] 100 | + (*blockExtentsL)[i].start[0] 101 | + (*blockExtentsL)[i].start[1]; 102 | if (i >= startindex) { 103 | hist[depth]++; 104 | } 105 | } 106 | } 107 | // When in the region of interest, update the hist each step. 108 | } 109 | } 110 | 111 | void CoverageBlock::updateCoverageHist(std::map &hist, uint start, 112 | uint end, bool dir) const { 113 | if (!blockExtentsL && !blockExtents) { 114 | // how many bases in this block? 115 | hist[firstDepth[dir]] += std::min(blockEnd, end) 116 | - std::max(blockStart, start); 117 | } else { 118 | //special handling for the first base -- the one before the vector starts. 119 | uint depth = firstDepth[dir]; 120 | if (start <= blockStart) { 121 | // use the first depth, before commencing in the vector. 122 | hist[depth]++; 123 | } 124 | 125 | uint startindex = std::max(blockStart + 1, start) - blockStart - 1; 126 | uint endindex = std::min(blockEnd, end) - blockStart - 1; 127 | if (blockExtents) { 128 | for (uint i = 0; i < endindex; i++) { 129 | depth += -(*blockExtents)[i].end[dir] 130 | + (*blockExtents)[i].start[dir]; 131 | if (i >= startindex) { 132 | hist[depth]++; 133 | } 134 | } 135 | } else { 136 | for (uint i = 0; i < endindex; i++) { 137 | depth += -(*blockExtentsL)[i].end[dir] 138 | + (*blockExtentsL)[i].start[dir]; 139 | if (i >= startindex) { 140 | hist[depth]++; 141 | } 142 | } 143 | } 144 | } 145 | } 146 | 147 | void CoverageBlock::updateCoverageArray(std::vector &arr, 148 | std::vector &covered, uint start, uint end) const { 149 | uint depth = firstDepth[0] + firstDepth[1], 150 | startindex = std::max( blockStart, start-1) - blockStart, 151 | endindex = std::min(blockEnd, end) - blockStart, 152 | startarray = std::max(blockStart+1, start ) - start , 153 | endarray = std::min(blockEnd, end) - start ; 154 | 155 | if (!blockExtentsL && !blockExtents) { 156 | for (uint i = startindex; i < endindex && startarray < endarray; 157 | i++, startarray++) { 158 | arr[startarray] += depth; 159 | covered[startarray] = true; 160 | } 161 | } else { 162 | // There are read starts and ends -- need to walk the positions from the start of this block 163 | // even if not in the region of interest. 164 | if (blockExtents) { 165 | for (uint i = 0; i < endindex && startarray < endarray; i++) { 166 | depth += -(*blockExtents)[i].end[0] - (*blockExtents)[i].end[1] 167 | + (*blockExtents)[i].start[0] 168 | + (*blockExtents)[i].start[1]; 169 | if (i >= startindex) { 170 | arr[startarray] += depth; 171 | covered[startarray] = true; 172 | startarray++; 173 | } 174 | } 175 | } else { 176 | for (uint i = 0; i < endindex && startarray < endarray; i++) { 177 | depth += -(*blockExtentsL)[i].end[0] 178 | - (*blockExtentsL)[i].end[1] 179 | + (*blockExtentsL)[i].start[0] 180 | + (*blockExtentsL)[i].start[1]; 181 | if (i >= startindex) { 182 | arr[startarray] += depth; 183 | covered[startarray] = true; 184 | startarray++; 185 | } 186 | } 187 | } 188 | // When in the region of interest, update the hist each step. 189 | } 190 | } 191 | 192 | void CoverageBlock::updateCoverageArray(std::vector &arr, 193 | std::vector &covered, uint start, uint end, bool dir) const { 194 | 195 | uint depth = firstDepth[0] + firstDepth[1], 196 | startindex = std::max( blockStart, start-1) - blockStart, 197 | endindex = std::min(blockEnd, end) - blockStart, 198 | startarray = std::max(blockStart+1, start ) - start , 199 | endarray = std::min(blockEnd, end) - start ; 200 | if (!blockExtentsL && !blockExtents) { 201 | for (uint i = startindex; i < endindex && startarray < endarray; 202 | i++, startarray++) { 203 | arr[startarray] += depth; 204 | covered[startarray] = true; 205 | } 206 | } else { 207 | // There are read starts and ends -- need to walk the positions from the start of this block 208 | // even if not in the region of interest. 209 | if (blockExtents) { 210 | for (uint i = 0; i < endindex && startarray < endarray; i++) { 211 | depth += -(*blockExtents)[i].end[dir] 212 | + (*blockExtents)[i].start[dir]; 213 | if (i >= startindex) { 214 | arr[startarray] += depth; 215 | covered[startarray] = true; 216 | startarray++; 217 | } 218 | } 219 | } else { 220 | for (uint i = 0; i < endindex && startarray < endarray; i++) { 221 | depth += -(*blockExtentsL)[i].end[dir] 222 | + (*blockExtentsL)[i].start[dir]; 223 | if (i >= startindex) { 224 | arr[startarray] += depth; 225 | covered[startarray] = true; 226 | startarray++; 227 | } 228 | } 229 | } 230 | // When in the region of interest, update the hist each step. 231 | } 232 | } 233 | 234 | void CoverageBlock::print(std::ostream &os) const { 235 | os << "Coverage block " << blockStart << " - " << blockEnd << "\n"; 236 | os << "First depth 0 : " << firstDepth[0] << "\n"; 237 | os << "First depth 1 : " << firstDepth[0] << "\n"; 238 | uint i=0; 239 | if (blockExtents) { 240 | os << "BlockExtents: \n"; 241 | for (auto &a : (*blockExtents)) { 242 | os << i+blockStart << " " << (uint) a.start[0] << ":" << (uint) a.start[1] << " - " << (uint) a.end[0] << ":"<< (uint) a.end[1] << "\n"; 243 | i++; 244 | } 245 | } 246 | if (blockExtentsL) { 247 | os << "BlockExtentsL: \n"; 248 | for (auto &a : (*blockExtentsL)) { 249 | os << i+blockStart << " " << (uint) a.start[0] << ":" << (uint) a.start[1] << " - " << (uint) a.end[0] << ":"<< (uint) a.end[1] << "\n"; 250 | i++; 251 | } 252 | } 253 | } 254 | 255 | std::ostream& operator<<(std::ostream &os, const CoverageBlock &cb) { 256 | cb.print(os); 257 | return os; 258 | } 259 | -------------------------------------------------------------------------------- /src/irfinder/src/Blocks/CoverageBlock.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_COVERAGEBLOCK 2 | #define CODE_COVERAGEBLOCK 3 | 4 | #include "../Utils/includedefine.h" 5 | 6 | class start_stops { 7 | public: 8 | unsigned char start[2]; 9 | unsigned char end[2]; 10 | 11 | start_stops() { 12 | start[0]=0; 13 | start[1]=0; 14 | end[0]=0; 15 | end[1]=0; 16 | }; 17 | }; 18 | 19 | class start_stopsL { 20 | public: 21 | unsigned int start[2]; 22 | unsigned int end[2]; 23 | 24 | start_stopsL() { 25 | start[0]=0; 26 | start[1]=0; 27 | end[0]=0; 28 | end[1]=0; 29 | }; 30 | start_stopsL(const start_stops ©) { 31 | start[0]=copy.start[0]; 32 | start[1]=copy.start[1]; 33 | end[0]=copy.end[0]; 34 | end[1]=copy.end[1]; 35 | }; 36 | 37 | }; 38 | 39 | 40 | class CoverageBlock { 41 | private: 42 | uint blockStart; 43 | uint blockEnd; 44 | uint firstDepth[2]; 45 | std::vector* blockExtents; 46 | std::vector* blockExtentsL; 47 | 48 | inline uint vectorLen() { 49 | return (blockEnd - blockStart - 1); 50 | }; 51 | public: 52 | uint getLength() { return blockEnd - blockStart;} 53 | uint getStart(){return blockStart;} 54 | uint getEnd(){return blockEnd;} 55 | CoverageBlock(uint start, uint end); 56 | void RecordCover(uint start, uint end, bool dir); 57 | //RetrieveCover(..); 58 | void print(std::ostream& os) const; 59 | 60 | //First form, non-directional. Second form, directional with "dir" specifiying whether sense or anti-sense. 61 | void updateCoverageHist(std::map &hist, uint start, uint end) const; 62 | void updateCoverageHist(std::map &hist, uint start, uint end, bool dir) const; 63 | void updateCoverageArray(std::vector &arr,std::vector & covered, uint start, uint end) const; 64 | void updateCoverageArray(std::vector &arr,std::vector & covered, uint start, uint end, bool dir) const; 65 | inline bool posIsAfterStart(const uint &compareval) const { 66 | return (compareval > blockStart); 67 | }; 68 | 69 | // http://www.learncpp.com/cpp-tutorial/94-overloading-the-comparison-operators/ 70 | // http://en.cppreference.com/w/cpp/language/operator_comparison 71 | inline bool operator<(const CoverageBlock &b) const { 72 | return (blockEnd < b.blockEnd); 73 | }; 74 | inline bool operator<(const uint &b) const { 75 | return (blockEnd < b); //a is the object. 76 | }; 77 | friend inline bool operator<(const uint &a, const CoverageBlock &b) { 78 | return (a < b.blockEnd); //a is a uint. 79 | }; 80 | }; 81 | 82 | std::ostream& operator<<( std::ostream& os, const CoverageBlock& cb); 83 | 84 | #endif 85 | -------------------------------------------------------------------------------- /src/irfinder/src/Blocks/FragmentBlocks.cpp: -------------------------------------------------------------------------------- 1 | #include "FragmentBlocks.h" 2 | 3 | 4 | // This class is an information storage container only -- pretty much a struct. 5 | // It allows all the relevant information relating to an interpreted fragment to be passed 6 | // to the variety of callback watchers that require fragment blocks to update their stats. 7 | 8 | FragmentBlocks::FragmentBlocks() { 9 | rStarts[0].reserve(initial_alloc); 10 | rLens[0].reserve(initial_alloc); 11 | rStarts[1].reserve(initial_alloc); 12 | rLens[1].reserve(initial_alloc); 13 | readName.reserve(max_read_name); 14 | readCount = 0; 15 | } 16 | 17 | // Return a string representation of the Chromosome name. 18 | const std::string FragmentBlocks::chrName() const { 19 | return chr_names.at(chr_id); 20 | } 21 | 22 | // Update the internal data structure with a new mapping between Chromosome ID# and Chromosome name (string). 23 | void FragmentBlocks::ChrMapUpdate(const std::vector &chrmap) { 24 | chr_names = chrmap; 25 | } 26 | -------------------------------------------------------------------------------- /src/irfinder/src/Blocks/FragmentBlocks.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_FRAGMENTBLOCKS 2 | #define CODE_FRAGMENTBLOCKS 3 | 4 | #include "../Utils/includedefine.h" 5 | 6 | /* A class to store up to 2 reads belonging to a single fragment. 7 | * It is a storage class, almost a struct, it does not perform processing itself. 8 | * Read1 is always valid. 9 | * Read2 is only valid if readCount == 2. 10 | * 11 | * There may only be a single read if: 12 | * - the sequencing is single end rather than paired end.. 13 | * - the sequencing is paired end, but the two reads overlapped and have been combined 14 | * into a single synthetic read / block of coverage. 15 | */ 16 | class FragmentBlocks { 17 | private: 18 | static const int initial_alloc = 100; 19 | static const int max_read_name = 300; 20 | std::vector chr_names; //TODO - this is currently unused?? 21 | public: 22 | FragmentBlocks(); 23 | const std::string chrName() const; 24 | void ChrMapUpdate(const std::vector& chrmap); 25 | 26 | std::string readName; 27 | std::vector rStarts[2]; 28 | std::vector rLens[2]; 29 | uint readStart[2]; 30 | uint readEnd[2]; 31 | int readCount; 32 | uint chr_id; // Assumption that both r1 & r2 are on the same chromosome? 33 | // if they aren't we shouldn't process them as a single fragment. 34 | // perhaps a sanity check in pairing, only treat them as a pair 35 | // if the name of the reads matches and the Chr matches. 36 | bool direction; 37 | }; 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /src/irfinder/src/ReadBlock/CoverageBlocks.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_READBLOCKPROCESSOR_COVERAGEBLOCKS 2 | #define CODE_READBLOCKPROCESSOR_COVERAGEBLOCKS 3 | 4 | #include "../Blocks/CoverageBlock.h" 5 | #include "ReadBlockProcessor.h" 6 | #include "../Blocks/FragmentBlocks.h" 7 | 8 | struct BEDrecord { 9 | std::string chrName; 10 | std::string name; 11 | uint start; 12 | uint end; 13 | bool direction; 14 | std::vector> blocks; 15 | }; 16 | 17 | 18 | class CoverageBlocks : public ReadBlockProcessor { 19 | //Store the Blocked BED record for each ROI/intron. This won't be referred to again until the end. 20 | //XX Create the temporary vectors (per Chr) which simply list the blocks sequentially as read. 21 | //XX Sort the temporary vectors 22 | //XX Build the final vectors of "blocks of interest" 23 | //xx Delete the temporary vectors 24 | //xx Create the parallel vectors with counter objects. (do these as a batch at the end, once vector size is known - for best memory layout) 25 | //xx Process fragments against the counter structure. (have I already written a class/object for this?) 26 | 27 | //Produce summary statistical output for each Blocked BED record, using the counter structure. 28 | 29 | protected: 30 | 31 | // Coverage depth data-structures. 32 | std::map> chrName_CoverageBlocks; 33 | std::map> chrName_FlankCoverageBlocks; 34 | // Shortcut pointers to depth data-structures. 35 | std::vector*> chrID_CoverageBlocks; 36 | std::vector*> chrID_FlankCoverageBlocks; 37 | 38 | // TODO: what is optimal for speed & memory usage? 39 | // static const uint coverage_block_max_length = 5000; 40 | static const uint coverage_block_max_length = 500; 41 | 42 | 43 | std::vector BEDrecords; 44 | bool long_read=false; 45 | int jitter = 3; 46 | 47 | public: 48 | CoverageBlocks(std::string read_type) { 49 | long_read = read_type == "LR"; 50 | } 51 | void setJitter(int j){jitter=j;}; 52 | void ProcessBlocks(const FragmentBlocks &fragblock); 53 | void ChrMapUpdate(const std::vector &chrmap); 54 | void loadRef(std::istream &IN); 55 | int WriteOutput(std::ostream *os) const; 56 | 57 | void fillHist(std::map &hist, const std::string &chrName, const std::vector> &blocks) const; 58 | void fillHist(std::map &hist, const std::string &chrName, const std::vector> &blocks, bool direction) const; 59 | void getCoverageArray(std::vector &coverages, 60 | std::vector & covered, 61 | const std::string &chrName, 62 | const uint arr_start, const uint arr_end) const; 63 | void getCoverageArray(std::vector &coverages, 64 | std::vector & covered, 65 | const std::string &chrName, 66 | const uint arr_start, const uint arr_end, 67 | bool direction) const; 68 | double meanFromHist(const std::map &hist) const; 69 | double coverageFromHist(const std::map &hist) const; 70 | double percentileFromHist(const std::map &hist, uint percentile) const; 71 | double trimmedMeanFromHist(const std::map &hist, uint centerPercent) const; 72 | }; 73 | 74 | class CoverageBlocksIRFinder : public CoverageBlocks { 75 | private: 76 | uint AI_warn=0; 77 | uint AI_intron=1; 78 | double AI_ratio=0.05; 79 | public: 80 | 81 | CoverageBlocksIRFinder(std::string read_type) : CoverageBlocks(read_type){ 82 | } 83 | void setAI(uint AI_warning_level, uint AI_min_intron_coverage, double AI_IRratio){ 84 | AI_warn=AI_warning_level; 85 | AI_intron=AI_min_intron_coverage; 86 | AI_ratio=AI_IRratio; 87 | } 88 | int WriteOutput(std::ostream *os, std::ostream *osAI, const JunctionCount &JC, const SpansPoint &SP, int directionality = 0) const; 89 | 90 | }; 91 | 92 | 93 | #endif 94 | -------------------------------------------------------------------------------- /src/irfinder/src/ReadBlock/ReadBlockProcessor.h: -------------------------------------------------------------------------------- 1 | #ifndef CODE_READBLOCKPROCESSOR 2 | #define CODE_READBLOCKPROCESSOR 3 | 4 | #include "../Blocks/FragmentBlocks.h" 5 | 6 | /* 7 | The code can be finished faster if we force a requirement that all input files are coordinate sorted by the start of each block. 8 | ie: sort -k2,2n (for BED files). 9 | Chromosome sorted or not won't matter, as these get split into different vectors in all cases. 10 | */ 11 | 12 | 13 | 14 | class ReadBlockProcessor { 15 | public: 16 | virtual void ProcessBlocks(const FragmentBlocks &fragblock) = 0; 17 | virtual void ChrMapUpdate(const std::vector &chrmap) = 0; //Maybe some of these funcs shouldn't be pure virtual - overloadable if needed, but default often ok. 18 | }; 19 | 20 | 21 | class BED12Output : public ReadBlockProcessor { 22 | private: 23 | std::vector chr_names; 24 | std::ostream* out; 25 | public: 26 | void ProcessBlocks(const FragmentBlocks &fragblock); 27 | void ChrMapUpdate(const std::vector &chrmap); 28 | void SetOutputStream(std::ostream *os); 29 | }; 30 | 31 | 32 | class JunctionCount : public ReadBlockProcessor { 33 | private: 34 | std::map,uint[3]>> chrName_junc_count; 35 | std::vector,uint[3]>*> chrID_junc_count; 36 | //uint[3] - 0, neg strand count; 1, pos strand count; 2 = expected direction from ref: 0=unknown, 1=neg, 2=pos. 37 | 38 | std::map> chrName_juncLeft_count; 39 | std::vector*> chrID_juncLeft_count; 40 | 41 | std::map> chrName_juncRight_count; 42 | std::vector*> chrID_juncRight_count; 43 | //chrID_... stores a fast access pointer to the appropriate structure in chrName_... 44 | public: 45 | void ProcessBlocks(const FragmentBlocks &fragblock); 46 | void ChrMapUpdate(const std::vector &chrmap); 47 | int WriteOutput(std::ostream *os) const; 48 | void loadRef(std::istream &IN); //loadRef is optional, it allows directional detection to determine not just non-dir vs dir, but also which direction. 49 | 50 | int Directional() const; 51 | 52 | uint lookup(std::string ChrName, uint left, uint right, bool direction) const; 53 | uint lookup(std::string ChrName, uint left, uint right) const; 54 | uint lookupLeft(std::string ChrName, uint left, bool direction) const; 55 | uint lookupLeft(std::string ChrName, uint left) const; 56 | uint lookupRight(std::string ChrName, uint right, bool direction) const; 57 | uint lookupRight(std::string ChrName, uint right) const; 58 | 59 | // Ideally we would read the XS junction strand attribute from the BAM if we want to count junctions from non-directional sequencing. 60 | // that will require BAM2blocks to be informed it should read the optional attributes looking for that attrib in that case. 61 | // -- or we can just ignore direction -- the splice start/end information effectively determines the XS info (by ref to the reference) 62 | }; 63 | 64 | 65 | class SpansPoint : public ReadBlockProcessor { 66 | private: 67 | std::map> chrName_pos; 68 | std::map> chrName_count[2]; 69 | std::vector*> chrID_pos; 70 | std::vector*> chrID_count[2]; 71 | char overhangLeft; 72 | char overhangRight; 73 | char overhangTotal; 74 | //chrID_... stores a fast access pointer to the appropriate structure in chrName_... 75 | public: 76 | void setSpanLength(uint overhang_left, uint overhang_right); 77 | void loadRef(std::istream &IN); 78 | void ProcessBlocks(const FragmentBlocks &fragblock); 79 | void ChrMapUpdate(const std::vector &chrmap); 80 | //void SetOutputStream(std::ostream *os); 81 | int WriteOutput(std::ostream *os) const; 82 | uint lookup(std::string ChrName, uint pos, bool direction) const; 83 | uint lookup(std::string ChrName, uint pos) const; 84 | }; 85 | 86 | class FragmentsInChr : public ReadBlockProcessor { 87 | // Counts the number of fragments in each Chromosome. (for both + & - strands). 88 | private: 89 | std::map> chrName_count; //only expecting 2 items in our vector. 90 | std::vector*> chrID_count; 91 | public: 92 | void ProcessBlocks(const FragmentBlocks &blocks); 93 | void ChrMapUpdate(const std::vector &chrmap); 94 | int WriteOutput(std::ostream *os) const; 95 | }; 96 | 97 | 98 | class FragmentsInROI : public ReadBlockProcessor { 99 | // Counts the number of fragments fully contained within a ROI. 100 | // the ROIs may not overlap. Direction ignored for overlap detect. 101 | private: 102 | std::map RegionID_counter[2]; 103 | 104 | std::map>> chrName_ROI; 105 | std::map> chrName_count[2]; 106 | 107 | std::vector>*> chrID_ROI; 108 | std::vector*> chrID_count[2]; 109 | 110 | // Perhaps we want to store some text relating to each record too? Easy to do if the input is pre-sorted (at least within each Chr). 111 | // if pre-sorted, it may be easier to check for no overlapping blocks on read .. or can do this immediately after read with a single nested-walk. 112 | std::map> chrName_ROI_text; 113 | public: 114 | void ProcessBlocks(const FragmentBlocks &blocks); 115 | void ChrMapUpdate(const std::vector &chrmap); 116 | void loadRef(std::istream &IN); 117 | int WriteOutput(std::ostream *os) const; 118 | }; 119 | 120 | 121 | /* 122 | class CoverageBlocks : public ReadBlockProcessor { ... } 123 | // In it's own file -- bigger code. 124 | */ 125 | 126 | #endif 127 | -------------------------------------------------------------------------------- /src/irfinder/src/Utils/crc32.h: -------------------------------------------------------------------------------- 1 | // ////////////////////////////////////////////////////////// 2 | // crc32.h 3 | // Copyright (c) 2014 Stephan Brumme. All rights reserved. 4 | // see http://create.stephan-brumme.com/disclaimer.html 5 | // 6 | 7 | #pragma once 8 | 9 | //#include "hash.h" 10 | #include 11 | 12 | // define fixed size integer types 13 | #ifdef _MSC_VER 14 | // Windows 15 | typedef unsigned __int8 uint8_t; 16 | typedef unsigned __int32 uint32_t; 17 | #else 18 | // GCC 19 | #include 20 | #endif 21 | 22 | 23 | /// compute CRC32 hash, based on Intel's Slicing-by-8 algorithm 24 | /** Usage: 25 | CRC32 crc32; 26 | std::string myHash = crc32("Hello World"); // std::string 27 | std::string myHash2 = crc32("How are you", 11); // arbitrary data, 11 bytes 28 | 29 | // or in a streaming fashion: 30 | 31 | CRC32 crc32; 32 | while (more data available) 33 | crc32.add(pointer to fresh data, number of new bytes); 34 | std::string myHash3 = crc32.getHash(); 35 | */ 36 | class CRC32 //: public Hash 37 | { 38 | public: 39 | /// same as reset() 40 | CRC32(); 41 | 42 | /// compute CRC32 of a memory block 43 | std::string operator()(const void* data, size_t numBytes); 44 | /// compute CRC32 of a string, excluding final zero 45 | std::string operator()(const std::string& text); 46 | 47 | /// add arbitrary number of bytes 48 | void add(const void* data, size_t numBytes); 49 | 50 | /// return latest hash as 16 hex characters 51 | std::string getHash(); 52 | 53 | /// return latest hash as a raw 32 bit integer 54 | uint32_t getRawHash(); 55 | 56 | /// restart 57 | void reset(); 58 | 59 | private: 60 | /// hash 61 | uint32_t m_hash; 62 | }; 63 | -------------------------------------------------------------------------------- /src/irfinder/src/Utils/includedefine.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDEDEFINE_DEF 2 | #define INCLUDEDEFINE_DEF 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include // std::sort 15 | #include // std::function 16 | #include 17 | 18 | 19 | //__asm__(".symver memcpy,memcpy@GLIBC_2.2.5"); 20 | 21 | 22 | 23 | 24 | #define DEF_lineLengthMax 10000 25 | #define DEF_adaptLengthMax 500 26 | 27 | 28 | // infix_iterator.h 29 | // 30 | // Lifted from Jerry Coffin's 's prefix_ostream_iterator 31 | template > 34 | class infix_ostream_iterator : 35 | public std::iterator 36 | { 37 | std::basic_ostream *os; 38 | charT const* delimiter; 39 | bool first_elem; 40 | public: 41 | typedef charT char_type; 42 | typedef traits traits_type; 43 | typedef std::basic_ostream ostream_type; 44 | infix_ostream_iterator(ostream_type& s) 45 | : os(&s),delimiter(0), first_elem(true) 46 | {} 47 | infix_ostream_iterator(ostream_type& s, charT const *d) 48 | : os(&s),delimiter(d), first_elem(true) 49 | {} 50 | infix_ostream_iterator& operator=(T const &item) 51 | { 52 | // Here's the only real change from ostream_iterator: 53 | // Normally, the '*os << item;' would come before the 'if'. 54 | if (!first_elem && delimiter != 0) 55 | *os << delimiter; 56 | *os << item; 57 | first_elem = false; 58 | return *this; 59 | } 60 | infix_ostream_iterator &operator*() { 61 | return *this; 62 | } 63 | infix_ostream_iterator &operator++() { 64 | return *this; 65 | } 66 | infix_ostream_iterator &operator++(int) { 67 | return *this; 68 | } 69 | }; 70 | 71 | 72 | 73 | #endif 74 | -------------------------------------------------------------------------------- /src/trim/Makefile: -------------------------------------------------------------------------------- 1 | OBJECTS := TrimReads.o sequenceTools.o trim.o 2 | SOURCES=$(wildcard *.cpp) 3 | LDFLAGS := 4 | LDFLAGS_static := -static -static-libgcc 5 | LDFLAGS_GDB := 6 | OPTIMFLAGS := 7 | OPTIMFLAGS1 := 8 | # below flags make little difference. 9 | #OPTIMFLAGS=-ffast-math 10 | #OPTIMFLAGS=-fforce-addr -funsafe-loop-optimizations -ftree-loop-linear -ftree-vectorize 11 | #OPTIMFLAGS=-fforce-addr -funsafe-loop-optimizations -ftree-vectorize 12 | # unroll-loops slows the program ~6%. 13 | #OPTIMFLAGS1=-funroll-loops -fprefetch-loop-arrays 14 | SVNDEF := -D'SVN_VERSION_COMPILED="magictrim0.1"' 15 | COMPTIMEPLACE := -D'COMPILATION_TIME_PLACE="$(shell echo `date` `hostname --fqdn`:`pwd`)"' 16 | CCFLAGS_MAIN := -pipe -std=c++0x -O3 -Wall -Wextra -fopenmp $(SVNDEF) $(COMPTIMEPLACE) $(OPTIMFLAGS) $(OPTIMFLAGS1) 17 | CCFLAGS_GDB := -pipe -std=c++0x -g -O0 -Wall -Wextra -fopenmp $(SVNDEF) $(COMPTIMEPLACE) 18 | CC :=g++ 19 | GCC:=gcc 20 | 21 | %.o : %.cpp 22 | $(CC) -c $(CCFLAGS) $< 23 | 24 | all: trimstatic 25 | 26 | clean : 27 | rm -f *.o trim Depend.list 28 | 29 | ifneq ($(MAKECMDGOALS),clean) 30 | Depend.list: $(SOURCES) 31 | /bin/rm -f ./Depend.list 32 | $(CC) $(CCFLAGS_MAIN) -MM $^ >> Depend.list 33 | include Depend.list 34 | endif 35 | 36 | trim : CCFLAGS=$(CCFLAGS_MAIN) 37 | trim : $(OBJECTS) 38 | $(CC) -o trim $(CCFLAGS) $(OBJECTS) $(LDFLAGS) 39 | 40 | trimstatic : CCFLAGS=$(CCFLAGS_MAIN) 41 | trimstatic : $(OBJECTS) 42 | $(CC) -o trim $(CCFLAGS) $(OBJECTS) $(LDFLAGS_static) 43 | 44 | gdb : CCFLAGS= $(CCFLAGS_GDB) 45 | gdb : $(OBJECTS) 46 | $(CC) -o trim $(CCFLAGS_GDB) $(OBJECTS) $(LDFLAGS_GDB) 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/trim/TrimReads.cpp: -------------------------------------------------------------------------------- 1 | #include "TrimReads.h" 2 | 3 | 4 | TrimReads::TrimReads (istream * _IF1, istream * _IF2, ostream * _OF1, ostream * _OF2, ostream * _DebugLog, char * Adapt1, char * Adapt2, char _debug) 5 | { 6 | IF1 = _IF1; 7 | IF2 = _IF2; 8 | OF1 = _OF1; 9 | OF2 = _OF2; 10 | OFdebug = _DebugLog; 11 | debug = _debug; 12 | 13 | time(&startTime); 14 | cout << "Started trimming at: " << timeMonthDayTime(startTime) << endl; 15 | 16 | 17 | Compare1 = new char[DEF_lineLengthMax + DEF_adaptLengthMax]; 18 | Compare2 = new char[DEF_lineLengthMax + DEF_adaptLengthMax]; 19 | 20 | lAdapt1 = strlen(Adapt1); 21 | lAdapt2 = strlen(Adapt2); 22 | 23 | seqToNumRevComp(Adapt2, Compare1, lAdapt2); 24 | seqToNumRev(Adapt1, Compare2, lAdapt1); 25 | 26 | Compare1Seq = Compare1 + lAdapt2; //Set pointer inside Compare1 just after the adapter - copy Sequence into this location each time. 27 | Compare2Seq = Compare2 + lAdapt1; 28 | 29 | lAdapt1prefix = 0; 30 | while (lAdapt1prefix <= lAdapt1) { 31 | if (Compare2[lAdapt1-1-lAdapt1prefix] != 0) break; 32 | lAdapt1prefix++; 33 | } 34 | lAdapt2prefix = 0; 35 | while (lAdapt2prefix <= lAdapt2) { 36 | if (Compare1[lAdapt2-1-lAdapt2prefix] != 0) break; 37 | lAdapt2prefix++; 38 | } 39 | 40 | minSpan = lAdapt1 + lAdapt2 - lAdapt1prefix - lAdapt2prefix; 41 | 42 | chunk1 = new char[DEF_lineLengthMax*4]; 43 | chunk2 = new char[DEF_lineLengthMax*4]; 44 | 45 | countInputReads = 0; 46 | countOutputTrimmed = 0; 47 | countOutputUntrimmed = 0; 48 | countOutputTooShort = 0; 49 | 50 | }; 51 | 52 | int TrimReads::trimAll() 53 | { 54 | // *OFdebug << "trimAll" << endl; 55 | while(IF1->peek() == '@') { 56 | 57 | countInputReads++; 58 | 59 | uint chunkLen1 = 0; 60 | uint chunkLen2 = 0; 61 | 62 | IF1->getline(chunk1, DEF_lineLengthMax); 63 | chunkLen1 += IF1->gcount(); 64 | chunk1[chunkLen1-1] = '\n'; 65 | Seq1 = chunk1 + chunkLen1; 66 | uint lName1 = IF1->gcount() - 1; 67 | 68 | IF1->getline(chunk1 + chunkLen1, DEF_lineLengthMax); 69 | chunkLen1 += IF1->gcount(); 70 | chunk1[chunkLen1-1] = '\n'; 71 | uint lR1 = IF1->gcount() - 1; 72 | 73 | IF1->getline(chunk1 + chunkLen1, DEF_lineLengthMax); 74 | chunkLen1 += IF1->gcount(); 75 | chunk1[chunkLen1-1] = '\n'; 76 | uint lQName1 = IF1->gcount() - 1; 77 | 78 | IF1->getline(chunk1 + chunkLen1, DEF_lineLengthMax); 79 | chunkLen1 += IF1->gcount(); 80 | chunk1[chunkLen1-1] = '\n'; 81 | uint lQual1 = IF1->gcount() - 1; 82 | 83 | 84 | IF2->getline(chunk2, DEF_lineLengthMax); 85 | chunkLen2 += IF2->gcount(); 86 | chunk2[chunkLen2-1] = '\n'; 87 | Seq2 = chunk2 + chunkLen2; 88 | uint lName2 = IF2->gcount() - 1; 89 | 90 | IF2->getline(chunk2 + chunkLen2, DEF_lineLengthMax); 91 | chunkLen2 += IF2->gcount(); 92 | chunk2[chunkLen2-1] = '\n'; 93 | uint lR2 = IF2->gcount() - 1; 94 | 95 | IF2->getline(chunk2 + chunkLen2, DEF_lineLengthMax); 96 | chunkLen2 += IF2->gcount(); 97 | chunk2[chunkLen2-1] = '\n'; 98 | uint lQName2 = IF2->gcount() - 1; 99 | 100 | IF2->getline(chunk2 + chunkLen2, DEF_lineLengthMax); 101 | chunkLen2 += IF2->gcount(); 102 | chunk2[chunkLen2-1] = '\n'; 103 | uint lQual2 = IF2->gcount() - 1; 104 | 105 | 106 | if (lR1 != lQual1 || lR2 != lQual2) { 107 | //*OFdebug << "FATAL: a quality line has different length to Sequence line - corrupt input file." << endl; 108 | cout << "FATAL: a quality line has different length to Sequence line - corrupt input file." << endl; 109 | cout << "Error at read number: " << countInputReads << endl; 110 | return(1); 111 | } 112 | 113 | seqToNum(Seq1, Compare1Seq, lR1); 114 | seqToNumComp(Seq2, Compare2Seq, lR2); 115 | 116 | uint maxOverlap = minSpan+max(lR1+lAdapt2prefix, lR2+lAdapt1prefix); 117 | // uint overlapPos = localAlign(Compare1, lR1+lAdapt2, Compare2, lR2+lAdapt1, lAdapt1+lAdapt2, maxOverlap); 118 | uint overlapPos = localAlign(Compare1, lR1+lAdapt2, Compare2, lR2+lAdapt1, minSpan, maxOverlap); 119 | 120 | uint InsertLen = overlapPos-lAdapt1-lAdapt2+lAdapt1prefix+lAdapt2prefix; 121 | 122 | if (insertLenDistribution.size() <= InsertLen) { 123 | insertLenDistribution.resize( InsertLen+1 ,0); 124 | } 125 | 126 | insertLenDistribution.at(InsertLen)++; 127 | 128 | if (maxOverlap == overlapPos) { 129 | //No trimming, output exactly what we read - as a single fast block. 130 | OF1->write(chunk1, chunkLen1); 131 | OF2->write(chunk2, chunkLen2); 132 | countOutputUntrimmed++; 133 | }else if (InsertLen < 30) { 134 | // Don't output, too short. 135 | countOutputTooShort++; 136 | }else{ 137 | if (!debug) { 138 | chunk1[lName1] = '\0'; 139 | uint firstSpace = strcspn(chunk1," "); 140 | chunk1[lName1] = '\n'; 141 | OF1->write(chunk1, firstSpace); 142 | *OF1 << "_INS_" << InsertLen; 143 | OF1->write(chunk1+firstSpace, lName1+1-firstSpace+min(InsertLen-lAdapt2prefix,lR1)); 144 | OF1->write(chunk1+lName1+lR1+1, lQName1+2+min(InsertLen-lAdapt2prefix,lR1)); 145 | OF1->write("\n",1); 146 | 147 | chunk2[lName2] = '\0'; 148 | firstSpace = strcspn(chunk2," "); 149 | chunk2[lName2] = '\n'; 150 | OF2->write(chunk2, firstSpace); 151 | *OF2 << "_INS_" << InsertLen; 152 | OF2->write(chunk2+firstSpace, lName2+1-firstSpace+min(InsertLen-lAdapt1prefix,lR2)); 153 | OF2->write(chunk2+lName2+lR2+1, lQName2+2+min(InsertLen-lAdapt1prefix,lR2)); 154 | OF2->write("\n",1); 155 | }else{ 156 | chunk1[lName1] = '\0'; 157 | uint firstSpace = strcspn(chunk1," "); 158 | chunk1[lName1] = '\n'; 159 | OF1->write(chunk1, firstSpace); 160 | *OF1 << "_INS_" << InsertLen; 161 | 162 | for(int i = lName1+1+min(InsertLen-lAdapt2prefix,lR1); i<(lName1+1+lR1); i++){ 163 | chunk1[i] = tolower(chunk1[i]); 164 | } 165 | OF1->write(chunk1+firstSpace, lName1+1-firstSpace+lR1); 166 | OF1->write(chunk1+lName1+lR1+1, lQName1+2+lR1); 167 | OF1->write("\n",1); 168 | 169 | chunk2[lName2] = '\0'; 170 | firstSpace = strcspn(chunk2," "); 171 | chunk2[lName2] = '\n'; 172 | OF2->write(chunk2, firstSpace); 173 | *OF2 << "_INS_" << InsertLen; 174 | 175 | for(int i = lName2+1+min(InsertLen-lAdapt1prefix,lR2); i<(lName2+1+lR2); i++){ 176 | chunk2[i] = tolower(chunk2[i]); 177 | } 178 | OF2->write(chunk2+firstSpace, lName2+1-firstSpace+lR2); 179 | OF2->write(chunk2+lName2+lR2+1, lQName2+2+lR2); 180 | OF2->write("\n",1); 181 | } 182 | 183 | countOutputTrimmed++; 184 | } 185 | 186 | 187 | } 188 | 189 | // Output summary statistics. 190 | time(&endTime); 191 | 192 | cout << "Completed trimming at: " << timeMonthDayTime(endTime) << endl; 193 | cout << double(countInputReads)/1e6/difftime(endTime,startTime)*3600 << "\tTrimming speed, million reads per hour" << endl; 194 | 195 | ios::fmtflags old_output_settings = cout.flags(); 196 | cout << fixed << setprecision(4); 197 | cout << (countOutputTrimmed+countOutputTooShort)/double(countInputReads)*100 << "\t% with adaptor" << endl; 198 | cout.flags(old_output_settings); 199 | cout << countInputReads << "\tTotal input reads" << endl; 200 | cout << (countOutputUntrimmed+countOutputTrimmed) << "\tTotal output reads" << endl; 201 | cout << countOutputUntrimmed << "\tTotal unmodified reads output" << endl; 202 | cout << countOutputTrimmed << "\tTotal trimmed reads output" << endl; 203 | cout << countOutputTooShort << "\tTotal trimmed reads too short" << endl; 204 | cout << endl; 205 | cout << "------ Insert length distribution ------" << endl; 206 | cout << "Length\tCount" << endl; 207 | 208 | for (uint i=0; i insertLenDistribution; 29 | 30 | time_t startTime, endTime; 31 | 32 | istream* IF1; 33 | istream* IF2; 34 | ostream* OF1; 35 | ostream* OF2; 36 | ostream* OFdebug; 37 | char debug; 38 | 39 | public: 40 | TrimReads(istream * _IF1, istream * _IF2, ostream * _OF1, ostream * _OF2, ostream * _DebugLog, char * Adapt1, char * Adapt2, char _debug = 0); 41 | int trimAll(); 42 | }; 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /src/trim/includedefine.h: -------------------------------------------------------------------------------- 1 | #ifndef INCLUDEDEFINE_DEF 2 | #define INCLUDEDEFINE_DEF 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | 14 | 15 | #define DEF_lineLengthMax 10000 16 | #define DEF_adaptLengthMax 500 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /src/trim/sequenceTools.cpp: -------------------------------------------------------------------------------- 1 | #include "sequenceTools.h" 2 | 3 | uint localAlign(const char *x, uint nx, const char *y, uint ny, uint minspan, uint maxspan) 4 | { 5 | // Expecting x, a numeric Seq string. y, a pre-complemented numeric Seq string. Both are expected in their original direction. 6 | uint nMatch; 7 | int nMMcounter; 8 | double nScore; 9 | double nScoreBest=0; 10 | uint spanBest=maxspan; 11 | uint ixbegin; 12 | uint ixlimit; 13 | // Min score = 0.8 therefore, at worst position (integer comparison): 14 | //uint maxMismatch = (maxspan/10) + 1; 15 | uint maxMismatch; 16 | 17 | for (uint span=minspan; span<=maxspan; span++ ) { 18 | nMatch=0; 19 | ixbegin = max(0,int(span)-int(ny)); 20 | ixlimit = min(span,nx); 21 | maxMismatch = ((ixlimit-ixbegin)/10) + 1; 22 | nMMcounter=maxMismatch; 23 | for (uint ix=ixbegin; ix= 0) { 36 | nScore = ((double)(nMatch - maxMismatch + nMMcounter))/(ixlimit-ixbegin); 37 | if (nScore >= 0.8 && nScore > nScoreBest) { 38 | nScoreBest = nScore; 39 | spanBest=span; 40 | } 41 | } 42 | } 43 | 44 | return spanBest; 45 | } 46 | 47 | void seqToNum(const char* in, char* out, uint nin) // do we really need length, or just run until \0? 48 | { 49 | for (uint jj=0;jjtrimAll(); 31 | 32 | OUT1.flush(); 33 | OUT2.flush(); 34 | 35 | IN1.close(); 36 | IN2.close(); 37 | 38 | OUT1.close(); 39 | OUT2.close(); 40 | 41 | exit(success); 42 | // f1InStream.getline 43 | }; 44 | -------------------------------------------------------------------------------- /src/winflat/Makefile: -------------------------------------------------------------------------------- 1 | all: winflat test 2 | 3 | winflat: winflat_with_beta.c 4 | $(CC) -o winflat winflat_with_beta.c -lm 5 | 6 | test: winflat 7 | sh runtest.sh 2> /dev/null 8 | 9 | clean: 10 | rm -f *.o winflat -------------------------------------------------------------------------------- /src/winflat/README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/src/winflat/README -------------------------------------------------------------------------------- /src/winflat/runtest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | echo -e " " \\t 2E=0.01 \\t 2E=0.005 3 | echo -e x \\t Ymin--Ymax \\t Ymin--Ymax 4 | for i in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 5 | do 6 | echo -e $i \\t `./winflat -xvalue $i -sig 0.01 | awk '{print $2}'` \\t\\t `./winflat -xvalue $i -sig 0.05 | awk '{print $2}'` 7 | done 8 | exit 0 9 | -------------------------------------------------------------------------------- /src/winflat/winflat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RitchieLabIGH/IRFinder/c183bff83abe85b19cbf488808ef884c566a1b0d/src/winflat/winflat -------------------------------------------------------------------------------- /src/winflat/winflat_with_beta.c: -------------------------------------------------------------------------------- 1 | /* To compile cc -o winflat_with_beta winflat_with_beta.c -lm */ 2 | /* Copyright Stephane Audic 2003 */ 3 | /* With code included from Numerical Recipes in C */ 4 | 5 | 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | 12 | 13 | #define MAXIT 500 14 | #define EPS 3.0e-30 15 | #define FPMIN 1.0e-30 16 | 17 | double betacf(double a, double b, double x); 18 | double gammln(double xx); 19 | double betai(double a, double b, double x); 20 | 21 | double betacf(double a, double b, double x) 22 | { 23 | /* void nrerror(char error_text[]); */ 24 | int m,m2; 25 | double aa,c,d,del,h,qab,qam,qap; 26 | 27 | qab=a+b; 28 | qap=a+1.0; 29 | qam=a-1.0; 30 | c=1.0; 31 | d=1.0-qab*x/qap; 32 | if (fabs(d) < FPMIN) d=FPMIN; 33 | d=1.0/d; 34 | h=d; 35 | for (m=1;m<=MAXIT;m++) { 36 | m2=2*m; 37 | aa=m*(b-m)*x/((qam+m2)*(a+m2)); 38 | d=1.0+aa*d; 39 | if (fabs(d) < FPMIN) d=FPMIN; 40 | c=1.0+aa/c; 41 | if (fabs(c) < FPMIN) c=FPMIN; 42 | d=1.0/d; 43 | h *= d*c; 44 | aa = -(a+m)*(qab+m)*x/((a+m2)*(qap+m2)); 45 | d=1.0+aa*d; 46 | if (fabs(d) < FPMIN) d=FPMIN; 47 | c=1.0+aa/c; 48 | if (fabs(c) < FPMIN) c=FPMIN; 49 | d=1.0/d; 50 | del=d*c; 51 | h *= del; 52 | if (fabs(del-1.0) < EPS) break; 53 | } 54 | if (m > MAXIT){ 55 | fprintf( stderr , "a or b too big, or MAXIT too small in betacf"); 56 | exit(1) ; 57 | } 58 | return h; 59 | } 60 | #undef MAXIT 61 | #undef EPS 62 | #undef FPMIN 63 | 64 | 65 | double gammln(double xx) 66 | { 67 | double x,y,tmp,ser; 68 | static double cof[6]={76.18009172947146,-86.50532032941677, 69 | 24.01409824083091,-1.231739572450155, 70 | 0.1208650973866179e-2,-0.5395239384953e-5}; 71 | int j; 72 | 73 | y=x=xx; 74 | tmp=x+5.5; 75 | tmp -= (x+0.5)*log(tmp); 76 | ser=1.000000000190015; 77 | for (j=0;j<=5;j++) ser += cof[j]/++y; 78 | return -tmp+log(2.5066282746310005*ser/x); 79 | } 80 | 81 | 82 | 83 | double betai(double a, double b, double x) 84 | { 85 | double bt; 86 | 87 | if (x < 0.0 || x > 1.0) { 88 | fprintf( stderr , "Bad x in routine betai") ; 89 | exit(1) ; 90 | } 91 | if (x == 0.0 || x == 1.0) bt=0.0; 92 | else 93 | bt=exp(gammln(a+b)-gammln(a)-gammln(b)+a*log(x)+b*log(1.0-x)); 94 | if (x < (a+1.0)/(a+b+2.0)) 95 | return bt*betacf(a,b,x)/a; 96 | else 97 | return 1.0-bt*betacf(b,a,1.0-x)/b; 98 | } 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | void usage(){ 107 | fprintf( stderr , "usage: two behaviours can be invoqued. \n") ; 108 | fprintf( stderr , " winflat -xvalue x -sig significance [ -diff n1 n2 ]\n"); 109 | fprintf( stderr , " will return the lower and upper y value at the given significance level\n"); 110 | fprintf( stderr ,"\n") ; 111 | 112 | fprintf( stderr , " winflat -xvalue x -yvalue y [ -diff n1 n2 ]\n"); 113 | fprintf( stderr , " will return the probability of over or underexpression \n"); 114 | 115 | 116 | /* fprintf( stderr ,"\n") ; 117 | fprintf( stderr , " winflat -xvalue x -show [ -diff n1 n2 ]\n"); 118 | fprintf( stderr , " will return a plot \n"); 119 | */ 120 | 121 | fprintf( stderr ,"If the number of clones in the two libraries is \n") ; 122 | fprintf( stderr ,"different, use -diff n1 n2\n") ; 123 | /* fprintf(stderr ,"To see p(y|x) and C(y|x) , use -show \n") ; */ 124 | 125 | fprintf( stderr ,"\n") ; 126 | } 127 | 128 | int main( int argc , char **argv){ 129 | double temp ; 130 | double thisproba , thisproba2 ; 131 | int x ; 132 | int y ; 133 | int thisy ; 134 | int n1 = 1 , n2 = 1 ; 135 | double ratio ; 136 | double t1 , t2 , t3 , t4 ; 137 | 138 | double sum ; 139 | int ymin , ymax ; 140 | int argcount = 1 ; 141 | int noup ; 142 | double sig ; 143 | int show = 0 ; 144 | 145 | double p ; 146 | #define WNMIN -1 147 | #define WNMAX 10000 148 | 149 | if( argc < 4){ 150 | usage() ; 151 | exit(0) ; 152 | } 153 | 154 | x = -1 ; 155 | y = -1 ; 156 | 157 | while( argcount < argc ){ 158 | 159 | if( !strcmp( argv[argcount] , "-xvalue")){ 160 | x = atoi( argv[argcount+1] ) ; 161 | argcount += 2 ; 162 | } else if( !strcmp( argv[argcount] , "-yvalue")){ 163 | y = atoi( argv[argcount+1] ) ; 164 | argcount += 2 ; 165 | } else if( !strcmp( argv[ argcount ] , "-diff")){ 166 | /* correction in the case where the number of est's drawn 167 | from the samples is different */ 168 | n1 = atoi( argv[ argcount + 1 ] ) ; 169 | n2 = atoi( argv[ argcount + 2 ] ) ; 170 | argcount += 3 ; 171 | } else if( !strcmp( argv[ argcount ] , "-sig" )){ 172 | sig = (double) atof( argv[ argcount + 1 ] ) ; 173 | argcount += 2 ; 174 | } else { 175 | usage() ; 176 | exit(0) ; 177 | } 178 | } 179 | 180 | 181 | 182 | /* Check arguments and invoque the right procedure */ 183 | 184 | if( x > -1 && y > -1 ){ 185 | /* Both x and y are defined so we compute the significance window */ 186 | ymin = WNMIN ; 187 | ymax = WNMAX ; 188 | 189 | sum = 0 ; 190 | noup = 1 ; 191 | ratio = (double) n1 / (double) n2 ; 192 | 193 | p = (double) ( n1 ) / (double) ( n1 + n2 ) ; 194 | 195 | thisproba = betai( (double) (x + 1 ) , (double)( y + 1 ) , p ) ; 196 | thisproba2 = betai( (double) ( y + 1 ) , (double) (x + 1 ) , 1 - p ) ; 197 | fprintf( stdout , "P( y <= %d | x = %d ) = %g \n" , y , x , thisproba ) ; 198 | fprintf( stdout , "P( y >= %d | x = %d ) = %g \n" , y , x , thisproba2 ) ; 199 | } else if( x > -1 && y == -1 ){ 200 | 201 | y = 0 ; 202 | ymin = WNMIN ; 203 | ymax = WNMAX ; 204 | 205 | sum = 0 ; 206 | noup = 1 ; 207 | ratio = (double) n1 / (double) n2 ; 208 | 209 | p = (double) ( n1 ) / (double) ( n1 + n2 ) ; 210 | 211 | 212 | y = 0 ; 213 | /* fprintf( stderr , "x = %d y= %d sig = %g\n" , x , y , sig ) ; */ 214 | while( 1 ){ 215 | thisproba = betai( (double) (x + 1 ) , (double)( y + 1 ) , p ) ; 216 | thisproba2 = betai( (double) ( y + 1 ) , (double) (x + 1 ) , 1 - p ) ; 217 | 218 | /* fprintf( stdout , "%d %d C(%d | %d ) = %g %g\n" , 219 | x , y , y , x , thisproba , thisproba2 ) ; 220 | */ 221 | 222 | if( thisproba < sig / 2.0 ){ 223 | ymin = y ; 224 | } 225 | 226 | if( ymax == WNMAX && thisproba2 < sig / 2.0 ){ 227 | ymax = y ; 228 | break ; 229 | } 230 | y++ ; 231 | } 232 | thisproba = betai( (double) (x + 1 ) , (double)( ymin + 1 ) , p ) ; 233 | thisproba2 = betai( (double) ( ymax + 1 ) , (double) (x + 1 ) , 1 - p ) ; 234 | fprintf( stderr , "P( y <= %d | x = %d ) = %g \n" , ymin , x , thisproba ) ; 235 | fprintf( stderr , "P( y >= %d | x = %d ) = %g \n" , ymax , x , thisproba2 ) ; 236 | 237 | if( ymin == -1 ){ 238 | fprintf( stdout , "%d *--%d\n" , x , ymax) ; 239 | } else { 240 | fprintf( stdout , "%d %d--%d\n" , x , ymin , ymax) ; 241 | } 242 | 243 | } else if( show == 1 ){ 244 | 245 | fprintf( stdout , "%d %d C(%d | %d ) = %g %g\n" , 246 | x , y , y , x , thisproba , thisproba2 ) ; 247 | y++ ; 248 | if( thisproba > 0.9999999999 ){ 249 | 250 | } 251 | } 252 | } 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | --------------------------------------------------------------------------------