├── Manual ├── MANUAL.pdf └── MANUAL.rmd ├── TestDataset ├── Sim_R1.fq.gz ├── Sim_R2.fq.gz ├── Run_detectIS.sh ├── .nextflow.log └── CelTag.fa ├── Workflows ├── bin │ ├── detectIS.png │ ├── CreatePDFandHTML.sh │ ├── detectISlisting.tex │ ├── GeneralTools.pm │ ├── detectISlisting.css │ ├── detectIS.pl │ ├── DetectChmR.pm │ └── DetectSpltR.pm └── detectIS.nf ├── AUTHORS.md ├── utils ├── Generate_Integrations.sh ├── SimulateIntegration.pl └── detectIS.rec ├── detectIS_TestDataset.conf ├── CONTRIBUTING.md ├── README.md └── LICENSE /Manual/MANUAL.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraZeneca/detectIS/HEAD/Manual/MANUAL.pdf -------------------------------------------------------------------------------- /TestDataset/Sim_R1.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraZeneca/detectIS/HEAD/TestDataset/Sim_R1.fq.gz -------------------------------------------------------------------------------- /TestDataset/Sim_R2.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraZeneca/detectIS/HEAD/TestDataset/Sim_R2.fq.gz -------------------------------------------------------------------------------- /Workflows/bin/detectIS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraZeneca/detectIS/HEAD/Workflows/bin/detectIS.png -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | # Authors 2 | 3 | * [Luigi Grassi](mailto:luigi.grassi@astrazeneca.com) 4 | * [Claire Harris](mailto:claire.harris@astrazeneca.com) 5 | * [Jie Zhu](mailto:jie.zhu5@astrazeneca.com) 6 | * [Colin Hardman](mailto:colin.hardman@astrazeneca.com) 7 | * [Diane Hatton](mailto:diane.hatton@astrazeneca.com) 8 | 9 | # Maintainer 10 | 11 | * [Luigi Grassi](https://github.com/luigra) 12 | -------------------------------------------------------------------------------- /Workflows/bin/CreatePDFandHTML.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | DIR=$1 4 | 5 | wd="$( cd "$(dirname "$( readlink -f ${BASH_SOURCE[0]} )")" >/dev/null 2>&1 && pwd)" 6 | 7 | rep_head="${wd}/detectISlisting.tex" 8 | rep_css="${wd}/detectISlisting.css" 9 | 10 | singimg=${wd}"../../../utils/detectIS.simg" 11 | 12 | array=($(ls -d $DIR*.md)) 13 | 14 | 15 | for name in "${array[@]}" 16 | do 17 | name=${name%".md"} 18 | singularity exec $singimg bash -c "pandoc ${name}.md --listings -H ${rep_head} -o ${name}.pdf" 19 | singularity exec $singimg bash -c "pandoc --css=${rep_css} --mathjax --to=html5 ${name}.md -o ${name}.html" 20 | done 21 | -------------------------------------------------------------------------------- /utils/Generate_Integrations.sh: -------------------------------------------------------------------------------- 1 | #!usr/bin/bash 2 | 3 | #Script designed to simulate different plasmid integrations in a genomic reference sequence 4 | 5 | fastadir="../TestDataset/SimIntegration/simFASTA/" 6 | mkdir -p ${fastadir} 7 | 8 | perl SimulateIntegration.pl ../TestDataset/Cricetulus_griseus_chok1gshd.CHOK1GS_HDv1.dna.toplevel_Scaffold0.fa ../TestDataset/CelTag.fa 0.51 > ${fastadir}Scaffold0_0.5C_IS.fa 9 | perl SimulateIntegration.pl ../TestDataset/Cricetulus_griseus_chok1gshd.CHOK1GS_HDv1.dna.toplevel_Scaffold0.fa ../TestDataset/CelTag.fa 1.03 > ${fastadir}Scaffold0_1C_IS.fa 10 | perl SimulateIntegration.pl ../TestDataset/Cricetulus_griseus_chok1gshd.CHOK1GS_HDv1.dna.toplevel_Scaffold0.fa ../TestDataset/CelTag.fa 2.05 > ${fastadir}Scaffold0_2C_IS.fa 11 | -------------------------------------------------------------------------------- /detectIS_TestDataset.conf: -------------------------------------------------------------------------------- 1 | params.project_name='Test_dataset-detectIS' 2 | //The variable params.project_name specifies the name of the project 3 | 4 | //CLUSTER SETTINGS 5 | process.executor='sge' 6 | //The variable process.executor specifies the executor (see https://www.nextflow.io/docs/latest/executor.html# for further information) 7 | process.queue = 'infini.q' 8 | //The variable process.queue specifies the queue to use 9 | process.clusterOptions = '-S /bin/bash' 10 | //The variable process.clusterOptions specifies options specific of the used cluster 11 | process.penv = 'smp' 12 | //The variable process.penv specifies the parallel environment to be used when submitting a parallel task to the SGE resource manager (see https://www.nextflow.io/docs/latest/process.html#penv for further info) 13 | params.scratch='/scratch/' 14 | //The variable params.scratch specifies the scratch directory 15 | 16 | 17 | //SINGULARITY SETTINGS 18 | singularity.enabled = true 19 | process.container = "utils/detectIS.simg" 20 | //The variable process.container specify the singularity image to use 21 | singularity.cacheDir = "/scratch/" 22 | //The variable singularity.cacheDir specifies the singularity.cacheDir 23 | 24 | 25 | //ANALYSIS SPECIFIC PARAMETERS: INPUT AND OUTPUT DIRECTORY 26 | params.reads = "TestDataset/*_{R1,R2}.fq.gz" 27 | //The variable params.scratch specifies the sequencing reads 28 | params.outdir = "TestDataset/NextflowRes/" 29 | //The variable params.outdir specifies the output directory 30 | 31 | //ANALYSIS SPECIFIC PARAMETERS: MAPPING PARAMETERS 32 | params.cpu.minimap=32 33 | //The variable params.cpu.minimap specifies the cpu used for the mapping with Minimap2 34 | params.host_seq="TestDataset/Cricetulus_griseus_chok1gshd.CHOK1GS_HDv1.dna.toplevel_Scaffold0.fa" 35 | //The variable params.host_seq specifies the reference fasta file of the host genome 36 | params.vir_seq="TestDataset/CelTag.fa" 37 | //The variable params.vir_seq specifies the reference fasta file of the exogenous element (plasmid, viral agent, etc.) 38 | 39 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to detectIS 2 | 3 | This document contains the guidelines to contribute to the detectIS project, hosted in GitHub at [https://github.com/AstraZeneca/detectIS](https://github.com/AstraZeneca/detectIS) 4 | 5 | :pray: First of all, thank you so much for contributing! :pray: 6 | 7 | 8 | ## How can I contribute ? 9 | 10 | Please use the GitHub [issue](https://guides.github.com/features/issues/) tracker to share with maintainers and users questions, bug reports and feature requests. 11 | 12 | ### Related issues 13 | 14 | Before submitting an new issue request please check whether the same or a similar topic has already been treated in the existing closed issues at [https://github.com/AstraZeneca/detectIS/issues](https://github.com/AstraZeneca/detectIS/issues). 15 | 16 | If you are not able to find a solution to your problem in the existing closed issues please open a new one and, if possible, include a link to any related existing issue in the body of your new request. 17 | 18 | If your request is already treated in an open issue please do not raise a new issue request but add your comments to the existing open issue. 19 | 20 | #### Bug Report 21 | 22 | We really appreciate bug reports: 23 | 24 | please check if you can reproduce the problem in the latest version of the program and with default parameters. Please report in details the problem and include all the available information (command used, system information, environment variables, error messages, log files, reproducibility of the error, etc.). 25 | 26 | 27 | ### Suggesting Enhancements 28 | 29 | Authors and maintainers are very open to requests of new features and minor improvements to existing functionality. Please describe and motivate your request, sharing test data sets if possible. 30 | 31 | 32 | ### Pull Requests 33 | 34 | If you want to submit a pull request please follow the "fork-and-pull" Git workflow and provide detailed code documentation and commit messages. 35 | 36 | 1. **Fork** the repo on GitHub 37 | 2. **Clone** the project to your local machine 38 | 3. **Commit** changes to your own branch 39 | 4. **Push** your work back up to your fork 40 | 5. Submit a **Pull request** so that we can review your changes 41 | 42 | -------------------------------------------------------------------------------- /TestDataset/Run_detectIS.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | #Script designed to run detectIS using 4 | #a host genome and a viral genome as references 5 | 6 | singimg="../utils/detectIS.simg" 7 | 8 | gref="./Cricetulus_griseus_chok1gshd.CHOK1GS_HDv1.dna.toplevel_Scaffold0.fa" 9 | extragenome1="./CelTag.fa" 10 | 11 | reads=( ./Sim_R1.fq.gz ./Sim_R2.fq.gz ) 12 | 13 | mkdir -p Res 14 | name="./Res/SimRead" 15 | paramscpu=4 16 | 17 | 18 | echo $(date) 19 | echo "Starting minimap2 mapping on viral ref" 20 | 21 | #Mapping Reads onto the plasmid and extracting the mapped ones 22 | /usr/local/singularity/bin/singularity exec $singimg bash -c "minimap2 -x sr -c ${extragenome1} ${reads[0]} -t ${paramscpu} > ${name}_mate1_vir.paf" 23 | /usr/local/singularity/bin/singularity exec $singimg bash -c "minimap2 -x sr -c ${extragenome1} ${reads[1]} -t ${paramscpu} > ${name}_mate2_vir.paf" 24 | /usr/local/singularity/bin/singularity exec $singimg bash -c "parallel -k cut -f1 ::: ${name}_mate1_vir.paf ${name}_mate2_vir.paf > ${name}_vir.lst" 25 | /usr/local/singularity/bin/singularity exec $singimg bash -c "sort --parallel ${paramscpu} ${name}_vir.lst | uniq > ${name}_vir_mapping.lst" 26 | /usr/local/singularity/bin/singularity exec $singimg parallel --link seqtk subseq {1} ${name}_vir_mapping.lst ">" {2} ::: ${reads[0]} ${reads[1]} ::: ${name}_vir_R1.fq ${name}_vir_R2.fq 27 | 28 | echo $(date) 29 | echo "minimap2 mapping on plasmid ref completed, mapping reads on host genome" 30 | #Mapping Reads onto the host genome and run detectIS 31 | /usr/local/singularity/bin/singularity exec $singimg bash -c "minimap2 -x sr -c ${gref} ${name}_vir_R1.fq -t ${paramscpu} > ${name}_mate1_gnm.paf" 32 | /usr/local/singularity/bin/singularity exec $singimg bash -c "minimap2 -x sr -c ${gref} ${name}_vir_R2.fq -t ${paramscpu} > ${name}_mate2_gnm.paf" 33 | 34 | echo $(date) 35 | echo "minimap2 mapping complete, runnig detectIS" 36 | /usr/local/singularity/bin/singularity exec $singimg bash -c "perl ../Workflows/bin/detectIS.pl -h1 ${name}_mate1_gnm.paf -h2 ${name}_mate2_gnm.paf -v1 ${name}_mate1_vir.paf -v2 ${name}_mate2_vir.paf -o ${name}" 37 | 38 | echo $(date) 39 | echo "detectIS complete" 40 | 41 | ln -s ../Workflows/bin/detectIS.png detectIS.png 42 | ../Workflows/bin/CreatePDFandHTML.sh Res/ 43 | rm detectIS.png 44 | 45 | 46 | -------------------------------------------------------------------------------- /utils/SimulateIntegration.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | use diagnostics ; 5 | 6 | #perl SimulateIntegration.pl 7 | # 8 | # 9 | # 10 | ################################################ 11 | 12 | (defined $ARGV[2]) || die "ERROR: please specify Chromosome fasta file, plasmid fasta file and number of inserted copies\n" ; 13 | 14 | my $genome = $ARGV[0]; 15 | my $plm = $ARGV[1]; 16 | my $plmcp = $ARGV[2]; 17 | 18 | ($plmcp=~ /^\d+\.?\d*$/ && $plmcp > 0 && $plmcp<=5) || die "ERROR: please number of inserted copies has to be a numeric value [0.1-5]\n" ; 19 | 20 | my $chrseq=''; 21 | my $chrname=''; 22 | my $isgname=''; 23 | open(FILE, "<$genome") || die "ERROR: please check the $genome file\n" ; 24 | while (my $line= ) { 25 | chomp $line; 26 | if (length ($line)>0) { 27 | if ($line=~/^>/) { 28 | $chrname=$line; 29 | my @tmp=split(/\s+/, $line) ; 30 | $isgname=$tmp[0] ; 31 | $isgname=~s/>//; 32 | 33 | } 34 | else { 35 | $chrseq.=$line ; 36 | } 37 | } 38 | } 39 | close FILE ; 40 | 41 | 42 | 43 | my $plmseq=''; 44 | my $plname=''; 45 | open(FILEP, "<$plm") || die "ERROR: please check $plm file\n" ; 46 | while (my $line= ) { 47 | chomp $line; 48 | if (length ($line)>0) { 49 | if ($line=~/^>/) { 50 | my @tmp=split(/ /, $line) ; 51 | $plname=$tmp[0] ; 52 | $plname=~s/>//; 53 | } 54 | else { 55 | $plmseq.=$line ; 56 | } 57 | } 58 | } 59 | close FILEP ; 60 | 61 | 62 | my $finalenght=int($plmcp*(length($plmseq))) ; 63 | my $V1=int(rand(length($plmseq))); 64 | 65 | my $mcplm1=substr($plmseq, $V1) ; 66 | my $mcplm2=$mcplm1.$plmseq.$plmseq.$plmseq.$plmseq.$plmseq ; 67 | my $mcplm=substr($mcplm2, 0, $finalenght ) ; 68 | my $V2; 69 | if ($finalenght <= length($mcplm1)) { 70 | $V2=$V1+($finalenght-1) ; 71 | } 72 | else { 73 | $V2=($finalenght-(length($mcplm1))-1) ; 74 | until ($V2<= length($plmseq)) { 75 | $V2=$V2-length($plmseq); 76 | } 77 | } 78 | 79 | my $G1 = length($chrseq); 80 | my $G2 = length($chrseq) ; 81 | until ($G10) { 82 | $G1=int( (length($chrseq)/2) -rand(1000)+rand(1000) ) ; 83 | } 84 | until ($G2$G1) { 85 | $G2=int($G1+(rand(1)*length($mcplm))); 86 | } 87 | print ">$isgname $isgname:$G1--$plname:$V1 $plname:$V2--$isgname:$G2 | " ; 88 | 89 | my $mcgnm1=substr($chrseq,0, $G1) ; 90 | my $mcgnm2=substr($chrseq,$G2) ; 91 | 92 | my $mcgnm=$mcgnm1.$mcplm.$mcgnm2; 93 | my $flen=length($mcgnm); 94 | my $fleni=length($mcplm); 95 | print "Total_len:$flen | Plasmid_len:$fleni\n"; 96 | print "$mcgnm\n"; 97 | 98 | 99 | -------------------------------------------------------------------------------- /Workflows/bin/detectISlisting.tex: -------------------------------------------------------------------------------- 1 | %Contents of listings-setup.tex 2 | \usepackage{graphics} 3 | \usepackage{listings} 4 | \usepackage{forloop} 5 | \usepackage{wrapfig} 6 | \usepackage{datetime} 7 | \usepackage{graphicx} 8 | % \usepackage[abspath]{currfile} 9 | 10 | %\graphicspath{{\currfileabsdir}} 11 | 12 | \newcounter{ct} 13 | \makeatletter 14 | % Since we don't want an initial comma in the list, we use \@gobble 15 | \def\optionslist{\@gobble} 16 | % Using the counter ct, range its value from 0 to 191 17 | \forloop{ct}{0}{\value{ct} < 27}{% 18 | % \edef expands the definition before it actually 19 | % binds the two. Thus, \rlist expands like a chameleon's tongue. 20 | \edef\optionslist{\optionslist,-\alph{ct}, -\Alph{ct}} 21 | } 22 | 23 | 24 | \makeatother 25 | \def\listwithrs#1\relax{% 26 | \lstdefinelanguage{ngs}{alsoletter={--,-,.,>,&}, % also: alsoletter 27 | }} 28 | \expandafter\listwithrs\optionslist\relax 29 | 30 | \usepackage{listings} 31 | \usepackage{upquote} %to force Latex not substitute ' by ` 32 | 33 | \lstset{ 34 | language=ngs, 35 | basicstyle=\ttfamily, 36 | keywordstyle=\color[rgb]{0.13,0.29,0.53}\bfseries, 37 | keywordstyle=[2]\color[rgb]{0.56,0.35,0.01}\bfseries, 38 | stringstyle=\color[rgb]{0.31,0.60,0.02}, 39 | commentstyle=\color[rgb]{0.56,0.35,0.01}\itshape, 40 | numberstyle=\footnotesize, 41 | stepnumber=1, 42 | numbersep=5pt, 43 | backgroundcolor=\color[RGB]{248,248,248}, 44 | showspaces=false, 45 | showstringspaces=false, 46 | showtabs=false, 47 | tabsize=2, 48 | captionpos=b, 49 | breaklines=true, 50 | breakatwhitespace=true, 51 | breakautoindent=true, 52 | escapeinside={\%*}{*)}, 53 | linewidth=0.9\textwidth, 54 | basewidth=0.5em, 55 | columns=fullflexible 56 | } 57 | 58 | 59 | 60 | \newdateformat{monthyeardate}{\monthname[\THEMONTH], \THEYEAR} 61 | \newcommand*{\customrule}[1]{\textcolor{#1}{\rule{\textwidth}{1.5pt}}\par} 62 | 63 | \newcommand*{\titleAT}{% 64 | \begingroup 65 | \newlength{\drop} 66 | \drop=0.1\textheight % White sapce generated is 10% of the total text height 67 | 68 | \customrule{blue} 69 | \customrule{blue} 70 | 71 | 72 | \vspace{\drop} 73 | \centering 74 | \textcolor[rgb]{0,1,0}{{\Huge detectIS}\\[0.5\baselineskip]} 75 | 76 | \vspace{0.25\drop} 77 | \rule{0.3\textwidth}{0.4pt}\par % Short horizontal line under the title 78 | \vspace{\drop} 79 | 80 | 81 | \begin{figure}[h!] 82 | \centering 83 | \includegraphics[width=0.4\columnwidth]{detectIS.png} 84 | \end{figure} 85 | 86 | 87 | \vfill 88 | {\large \textsc{detectIS data analysis\\ 89 | \monthyeardate\today }}\par % Publisher 90 | 91 | \vspace*{\drop} 92 | \customrule{blue} 93 | \customrule{blue} 94 | \endgroup} 95 | -------------------------------------------------------------------------------- /Workflows/bin/GeneralTools.pm: -------------------------------------------------------------------------------- 1 | package GeneralTools; 2 | 3 | use List::Util qw( min max ); 4 | use POSIX; 5 | 6 | ################################################################################ 7 | #########SUB 8 | ################################################################################ 9 | 10 | sub average{ 11 | my($data) = @_; 12 | if (not @$data) { 13 | die("Empty arrayn"); 14 | } 15 | my $total = 0; 16 | foreach (@$data) { 17 | $total += $_; 18 | } 19 | my $average = $total / @$data; 20 | return $average; 21 | } 22 | 23 | sub stdev{ 24 | my($data) = @_; 25 | if(@$data == 1){ 26 | return 0; 27 | } 28 | my $average = &average($data); 29 | my $sqtotal = 0; 30 | foreach(@$data) { 31 | $sqtotal += ($average-$_) ** 2; 32 | } 33 | my $std = ($sqtotal / (@$data-1)) ** 0.5; 34 | return $std; 35 | } 36 | 37 | sub median 38 | { 39 | my @vals = sort {$a <=> $b} @_; 40 | my $len = @vals; 41 | if($len%2) #odd? 42 | { 43 | return $vals[int($len/2)]; 44 | } 45 | else #even 46 | { 47 | return ($vals[int($len/2)-1] + $vals[int($len/2)])/2; 48 | } 49 | } 50 | 51 | sub merge_split_pairs 52 | { 53 | my $sp1 = $_[0]; 54 | my $sp2 = $_[1]; 55 | my $ms = $_[2]; 56 | my %ish1=(); # Key1:IS -> Element: number of fragment/read supporting it 57 | my %isr=(); # Key1:IS -> Key2:Read_ID -> Element: Read_occurrence 58 | my %isc=(); # Key IS -> Element: @ 0[Plm_chr] 1[Plm_pos] 2[Host_chr] 3[Host_pos] 59 | foreach my $n(keys %{$sp1}) { 60 | my @tmp=split(/--/, $n) ; 61 | my @tmp1=split(/:/, $tmp[0]) ; 62 | my @tmp2=split(/:/, $tmp[1]) ; 63 | $isc{$n}[0]=$tmp1[0] ; 64 | $isc{$n}[1]=$tmp1[1] ; 65 | $isc{$n}[2]=$tmp2[0] ; 66 | $isc{$n}[3]=$tmp2[1] ; 67 | foreach my $j( keys(%{$sp1->{$n}}) ) { 68 | $isr{$n}{$j}++ ; 69 | } 70 | } 71 | foreach my $n(keys %{$sp2}) { 72 | my @tmp=split(/--/, $n) ; 73 | my @tmp1=split(/:/, $tmp[0]) ; 74 | my @tmp2=split(/:/, $tmp[1]) ; 75 | $isc{$n}[0]=$tmp1[0] ; 76 | $isc{$n}[1]=$tmp1[1] ; 77 | $isc{$n}[2]=$tmp2[0] ; 78 | $isc{$n}[3]=$tmp2[1] ; 79 | foreach my $j( keys(%{$sp2->{$n}}) ) { 80 | $isr{$n}{$j}++; 81 | } 82 | } 83 | foreach my $n(keys %isr) { 84 | $ish1{$n}=scalar (keys %{$isr{$n}}) ; 85 | } 86 | foreach my $n(keys %ish1) { #Keeping IS with at least ms supporting splt reads 87 | if ($ish1{$n} < $ms) { 88 | delete $ish1{$n} ; 89 | delete $isr{$n} ; 90 | delete $isc{$n} ; 91 | } 92 | } 93 | return (\%ish1, \%isr, \%isc); #1K: Read name; El: @ 94 | } 95 | 96 | 1; 97 | -------------------------------------------------------------------------------- /utils/detectIS.rec: -------------------------------------------------------------------------------- 1 | BootStrap:docker 2 | From: ubuntu:18.04 3 | 4 | %help 5 | Singularity image for detectIS 6 | 7 | Create the img: sudo singularity build detectIS.simg detectIS.rec 8 | 9 | %runscript 10 | exec echo "The runscript is the containers default runtime command!" 11 | 12 | %files 13 | 14 | %environment 15 | VARIABLE=MEATBALLVALUE 16 | export VARIABLE 17 | 18 | %labels 19 | AUTHOR luigi.grassi@astrazeneca.com 20 | 21 | %post 22 | apt-get update && apt-get install -y --no-install-recommends \ 23 | apt-utils \ 24 | sudo \ 25 | vim \ 26 | less \ 27 | build-essential \ 28 | curl \ 29 | git \ 30 | wget \ 31 | unzip \ 32 | locales \ 33 | default-jre \ 34 | g++ \ 35 | make \ 36 | libz-dev \ 37 | samtools \ 38 | bedtools \ 39 | python \ 40 | python-dev \ 41 | parallel \ 42 | python-pip \ 43 | cmake \ 44 | texlive-latex-extra \ 45 | texlive-fonts-recommended \ 46 | lmodern \ 47 | libx11-dev \ 48 | libbz2-dev 49 | apt-get clean 50 | 51 | locale-gen en_US.UTF-8 52 | 53 | export LC_ALL=C.UTF-8 54 | export LANG=C.UTF-8 55 | echo 'export LC_ALL=C.UTF-8' >> $SINGULARITY_ENVIRONMENT 56 | echo 'export LANG=C.UTF-8' >> $SINGULARITY_ENVIRONMENT 57 | echo "export PATH=/usr/local:/usr/local/bin:$PATH" >> $SINGULARITY_ENVIRONMENT 58 | 59 | 60 | #tex fonts 61 | wget --no-check-certificate http://mirrors.ctan.org/macros/generic/iftex/ifluatex.sty 62 | mv ifluatex.sty /usr/share/texmf/tex/generic 63 | wget --no-check-certificate http://mirrors.ctan.org/macros/latex/contrib/framed.zip 64 | unzip framed.zip 65 | mv framed /usr/share/texmf/tex/latex 66 | texhash 67 | 68 | #minimap2 69 | sudo wget --no-check-certificate https://github.com/lh3/minimap2/releases/download/v2.17/minimap2-2.17_x64-linux.tar.bz2 70 | tar -jxf minimap2-2.17_x64-linux.tar.bz2 71 | cd minimap2-2.17_x64-linux 72 | sudo mv k8 paftools.js minimap2 /usr/bin 73 | cd .. 74 | sudo rm -rf minimap2-2.12_x64-linux 75 | sudo chmod a+x /usr/bin/k8 76 | sudo chmod a+x /usr/bin/paftools.js 77 | sudo chmod a+x /usr/bin/minimap2 78 | 79 | #seqtk 80 | wget --no-check-certificate https://github.com/lh3/seqtk/archive/v1.3.tar.gz 81 | tar -xvf v1.3.tar.gz 82 | cd seqtk-1.3 83 | make install BINDIR=/usr/bin 84 | cd . 85 | 86 | #Pandoc 87 | sudo wget --no-check-certificate https://github.com/jgm/pandoc/releases/download/2.9.1.1/pandoc-2.9.1.1-linux-amd64.tar.gz 88 | tar -xzf pandoc-2.9.1.1-linux-amd64.tar.gz 89 | cd pandoc-2.9.1.1/bin 90 | sudo chmod a+x pandoc 91 | sudo chmod a+x pandoc-citeproc 92 | sudo mv pandoc pandoc-citeproc /usr/bin 93 | cd ../.. 94 | sudo rm -rf pandoc* 95 | 96 | #to avoid warnings on hpc/pbs 97 | mkdir -p /extra 98 | mkdir -p /xdisk 99 | mkdir -p /rsgrps 100 | mkdir -p /cm/shared 101 | 102 | 103 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Maturity level-Prototype](https://img.shields.io/badge/Maturity%20Level-Prototype-red) 2 | 3 | # detectIS 4 | 5 | DetectIS is a pipeline specifically designed to detect exogenous DNA integration sites using DNA or RNA paired-end sequencing data. 6 | The workflow manager [nextflow](https://www.nextflow.io/) is used with a configuration file and a Singularity image 7 | 8 | 9 | ## Getting Started 10 | 11 | In order to run the workflow, the user has to create a configuration file, specifying: 12 | 13 | a)fasta file with the reference host genome; 14 | b)fasta file with the reference exogenous sequence; 15 | c)the directory containing the raw data, in FASTQ format 16 | d)the output directory. 17 | The analysis can be executed locally or in an HPC environment, in the latter scenario the user has also to specify the cluster executor. 18 | 19 | 20 | ### Prerequisites 21 | 22 | The detectIS software requirements are: 23 | - [Singularity](https://www.sylabs.io/docs/) V2.6 or higher. 24 | - [Nextflow](https://www.nextflow.io/), the workflow has been developed and tested with version 0.32.0.4897 25 | 26 | 27 | ### Creating a Singularity container 28 | 29 | A Singularity container with all the necessary software is required to run the pipeline. 30 | The image can be created by using the recipe (file: "detectIS.rec" contained in the "utils" directory). Superuser privileges are necessary to generate a Singularity container with the command: 31 | 32 | ``` 33 | sudo singularity build detectIS.simg detectIS.rec 34 | ``` 35 | 36 | N.B. superuser privileges are necessary only to create the container but no to use it. This means you can create the container in your local pc/workstation and copy it to the system where you run analyses (e.g. your hpc or cluster). 37 | 38 | Alternatively, If you have problems in generating a Singularity container from the recipe you can download the image from [Singularity Hub](https://singularity-hub.org/) 39 | 40 | 41 | ### Runnig the workflow 42 | 43 | If you have installed Singularity, Nextflow, and [configured the Singularity](https://www.sylabs.io/guides/2.6/user-guide/faq.html?highlight=disk%20access#how-are-external-file-systems-and-paths-handled-in-a-singularity-container) granting the image access to the disk partitions to read and write you can run any workflow. 44 | 45 | ``` 46 | nextflow run Workflows/detectIS.nf -c detectIS_TestDataset.conf -with-report detectIS_TestDataset_nextflow_report.html 47 | ``` 48 | 49 | In the example Workflows/detectIS.nf is the workflow for the detectIS analysis and detectIS_TestDataset.conf is the configuration file with all the information needed for that given project. In the configuration file are specified input and output file directories, references (fasta) directories, and cluster specific parameters. 50 | 51 | 52 | ### Test data sets 53 | 54 | In the directory "TestDataset" are contained paired-end reads and reference files to run a detectIS analysis. 55 | The dataset simulates the integration of a plasmid in the genome of Chinese hamster ovary cell line (CHOK1) . 56 | 57 | The analysis can be executed using the bash script "Run_detectIS.sh", also contained in the directory "TestDataset" or using nexflow: 58 | 59 | ``` 60 | nextflow run Workflows/detectIS.nf -c detectIS_TestDataset.conf -with-report detectIS_TestDataset_nextflow_report.html 61 | ``` 62 | 63 | The configuration file and the bash script can be either used as a template for other analyses. 64 | 65 | 66 | ## Deployment 67 | 68 | Please notice that Singularity containers can be [kernel-dependent](https://www.sylabs.io/guides/2.6/user-guide/faq.html?highlight=disk%20access#are-singularity-containers-kernel-dependent), this implies that the image recipies contained in this project will not necessarily produce an image able to run on your HPC system. If none of the available images is compatible with your system you might need to modify the recipe using an OS with compatible kernel, please raise an issue if this is the case and you need support for it. 69 | 70 | ## Citation 71 | 72 | If you use detectIS in your research, please cite our latest [publication](https://doi.org/10.1093/bioinformatics/btab366). 73 | -------------------------------------------------------------------------------- /Workflows/detectIS.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | /* 4 | =============================================================== 5 | detectIS-nextflow 6 | =============================================================== 7 | Pipeline to identify Integration Sites (IS) in paired-end 8 | RNA-seq/DNA-seq experiments. 9 | 10 | NextFlow main file 11 | Jan 2020. 12 | 13 | 14 | #### Homepage / Documentation 15 | https://github.com/AstraZeneca/detectIS 16 | #### Author 17 | Luigi Grassi 18 | --------------------------------------------------------------- 19 | */ 20 | 21 | 22 | def helpMessage() { 23 | log.info """ 24 | 25 | ================================================================================= 26 | detectIS pipeline 27 | https://github.com/AstraZeneca/detectIS 28 | ================================================================================= 29 | 30 | Usage: 31 | 32 | The typical command for running the pipeline is as follows: 33 | 34 | nextflow run detectIS-nextflow -c 20190515_downsampleRNAseq_detectISTEST.conf 35 | 36 | Mandatory arguments: -c Configuration file with all the parameters used in the analysis 37 | """.stripIndent() 38 | } 39 | 40 | 41 | // Show help message 42 | params.help = false 43 | if (params.help){ 44 | helpMessage() 45 | exit 0 46 | } 47 | 48 | if (params.project_name==null){ 49 | helpMessage() 50 | exit 0 51 | } 52 | 53 | Channel 54 | .fromFilePairs(params.reads) 55 | .ifEmpty {exit 1, error "Cannot find any reads matching: ${params.reads}" } 56 | .into { reads_minimap1 } 57 | 58 | 59 | // STEP 1 - MAPPING WITH MINIMAP2 READS AS SINGLE END ON PLSM/VIRUS 60 | 61 | vir_mini = Channel.fromPath(params.vir_seq) 62 | .ifEmpty { exit 1, "Viral/plasmid genome reference fasta file not found: please check ${params.vir_seq}" } 63 | 64 | process minimap_vir { 65 | errorStrategy 'retry' 66 | maxRetries 3 67 | scratch params.scratch 68 | 69 | cpus = params.cpu.minimap 70 | memory = 20.GB 71 | tag "$name" 72 | publishDir "${params.outdir}/MinimapPAF", mode: 'copy' , 73 | saveAs: {filename -> 74 | if (filename.indexOf("_vir.paf") > 0) "$filename" 75 | else if (filename.indexOf("_mapping.lst") > 0) "$filename" 76 | else null 77 | } 78 | 79 | input: 80 | set val(name), file(reads) from reads_minimap1 81 | file pref from vir_mini.collect() 82 | 83 | output: 84 | file "*_R1.fq.gz" into fastq_toremap1 85 | file "*_R2.fq.gz" into fastq_toremap2 86 | file "*_mate1_vir.paf" into viral1_paf 87 | file "*_mate2_vir.paf" into viral2_paf 88 | 89 | script: 90 | 91 | """ 92 | minimap2 -x sr -c ${pref} ${reads[0]} -t ${params.cpu.minimap} > ${name}_mate1_vir.paf 93 | minimap2 -x sr -c ${pref} ${reads[1]} -t ${params.cpu.minimap} > ${name}_mate2_vir.paf 94 | cut -f1 ${name}_mate1_vir.paf > ${name}_vir.lst 95 | cut -f1 ${name}_mate2_vir.paf >> ${name}_vir.lst 96 | less -S ${name}_vir.lst | sort | uniq > ${name}_vir_mapping.lst 97 | seqtk subseq ${reads[0]} ${name}_vir_mapping.lst | gzip -vc > ${name}_vir_R1.fq.gz 98 | seqtk subseq ${reads[1]} ${name}_vir_mapping.lst | gzip -vc > ${name}_vir_R2.fq.gz 99 | """ 100 | } 101 | 102 | // STEP 2 - MAPPING ALL READS WITH ANY VIRAL OVERLAP ONTO THE HOST GENOME 103 | 104 | host_mini = Channel.fromPath(params.host_seq) 105 | .ifEmpty { exit 1, "Host genome reference fasta file not found: please check ${params.host_seq}" } 106 | 107 | 108 | 109 | process minimap_genome_and_detectIS { 110 | errorStrategy 'retry' 111 | maxRetries 3 112 | scratch params.scratch 113 | 114 | cpus = params.cpu.minimap 115 | memory = 20.GB 116 | tag "$sample" 117 | publishDir "${params.outdir}", mode: 'copy' , 118 | saveAs: {filename -> 119 | if (filename.indexOf("_gnm.paf") > 0) "MinimapPAF/$filename" 120 | else "detectIS/$filename" 121 | } 122 | 123 | input: 124 | file read1 from fastq_toremap1 125 | file read2 from fastq_toremap2 126 | file gref from host_mini.collect() 127 | file vpaf1 from viral1_paf 128 | file vpaf2 from viral2_paf 129 | 130 | output: 131 | file "*_mate1_gnm.paf" into genom1_paf 132 | file "*_mate2_gnm.paf" into genom2_paf 133 | file("*.md") 134 | file("*.txt") 135 | 136 | script: 137 | sample = read1.toString() - ~/(_vir)?(_R1)?(\.fq)?(\.gz)?$/ 138 | 139 | """ 140 | minimap2 -x sr -c ${gref} ${read1} -t ${params.cpu.minimap} > ${sample}_mate1_gnm.paf 141 | minimap2 -x sr -c ${gref} ${read2} -t ${params.cpu.minimap} > ${sample}_mate2_gnm.paf 142 | perl $baseDir/bin/detectIS.pl -h1 ${sample}_mate1_gnm.paf -h2 ${sample}_mate2_gnm.paf -v1 ${vpaf1} -v2 ${vpaf2} -o ${sample} 143 | """ 144 | } 145 | 146 | -------------------------------------------------------------------------------- /TestDataset/.nextflow.log: -------------------------------------------------------------------------------- 1 | Sep-08 10:32:48.133 [main] DEBUG nextflow.cli.Launcher - $> /home/grassil/bin/nextflow run Workflows/detectIS.nf -c TestDataset/detectIS.conf 2 | Sep-08 10:32:48.212 [main] INFO nextflow.cli.CmdRun - N E X T F L O W ~ version 0.32.0 3 | Sep-08 10:32:48.443 [main] DEBUG nextflow.scm.AssetManager - Listing projects in folder: /home/grassil/.nextflow/assets 4 | Sep-08 10:32:48.491 [main] INFO nextflow.cli.CmdRun - Pulling nextflow-io/Workflows ... 5 | Sep-08 10:32:48.492 [main] DEBUG nextflow.scm.RepositoryProvider - Request [credentials -:-] -> https://api.github.com/repos/nextflow-io/Workflows/contents/detectIS.nf 6 | Sep-08 10:32:49.156 [main] DEBUG nextflow.scm.RepositoryProvider - Request [credentials -:-] -> https://api.github.com/repos/nextflow-io/Workflows 7 | Sep-08 10:32:49.448 [main] DEBUG nextflow.cli.Launcher - Operation aborted 8 | java.io.FileNotFoundException: https://api.github.com/repos/nextflow-io/Workflows 9 | at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) 10 | at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) 11 | at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) 12 | at java.lang.reflect.Constructor.newInstance(Constructor.java:423) 13 | at sun.net.www.protocol.http.HttpURLConnection$10.run(HttpURLConnection.java:1950) 14 | at sun.net.www.protocol.http.HttpURLConnection$10.run(HttpURLConnection.java:1945) 15 | at java.security.AccessController.doPrivileged(Native Method) 16 | at sun.net.www.protocol.http.HttpURLConnection.getChainedException(HttpURLConnection.java:1944) 17 | at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1514) 18 | at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1498) 19 | at sun.net.www.protocol.https.HttpsURLConnectionImpl.getInputStream(HttpsURLConnectionImpl.java:268) 20 | at nextflow.scm.RepositoryProvider.invoke(RepositoryProvider.groovy:114) 21 | at nextflow.scm.RepositoryProvider.memoizedMethodPriv$invokeAndParseResponseString(RepositoryProvider.groovy:175) 22 | at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) 23 | at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 24 | at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) 25 | at java.lang.reflect.Method.invoke(Method.java:498) 26 | at org.codehaus.groovy.reflection.CachedMethod.invoke(CachedMethod.java:104) 27 | at groovy.lang.MetaMethod.doMethodInvoke(MetaMethod.java:326) 28 | at groovy.lang.MetaClassImpl.invokeMethod(MetaClassImpl.java:1235) 29 | at groovy.lang.MetaClassImpl.invokeMethod(MetaClassImpl.java:1041) 30 | at org.codehaus.groovy.runtime.InvokerHelper.invokePogoMethod(InvokerHelper.java:1018) 31 | at org.codehaus.groovy.runtime.InvokerHelper.invokeMethod(InvokerHelper.java:1001) 32 | at org.codehaus.groovy.runtime.InvokerHelper.invokeMethodSafe(InvokerHelper.java:97) 33 | at nextflow.scm.RepositoryProvider$_closure1.doCall(RepositoryProvider.groovy) 34 | at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) 35 | at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 36 | at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) 37 | at java.lang.reflect.Method.invoke(Method.java:498) 38 | at org.codehaus.groovy.reflection.CachedMethod.invoke(CachedMethod.java:104) 39 | at groovy.lang.MetaMethod.doMethodInvoke(MetaMethod.java:326) 40 | at org.codehaus.groovy.runtime.metaclass.ClosureMetaClass.invokeMethod(ClosureMetaClass.java:264) 41 | at groovy.lang.MetaClassImpl.invokeMethod(MetaClassImpl.java:1041) 42 | at groovy.lang.Closure.call(Closure.java:421) 43 | at org.codehaus.groovy.runtime.memoize.Memoize$MemoizeFunction$1.provide(Memoize.java:139) 44 | at org.codehaus.groovy.runtime.memoize.ConcurrentCommonCache.getAndPut(ConcurrentCommonCache.java:147) 45 | at org.codehaus.groovy.runtime.memoize.ConcurrentCommonCache.getAndPut(ConcurrentCommonCache.java:123) 46 | at org.codehaus.groovy.runtime.memoize.Memoize$MemoizeFunction.call(Memoize.java:136) 47 | at groovy.lang.Closure.call(Closure.java:437) 48 | at nextflow.scm.RepositoryProvider.invokeAndParseResponse(RepositoryProvider.groovy) 49 | at nextflow.scm.RepositoryProvider.validateRepo(RepositoryProvider.groovy:213) 50 | at nextflow.scm.RepositoryProvider.validateFor(RepositoryProvider.groovy:206) 51 | at nextflow.scm.AssetManager.checkValidRemoteRepo(AssetManager.groovy:344) 52 | at nextflow.scm.AssetManager.download(AssetManager.groovy:551) 53 | at nextflow.scm.AssetManager.download(AssetManager.groovy) 54 | at nextflow.cli.CmdRun.getScriptFile(CmdRun.groovy:295) 55 | at nextflow.cli.CmdRun.run(CmdRun.groovy:210) 56 | at nextflow.cli.Launcher.run(Launcher.groovy:432) 57 | at nextflow.cli.Launcher.main(Launcher.groovy:590) 58 | Caused by: java.io.FileNotFoundException: https://api.github.com/repos/nextflow-io/Workflows 59 | at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1896) 60 | at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1498) 61 | at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:480) 62 | at sun.net.www.protocol.https.HttpsURLConnectionImpl.getResponseCode(HttpsURLConnectionImpl.java:352) 63 | at nextflow.scm.RepositoryProvider.checkResponse(RepositoryProvider.groovy:143) 64 | at nextflow.scm.RepositoryProvider.invoke(RepositoryProvider.groovy:111) 65 | ... 37 common frames omitted 66 | -------------------------------------------------------------------------------- /TestDataset/CelTag.fa: -------------------------------------------------------------------------------- 1 | >CelTag Plasmid sequence 5574 bps #https://www.addgene.org/66562/sequences/#depositor-full 2 | GGGCGAATTGGGCCCGACGTCGCATGCTCCCGGCCGCCATGGCGGCCGCGGGAATTCGATTAATCGATAC 3 | ATATGCCCGGGTTAATTAACGGTGAACAAAAGCTAATCTCCGAGGAAGACTTGAACGGTGAACAAAAATT 4 | AATCTCAGAAGAAGACTTGAACGGACTCGACGGTGAACAAAAGTTGATTTCTGAAGAAGATTTGAACGGT 5 | GAACAAAAGCTAATCTCCGAGGAAGACTTGAACGGTGAACAAAAATTAATCTCAGAAGAAGACTTGAACG 6 | GACTCGACGGTGAACAAAAGTTGATTTCTGAAGAAGATTTGAACGGTGAACAAAAGCTAATCTCCGAGGA 7 | AGACTTGAACGGTGAACAAAAATTAATCTCAGAAGAAGACTTGAACGGACTCGACGGTGAACAAAAGTTG 8 | ATTTCTGAAGAAGATTTGAACGGTGAACAAAAGCTAATCTCCGAGGAAGACTTGAACGGTGAACAAAAAT 9 | TAATCTCAGAAGAAGACTTGAACGGACTCGACGGTGAACAAAAGTTGATTTCTGAAGAAGATTTGAACGG 10 | TGAACAAAAGCTAATCTCCGAGGAAGACTTGAACGGTGAACAAAAATTAATCAATCACGAGAACCTCTAC 11 | TTCCAAAGCGTATCAGGCAATTTGAAGGTTGAATTCTACAACAGCAATCCTTCAGATACTACTAACTCAA 12 | TCAATCCTCAGTTCAAGGTTACTAATACCGGAAGCAGTGCAATTGATTTGTCCAAACTCACATTGAGATA 13 | TTATTATACAGTAGACGGACAGAAAGATCAGACCTTCTGGTGTGACCATGCTGCAATAATCGGCAGTAAC 14 | GGCAGCTACAACGGAATTACTTCAAATGTAAAAGGAACATTTGTAAAAATGAGTTCCTCAACAAATAACG 15 | CAGACACCTACCTTGAAATAAGCTTTACAGGCGGAACTCTTGAACCGGGTGCACATGTTCAGATACAAGG 16 | TAGATTTGCAAAGAATGACTGGAGTAACTATACACAGTCAAATGACTACTCATTCAAGTCTGCTTCACAG 17 | TTTGTTGAATGGGATCAGGTAACAGCATACTTGAACGGTGTTCTTGTATGGGGTAAAGAATAGACTTCTA 18 | AATAAGCGAATTTCTTATGATTTATGATTTTTATTATTAAATAAGTTATAAAAAAAATAAGTGTATACAA 19 | ATTTTAAAGTGACTCTTAGGTTTTAAAACGAAAATTCTTATTCTTGAGTAACTCTTTCCTGTAGGTCAGG 20 | TTGCTTTCTCAGGTATAGTATGAGGTCGCTCTTATTGACCACACCTCTACCGGCAGATCCGCTAGGGATA 21 | ACAGGGTAATATAGATCTGTTTAGCTTGCCTCGTCCCCGCCGGGTCACCCGGCCAGCGACATGGAGGCCC 22 | AGAATACCCTCCTTGACAGTCTTGACGTGCGCAGCTCAGGGGCATGATGTGACTGTCGCCCGTACATTTA 23 | GCCCATACATCCCCATGTATAATCATTTGCATCCATACATTTTGATGGCCGCACGGCGCGAAGCAAAAAT 24 | TACGGCTCCTCGCTGCAGACCTGCGAGCAGGGAAACGCTCCCCTCACAGACGCGTTGAATTGTCCCCACG 25 | CCGCGCCCCTGTAGAGAAATATAAAAGGTTAGGATTTGCCACTGAGGTTCTTCTTTCATATACTTCCTTT 26 | TTAAATCTTGCTAGGATACAGTTCTCACATCACATCCGAACATAAACAACCATGGGTACCACTCTTGACG 27 | ACACGGCTTACCGGTACCGCACCAGTGTCCCGGGGGACGCCGAGGCCATCGAGGCACTGGATGGGTCCTT 28 | CACCACCGACACCGTCTTACTGGATGGGTCCTTCACCACCGACACCGTCTTCCGCGTCACCGCCACCGGG 29 | GACGGCTTCACCCTGCGGGAGGTGCCGGTGGACCCGCCCCTGACCAAGGTGTTCCCCGACGACGAATCGG 30 | ACGACGAATCGGACGACGGGGAGGACGGCGACCCGGACTCTCGGACGTTCGTCGCGTACGGGGACGACGG 31 | CGACCTGGCGGGCTTCGTGGTCGTCTCGTACTCCGGCTGGAACCGCCGGCTGACCGTCGAGGACATCGAG 32 | GTCGCCCCGGAGCACCGGGGGCACGGGGTCGGGCGCGCGTTGATGGGGCTCGCGACGGAGTTCGCCCGCG 33 | AGCGGGGTGCCGGGCACCTCTGGCTGGAGGTCACCAACGTCAACGCACCGGCGATCCACGCGTACCGGCG 34 | GATGGGGTTCACCCTCTGCGGCCTGGACACCGCCCTGTACGACGGCACCGCCTCGGACGGCGAGCAGGCG 35 | CTCTACATGAGCATGCCCTGCCCCTAATCAGTACTGACAATAAAAAGATTCTTGTTTTCAAGAACTTGTC 36 | ATTTGTATAGTTTTTTTATATTGTAGTTGTTCTATTTTAATCAAATGTTAGCGTGATTTATATTTTTTTT 37 | CGCCTCGACATCATCTGCCCAGATGCGAAGTTAAGTGCGCAGAAAGTAATATCATGCGTCAATCGTATGT 38 | GAATGCTGGTCGCTATACTGCTGTCGATTCGATACTAACGCCGCCATCCAGTTTAAACGAGCTCGAATTC 39 | ATCGATACCGTCGACCTCGAGCGTACGTAATCACTAGTGAATTCGCGGCCGCCTGCAGGTCGACCATATG 40 | GGAGAGCTCCCAACGCGTTGGATGCATAGCTTGAGTATTCTATAGTGTCACCTAAATAGCTTGGCGTAAT 41 | CATGGTCATAGCTGTTTCCTGTGTGAAATTGTTATCCGCTCACAATTCCACACAACATACGAGCCGGAAG 42 | CATAAAGTGTAAAGCCTGGGGTGCCTAATGAGTGAGCTAACTCACATTAATTGCGTTGCGCTCACTGCCC 43 | GCTTTCCAGTCGGGAAACCTGTCGTGCCAGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTT 44 | TGCGTATTGGGCGCTCTTCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGC 45 | GGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATG 46 | TGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCC 47 | GCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAG 48 | ATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATAC 49 | CTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGG 50 | TGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATC 51 | CGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAAC 52 | AGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACA 53 | CTAGAAGAACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTC 54 | TTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGA 55 | AAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCAC 56 | GTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAG 57 | TTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCA 58 | CCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGA 59 | TACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGA 60 | TTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCC 61 | ATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTG 62 | TTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCA 63 | ACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATC 64 | GTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTG 65 | TCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTAT 66 | GCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAA 67 | GTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTT 68 | CGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGC 69 | AAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTC 70 | TTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTA 71 | TTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGATGCGGTGTGAAA 72 | TACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGAAATTGTAAGCGTTAATATTTTGTTAAAATTC 73 | GCGTTAAATTTTTGTTAAATCAGCTCATTTTTTAACCAATAGGCCGAAATCGGCAAAATCCCTTATAAAT 74 | CAAAAGAATAGACCGAGATAGGGTTGAGTGTTGTTCCAGTTTGGAACAAGAGTCCACTATTAAAGAACGT 75 | GGACTCCAACGTCAAAGGGCGAAAAACCGTCTATCAGGGCGATGGCCCACTACGTGAACCATCACCCTAA 76 | TCAAGTTTTTTGGGGTCGAGGTGCCGTAAAGCACTAAATCGGAACCCTAAAGGGAGCCCCCGATTTAGAG 77 | CTTGACGGGGAAAGCCGGCGAACGTGGCGAGAAAGGAAGGGAAGAAAGCGAAAGGAGCGGGCGCTAGGGC 78 | GCTGGCAAGTGTAGCGGTCACGCTGCGCGTAACCACCACACCCGCCGCGCTTAATGCGCCGCTACAGGGC 79 | GCGTCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACG 80 | CCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTAACGCCAGGGTTTTCCCAGTCACGA 81 | CGTTGTAAAACGACGGCCAGTGAATTGTAATACGACTCACTATA 82 | -------------------------------------------------------------------------------- /Workflows/bin/detectISlisting.css: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * From https://gist.github.com/killercup/5917178 4 | * I add this to html files generated with pandoc. 5 | */ 6 | 7 | 8 | html { 9 | font-size: 100%; 10 | overflow-y: scroll; 11 | -webkit-text-size-adjust: 100%; 12 | -ms-text-size-adjust: 100%; 13 | } 14 | 15 | body { 16 | color: #444; 17 | font-family: Georgia, Palatino, 'Palatino Linotype', Times, 'Times New Roman', serif; 18 | font-size: 12px; 19 | line-height: 1.7; 20 | padding: 1em; 21 | margin: auto; 22 | max-width: 42em; 23 | background: #fefefe; 24 | } 25 | 26 | a { 27 | color: #0645ad; 28 | text-decoration: none; 29 | } 30 | 31 | a:visited { 32 | color: #0b0080; 33 | } 34 | 35 | a:hover { 36 | color: #06e; 37 | } 38 | 39 | a:active { 40 | color: #faa700; 41 | } 42 | 43 | a:focus { 44 | outline: thin dotted; 45 | } 46 | 47 | *::-moz-selection { 48 | background: rgba(255, 255, 0, 0.3); 49 | color: #000; 50 | } 51 | 52 | *::selection { 53 | background: rgba(255, 255, 0, 0.3); 54 | color: #000; 55 | } 56 | 57 | a::-moz-selection { 58 | background: rgba(255, 255, 0, 0.3); 59 | color: #0645ad; 60 | } 61 | 62 | a::selection { 63 | background: rgba(255, 255, 0, 0.3); 64 | color: #0645ad; 65 | } 66 | 67 | p { 68 | margin: 1em 0; 69 | } 70 | 71 | img { 72 | max-width: 100%; 73 | } 74 | 75 | h1, h2, h3, h4, h5, h6 { 76 | color: #111; 77 | line-height: 125%; 78 | margin-top: 2em; 79 | font-weight: normal; 80 | } 81 | 82 | h4, h5, h6 { 83 | font-weight: bold; 84 | } 85 | 86 | h1 { 87 | font-size: 2.5em; 88 | } 89 | 90 | h2 { 91 | font-size: 2em; 92 | } 93 | 94 | h3 { 95 | font-size: 1.5em; 96 | } 97 | 98 | h4 { 99 | font-size: 1.2em; 100 | } 101 | 102 | h5 { 103 | font-size: 1em; 104 | } 105 | 106 | h6 { 107 | font-size: 0.9em; 108 | } 109 | 110 | blockquote { 111 | color: #666666; 112 | margin: 0; 113 | padding-left: 3em; 114 | border-left: 0.5em #EEE solid; 115 | } 116 | 117 | hr { 118 | display: block; 119 | height: 2px; 120 | border: 0; 121 | border-top: 1px solid #aaa; 122 | border-bottom: 1px solid #eee; 123 | margin: 1em 0; 124 | padding: 0; 125 | } 126 | 127 | pre, code, kbd, samp { 128 | color: #000; 129 | font-family: monospace, monospace; 130 | _font-family: 'courier new', monospace; 131 | font-size: 0.98em; 132 | } 133 | 134 | pre { 135 | white-space: pre; 136 | white-space: pre-wrap; 137 | word-wrap: break-word; 138 | } 139 | 140 | b, strong { 141 | font-weight: bold; 142 | } 143 | 144 | dfn { 145 | font-style: italic; 146 | } 147 | 148 | ins { 149 | background: #ff9; 150 | color: #000; 151 | text-decoration: none; 152 | } 153 | 154 | mark { 155 | background: #ff0; 156 | color: #000; 157 | font-style: italic; 158 | font-weight: bold; 159 | } 160 | 161 | sub, sup { 162 | font-size: 75%; 163 | line-height: 0; 164 | position: relative; 165 | vertical-align: baseline; 166 | } 167 | 168 | sup { 169 | top: -0.5em; 170 | } 171 | 172 | sub { 173 | bottom: -0.25em; 174 | } 175 | 176 | ul, ol { 177 | margin: 1em 0; 178 | padding: 0 0 0 2em; 179 | } 180 | 181 | li p:last-child { 182 | margin-bottom: 0; 183 | } 184 | 185 | ul ul, ol ol { 186 | margin: .3em 0; 187 | } 188 | 189 | dl { 190 | margin-bottom: 1em; 191 | } 192 | 193 | dt { 194 | font-weight: bold; 195 | margin-bottom: .8em; 196 | } 197 | 198 | dd { 199 | margin: 0 0 .8em 2em; 200 | } 201 | 202 | dd:last-child { 203 | margin-bottom: 0; 204 | } 205 | 206 | img { 207 | border: 0; 208 | -ms-interpolation-mode: bicubic; 209 | vertical-align: middle; 210 | } 211 | 212 | figure { 213 | display: block; 214 | text-align: center; 215 | margin: 1em 0; 216 | } 217 | 218 | figure img { 219 | border: none; 220 | margin: 0 auto; 221 | } 222 | 223 | figcaption { 224 | font-size: 0.8em; 225 | font-style: italic; 226 | margin: 0 0 .8em; 227 | } 228 | 229 | table { 230 | margin-bottom: 2em; 231 | border-bottom: 1px solid #ddd; 232 | border-right: 1px solid #ddd; 233 | border-spacing: 0; 234 | border-collapse: collapse; 235 | } 236 | 237 | table th { 238 | padding: .2em 1em; 239 | background-color: #eee; 240 | border-top: 1px solid #ddd; 241 | border-left: 1px solid #ddd; 242 | } 243 | 244 | table td { 245 | padding: .2em 1em; 246 | border-top: 1px solid #ddd; 247 | border-left: 1px solid #ddd; 248 | vertical-align: top; 249 | } 250 | 251 | .author { 252 | font-size: 1.2em; 253 | text-align: center; 254 | } 255 | 256 | @media only screen and (min-width: 480px) { 257 | body { 258 | font-size: 14px; 259 | } 260 | } 261 | @media only screen and (min-width: 768px) { 262 | body { 263 | font-size: 16px; 264 | } 265 | } 266 | @media print { 267 | * { 268 | background: transparent !important; 269 | color: black !important; 270 | filter: none !important; 271 | -ms-filter: none !important; 272 | } 273 | 274 | body { 275 | font-size: 12pt; 276 | max-width: 100%; 277 | } 278 | 279 | a, a:visited { 280 | text-decoration: underline; 281 | } 282 | 283 | hr { 284 | height: 1px; 285 | border: 0; 286 | border-bottom: 1px solid black; 287 | } 288 | 289 | a[href]:after { 290 | content: " (" attr(href) ")"; 291 | } 292 | 293 | abbr[title]:after { 294 | content: " (" attr(title) ")"; 295 | } 296 | 297 | .ir a:after, a[href^="javascript:"]:after, a[href^="#"]:after { 298 | content: ""; 299 | } 300 | 301 | pre, blockquote { 302 | border: 1px solid #999; 303 | padding-right: 1em; 304 | page-break-inside: avoid; 305 | } 306 | 307 | tr, img { 308 | page-break-inside: avoid; 309 | } 310 | 311 | img { 312 | max-width: 100% !important; 313 | } 314 | 315 | @page :left { 316 | margin: 15mm 20mm 15mm 10mm; 317 | } 318 | 319 | @page :right { 320 | margin: 15mm 10mm 15mm 20mm; 321 | } 322 | 323 | p, h2, h3 { 324 | orphans: 3; 325 | widows: 3; 326 | } 327 | 328 | h2, h3 { 329 | page-break-after: avoid; 330 | } 331 | } 332 | -------------------------------------------------------------------------------- /Workflows/bin/detectIS.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | 3 | ###### detectIS.pl 4 | ## 5 | ##luigi.grassi@astrazeneca.com 6 | ## 7 | ##USAGE: perl detectIS.pl 8 | ################################################################################ 9 | 10 | use warnings ; 11 | use strict ; 12 | use diagnostics ; 13 | use FindBin; # locate this script to locate pm files 14 | use lib "$FindBin::Bin/" ; 15 | use GeneralTools ; 16 | use DetectSpltR ; 17 | use DetectChmR ; 18 | use Data::Dumper; 19 | use List::Util qw( min max ); 20 | use Getopt::Long; 21 | 22 | my $message_text = "detectIS usage:\nperl detectIS.pl -h1 name_mate1_gnm.paf -h2 name_mate2_gnm.paf -v1 name_mate1_vir.paf -v2 name_mate2_vir.paf -o name\n-h1: Aln results of R1 on host genome\n-h2: Aln results of R2 on host genome\n-v1: Aln results of R1 on virus/plasmid\n-v2: Aln results of R2 on virus/plasmid\n-o: Output prefix\nExtra options -mqual [1] -ovlwind [0.05]\n-mqual: minimum mapping quality [default 1, min:0 max:60]\n-ovlwind: Ovl tolerability window (fraction of the read length) [default 0.05, min:0 max:1]\n-mspr [default 2]"; 23 | 24 | my $mqual=1 ; 25 | my $mspr=2 ; 26 | my $ovlwind=0.05 ; 27 | my $GNM1 = '' ; 28 | my $GNM2 = '' ; 29 | my $PLSM1 = '' ; 30 | my $PLSM2 = '' ; 31 | my $OUTPREF = '' ; 32 | 33 | GetOptions ('mspr=i' => \$mspr, 'mqual=i' => \$mqual, 'ovlwind=o' => \$ovlwind, 'h1=s' => \$GNM1, 'h2=s' => \$GNM2, 'v1=s'=> \$PLSM1, 'v2=s'=> \$PLSM2, 'o=s'=> \$OUTPREF) ; 34 | 35 | ( $GNM1 ne '' && $GNM2 ne '' && $PLSM1 ne '' && $PLSM2 ne '' && $OUTPREF ne '') || die $message_text ; 36 | 37 | chomp $OUTPREF; 38 | chomp $GNM1; 39 | chomp $GNM2; 40 | chomp $PLSM1; 41 | chomp $PLSM2; 42 | 43 | my $OUTPREFMD=$OUTPREF.'.md' ; 44 | my $OUTPREFTXT=$OUTPREF.'.txt' ; 45 | my $OUTPREFSPREADTXT=$OUTPREF.'_SRlist.txt' ; 46 | 47 | open(OUT1, ">$OUTPREFMD") || die "ERROR: not possible to open output file $OUTPREFMD\n" ; 48 | open(OUT2, ">$OUTPREFTXT") || die "ERROR: not possible to open output file $OUTPREFTXT\n" ; 49 | open(OUT3, ">$OUTPREFSPREADTXT") || die "ERROR: not possible to open output file $OUTPREFSPREADTXT\n" ; 50 | 51 | print OUT1 '\\titleAT'."\n\n" ; 52 | print OUT1 '\\newpage'."\n\n" ; 53 | print OUT1 "# detectIS Results\n" ; 54 | print OUT1 "perl detectIS.pl -h1 $GNM1 -h2 $GNM2 -v1 $PLSM1 -v2 $PLSM2 -o $OUTPREF -mqual $mqual -ovlwind $ovlwind -mspr $mspr" ; 55 | print OUT1 "\n\n----\n\n\n" ; 56 | 57 | print OUT2 "IS\tTotSpReads\tR1R2SpReads\tR1SpReads\tR2SpReads\tChimReads\tSingleSplitRead\tInterval\n" ; 58 | 59 | print OUT3 "IS\tReadID\tSRType\n" ; 60 | 61 | ###Detect potential Split Reads in genomic hits 62 | my $splt1=DetectSpltR::detect_split_reads($GNM1, $mqual) ; 63 | my $splt2=DetectSpltR::detect_split_reads($GNM2, $mqual) ; 64 | 65 | ###Remove fal Split Reads looking using plasmidic reads 66 | my $ishash1=DetectSpltR::verify_split_reads($splt1, $PLSM1, $ovlwind) ; 67 | my $ishash2=DetectSpltR::verify_split_reads($splt2, $PLSM2, $ovlwind) ; 68 | 69 | ###Merging the results from both reads and filtering by frequency 70 | my ($ca1, $ca2, $ca3)=GeneralTools::merge_split_pairs($ishash1, $ishash2, $mspr) ; 71 | my %ishash=%$ca1; # Key1:IS -> Element: number of fragment/read supporting it 72 | my %isres=%$ca2; # Key1:IS -> Key2:Read_ID -> Element: Read_occurrence 73 | my %iscoord=%$ca3; # Key IS -> Element: @ 0[Plm_chr] 1[Plm_pos] 2[Host_chr] 3[Host_pos] 74 | 75 | if ( scalar (keys %ishash) > 0 ) { 76 | my ($isfuss, $spreads) = DetectSpltR::verify_spreads_is($GNM1, $GNM2, $PLSM1, $PLSM2, \%isres, \%iscoord) ; 77 | my %hashres = %$isfuss; 78 | my %hspreads = %$spreads; 79 | foreach my $n(sort { $ishash{$b} <=> $ishash{$a} } keys %ishash) { 80 | if (exists $hashres{$n}) { 81 | print OUT1 "## $n\n" ; 82 | my $totsplit= $hashres{$n}{'R1'} + $hashres{$n}{'R2'} + $hashres{$n}{'R1R2'} ; 83 | print OUT1 "SPLIT READS: $totsplit (R1R2:$hashres{$n}{'R1R2'}; R1:$hashres{$n}{'R1'}; R2:$hashres{$n}{'R2'})\n"; 84 | print OUT1 "CHIMERIC READS: $hashres{$n}{'CHIM'}\n"; 85 | print OUT1 "SINGLE SPLIT READ: $hashres{$n}{'RU'}\n"; 86 | print OUT1 "INTERVAL: $hashres{$n}{'INT'}\n"; 87 | print OUT1 "\n\n----\n\n\n" ; 88 | print OUT2 "$n\t$totsplit\t$hashres{$n}{'R1R2'}\t$hashres{$n}{'R1'}\t$hashres{$n}{'R2'}\t$hashres{$n}{'CHIM'}\t$hashres{$n}{'RU'}\t$hashres{$n}{'INT'}\n" ; 89 | foreach my $spr (keys %{$hspreads{$n}}) { 90 | print OUT3 "$spr\t$n\t$hspreads{$n}{$spr}\n"; 91 | } 92 | } 93 | } 94 | } 95 | else { 96 | print OUT1 "No split read identified! Looking only for chimeric reads\n" ; 97 | print OUT2 "No split read identified! Looking only for chimeric reads\n" ; 98 | 99 | my ($chm1, $chm2)=DetectChmR::detect_chimeric_reads($GNM1, $GNM2); #Detect genomic hits potentially in a chimeric pair 100 | DetectChmR::filter_chimeric_reads($PLSM1, $PLSM2, $chm1, $chm2); #Detect potential chimeric pairs using genomic and plasmidic hits 101 | 102 | my ($ChimReads2, $len2)=DetectChmR::count_and_collapse_chimeric_reads($PLSM1,$GNM2,$chm2) ; #Count hits dividing chromosomes in intervals 103 | my ($ChimReads1, $len1)=DetectChmR::count_and_collapse_chimeric_reads($PLSM2,$GNM1,$chm1) ; #Count hits dividing chromosomes in intervals 104 | 105 | my %chmhash=(); 106 | 107 | my %ch1 = %$ChimReads1; 108 | my %ch2 = %$ChimReads2; 109 | 110 | foreach my $k(keys %ch1) { 111 | $chmhash{$k}+=$ch1{$k} ; 112 | 113 | } 114 | foreach my $k(keys %ch2) { 115 | $chmhash{$k}+=$ch2{$k} ; 116 | } 117 | 118 | foreach my $k(keys %chmhash) { 119 | if ($chmhash{$k}<2) { 120 | delete $chmhash{$k} ; 121 | } 122 | } 123 | if ( scalar (keys %chmhash) > 0 ) { #Only one IS supported by more than 1 chimeric pair 124 | foreach my $n(sort { $chmhash{$b} <=> $chmhash{$a} } keys %chmhash) { 125 | my @tmp=split(/--/, $n) ; 126 | my @tmp1=split(/:/, $tmp[0]) ; 127 | my @tmp2=split(/:/, $tmp[1]) ; 128 | my $is1=$tmp1[1]*$len1*5 ; 129 | my $is2=$is1+($len1*5); 130 | my $is3=$tmp2[1]*$len1*5 ; 131 | my $is4=$is3+($len1*5); 132 | print OUT1 "## $tmp1[0]:$is1-$is2--$tmp2[0]:$is3-$is4\nCHIM:$chmhash{$n}\n" ; 133 | print OUT2 "## $tmp1[0]:$is1-$is2--$tmp2[0]:$is3-$is4\t0\t0\t0\t0\t$chmhash{$n}\t0\t0\n" ; 134 | print OUT1 "\n\n----\n\n\n" ; 135 | } 136 | 137 | } 138 | else { 139 | print OUT1 "No chimeric hit identified!\n ## No IS found!\n" ; 140 | print OUT2 "No chimeric hit identified!\n ## No IS found!\n" ; 141 | } 142 | 143 | } 144 | 145 | close OUT1 ; 146 | close OUT2 ; 147 | close OUT3 ; 148 | -------------------------------------------------------------------------------- /Workflows/bin/DetectChmR.pm: -------------------------------------------------------------------------------- 1 | package DetectChmR; 2 | 3 | use List::Util qw( min max ); 4 | use POSIX; 5 | use FindBin; # locate this script 6 | use lib "$FindBin::Bin/"; 7 | use GeneralTools ; 8 | 9 | 10 | ################################################################################ 11 | #########SUB 12 | ################################################################################ 13 | 14 | ################################################################################ 15 | #####Passing a 2 genomic PAF file identifies potentially chimeric reads 16 | sub detect_chimeric_reads 17 | { 18 | my $paf1 = $_[0]; 19 | my $paf2 = $_[1]; 20 | my %chimreads1=() ; 21 | my %chimf1=() ; 22 | my %chimreads2=() ; 23 | my %chimf2=() ; 24 | open(PAF1 , "<", $paf1) || die "ERROR in opening $paf1: please check the file name\n" ; 25 | while (my $line=) { 26 | chomp $line; 27 | if (length ($line) >0) { 28 | my @tmp =split("\t" , $line) ; 29 | if ( ($tmp[10]>=($tmp[1]*0.90)) && ($tmp[11]==60) && ($tmp[9]>($tmp[10]*0.99)) ) { 30 | $chimreads1{$tmp[0]}++ ; 31 | } 32 | else { 33 | $chimf1{$tmp[0]}++ ; 34 | } 35 | } 36 | } 37 | close PAF1; 38 | open(PAF2 , "<", $paf2) || die "ERROR in opening $paf2: please check the file name\n" ; 39 | while (my $line=) { 40 | chomp $line; 41 | if (length ($line) >0) { 42 | my @tmp =split("\t" , $line) ; 43 | if ( ($tmp[10]>=($tmp[1]*0.90)) && ($tmp[11]==60) && ($tmp[9]>($tmp[10]*0.99)) ) { 44 | $chimreads2{$tmp[0]}++ ; 45 | } 46 | else { 47 | $chimf2{$tmp[0]}++ ; 48 | } 49 | } 50 | } 51 | close PAF2; 52 | my %chimrd1=(); 53 | my %chimrd2=(); 54 | foreach my $k(keys %chimreads1) { 55 | if ( (not exists $chimreads2{$k}) && (not exists $chimf2{$k}) && (not exists $chimf1{$k})) { 56 | $chimrd1{$k}=$chimreads1{$k}; 57 | } 58 | } 59 | foreach my $k(keys %chimreads2) { 60 | if ( (not exists $chimreads1{$k}) && (not exists $chimf1{$k}) && (not exists $chimf2{$k}) ){ 61 | $chimrd2{$k}=$chimreads2{$k}; 62 | } 63 | } 64 | %chimreads1=(); 65 | %chimreads2=(); 66 | %chimf1=(); 67 | %chimf2=(); 68 | return (\%chimrd1, \%chimrd2); 69 | } 70 | ################################################################################ 71 | 72 | sub filter_chimeric_reads 73 | { 74 | my $paf1 = $_[0]; 75 | my $paf2 = $_[1]; 76 | my $potchimr1= $_[2] ; 77 | my $potchimr2= $_[3] ; 78 | my %chimreads1=() ; 79 | my %chimreads2=() ; 80 | open(PAF1 , "<", $paf1) || die "ERROR in opening $paf1: please check the file name\n" ; 81 | while (my $line=) { 82 | chomp $line; 83 | if (length ($line) >0) { 84 | my @tmp =split("\t" , $line) ; 85 | if ( (exists $potchimr2->{ $tmp[0] }) && ($tmp[10]>=($tmp[1]*0.90)) && ($tmp[9]>($tmp[10]*0.90)) ) { 86 | $chimreads1{$tmp[0]}++ ; 87 | } 88 | else { 89 | $chimf1{$tmp[0]}++ ; 90 | } 91 | } 92 | } 93 | close PAF1; 94 | open(PAF2 , "<", $paf2) || die "ERROR in opening $paf2: please check the file name\n" ; 95 | while (my $line=) { 96 | chomp $line; 97 | if (length ($line) >0) { 98 | my @tmp =split("\t" , $line) ; 99 | if ( (exists $potchimr1->{ $tmp[0] }) && ($tmp[10]>=($tmp[1]*0.90)) && ($tmp[9]>($tmp[10]*0.90)) ) { 100 | $chimreads2{$tmp[0]}++ ; 101 | 102 | } 103 | else { 104 | $chimf2{$tmp[0]}++ ; 105 | } 106 | } 107 | } 108 | close PAF2; 109 | foreach my $read (keys %$potchimr1) { 110 | if ( (exists $chimreads2{$read}) && (not exists $chimreads1{$read}) && (not exists $chimf1{$read}) && (not exists $chimf2{$read})) { 111 | ; 112 | } 113 | else { 114 | delete $potchimr1->{ $read } ; 115 | } 116 | } 117 | foreach my $read (keys %$potchimr2) { 118 | if ( (exists $chimreads1{$read}) && (not exists $chimreads2{$read}) && (not exists $chimf2{$read}) && (not exists $chimf1{$read}) ) { 119 | ; 120 | } 121 | else { 122 | delete $potchimr2->{ $read } ; 123 | } 124 | } 125 | } 126 | 127 | 128 | sub count_and_collapse_chimeric_reads 129 | { 130 | my $paf1 = $_[0]; 131 | my $paf2 = $_[1]; 132 | my $potchimr= $_[2] ; 133 | my %rpairs=(); 134 | my @rlen=(); 135 | open(PAF1 , "<", $paf1) || die "ERROR in opening $paf1: please check the file name\n" ; 136 | while (my $line=) { 137 | chomp $line; 138 | if (length ($line) >0) { 139 | my @tmp =split("\t" , $line) ; 140 | if (exists $potchimr->{ $tmp[0] }) { 141 | push (@rlen, $tmp[1]); 142 | push (@{$rpairs{$tmp[0]}}, $tmp[5]); 143 | push (@{$rpairs{$tmp[0]}}, $tmp[6]); 144 | push (@{$rpairs{$tmp[0]}}, $tmp[7]); 145 | push (@{$rpairs{$tmp[0]}}, $tmp[8]); 146 | } 147 | } 148 | } 149 | close PAF1; 150 | open(PAF2 , "<", $paf2) || die "ERROR in opening $paf2: please check the file name\n" ; 151 | while (my $line=) { 152 | chomp $line; 153 | if (length ($line) >0) { 154 | my @tmp =split("\t" , $line) ; 155 | if (exists $potchimr->{ $tmp[0] }) { 156 | push (@rlen, $tmp[1]); 157 | push (@{$rpairs{$tmp[0]}}, $tmp[5]); 158 | push (@{$rpairs{$tmp[0]}}, $tmp[6]); 159 | push (@{$rpairs{$tmp[0]}}, $tmp[7]); 160 | push (@{$rpairs{$tmp[0]}}, $tmp[8]); 161 | } 162 | 163 | } 164 | } 165 | close PAF2; 166 | my $len=0 ; 167 | if (@rlen >0 ) { 168 | $len=GeneralTools::average(\@rlen); 169 | @rlen=(); 170 | } 171 | my %comb=(); 172 | foreach my $read (keys %rpairs) { 173 | my $v1=floor(($rpairs{$read}[2]+(($rpairs{$read}[3]-$rpairs{$read}[2])/2))/($len*5)) ; 174 | my $v2=floor(($rpairs{$read}[6]+(($rpairs{$read}[7]-$rpairs{$read}[6])/2))/($len*5)) ; 175 | my $int1=$rpairs{$read}[0].":".$v1 ; 176 | my $int2=$rpairs{$read}[4].":".$v2; 177 | $comb{"$int1--$int2"}++ ; 178 | } 179 | return(\%comb, $len); 180 | } 181 | 1; 182 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Manual/MANUAL.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: DetectIS (v 0.1.1) 3 | author: Manual 4 | output: 5 | pdf_document: 6 | fig_caption: true 7 | number_sections: true 8 | --- 9 | 10 | 11 | \newpage 12 | 13 | \tableofcontents 14 | 15 | \newpage 16 | 17 | ```{r setup, include = FALSE} 18 | knitr::opts_chunk$set( 19 | collapse = TRUE, 20 | comment = "#>", 21 | library(knitr), 22 | library(formatR), 23 | knitr::opts_chunk$set(message=FALSE, warning=FALSE, tidy.opts=list(width.cutoff=60)) 24 | 25 | ) 26 | ``` 27 | 28 | DetectIS is a pipeline designed to detect the integration sites of exogenous DNA using DNA or RNA paired-end sequencing data. A [Singularity](https://sylabs.io/docs/) container has all the software necessary for the analysis. The workflow manager [nextflow](https://www.nextflow.io/) can be used to run the analysis locally or in an HPC environment. 29 | 30 | # Installation 31 | 32 | You can use git to clone the detecIS repository from GitHub to your computer: 33 | 34 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 35 | git clone https://github.com/AstraZeneca/detectIS.git 36 | ``` 37 | or alternatively you can download the tarball from GitHub: 38 | 39 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 40 | wget https://github.com/AstraZeneca/detectIS/archive/0.1.1.tar.gz 41 | ``` 42 | 43 | and extract the files: 44 | 45 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 46 | tar -xvf 0.1.1.tar.gz 47 | ``` 48 | 49 | The repository is made of three directories: 50 | 51 | * Workflows; 52 | 53 | * utils; 54 | 55 | * TestDataset. 56 | 57 | The *Workflows* directory contains the *detectIS.nf* file, necessary to run the analysis with nextflow. 58 | The directory *TestDataset* contains fastq files and reference data that can be used to test detectIS. It also contains the script *Run_detectIS.sh*, that can be used to run the analysis without using nextflow. 59 | The directory utils contains the recipe *detectIS.rec*, necessary to build the Singularity container, and the bash and Perl script used to generate simulated data sets. 60 | 61 | The configuration file and the bash script have been written to analyse the example dataset present in the TestDataset directory and, can also be used as templates for other analyses. 62 | 63 | ## Prerequisites to run the analysis 64 | 65 | The detectIS software requires [Singularity](https://www.sylabs.io/docs/) version 2.6 or higher. 66 | The installation of [Nextflow](https://www.nextflow.io/), version 0.32.0.4897 or higher, is strongly advised. 67 | 68 | ### Creating a Singularity container 69 | 70 | [Singularity](https://www.sylabs.io/docs/) is a container platform that creates and runs containers with the required software in a way that is portable and reproducible. A container is a single file that can be generated using Singularity on a laptop, and executed on HPC clusters, local university or company clusters, servers or cloud. 71 | 72 | A Singularity container with all the necessary software is required to run the detectIS pipeline. 73 | The image can be created by using the recipe contained in the *utils* directory of the detectIS GitHub repository (https://github.com/AstraZeneca/detectIS/raw/master/utils/detectIS.rec). 74 | It is possible to generate a Singularity container from a recipe with the command: 75 | 76 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 77 | sudo singularity build detectIS.simg detectIS.rec 78 | ``` 79 | 80 | Superuser privileges are necessary only to create the container but not to use it. This means you can create the container on your local pc/workstation and copy it to the system where you run the detectIS analyses (e.g. your hpc or cluster). 81 | 82 | Singularity containers are [kernel-dependent](https://www.sylabs.io/guides/2.6/user-guide/faq.html?highlight=disk%20access#are-singularity-containers-kernel-dependent), this implies that the recipes contained in this project will not necessarily produce a container able to run on your HPC system. If none of the available recipes generates a container compatible with your system you might need to modify the recipe using an operating system with compatible kernel, please raise an issue on GitHub if this is the case and you need support for it. 83 | 84 | ### Nextflow 85 | 86 | [Nextflow](https://www.nextflow.io/) is a workflow manager that enables scalable and reproducible scientific workflows using software containers. It requires Bash 3.2 (or later) and Java 8 (or later, up to 15) and it is distributed as a self-contained executable package. See the [nextflow manual](https://www.nextflow.io/docs/latest/) for further information. 87 | 88 | # Running the workflow using nextflow 89 | 90 | Once installed Singularity and Nextflow, if the container has been created and copied to the *utils* directory, you can run the nextflow workflow to analyse the test data set by using the command: 91 | 92 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 93 | nextflow run Workflows/detectIS.nf -c detectIS_TestDataset.conf --with-report \ 94 | detectIS_TestDataset_nextflow_report.html 95 | ``` 96 | The file *detectIS.nf* is the workflow for the detectIS analysis and *detectIS_TestDataset.conf* is the configuration file with all the information needed for the analysis. The workflow file is made by all the instructions used by nextflow for the analysis, it can be used without any change, unless you need to change the default parameters of the Perl script (see the *detectIS.pl script* section for further information). 97 | 98 | ## Structure of the configuration file 99 | The configuration file is made of two sections: the first one contains information of the HPC/cluster used for the analysis. 100 | 101 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 102 | params.project_name='Test_dataset-detectIS' 103 | ``` 104 | 105 | The variable *params.project_name* specifies the name of the project, in this case Test_dataset-detectIS. 106 | 107 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 108 | process.executor='sge' 109 | ``` 110 | 111 | The variable *process.executor* specifies the executor, there are different options depending by the HPC/cluster where you run the analysis (see https://www.nextflow.io/docs/latest/executor.html# for further information). If you run the process locally you don't need to specify it, or you can set it as 'local'. 112 | 113 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 114 | process.queue = 'infini.q' 115 | ``` 116 | 117 | The variable *process.queue* specifies the queue to use, this depends by the used HPC/cluster. 118 | 119 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 120 | process.clusterOptions = '-S /bin/bash' 121 | ``` 122 | 123 | The variable *process.clusterOptions* specifies options specific of the used cluster, the one in the example forces the cluster to run the job with bash, in case it is not the default option. 124 | 125 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 126 | process.penv = 'smp' 127 | ``` 128 | 129 | The variable *process.penv* specifies the parallel environment to use submitting a parallel task to the SGE resource manager (see https://www.nextflow.io/docs/latest/process.html#penv for further info). 130 | 131 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 132 | params.scratch='/scratch/' 133 | ``` 134 | 135 | The variable *params.scratch* specifies the scratch directory. 136 | 137 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 138 | singularity.enabled = true 139 | process.container = "utils/detectIS.simg" 140 | ``` 141 | 142 | These variables are necessary to use Singularity and to specify the container file. 143 | 144 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 145 | singularity.cacheDir = "/scratch/" 146 | ``` 147 | The variable *singularity.cacheDir* specifies the singularity cache. 148 | 149 | The second part of the configuration file contains analysis specific information. 150 | 151 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 152 | params.reads = "TestDataset/*_{R1,R2}.fq.gz" 153 | ``` 154 | 155 | The variable *params.reads* specifies the sequencing reads to process, the pattern used in the example is used to specify read pairs. 156 | 157 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 158 | params.outdir = "TestDataset/NextflowRes/" 159 | ``` 160 | 161 | The variable *params.outdir* specifies the output directory. 162 | 163 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 164 | params.cpu.minimap = 32 165 | ``` 166 | The variable *params.cpu.minimap* specifies the cpu used for the mapping with Minimap2. 167 | 168 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 169 | params.host_seq="TestDataset/Cricetulus_griseus_chok1gshd.CHOK1GS_HDv1.dna.toplevel_Scaffold0.fa" 170 | ``` 171 | The variable *params.host_seq specifies* the reference fasta file of the host genome. 172 | 173 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 174 | params.vir_seq="TestDataset/CelTag.fa" 175 | ``` 176 | 177 | The variable *params.vir_seq* specifies the reference fasta file of the exogenous element (plasmid, viral agent, etc.). 178 | 179 | 180 | # Test data sets and bash script 181 | 182 | In the directory "TestDataset" are contained paired-end reads and reference files to run a detectIS analysis.The dataset simulates the integration of a plasmid in the genome of Chinese hamster ovary cell line (CHOK1). 183 | 184 | ## Structure of the bash script 185 | A less valid alternative to Nextflow is a bash script able to run all the steps required by the pipeline. The directory *TestDataset* contains the bash script *Run_detectIS.sh* written for this aim. 186 | 187 | The top part of the script specifies variables specific of the analysis: 188 | 189 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 190 | singimg="../utils/detectIS.simg" 191 | ``` 192 | The variable *singimg* specifies the singularity container used for the analysis 193 | 194 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 195 | gref="./Cricetulus_griseus_chok1gshd.CHOK1GS_HDv1.dna.toplevel_Scaffold0.fa" 196 | extragenome1="./CelTag.fa" 197 | ``` 198 | 199 | These variables specify respectively the host genome and the exogenous references 200 | 201 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 202 | reads=( ./Sim_R1.fq.gz ./Sim_R2.fq.gz ) 203 | ``` 204 | 205 | This array specifies the two fastq files to process 206 | 207 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 208 | mkdir -p Res 209 | name="./Res/SimRead" 210 | paramscpu=4 211 | ``` 212 | 213 | The second part of the script run the alignment to the host and exogenous reference and finally the detectIS.pl script to integrate the results. Finally the markdown file produced by the script is converted to pdf and html. 214 | 215 | ## detectIS.pl script 216 | The "Workflows" directory contains the detectIS.nf file, the nextflow workflow file and the "bin" sub directory with the detectIS.pl script and the modules with all the subroutines used by it. The script processes the 4 alignment results in paf format, looking for split and/or chimeric read pairs able to identify integration sites. 217 | 218 | Script arguments: 219 | 220 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 221 | -h1 name_mate1_gnm.paf 222 | (Aligment results of R1 reads on the host genome) [Mandatory argument] 223 | -h2 name_mate2_gnm.paf 224 | (Aligment results of R2 reads on the host genome) [Mandatory argument] 225 | -v1 name_mate1_vir.paf 226 | (Aligment results of R1 reads on the exogenous reference) [Mandatory argument] 227 | -v2 name_mate2_vir.paf 228 | (Aligment results of R2 reads on the exogenous reference) [Mandatory argument] 229 | -o name_prefix (Output prefix) [Mandatory argument] 230 | -mqual 1 231 | (Minimum mapping quality to consider hits in the host genome) 232 | [Not mandatory, range: 0-60 default value: 1] 233 | -ovlwind 0.05 234 | (Overlap or distance, as fraction of the read length, tollerated to detect split reads) 235 | [Not mandatory, range: 0-1 default value: 0.05] 236 | -mspr 2 237 | (Minimum number of split reads to identify an integration site) 238 | [Not mandatory, range: 1+ default value: 2] 239 | ``` 240 | 241 | The script can be executed specifying the mandatory arguments and leaving as default the other arguments. Alternatively the ovlwind can be increased and the mspr can be reduced to increase the sensitivity of the tool, for experiments executed at low coverage. 242 | 243 | # Result interpretation 244 | 245 | The detectIS.pl script makes 2 final results: one text file (with the .txt extension) and one markdown file (with the .md extension). The text file can be visualized using the UNIX less command or edited by using any UNIX/MAC text editor like vim nano Emacs or in Windows, notepad. It can also be imported as spreadsheet in Excel or Open Office. The integration sites identified in the analysis are reported in rows with the following information: 246 | 247 | 1. IS: The integration site with chromosome and position of either host genome and exogenous element. 248 | 249 | 2. TotSpReads: The total number of split read pairs supporting the integration site. 250 | 251 | 3. R1R2SpReads: Number of split read pairs supporting the integration site with both read split. 252 | 253 | 4. R1SpReads: Number of split read pairs supporting the integration site having the R1 read split and the R2 read mapped within 5 read length of the integration site. 254 | 255 | 5. R2SpReads: Number of split read pairs supporting the integration site having the R2 read split and the R1 read mapped within 5 read length of the integration site. 256 | 257 | 6. ChimReads: The total number of chimeric read pairs supporting the integration site. 258 | 259 | 7. SingleSplitRead: Number of split read pairs made only one split read and the other not mapped within 5 read length of the integration site. 260 | 261 | 8. Interval: Extended interval supporting the integration site. It specifies the relative orientation of host genome and exogenous element and this information is fundamental to correctly design primers for PCR verification of the integration site. 262 | 263 | The same information are also contained in the markdown file that can be converted to pdf and/or html. In the directory Workflows/bin is contained a script to convert all the .md file present in the directory with the results to pdf and html files. 264 | 265 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)} 266 | cd detectIS/Workflows/bin/ 267 | bash CreatePDFandHTML.sh /MyResultDirectory 268 | ``` 269 | -------------------------------------------------------------------------------- /Workflows/bin/DetectSpltR.pm: -------------------------------------------------------------------------------- 1 | package DetectSpltR; 2 | 3 | use List::Util qw( min max ); 4 | use POSIX; 5 | 6 | ################################################################################ 7 | #########SUB 8 | ################################################################################ 9 | 10 | ################################################################################ 11 | #####Passing a genomic PAF file looks for potential split reads 12 | sub detect_split_reads 13 | { 14 | my $paf = $_[0]; 15 | my $mq = $_[1]; 16 | my %spltreads=() ; 17 | my %blklist=() ; 18 | open(PAF , "<", $paf) || die "ERROR in opening $paf: please check the file name\n" ; 19 | while (my $line=) { 20 | chomp $line; 21 | if (length ($line) >0) { 22 | my @tmp =split("\t" , $line) ; 23 | $blklist{$tmp[0]}++ ; 24 | } 25 | } 26 | close PAF; 27 | open(PAF , "<", $paf) || die "ERROR in opening $paf: please check the file name\n" ; 28 | while (my $line=) { 29 | chomp $line; 30 | if (length ($line) >0) { 31 | my @tmp =split("\t" , $line) ; 32 | if ( ($blklist{$tmp[0]}==1) && ($tmp[10]<($tmp[1])) && ($tmp[11]>=$mq) ) { #Filters to consider potential Split Reads: Univocally mapped; mapping lenght shorter than read length and Mapping Quality >= $mq 33 | $spltreads{$tmp[0]}[0]=$tmp[2]; #First mapped pos 34 | $spltreads{$tmp[0]}[1]=$tmp[3]; #Last mapped pos 35 | $spltreads{$tmp[0]}[2]=$tmp[4]; #STRAND 36 | $spltreads{$tmp[0]}[3]=$tmp[5]; #CHR 37 | $spltreads{$tmp[0]}[4]=$tmp[7]; #START 38 | $spltreads{$tmp[0]}[5]=$tmp[8]; #STOP 39 | } 40 | } 41 | } 42 | %blklist=() ; 43 | return (\%spltreads); #1K: Read name; El: @ 44 | } 45 | ################################################################################ 46 | 47 | ################################################################################ 48 | ######Passing a Plasmidic PAF file verify the potentially split reads identified in the genome 49 | sub verify_split_reads 50 | { 51 | my $splt1 = $_[0]; 52 | my $paf = $_[1]; 53 | my $ovlwin = $_[2]; 54 | my %isarray = (); 55 | my %vircounts = (); 56 | 57 | open(PAF , "<", $paf) || die "ERROR in opening $paf: please check the file name\n" ; 58 | while (my $line=) { 59 | chomp $line; 60 | if (length ($line) >0) { 61 | my @tmp =split("\t" , $line) ; 62 | if ( (exists $splt1->{ $tmp[0] }) ) { 63 | $vircounts{$tmp[0]}++ ; 64 | } 65 | } 66 | } 67 | close PAF ; 68 | open(PAF , "<", $paf) || die "ERROR in opening $paf: please check the file name\n" ; 69 | while (my $line=) { 70 | chomp $line; 71 | if (length ($line) >0) { 72 | my @tmp =split("\t" , $line) ; 73 | if ( (exists $splt1->{ $tmp[0] }) && $vircounts{$tmp[0]}==1 ) { 74 | my $rliminv=$tmp[1]*$ovlwin; #Overlap window shift allowed 75 | if ( ($tmp[2]> ($splt1->{ $tmp[0] }[1] - $rliminv) && ($tmp[2]< ($splt1->{ $tmp[0] }[1] + $rliminv))) ) { 76 | if ( ($splt1->{ $tmp[0] }[2] eq "+") && ($tmp[4] eq "+") ){ 77 | my $is=$tmp[5].":".$tmp[7]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[5] ; 78 | $isarray{$is}{$tmp[0]}++; 79 | } 80 | elsif ( ($splt1->{ $tmp[0] }[2] eq "+") && ($tmp[4] eq "-") ){ 81 | my $is=$tmp[5].":".$tmp[8]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[5] ; 82 | $isarray{$is}{$tmp[0]}++; 83 | } 84 | elsif ( ($splt1->{ $tmp[0] }[2] eq "-") && ($tmp[4] eq "+") ){ 85 | my $is=$tmp[5].":".$tmp[7]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[4] ; 86 | $isarray{$is}{$tmp[0]}++; 87 | } 88 | elsif ( ($splt1->{ $tmp[0] }[2] eq "-") && ($tmp[4] eq "-") ){ 89 | my $is=$tmp[5].":".$tmp[8]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[4] ; 90 | $isarray{$is}{$tmp[0]}++; 91 | } 92 | } 93 | elsif ( (($tmp[3] - $rliminv) < $splt1->{ $tmp[0] }[0] ) && (($tmp[3] + $rliminv) > $splt1->{ $tmp[0] }[0]) ) { 94 | if ( ($splt1->{ $tmp[0] }[2] eq "+") && ($tmp[4] eq "+") ){ 95 | my $is=$tmp[5].":".$tmp[8]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[4] ; 96 | $isarray{$is}{$tmp[0]}++; 97 | } 98 | elsif ( ($splt1->{ $tmp[0] }[2] eq "+") && ($tmp[4] eq "-") ){ 99 | my $is=$tmp[5].":".$tmp[7]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[4] ; 100 | $isarray{$is}{$tmp[0]}++; 101 | } 102 | elsif ( ($splt1->{ $tmp[0] }[2] eq "-") && ($tmp[4] eq "+") ){ 103 | my $is=$tmp[5].":".$tmp[8]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[5] ; 104 | $isarray{$is}{$tmp[0]}++; 105 | } 106 | elsif ( ($splt1->{ $tmp[0] }[2] eq "-") && ($tmp[4] eq "-") ){ 107 | my $is=$tmp[5].":".$tmp[7]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[5] ; 108 | $isarray{$is}{$tmp[0]}++; 109 | } 110 | 111 | } 112 | else { 113 | delete $splt1->{ $tmp[0] } ; 114 | } 115 | } 116 | } 117 | 118 | } 119 | close PAF ; 120 | $splt1=() ; 121 | return (\%isarray); #1K: IS; 2K: Supporting read 122 | } 123 | 124 | 125 | ################################################################################ 126 | ###### 2 genomic PAF file identifies potentially chimeric reads 127 | sub verify_spreads_is 128 | { 129 | my $pafg1 = $_[0]; #Genomic PAF Read1 130 | my $pafg2 = $_[1]; #Genomic PAF Read2 131 | my $pafv1 = $_[2]; #Viral PAF Read1 132 | my $pafv2 = $_[3]; #Viral PAF Read2 133 | my $ishash= $_[4]; #K1 IS; K2:Read_ID 134 | my $iscoord= $_[5]; #K: IS ->@ [VC] [VP] [GC] [GP] 135 | my %readstats = (); #K: Read_ID ->@ [G1 CHR] [G1 CO] [V1 CHR] [V1 CO] [G2 CHR] [G2 CO] [V2 CHR] [V2 CO] 136 | my %readcoord = (); #K1:Genomic_Chr; K2:Genomic_Pos -> IS 137 | my %readtois1 = (); #K:Read_ID -> IS 138 | my %readtois = (); #K:Read_ID -> IS 139 | my %pmreads1 = (); # 140 | my %pmreads2 = (); 141 | my %chmreads = (); 142 | my @t1 = (); 143 | my @t2 = (); 144 | foreach my $k (keys %{$ishash}) { 145 | foreach my $r (keys %{$ishash->{$k}}) { 146 | $readtois1{$r}{$k}++ ; 147 | } 148 | } 149 | 150 | foreach my $k (keys %{$ishash}) { 151 | my $vl=0; 152 | $readcoord{$iscoord->{$k}[2]}{$iscoord->{$k}[3]}=$k; 153 | foreach my $r (keys %{$ishash->{$k}}) { 154 | if (scalar (keys %{$readtois1{$r}}) ==1) { 155 | $vl++; 156 | $readtois{$r}=$k ; 157 | $readstats{$r}[0]=0; 158 | $readstats{$r}[1]=0; 159 | $readstats{$r}[2]=0; 160 | $readstats{$r}[3]=0; 161 | $readstats{$r}[4]=0; 162 | $readstats{$r}[5]=0; 163 | $readstats{$r}[6]=0; 164 | $readstats{$r}[7]=0; 165 | 166 | } 167 | else { 168 | delete $ishash->{$k}->{$r}; 169 | } 170 | } 171 | if ($vl==0) { 172 | delete $ishash->{$k}; 173 | } 174 | } 175 | open(PAF1 , "<", $pafg1) || die "ERROR in opening $pafg1: please check the file name\n" ; 176 | while (my $line=) { 177 | chomp $line; 178 | if (length ($line) >0) { 179 | my @tmp =split("\t" , $line) ; 180 | if (exists $readstats{$tmp[0]}) { 181 | if ( ($tmp[5] eq $iscoord->{$readtois{$tmp[0]}}[2]) && (($tmp[7]>=($iscoord->{$readtois{$tmp[0]}}[3]-($tmp[1]*5))) && ($tmp[8]<=($iscoord->{$readtois{$tmp[0]}}[3]+($tmp[1]*5)))) ) { 182 | $readstats{$tmp[0]}[0]=1; 183 | 184 | if ($tmp[7] == $iscoord->{$readtois{$tmp[0]}}[3]) { 185 | $readstats{$tmp[0]}[1]=1; 186 | push (@{$t1{$readtois{$tmp[0]}}}, $tmp[8]) ; 187 | } 188 | elsif ($tmp[8] == $iscoord->{$readtois{$tmp[0]}}[3]) { 189 | $readstats{$tmp[0]}[1]=1; 190 | push (@{$t1{$readtois{$tmp[0]}}}, $tmp[7]) ; 191 | 192 | } 193 | } 194 | } 195 | elsif (exists $readcoord{$tmp[5]}) { 196 | foreach my $k(keys %{$readcoord{$tmp[5]}}) { 197 | if ( ($tmp[7]>=($k-($tmp[1]*5))) && ($tmp[8]<=($k+($tmp[1]*5))) ) { # && ($tmp[10]>=(0.9*$tmp[1]))) { 198 | $pmreads1{$tmp[0]}{$readcoord{$tmp[5]}{$k}}=$tmp[4]; 199 | } 200 | } 201 | } 202 | } 203 | } 204 | close PAF1 ; 205 | open(PAF2 , "<", $pafg2) || die "ERROR in opening $pafg2: please check the file name\n" ; 206 | while (my $line=) { 207 | chomp $line; 208 | if (length ($line) >0) { 209 | my @tmp =split("\t" , $line) ; 210 | if (exists $readstats{$tmp[0]}) { 211 | if ( ($tmp[5] eq $iscoord->{$readtois{$tmp[0]}}[2]) && (($tmp[7]>=($iscoord->{$readtois{$tmp[0]}}[3]-($tmp[1]*5))) && ($tmp[8]<=($iscoord->{$readtois{$tmp[0]}}[3]+($tmp[1]*5)))) ) { 212 | $readstats{$tmp[0]}[4]=1; 213 | 214 | if ($tmp[7] == $iscoord->{$readtois{$tmp[0]}}[3]) { 215 | $readstats{$tmp[0]}[5]=1; 216 | push (@{$t1{$readtois{$tmp[0]}}}, $tmp[8]) ; 217 | } 218 | elsif ($tmp[8] == $iscoord->{$readtois{$tmp[0]}}[3]) { 219 | $readstats{$tmp[0]}[5]=1; 220 | push (@{$t1{$readtois{$tmp[0]}}}, $tmp[7]) ; 221 | } 222 | } 223 | } 224 | elsif (exists $readcoord{$tmp[5]}) { 225 | foreach my $k(keys %{$readcoord{$tmp[5]}}) { 226 | if ( ($tmp[7]>=($k-($tmp[1]*5))) && ($tmp[8]<=($k+($tmp[1]*5))) ) { #&& ($tmp[10]>=(0.9*$tmp[1]))) { 227 | $pmreads2{$tmp[0]}{$readcoord{$tmp[5]}{$k}}=$tmp[4]; 228 | } 229 | } 230 | } 231 | } 232 | } 233 | close PAF2 ; 234 | open(PAF3 , "<", $pafv1) || die "ERROR in opening $pafv1: please check the file name\n" ; 235 | while (my $line=) { 236 | chomp $line; 237 | if (length ($line) >0) { 238 | my @tmp =split("\t" , $line) ; 239 | if (exists $readstats{$tmp[0]}) { 240 | if ( ($tmp[5] eq $iscoord->{$readtois{$tmp[0]}}[0]) && (($tmp[7]>=($iscoord->{$readtois{$tmp[0]}}[1]-($tmp[1]*5))) && ($tmp[8]<=($iscoord->{$readtois{$tmp[0]}}[1]+($tmp[1]*5)))) ) { 241 | $readstats{$tmp[0]}[2]=1; 242 | if ($tmp[7] == $iscoord->{$readtois{$tmp[0]}}[1]) { 243 | $readstats{$tmp[0]}[3]=1; 244 | push (@{$t2{$readtois{$tmp[0]}}}, $tmp[8]) ; 245 | } 246 | elsif ($tmp[8] == $iscoord->{$readtois{$tmp[0]}}[1]) { 247 | $readstats{$tmp[0]}[3]=1; 248 | push (@{$t2{$readtois{$tmp[0]}}}, $tmp[7]) ; 249 | } 250 | } 251 | 252 | } 253 | elsif (exists $pmreads2{$tmp[0]}) { 254 | foreach my $is(keys %{$pmreads2{$tmp[0]}} ) { 255 | if ( ($tmp[5] eq $iscoord->{$is}[0]) && ($tmp[7]>=($iscoord->{$is}[1]-($tmp[1]*5))) && ($tmp[8]<=($iscoord->{$is}[1]+($tmp[1]*5))) && ($tmp[4] ne $pmreads2{$tmp[0]}{$is}) ) { 256 | $chmreads{$is}{$tmp[0]}++; 257 | 258 | } 259 | } 260 | } 261 | } 262 | } 263 | close PAF3 ; 264 | open(PAF4 , "<", $pafv2) || die "ERROR in opening $pafv2: please check the file name\n" ; 265 | while (my $line=) { 266 | chomp $line; 267 | if (length ($line) >0) { 268 | my @tmp =split("\t" , $line) ; 269 | if (exists $readstats{$tmp[0]}) { 270 | if ( ($tmp[5] eq $iscoord->{$readtois{$tmp[0]}}[0]) && (($tmp[7]>=($iscoord->{$readtois{$tmp[0]}}[1]-($tmp[1]*5))) && ($tmp[8]<=($iscoord->{$readtois{$tmp[0]}}[1]+($tmp[1]*5)))) ) { 271 | $readstats{$tmp[0]}[6]=1; 272 | if ($tmp[7] == $iscoord->{$readtois{$tmp[0]}}[1]) { 273 | $readstats{$tmp[0]}[7]=1; 274 | push(@{$t2{$readtois{$tmp[0]}}}, $tmp[8]) ; 275 | } 276 | elsif ($tmp[8] == $iscoord->{$readtois{$tmp[0]}}[1]) { 277 | $readstats{$tmp[0]}[7]=1; 278 | push(@{$t2{$readtois{$tmp[0]}}}, $tmp[7]) ; 279 | } 280 | 281 | } 282 | 283 | } 284 | elsif (exists $pmreads1{$tmp[0]}) { 285 | foreach my $is(keys %{$pmreads1{$tmp[0]}} ) { 286 | if ( ($tmp[5] eq $iscoord->{$is}[0]) && ($tmp[7]>=($iscoord->{$is}[1]-($tmp[1]*5))) && ($tmp[8]<=($iscoord->{$is}[1]+($tmp[1]*5))) && ($tmp[4] ne $pmreads1{$tmp[0]}{$is}) ) { 287 | $chmreads{$is}{$tmp[0]}++; 288 | 289 | } 290 | } 291 | } 292 | } 293 | } 294 | close PAF4 ; 295 | my $verifis=(); 296 | my $spreads=(); 297 | foreach my $k (keys %{$ishash}) { 298 | my $chim= scalar keys(%{$chmreads{$k}}) ; 299 | my $R1R2=0; 300 | my $R1=0; 301 | my $R2=0; 302 | my $R0=0; 303 | foreach my $r (keys %{$ishash->{$k}}) { 304 | if ($readstats{$r}[0]==1 && $readstats{$r}[1]==1 && $readstats{$r}[2]==1 && $readstats{$r}[3]==1 && $readstats{$r}[4]==1 && $readstats{$r}[5]==1 && $readstats{$r}[6]==1 && $readstats{$r}[7]==1) { 305 | $R1R2++ ; 306 | $spreads{$k}{$r}="R1R2" ; 307 | } 308 | elsif ( ($readstats{$r}[0]==1 && $readstats{$r}[1]==1 && $readstats{$r}[2]==1 && $readstats{$r}[3]==1) && ( ($readstats{$r}[4]==1 && $readstats{$r}[6]==0) || ($readstats{$r}[4]==0 && $readstats{$r}[6]==1) ) ) { 309 | $R1++ ; 310 | $spreads{$k}{$r}="R1" ; 311 | } 312 | elsif ( ($readstats{$r}[4]==1 && $readstats{$r}[5]==1 && $readstats{$r}[6]==1 && $readstats{$r}[7]==1) && ( ($readstats{$r}[0]==1 && $readstats{$r}[2]==0) || ($readstats{$r}[0]==0 && $readstats{$r}[2]==1) ) ) { 313 | $R2++ ; 314 | $spreads{$k}{$r}="R2" ; 315 | } 316 | else { 317 | $R0++ ; 318 | $spreads{$k}{$r}="R0" ; 319 | } 320 | } 321 | if (exists $t2{$k} && $t1{$k} ) { 322 | $verifis{$k}{"R1R2"}=$R1R2 ; 323 | $verifis{$k}{"R1"}=$R1 ; 324 | $verifis{$k}{"R2"}=$R2 ; 325 | $verifis{$k}{"CHIM"}=$chim ; 326 | $verifis{$k}{"RU"}=$R0 ; 327 | my @isc=split(/--/, $k) ; 328 | my @iscp=split(/:/, $isc[0]) ; 329 | my @iscg=split(/:/, $isc[1]) ; 330 | my $intp=max(@{$t2{$k}}) ; 331 | if ($intp<=$iscp[1]) { 332 | $intp=min(@{$t2{$k}}) ; 333 | } 334 | my $intg=max(@{$t1{$k}}) ; 335 | if ($intg <= $iscg[1]) { 336 | $intg=min(@{$t1{$k}}) ; 337 | } 338 | my $interval="$iscp[0]:$iscp[1]-$intp--$iscg[0]:$iscg[1]-$intg"; 339 | $verifis{$k}{"INT"}=$interval; 340 | } 341 | 342 | } 343 | return(\%verifis, \%spreads) ; 344 | 345 | } 346 | 347 | 348 | ################################################################################ 349 | 350 | 1; 351 | --------------------------------------------------------------------------------