├── Manual
    ├── MANUAL.pdf
    └── MANUAL.rmd
├── TestDataset
    ├── Sim_R1.fq.gz
    ├── Sim_R2.fq.gz
    ├── Run_detectIS.sh
    ├── .nextflow.log
    └── CelTag.fa
├── Workflows
    ├── bin
    │   ├── detectIS.png
    │   ├── CreatePDFandHTML.sh
    │   ├── detectISlisting.tex
    │   ├── GeneralTools.pm
    │   ├── detectISlisting.css
    │   ├── detectIS.pl
    │   ├── DetectChmR.pm
    │   └── DetectSpltR.pm
    └── detectIS.nf
├── AUTHORS.md
├── utils
    ├── Generate_Integrations.sh
    ├── SimulateIntegration.pl
    └── detectIS.rec
├── detectIS_TestDataset.conf
├── CONTRIBUTING.md
├── README.md
└── LICENSE


/Manual/MANUAL.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraZeneca/detectIS/HEAD/Manual/MANUAL.pdf


--------------------------------------------------------------------------------
/TestDataset/Sim_R1.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraZeneca/detectIS/HEAD/TestDataset/Sim_R1.fq.gz


--------------------------------------------------------------------------------
/TestDataset/Sim_R2.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraZeneca/detectIS/HEAD/TestDataset/Sim_R2.fq.gz


--------------------------------------------------------------------------------
/Workflows/bin/detectIS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraZeneca/detectIS/HEAD/Workflows/bin/detectIS.png


--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
 1 | # Authors
 2 | 
 3 | * [Luigi Grassi](mailto:luigi.grassi@astrazeneca.com)
 4 | * [Claire Harris](mailto:claire.harris@astrazeneca.com)
 5 | * [Jie Zhu](mailto:jie.zhu5@astrazeneca.com)
 6 | * [Colin Hardman](mailto:colin.hardman@astrazeneca.com)
 7 | * [Diane Hatton](mailto:diane.hatton@astrazeneca.com)
 8 | 
 9 | # Maintainer
10 | 
11 | * [Luigi Grassi](https://github.com/luigra)
12 | 


--------------------------------------------------------------------------------
/Workflows/bin/CreatePDFandHTML.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | DIR=$1
 4 | 
 5 | wd="$( cd "$(dirname "$( readlink -f ${BASH_SOURCE[0]} )")" >/dev/null 2>&1 && pwd)"
 6 | 
 7 | rep_head="${wd}/detectISlisting.tex"
 8 | rep_css="${wd}/detectISlisting.css"
 9 | 
10 | singimg=${wd}"../../../utils/detectIS.simg"
11 | 
12 | array=($(ls -d $DIR*.md))
13 | 
14 |  
15 | for name in "${array[@]}"
16 | do
17 | 	name=${name%".md"}
18 | 	singularity exec $singimg bash -c "pandoc ${name}.md --listings -H ${rep_head} -o ${name}.pdf" 
19 | 	singularity exec $singimg bash -c "pandoc --css=${rep_css} --mathjax  --to=html5 ${name}.md -o ${name}.html"
20 | done 
21 | 


--------------------------------------------------------------------------------
/utils/Generate_Integrations.sh:
--------------------------------------------------------------------------------
 1 | #!usr/bin/bash
 2 | 
 3 | #Script designed to simulate different plasmid integrations in a genomic reference sequence 
 4 | 
 5 | fastadir="../TestDataset/SimIntegration/simFASTA/"
 6 | mkdir -p ${fastadir}
 7 | 
 8 | perl SimulateIntegration.pl ../TestDataset/Cricetulus_griseus_chok1gshd.CHOK1GS_HDv1.dna.toplevel_Scaffold0.fa ../TestDataset/CelTag.fa 0.51 > ${fastadir}Scaffold0_0.5C_IS.fa 
 9 | perl SimulateIntegration.pl ../TestDataset/Cricetulus_griseus_chok1gshd.CHOK1GS_HDv1.dna.toplevel_Scaffold0.fa ../TestDataset/CelTag.fa 1.03 > ${fastadir}Scaffold0_1C_IS.fa 
10 | perl SimulateIntegration.pl ../TestDataset/Cricetulus_griseus_chok1gshd.CHOK1GS_HDv1.dna.toplevel_Scaffold0.fa ../TestDataset/CelTag.fa 2.05 > ${fastadir}Scaffold0_2C_IS.fa 
11 | 


--------------------------------------------------------------------------------
/detectIS_TestDataset.conf:
--------------------------------------------------------------------------------
 1 | params.project_name='Test_dataset-detectIS'
 2 | //The variable params.project_name specifies the name of the project
 3 | 
 4 | //CLUSTER SETTINGS
 5 | process.executor='sge'
 6 | //The variable process.executor specifies the executor (see https://www.nextflow.io/docs/latest/executor.html# for further information)
 7 | process.queue = 'infini.q'
 8 | //The variable process.queue specifies the queue to use
 9 | process.clusterOptions = '-S /bin/bash' 
10 | //The variable process.clusterOptions specifies options specific of the used cluster
11 | process.penv = 'smp'
12 | //The variable process.penv specifies the parallel environment to be used when submitting a parallel task to the SGE resource manager (see https://www.nextflow.io/docs/latest/process.html#penv for further info)
13 | params.scratch='/scratch/'
14 | //The variable params.scratch specifies the scratch directory
15 | 
16 | 
17 | //SINGULARITY SETTINGS
18 | singularity.enabled = true
19 | process.container = "utils/detectIS.simg"
20 | //The variable process.container specify the singularity image to use
21 | singularity.cacheDir = "/scratch/"   
22 | //The variable singularity.cacheDir specifies the singularity.cacheDir
23 | 
24 | 
25 | //ANALYSIS SPECIFIC PARAMETERS: INPUT AND OUTPUT DIRECTORY
26 | params.reads = "TestDataset/*_{R1,R2}.fq.gz"
27 | //The variable params.scratch specifies the sequencing reads
28 | params.outdir = "TestDataset/NextflowRes/"
29 | //The variable params.outdir specifies the output directory 
30 | 
31 | //ANALYSIS SPECIFIC PARAMETERS: MAPPING PARAMETERS
32 | params.cpu.minimap=32
33 | //The variable params.cpu.minimap specifies the cpu used for the mapping with Minimap2
34 | params.host_seq="TestDataset/Cricetulus_griseus_chok1gshd.CHOK1GS_HDv1.dna.toplevel_Scaffold0.fa"
35 | //The variable params.host_seq specifies the reference fasta file of the host genome
36 | params.vir_seq="TestDataset/CelTag.fa"
37 | //The variable params.vir_seq specifies the reference fasta file of the exogenous element (plasmid, viral agent, etc.)
38 | 
39 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to detectIS
 2 | 
 3 | This document contains the guidelines to contribute to the detectIS project, hosted in GitHub at [https://github.com/AstraZeneca/detectIS](https://github.com/AstraZeneca/detectIS)
 4 | 
 5 | :pray: First of all, thank you so much for contributing! :pray:
 6 | 
 7 | 
 8 | ## How can I contribute ?
 9 | 
10 | Please use the GitHub [issue](https://guides.github.com/features/issues/) tracker to share with maintainers and users questions, bug reports and feature requests. 
11 | 
12 | ### Related issues
13 | 
14 | Before submitting an new issue request please check whether the same or a similar topic has already been treated in the existing closed issues at [https://github.com/AstraZeneca/detectIS/issues](https://github.com/AstraZeneca/detectIS/issues).
15 |  
16 | If you are not able to find a solution to your problem in the existing closed issues please open a new one and, if possible, include a link to any related existing issue in the body of your new request.
17 | 
18 | If your request is already treated in an open issue please do not raise a new issue request but add your comments to the existing open issue.
19 | 
20 | #### Bug Report
21 | 
22 | We really appreciate bug reports:
23 | 
24 | please check if you can reproduce the problem in the latest version of the program and with default parameters. Please report in details the problem and include all the available information (command used, system information, environment variables, error messages, log files, reproducibility of the error, etc.).
25 | 
26 | 
27 | ### Suggesting Enhancements
28 | 
29 | Authors and maintainers are very open to requests of new features and minor improvements to existing functionality. Please describe and motivate your request, sharing test data sets if possible.
30 | 
31 | 
32 | ### Pull Requests
33 | 
34 | If you want to submit a pull request please follow the "fork-and-pull" Git workflow and provide detailed code documentation and commit messages.
35 | 
36 |  1. **Fork** the repo on GitHub
37 |  2. **Clone** the project to your local machine
38 |  3. **Commit** changes to your own branch
39 |  4. **Push** your work back up to your fork
40 |  5. Submit a **Pull request** so that we can review your changes
41 | 
42 | 


--------------------------------------------------------------------------------
/TestDataset/Run_detectIS.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #Script designed to run detectIS using 
 4 | #a host genome and a viral genome as references 
 5 | 
 6 | singimg="../utils/detectIS.simg"
 7 | 
 8 | gref="./Cricetulus_griseus_chok1gshd.CHOK1GS_HDv1.dna.toplevel_Scaffold0.fa"
 9 | extragenome1="./CelTag.fa"
10 | 
11 | reads=( ./Sim_R1.fq.gz ./Sim_R2.fq.gz ) 
12 | 
13 | mkdir -p Res
14 | name="./Res/SimRead"
15 | paramscpu=4
16 | 
17 | 
18 | echo $(date)
19 | echo "Starting minimap2 mapping on viral ref"
20 | 
21 | #Mapping Reads onto the plasmid and extracting the mapped ones
22 | /usr/local/singularity/bin/singularity exec $singimg bash -c "minimap2 -x sr -c ${extragenome1} ${reads[0]} -t ${paramscpu} > ${name}_mate1_vir.paf"
23 | /usr/local/singularity/bin/singularity exec $singimg bash -c "minimap2 -x sr -c ${extragenome1} ${reads[1]} -t ${paramscpu} > ${name}_mate2_vir.paf"
24 | /usr/local/singularity/bin/singularity exec $singimg bash -c "parallel -k cut -f1 ::: ${name}_mate1_vir.paf ${name}_mate2_vir.paf >  ${name}_vir.lst"
25 | /usr/local/singularity/bin/singularity exec $singimg bash -c "sort --parallel ${paramscpu} ${name}_vir.lst | uniq > ${name}_vir_mapping.lst"
26 | /usr/local/singularity/bin/singularity exec $singimg parallel --link seqtk subseq {1} ${name}_vir_mapping.lst ">" {2} ::: ${reads[0]} ${reads[1]} ::: ${name}_vir_R1.fq  ${name}_vir_R2.fq
27 | 
28 | echo $(date)
29 | echo "minimap2 mapping on plasmid ref completed, mapping reads on host genome"
30 | #Mapping Reads onto the host genome and run detectIS
31 | /usr/local/singularity/bin/singularity exec $singimg bash -c "minimap2 -x sr -c ${gref} ${name}_vir_R1.fq -t ${paramscpu} > ${name}_mate1_gnm.paf"
32 | /usr/local/singularity/bin/singularity exec $singimg bash -c "minimap2 -x sr -c ${gref} ${name}_vir_R2.fq -t ${paramscpu} > ${name}_mate2_gnm.paf"
33 | 
34 | echo $(date)
35 | echo "minimap2 mapping complete, runnig detectIS"
36 | /usr/local/singularity/bin/singularity exec $singimg bash -c "perl ../Workflows/bin/detectIS.pl -h1 ${name}_mate1_gnm.paf -h2 ${name}_mate2_gnm.paf -v1 ${name}_mate1_vir.paf -v2 ${name}_mate2_vir.paf -o ${name}"
37 | 
38 | echo $(date)
39 | echo "detectIS complete"
40 | 
41 | ln -s ../Workflows/bin/detectIS.png detectIS.png
42 | ../Workflows/bin/CreatePDFandHTML.sh Res/
43 | rm detectIS.png
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/utils/SimulateIntegration.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | use warnings;
 4 | use diagnostics ;
 5 | 
 6 | #perl SimulateIntegration.pl
 7 | #
 8 | #
 9 | #
10 | ################################################
11 | 
12 | (defined $ARGV[2]) || die "ERROR: please specify Chromosome fasta file, plasmid fasta file and number of inserted copies\n" ;
13 | 
14 | my $genome = $ARGV[0];
15 | my $plm = $ARGV[1];
16 | my $plmcp = $ARGV[2];
17 | 
18 | ($plmcp=~ /^\d+\.?\d*$/ && $plmcp > 0 && $plmcp<=5) || die "ERROR: please number of inserted copies has to be a numeric value [0.1-5]\n" ; 
19 | 
20 | my $chrseq='';
21 | my $chrname='';
22 | my $isgname='';
23 | open(FILE, "<$genome") || die "ERROR: please check the $genome file\n" ;
24 | while (my $line=<FILE> ) {
25 | 	chomp $line;
26 | 	if (length ($line)>0)  {
27 | 		if ($line=~/^>/) {
28 | 			$chrname=$line;
29 | 			my @tmp=split(/\s+/, $line) ;
30 |                         $isgname=$tmp[0] ;
31 | 			$isgname=~s/>//;
32 | 
33 | 		}
34 | 		else { 
35 | 			$chrseq.=$line ;
36 | 		}
37 | 	}	
38 | }
39 | close FILE ;
40 | 
41 | 
42 | 
43 | my $plmseq='';
44 | my $plname='';
45 | open(FILEP, "<$plm") || die "ERROR: please check $plm file\n" ;
46 | while (my $line=<FILEP> ) {
47 |         chomp $line;
48 |         if (length ($line)>0)  {
49 |                 if ($line=~/^>/) {
50 | 			my @tmp=split(/ /, $line) ;
51 | 			$plname=$tmp[0] ;
52 | 			$plname=~s/>//;
53 | 		}
54 | 		else {
55 |                         $plmseq.=$line ;
56 |                 }
57 |         }       
58 | }
59 | close FILEP ;
60 | 
61 | 
62 | my $finalenght=int($plmcp*(length($plmseq))) ;
63 | my $V1=int(rand(length($plmseq)));
64 | 
65 | my $mcplm1=substr($plmseq, $V1) ;
66 | my $mcplm2=$mcplm1.$plmseq.$plmseq.$plmseq.$plmseq.$plmseq ;
67 | my $mcplm=substr($mcplm2, 0, $finalenght ) ;
68 | my $V2;
69 | if ($finalenght <= length($mcplm1)) {
70 | 	$V2=$V1+($finalenght-1) ;
71 | }
72 | else {
73 | 	$V2=($finalenght-(length($mcplm1))-1) ;
74 | 	until ($V2<= length($plmseq)) {
75 | 		$V2=$V2-length($plmseq);
76 | 	} 
77 | }
78 | 
79 | my $G1 = length($chrseq);
80 | my $G2 = length($chrseq) ;
81 | until ($G1<length($chrseq) && $G1>0) {
82 | 	$G1=int( (length($chrseq)/2) -rand(1000)+rand(1000) )  ;
83 | }
84 | until ($G2<length($chrseq) && $G2>$G1) {
85 | 	$G2=int($G1+(rand(1)*length($mcplm)));
86 | }
87 | print ">$isgname $isgname:$G1--$plname:$V1 $plname:$V2--$isgname:$G2 | " ;
88 | 
89 | my $mcgnm1=substr($chrseq,0, $G1) ;
90 | my $mcgnm2=substr($chrseq,$G2) ;
91 | 
92 | my $mcgnm=$mcgnm1.$mcplm.$mcgnm2;
93 | my $flen=length($mcgnm);
94 | my $fleni=length($mcplm);
95 | print "Total_len:$flen | Plasmid_len:$fleni\n";
96 | print "$mcgnm\n";
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/Workflows/bin/detectISlisting.tex:
--------------------------------------------------------------------------------
 1 | %Contents of listings-setup.tex
 2 | \usepackage{graphics}
 3 | \usepackage{listings}
 4 | \usepackage{forloop}
 5 | \usepackage{wrapfig}
 6 | \usepackage{datetime}
 7 | \usepackage{graphicx}
 8 | % \usepackage[abspath]{currfile}
 9 | 
10 | %\graphicspath{{\currfileabsdir}}
11 | 
12 | \newcounter{ct}
13 | \makeatletter
14 | % Since we don't want an initial comma in the list, we use \@gobble
15 | \def\optionslist{\@gobble}
16 | %  Using the counter ct, range its value from 0 to 191
17 | \forloop{ct}{0}{\value{ct} < 27}{%
18 |     % \edef expands the definition before it actually 
19 |     % binds the two.  Thus, \rlist expands like a chameleon's tongue.
20 |     \edef\optionslist{\optionslist,-\alph{ct}, -\Alph{ct}}
21 |   }
22 | 
23 | 
24 | \makeatother
25 | \def\listwithrs#1\relax{%
26 | 	\lstdefinelanguage{ngs}{alsoletter={--,-,.,>,&},    % also: alsoletter
27 | }}
28 | \expandafter\listwithrs\optionslist\relax
29 | 
30 | \usepackage{listings}
31 | \usepackage{upquote} %to force Latex not substitute ' by `
32 | 
33 | \lstset{
34 | language=ngs,
35 | basicstyle=\ttfamily,
36 | keywordstyle=\color[rgb]{0.13,0.29,0.53}\bfseries,
37 | keywordstyle=[2]\color[rgb]{0.56,0.35,0.01}\bfseries,
38 | stringstyle=\color[rgb]{0.31,0.60,0.02},
39 | commentstyle=\color[rgb]{0.56,0.35,0.01}\itshape,
40 | numberstyle=\footnotesize,
41 | stepnumber=1,
42 | numbersep=5pt,
43 | backgroundcolor=\color[RGB]{248,248,248},
44 | showspaces=false,
45 | showstringspaces=false,
46 | showtabs=false,
47 | tabsize=2,
48 | captionpos=b,
49 | breaklines=true,
50 | breakatwhitespace=true,
51 | breakautoindent=true,
52 | escapeinside={\%*}{*)},
53 | linewidth=0.9\textwidth,
54 | basewidth=0.5em,
55 | columns=fullflexible
56 | }
57 | 
58 | 
59 | 
60 | \newdateformat{monthyeardate}{\monthname[\THEMONTH], \THEYEAR}
61 | \newcommand*{\customrule}[1]{\textcolor{#1}{\rule{\textwidth}{1.5pt}}\par}
62 | 
63 | \newcommand*{\titleAT}{%
64 | \begingroup 
65 | \newlength{\drop}
66 | \drop=0.1\textheight % White sapce generated is 10% of the total text height
67 | 
68 | \customrule{blue}
69 | \customrule{blue}
70 | 
71 | 
72 | \vspace{\drop} 
73 | \centering 
74 | \textcolor[rgb]{0,1,0}{{\Huge detectIS}\\[0.5\baselineskip]} 
75 | 
76 | \vspace{0.25\drop} 
77 | \rule{0.3\textwidth}{0.4pt}\par % Short horizontal line under the title
78 | \vspace{\drop}
79 | 
80 | 
81 | \begin{figure}[h!]
82 |   \centering
83 |   \includegraphics[width=0.4\columnwidth]{detectIS.png}
84 | \end{figure}
85 | 
86 | 
87 | \vfill 
88 | {\large \textsc{detectIS data analysis\\ 
89 | \monthyeardate\today }}\par % Publisher
90 | 
91 | \vspace*{\drop} 
92 | \customrule{blue}
93 | \customrule{blue}
94 | \endgroup}
95 | 


--------------------------------------------------------------------------------
/Workflows/bin/GeneralTools.pm:
--------------------------------------------------------------------------------
 1 | package GeneralTools;
 2 | 
 3 | use List::Util qw( min max );
 4 | use POSIX;
 5 | 
 6 | ################################################################################
 7 | #########SUB
 8 | ################################################################################
 9 | 
10 | sub average{
11 |         my($data) = @_;
12 |         if (not @$data) {
13 |                 die("Empty arrayn");
14 |         }
15 |         my $total = 0;
16 |         foreach (@$data) {
17 |                 $total += $_;
18 |         }
19 |         my $average = $total / @$data;
20 |         return $average;
21 | }
22 | 
23 | sub stdev{
24 |         my($data) = @_;
25 |         if(@$data == 1){
26 |                 return 0;
27 |         }
28 |         my $average = &average($data);
29 |         my $sqtotal = 0;
30 |         foreach(@$data) {
31 |                 $sqtotal += ($average-$_) ** 2;
32 |         }
33 |         my $std = ($sqtotal / (@$data-1)) ** 0.5;
34 |         return $std;
35 | }
36 | 
37 | sub median
38 | {
39 |     my @vals = sort {$a <=> $b} @_;
40 |     my $len = @vals;
41 |     if($len%2) #odd?
42 |     {
43 |         return $vals[int($len/2)];
44 |     }
45 |     else #even
46 |     {
47 |         return ($vals[int($len/2)-1] + $vals[int($len/2)])/2;
48 |     }
49 | }
50 | 
51 | sub merge_split_pairs  
52 | {
53 |         my $sp1 = $_[0];
54 |         my $sp2 = $_[1];
55 |         my $ms = $_[2];
56 |        	my %ish1=(); # Key1:IS -> Element: number of fragment/read supporting it
57 | 	my %isr=(); # Key1:IS -> Key2:Read_ID -> Element: Read_occurrence
58 | 	my %isc=(); # Key IS -> Element: @ 0[Plm_chr] 1[Plm_pos] 2[Host_chr] 3[Host_pos]
59 |  	foreach my $n(keys %{$sp1}) {
60 |         	my @tmp=split(/--/, $n) ;
61 |         	my @tmp1=split(/:/, $tmp[0]) ;
62 |        		my @tmp2=split(/:/, $tmp[1]) ;
63 |         	$isc{$n}[0]=$tmp1[0] ;
64 |         	$isc{$n}[1]=$tmp1[1] ;
65 |         	$isc{$n}[2]=$tmp2[0] ;
66 |         	$isc{$n}[3]=$tmp2[1] ;
67 |         	foreach my $j( keys(%{$sp1->{$n}}) ) {
68 |                 	$isr{$n}{$j}++ ;
69 |         	}
70 | 	}
71 | 	foreach my $n(keys %{$sp2}) {
72 |         	my @tmp=split(/--/, $n) ;
73 |         	my @tmp1=split(/:/, $tmp[0]) ;
74 |         	my @tmp2=split(/:/, $tmp[1]) ;
75 |         	$isc{$n}[0]=$tmp1[0] ;
76 |         	$isc{$n}[1]=$tmp1[1] ;
77 |         	$isc{$n}[2]=$tmp2[0] ;
78 |         	$isc{$n}[3]=$tmp2[1] ;
79 |         	foreach my $j( keys(%{$sp2->{$n}}) ) {
80 |                 	$isr{$n}{$j}++;
81 |         	}
82 | 	}
83 | 	foreach my $n(keys %isr) {
84 |         	 $ish1{$n}=scalar (keys %{$isr{$n}}) ;
85 | 	}
86 | 	foreach my $n(keys %ish1) {  #Keeping IS with at least ms supporting splt reads
87 |         	if ($ish1{$n} < $ms) {
88 |                 	delete $ish1{$n} ;
89 |                 	delete $isr{$n} ;
90 |                 	delete $isc{$n} ;
91 |         	}
92 | 	}
93 |         return (\%ish1, \%isr, \%isc); #1K: Read name; El: @   
94 | }
95 | 
96 | 1;
97 | 


--------------------------------------------------------------------------------
/utils/detectIS.rec:
--------------------------------------------------------------------------------
  1 | BootStrap:docker
  2 | From: ubuntu:18.04
  3 | 
  4 | %help
  5 | Singularity image for detectIS
  6 | 
  7 | Create the img: sudo singularity build detectIS.simg detectIS.rec
  8 | 
  9 | %runscript
 10 |     exec echo "The runscript is the containers default runtime command!"
 11 | 
 12 | %files
 13 | 
 14 | %environment
 15 |     VARIABLE=MEATBALLVALUE
 16 |     export VARIABLE
 17 | 
 18 | %labels
 19 |    AUTHOR luigi.grassi@astrazeneca.com
 20 | 
 21 | %post
 22 | 	apt-get update && apt-get install -y --no-install-recommends \
 23 |         apt-utils \
 24 | 	sudo \
 25 | 	vim \
 26 | 	less \
 27 |         build-essential \
 28 |         curl \
 29 |         git \
 30 | 	wget \
 31 | 	unzip \
 32 | 	locales \
 33 | 	default-jre \
 34 | 	g++ \
 35 | 	make \
 36 | 	libz-dev \
 37 | 	samtools \
 38 | 	bedtools \
 39 | 	python \
 40 | 	python-dev \
 41 | 	parallel \
 42 | 	python-pip \
 43 | 	cmake \
 44 | 	texlive-latex-extra \
 45 | 	texlive-fonts-recommended \
 46 | 	lmodern \
 47 | 	libx11-dev \
 48 | 	libbz2-dev
 49 | 	apt-get clean
 50 | 
 51 | 	locale-gen en_US.UTF-8
 52 | 
 53 |         export LC_ALL=C.UTF-8
 54 |         export LANG=C.UTF-8
 55 |         echo 'export LC_ALL=C.UTF-8' >> $SINGULARITY_ENVIRONMENT
 56 |         echo 'export LANG=C.UTF-8' >> $SINGULARITY_ENVIRONMENT
 57 |         echo "export PATH=/usr/local:/usr/local/bin:$PATH" >> $SINGULARITY_ENVIRONMENT
 58 | 
 59 | 
 60 | 	#tex fonts
 61 | 	wget --no-check-certificate http://mirrors.ctan.org/macros/generic/iftex/ifluatex.sty
 62 | 	mv ifluatex.sty /usr/share/texmf/tex/generic
 63 | 	wget --no-check-certificate  http://mirrors.ctan.org/macros/latex/contrib/framed.zip
 64 | 	unzip framed.zip
 65 | 	mv framed /usr/share/texmf/tex/latex
 66 | 	texhash
 67 | 	
 68 | 	#minimap2
 69 | 	sudo wget --no-check-certificate https://github.com/lh3/minimap2/releases/download/v2.17/minimap2-2.17_x64-linux.tar.bz2
 70 | 	tar -jxf minimap2-2.17_x64-linux.tar.bz2
 71 | 	cd minimap2-2.17_x64-linux
 72 | 	sudo mv k8 paftools.js minimap2 /usr/bin
 73 | 	cd ..
 74 | 	sudo rm -rf  minimap2-2.12_x64-linux
 75 | 	sudo chmod a+x /usr/bin/k8 
 76 | 	sudo chmod a+x /usr/bin/paftools.js
 77 | 	sudo chmod a+x /usr/bin/minimap2
 78 | 
 79 | 	#seqtk
 80 |         wget --no-check-certificate https://github.com/lh3/seqtk/archive/v1.3.tar.gz
 81 |         tar -xvf v1.3.tar.gz
 82 |         cd seqtk-1.3
 83 |         make install BINDIR=/usr/bin    
 84 |         cd .
 85 | 	
 86 | 	#Pandoc
 87 | 	sudo wget --no-check-certificate https://github.com/jgm/pandoc/releases/download/2.9.1.1/pandoc-2.9.1.1-linux-amd64.tar.gz
 88 | 	tar -xzf pandoc-2.9.1.1-linux-amd64.tar.gz
 89 | 	cd pandoc-2.9.1.1/bin	
 90 | 	sudo chmod a+x pandoc
 91 | 	sudo chmod a+x pandoc-citeproc
 92 |         sudo mv pandoc pandoc-citeproc /usr/bin
 93 | 	cd ../..
 94 | 	sudo rm -rf pandoc*
 95 | 	
 96 |     	#to avoid warnings on hpc/pbs
 97 |     	mkdir -p /extra
 98 |     	mkdir -p /xdisk
 99 |     	mkdir -p /rsgrps
100 |     	mkdir -p /cm/shared
101 | 	
102 | 
103 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Maturity level-Prototype](https://img.shields.io/badge/Maturity%20Level-Prototype-red)
 2 | 
 3 | # detectIS
 4 | 
 5 | DetectIS is a pipeline specifically designed to detect exogenous DNA integration sites using DNA or RNA paired-end sequencing data.
 6 | The workflow manager [nextflow](https://www.nextflow.io/) is used with a configuration file and a Singularity image 
 7 | 
 8 | 
 9 | ## Getting Started
10 | 
11 | In order to run the workflow, the user has to create a configuration file, specifying:
12 | 
13 | 	 	a)fasta file with the reference host genome;
14 | 		b)fasta file with the reference exogenous sequence;
15 | 		c)the directory containing the raw data, in FASTQ format
16 | 		d)the output directory. 
17 | The analysis can be executed locally or in an HPC environment, in the latter scenario the user has also to specify the cluster executor. 
18 | 
19 | 
20 | ### Prerequisites
21 | 
22 | The detectIS software requirements are:
23 | 	- [Singularity](https://www.sylabs.io/docs/) V2.6 or higher.
24 | 	- [Nextflow](https://www.nextflow.io/), the workflow has been developed and tested with version 0.32.0.4897 
25 | 
26 | 
27 | ### Creating a Singularity container
28 | 
29 | A Singularity container with all the necessary software is required to run the pipeline.
30 | The image can be created by using the recipe (file: "detectIS.rec" contained in the  "utils" directory). Superuser privileges are necessary to generate a Singularity container with the command:
31 | 
32 | ```
33 | sudo singularity build detectIS.simg detectIS.rec
34 | ```
35 | 
36 | N.B. superuser privileges are necessary only to create the container but no to use it. This means you can create the container in your local pc/workstation and copy it to the system where you run analyses (e.g. your hpc or cluster). 
37 | 
38 | Alternatively, If you have problems in generating a Singularity container from the recipe you can download the image from [Singularity Hub](https://singularity-hub.org/)  
39 | 
40 | 
41 | ### Runnig the workflow
42 | 
43 | If you have installed Singularity, Nextflow, and [configured the Singularity](https://www.sylabs.io/guides/2.6/user-guide/faq.html?highlight=disk%20access#how-are-external-file-systems-and-paths-handled-in-a-singularity-container) granting the image access to the disk partitions to read and write you can run any workflow.
44 | 
45 | ```
46 | nextflow run Workflows/detectIS.nf -c detectIS_TestDataset.conf -with-report detectIS_TestDataset_nextflow_report.html
47 | ```
48 | 
49 | In the example Workflows/detectIS.nf is the workflow for the detectIS analysis and detectIS_TestDataset.conf is the configuration file with all the information needed for that given project. In the configuration file are specified input and output file directories, references (fasta) directories, and cluster specific parameters. 
50 | 
51 | 
52 | ### Test data sets
53 | 
54 | In the directory "TestDataset" are contained paired-end reads and reference files to run a detectIS analysis.
55 | The dataset simulates the integration of a plasmid in the genome of Chinese hamster ovary cell line (CHOK1) .
56 | 
57 | The analysis can be executed using the bash script "Run_detectIS.sh", also contained in the directory "TestDataset" or using nexflow:
58 | 
59 | ```
60 | nextflow run Workflows/detectIS.nf -c detectIS_TestDataset.conf -with-report detectIS_TestDataset_nextflow_report.html
61 | ```
62 | 
63 | The configuration file and the bash script can be either used as a template for other analyses. 
64 | 
65 | 
66 | ## Deployment
67 | 
68 | Please notice that Singularity containers can be [kernel-dependent](https://www.sylabs.io/guides/2.6/user-guide/faq.html?highlight=disk%20access#are-singularity-containers-kernel-dependent), this implies that the image recipies contained in this project will not necessarily produce an image able to run on your HPC system. If none of the available images is compatible with your system you might need to modify the recipe using an OS with compatible kernel, please raise an issue if this is the case and you need support for it.
69 | 
70 | ## Citation
71 | 
72 | If you use detectIS in your research, please cite our latest [publication](https://doi.org/10.1093/bioinformatics/btab366).
73 | 


--------------------------------------------------------------------------------
/Workflows/detectIS.nf:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env nextflow
  2 | 
  3 | /*
  4 | ===============================================================
  5 |  detectIS-nextflow
  6 | ===============================================================
  7 | Pipeline to identify Integration Sites (IS) in paired-end
  8 | RNA-seq/DNA-seq experiments.
  9 |  
 10 | NextFlow main file 
 11 | Jan 2020.
 12 | 
 13 | 
 14 | #### Homepage / Documentation
 15 | https://github.com/AstraZeneca/detectIS
 16 | #### Author
 17 | Luigi Grassi
 18 | ---------------------------------------------------------------
 19 | */
 20 | 
 21 | 
 22 | def helpMessage() {
 23 |     log.info """
 24 |  
 25 | =================================================================================
 26 | detectIS pipeline	 
 27 | https://github.com/AstraZeneca/detectIS
 28 | =================================================================================
 29 | 
 30 | Usage:
 31 |     
 32 | The typical command for running the pipeline is as follows:
 33 | 
 34 | nextflow run  detectIS-nextflow -c 20190515_downsampleRNAseq_detectISTEST.conf 
 35 | 
 36 | Mandatory arguments: -c Configuration file with all the parameters used in the analysis
 37 |     """.stripIndent()
 38 | }
 39 | 
 40 | 
 41 | // Show help message
 42 | params.help = false
 43 | if (params.help){
 44 |     helpMessage()
 45 |     exit 0
 46 | }
 47 | 
 48 | if (params.project_name==null){
 49 |     helpMessage()
 50 |     exit 0
 51 | }
 52 | 
 53 | Channel
 54 |     .fromFilePairs(params.reads)                                             
 55 |     .ifEmpty {exit 1, error "Cannot find any reads matching: ${params.reads}" }  
 56 |     .into { reads_minimap1 }
 57 | 
 58 | 
 59 | // STEP 1 - MAPPING WITH MINIMAP2 READS AS SINGLE END ON PLSM/VIRUS
 60 | 
 61 | vir_mini = Channel.fromPath(params.vir_seq)
 62 |         .ifEmpty { exit 1, "Viral/plasmid genome reference fasta file not found: please check ${params.vir_seq}" }
 63 | 
 64 | process minimap_vir {
 65 |         errorStrategy 'retry'
 66 |         maxRetries 3
 67 |         scratch params.scratch
 68 | 
 69 |         cpus = params.cpu.minimap
 70 |         memory = 20.GB
 71 |         tag "$name"
 72 |         publishDir "${params.outdir}/MinimapPAF", mode: 'copy' ,
 73 | 	saveAs: {filename ->
 74 |         		if (filename.indexOf("_vir.paf") > 0) "$filename"
 75 |                         else if (filename.indexOf("_mapping.lst") > 0) "$filename"
 76 |                         else null
 77 |         }
 78 | 	
 79 |         input:
 80 |         set val(name), file(reads) from reads_minimap1
 81 |         file pref from vir_mini.collect()
 82 | 
 83 |         output:
 84 |         file "*_R1.fq.gz" into fastq_toremap1
 85 |         file "*_R2.fq.gz" into fastq_toremap2
 86 |         file "*_mate1_vir.paf" into viral1_paf
 87 |         file "*_mate2_vir.paf" into viral2_paf
 88 | 
 89 |         script:
 90 | 
 91 |         """
 92 | 		minimap2 -x sr -c ${pref} ${reads[0]} -t ${params.cpu.minimap} > ${name}_mate1_vir.paf
 93 | 		minimap2 -x sr -c ${pref} ${reads[1]} -t ${params.cpu.minimap} > ${name}_mate2_vir.paf
 94 | 		cut -f1 ${name}_mate1_vir.paf > ${name}_vir.lst
 95 | 		cut -f1 ${name}_mate2_vir.paf >> ${name}_vir.lst
 96 | 		less -S ${name}_vir.lst | sort | uniq > ${name}_vir_mapping.lst
 97 | 		seqtk subseq ${reads[0]} ${name}_vir_mapping.lst | gzip -vc > ${name}_vir_R1.fq.gz
 98 | 		seqtk subseq ${reads[1]} ${name}_vir_mapping.lst | gzip -vc > ${name}_vir_R2.fq.gz
 99 | 	 """
100 | }
101 | 
102 | // STEP 2 - MAPPING ALL READS WITH ANY VIRAL OVERLAP ONTO THE HOST GENOME
103 | 
104 | host_mini = Channel.fromPath(params.host_seq)
105 |         .ifEmpty { exit 1, "Host genome reference fasta file not found: please check ${params.host_seq}" }
106 | 
107 | 
108 | 
109 | process minimap_genome_and_detectIS {
110 |         errorStrategy 'retry'
111 |         maxRetries 3
112 |         scratch params.scratch
113 | 
114 |         cpus = params.cpu.minimap
115 |         memory = 20.GB
116 |         tag "$sample"
117 |         publishDir "${params.outdir}", mode: 'copy' ,
118 | 	saveAs: {filename ->
119 |         		if (filename.indexOf("_gnm.paf") > 0) "MinimapPAF/$filename"
120 |                         else "detectIS/$filename"
121 |         }
122 | 
123 |         input:
124 |         file read1 from fastq_toremap1
125 |         file read2 from fastq_toremap2
126 |         file gref from host_mini.collect()
127 | 	file vpaf1 from viral1_paf
128 |         file vpaf2 from viral2_paf	
129 | 	
130 |         output:
131 | 	file "*_mate1_gnm.paf" into genom1_paf
132 |         file "*_mate2_gnm.paf" into genom2_paf
133 | 	file("*.md")
134 |         file("*.txt")
135 | 
136 |         script:	
137 | 	sample = read1.toString() - ~/(_vir)?(_R1)?(\.fq)?(\.gz)?$/
138 | 
139 |         """
140 |                 minimap2 -x sr -c ${gref} ${read1} -t ${params.cpu.minimap} > ${sample}_mate1_gnm.paf
141 |                 minimap2 -x sr -c ${gref} ${read2} -t ${params.cpu.minimap} > ${sample}_mate2_gnm.paf
142 | 		perl $baseDir/bin/detectIS.pl -h1 ${sample}_mate1_gnm.paf -h2 ${sample}_mate2_gnm.paf -v1 ${vpaf1} -v2 ${vpaf2} -o ${sample}
143 | 	"""
144 | }
145 | 
146 | 


--------------------------------------------------------------------------------
/TestDataset/.nextflow.log:
--------------------------------------------------------------------------------
 1 | Sep-08 10:32:48.133 [main] DEBUG nextflow.cli.Launcher - $> /home/grassil/bin/nextflow run Workflows/detectIS.nf -c TestDataset/detectIS.conf
 2 | Sep-08 10:32:48.212 [main] INFO  nextflow.cli.CmdRun - N E X T F L O W  ~  version 0.32.0
 3 | Sep-08 10:32:48.443 [main] DEBUG nextflow.scm.AssetManager - Listing projects in folder: /home/grassil/.nextflow/assets
 4 | Sep-08 10:32:48.491 [main] INFO  nextflow.cli.CmdRun - Pulling nextflow-io/Workflows ...
 5 | Sep-08 10:32:48.492 [main] DEBUG nextflow.scm.RepositoryProvider - Request [credentials -:-] -> https://api.github.com/repos/nextflow-io/Workflows/contents/detectIS.nf
 6 | Sep-08 10:32:49.156 [main] DEBUG nextflow.scm.RepositoryProvider - Request [credentials -:-] -> https://api.github.com/repos/nextflow-io/Workflows
 7 | Sep-08 10:32:49.448 [main] DEBUG nextflow.cli.Launcher - Operation aborted
 8 | java.io.FileNotFoundException: https://api.github.com/repos/nextflow-io/Workflows
 9 | 	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
10 | 	at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
11 | 	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
12 | 	at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
13 | 	at sun.net.www.protocol.http.HttpURLConnection$10.run(HttpURLConnection.java:1950)
14 | 	at sun.net.www.protocol.http.HttpURLConnection$10.run(HttpURLConnection.java:1945)
15 | 	at java.security.AccessController.doPrivileged(Native Method)
16 | 	at sun.net.www.protocol.http.HttpURLConnection.getChainedException(HttpURLConnection.java:1944)
17 | 	at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1514)
18 | 	at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1498)
19 | 	at sun.net.www.protocol.https.HttpsURLConnectionImpl.getInputStream(HttpsURLConnectionImpl.java:268)
20 | 	at nextflow.scm.RepositoryProvider.invoke(RepositoryProvider.groovy:114)
21 | 	at nextflow.scm.RepositoryProvider.memoizedMethodPriv$invokeAndParseResponseString(RepositoryProvider.groovy:175)
22 | 	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
23 | 	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
24 | 	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
25 | 	at java.lang.reflect.Method.invoke(Method.java:498)
26 | 	at org.codehaus.groovy.reflection.CachedMethod.invoke(CachedMethod.java:104)
27 | 	at groovy.lang.MetaMethod.doMethodInvoke(MetaMethod.java:326)
28 | 	at groovy.lang.MetaClassImpl.invokeMethod(MetaClassImpl.java:1235)
29 | 	at groovy.lang.MetaClassImpl.invokeMethod(MetaClassImpl.java:1041)
30 | 	at org.codehaus.groovy.runtime.InvokerHelper.invokePogoMethod(InvokerHelper.java:1018)
31 | 	at org.codehaus.groovy.runtime.InvokerHelper.invokeMethod(InvokerHelper.java:1001)
32 | 	at org.codehaus.groovy.runtime.InvokerHelper.invokeMethodSafe(InvokerHelper.java:97)
33 | 	at nextflow.scm.RepositoryProvider$_closure1.doCall(RepositoryProvider.groovy)
34 | 	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
35 | 	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
36 | 	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
37 | 	at java.lang.reflect.Method.invoke(Method.java:498)
38 | 	at org.codehaus.groovy.reflection.CachedMethod.invoke(CachedMethod.java:104)
39 | 	at groovy.lang.MetaMethod.doMethodInvoke(MetaMethod.java:326)
40 | 	at org.codehaus.groovy.runtime.metaclass.ClosureMetaClass.invokeMethod(ClosureMetaClass.java:264)
41 | 	at groovy.lang.MetaClassImpl.invokeMethod(MetaClassImpl.java:1041)
42 | 	at groovy.lang.Closure.call(Closure.java:421)
43 | 	at org.codehaus.groovy.runtime.memoize.Memoize$MemoizeFunction$1.provide(Memoize.java:139)
44 | 	at org.codehaus.groovy.runtime.memoize.ConcurrentCommonCache.getAndPut(ConcurrentCommonCache.java:147)
45 | 	at org.codehaus.groovy.runtime.memoize.ConcurrentCommonCache.getAndPut(ConcurrentCommonCache.java:123)
46 | 	at org.codehaus.groovy.runtime.memoize.Memoize$MemoizeFunction.call(Memoize.java:136)
47 | 	at groovy.lang.Closure.call(Closure.java:437)
48 | 	at nextflow.scm.RepositoryProvider.invokeAndParseResponse(RepositoryProvider.groovy)
49 | 	at nextflow.scm.RepositoryProvider.validateRepo(RepositoryProvider.groovy:213)
50 | 	at nextflow.scm.RepositoryProvider.validateFor(RepositoryProvider.groovy:206)
51 | 	at nextflow.scm.AssetManager.checkValidRemoteRepo(AssetManager.groovy:344)
52 | 	at nextflow.scm.AssetManager.download(AssetManager.groovy:551)
53 | 	at nextflow.scm.AssetManager.download(AssetManager.groovy)
54 | 	at nextflow.cli.CmdRun.getScriptFile(CmdRun.groovy:295)
55 | 	at nextflow.cli.CmdRun.run(CmdRun.groovy:210)
56 | 	at nextflow.cli.Launcher.run(Launcher.groovy:432)
57 | 	at nextflow.cli.Launcher.main(Launcher.groovy:590)
58 | Caused by: java.io.FileNotFoundException: https://api.github.com/repos/nextflow-io/Workflows
59 | 	at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1896)
60 | 	at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1498)
61 | 	at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:480)
62 | 	at sun.net.www.protocol.https.HttpsURLConnectionImpl.getResponseCode(HttpsURLConnectionImpl.java:352)
63 | 	at nextflow.scm.RepositoryProvider.checkResponse(RepositoryProvider.groovy:143)
64 | 	at nextflow.scm.RepositoryProvider.invoke(RepositoryProvider.groovy:111)
65 | 	... 37 common frames omitted
66 | 


--------------------------------------------------------------------------------
/TestDataset/CelTag.fa:
--------------------------------------------------------------------------------
 1 | >CelTag Plasmid sequence 5574 bps #https://www.addgene.org/66562/sequences/#depositor-full
 2 | GGGCGAATTGGGCCCGACGTCGCATGCTCCCGGCCGCCATGGCGGCCGCGGGAATTCGATTAATCGATAC
 3 | ATATGCCCGGGTTAATTAACGGTGAACAAAAGCTAATCTCCGAGGAAGACTTGAACGGTGAACAAAAATT
 4 | AATCTCAGAAGAAGACTTGAACGGACTCGACGGTGAACAAAAGTTGATTTCTGAAGAAGATTTGAACGGT
 5 | GAACAAAAGCTAATCTCCGAGGAAGACTTGAACGGTGAACAAAAATTAATCTCAGAAGAAGACTTGAACG
 6 | GACTCGACGGTGAACAAAAGTTGATTTCTGAAGAAGATTTGAACGGTGAACAAAAGCTAATCTCCGAGGA
 7 | AGACTTGAACGGTGAACAAAAATTAATCTCAGAAGAAGACTTGAACGGACTCGACGGTGAACAAAAGTTG
 8 | ATTTCTGAAGAAGATTTGAACGGTGAACAAAAGCTAATCTCCGAGGAAGACTTGAACGGTGAACAAAAAT
 9 | TAATCTCAGAAGAAGACTTGAACGGACTCGACGGTGAACAAAAGTTGATTTCTGAAGAAGATTTGAACGG
10 | TGAACAAAAGCTAATCTCCGAGGAAGACTTGAACGGTGAACAAAAATTAATCAATCACGAGAACCTCTAC
11 | TTCCAAAGCGTATCAGGCAATTTGAAGGTTGAATTCTACAACAGCAATCCTTCAGATACTACTAACTCAA
12 | TCAATCCTCAGTTCAAGGTTACTAATACCGGAAGCAGTGCAATTGATTTGTCCAAACTCACATTGAGATA
13 | TTATTATACAGTAGACGGACAGAAAGATCAGACCTTCTGGTGTGACCATGCTGCAATAATCGGCAGTAAC
14 | GGCAGCTACAACGGAATTACTTCAAATGTAAAAGGAACATTTGTAAAAATGAGTTCCTCAACAAATAACG
15 | CAGACACCTACCTTGAAATAAGCTTTACAGGCGGAACTCTTGAACCGGGTGCACATGTTCAGATACAAGG
16 | TAGATTTGCAAAGAATGACTGGAGTAACTATACACAGTCAAATGACTACTCATTCAAGTCTGCTTCACAG
17 | TTTGTTGAATGGGATCAGGTAACAGCATACTTGAACGGTGTTCTTGTATGGGGTAAAGAATAGACTTCTA
18 | AATAAGCGAATTTCTTATGATTTATGATTTTTATTATTAAATAAGTTATAAAAAAAATAAGTGTATACAA
19 | ATTTTAAAGTGACTCTTAGGTTTTAAAACGAAAATTCTTATTCTTGAGTAACTCTTTCCTGTAGGTCAGG
20 | TTGCTTTCTCAGGTATAGTATGAGGTCGCTCTTATTGACCACACCTCTACCGGCAGATCCGCTAGGGATA
21 | ACAGGGTAATATAGATCTGTTTAGCTTGCCTCGTCCCCGCCGGGTCACCCGGCCAGCGACATGGAGGCCC
22 | AGAATACCCTCCTTGACAGTCTTGACGTGCGCAGCTCAGGGGCATGATGTGACTGTCGCCCGTACATTTA
23 | GCCCATACATCCCCATGTATAATCATTTGCATCCATACATTTTGATGGCCGCACGGCGCGAAGCAAAAAT
24 | TACGGCTCCTCGCTGCAGACCTGCGAGCAGGGAAACGCTCCCCTCACAGACGCGTTGAATTGTCCCCACG
25 | CCGCGCCCCTGTAGAGAAATATAAAAGGTTAGGATTTGCCACTGAGGTTCTTCTTTCATATACTTCCTTT
26 | TTAAATCTTGCTAGGATACAGTTCTCACATCACATCCGAACATAAACAACCATGGGTACCACTCTTGACG
27 | ACACGGCTTACCGGTACCGCACCAGTGTCCCGGGGGACGCCGAGGCCATCGAGGCACTGGATGGGTCCTT
28 | CACCACCGACACCGTCTTACTGGATGGGTCCTTCACCACCGACACCGTCTTCCGCGTCACCGCCACCGGG
29 | GACGGCTTCACCCTGCGGGAGGTGCCGGTGGACCCGCCCCTGACCAAGGTGTTCCCCGACGACGAATCGG
30 | ACGACGAATCGGACGACGGGGAGGACGGCGACCCGGACTCTCGGACGTTCGTCGCGTACGGGGACGACGG
31 | CGACCTGGCGGGCTTCGTGGTCGTCTCGTACTCCGGCTGGAACCGCCGGCTGACCGTCGAGGACATCGAG
32 | GTCGCCCCGGAGCACCGGGGGCACGGGGTCGGGCGCGCGTTGATGGGGCTCGCGACGGAGTTCGCCCGCG
33 | AGCGGGGTGCCGGGCACCTCTGGCTGGAGGTCACCAACGTCAACGCACCGGCGATCCACGCGTACCGGCG
34 | GATGGGGTTCACCCTCTGCGGCCTGGACACCGCCCTGTACGACGGCACCGCCTCGGACGGCGAGCAGGCG
35 | CTCTACATGAGCATGCCCTGCCCCTAATCAGTACTGACAATAAAAAGATTCTTGTTTTCAAGAACTTGTC
36 | ATTTGTATAGTTTTTTTATATTGTAGTTGTTCTATTTTAATCAAATGTTAGCGTGATTTATATTTTTTTT
37 | CGCCTCGACATCATCTGCCCAGATGCGAAGTTAAGTGCGCAGAAAGTAATATCATGCGTCAATCGTATGT
38 | GAATGCTGGTCGCTATACTGCTGTCGATTCGATACTAACGCCGCCATCCAGTTTAAACGAGCTCGAATTC
39 | ATCGATACCGTCGACCTCGAGCGTACGTAATCACTAGTGAATTCGCGGCCGCCTGCAGGTCGACCATATG
40 | GGAGAGCTCCCAACGCGTTGGATGCATAGCTTGAGTATTCTATAGTGTCACCTAAATAGCTTGGCGTAAT
41 | CATGGTCATAGCTGTTTCCTGTGTGAAATTGTTATCCGCTCACAATTCCACACAACATACGAGCCGGAAG
42 | CATAAAGTGTAAAGCCTGGGGTGCCTAATGAGTGAGCTAACTCACATTAATTGCGTTGCGCTCACTGCCC
43 | GCTTTCCAGTCGGGAAACCTGTCGTGCCAGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTT
44 | TGCGTATTGGGCGCTCTTCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGC
45 | GGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATG
46 | TGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCC
47 | GCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAG
48 | ATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATAC
49 | CTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGG
50 | TGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATC
51 | CGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAAC
52 | AGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACA
53 | CTAGAAGAACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTC
54 | TTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGA
55 | AAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCAC
56 | GTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAG
57 | TTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCA
58 | CCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGA
59 | TACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGA
60 | TTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCC
61 | ATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTG
62 | TTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCA
63 | ACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATC
64 | GTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTG
65 | TCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTAT
66 | GCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAA
67 | GTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTT
68 | CGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGC
69 | AAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTC
70 | TTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTA
71 | TTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGATGCGGTGTGAAA
72 | TACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGAAATTGTAAGCGTTAATATTTTGTTAAAATTC
73 | GCGTTAAATTTTTGTTAAATCAGCTCATTTTTTAACCAATAGGCCGAAATCGGCAAAATCCCTTATAAAT
74 | CAAAAGAATAGACCGAGATAGGGTTGAGTGTTGTTCCAGTTTGGAACAAGAGTCCACTATTAAAGAACGT
75 | GGACTCCAACGTCAAAGGGCGAAAAACCGTCTATCAGGGCGATGGCCCACTACGTGAACCATCACCCTAA
76 | TCAAGTTTTTTGGGGTCGAGGTGCCGTAAAGCACTAAATCGGAACCCTAAAGGGAGCCCCCGATTTAGAG
77 | CTTGACGGGGAAAGCCGGCGAACGTGGCGAGAAAGGAAGGGAAGAAAGCGAAAGGAGCGGGCGCTAGGGC
78 | GCTGGCAAGTGTAGCGGTCACGCTGCGCGTAACCACCACACCCGCCGCGCTTAATGCGCCGCTACAGGGC
79 | GCGTCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACG
80 | CCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTAACGCCAGGGTTTTCCCAGTCACGA
81 | CGTTGTAAAACGACGGCCAGTGAATTGTAATACGACTCACTATA
82 | 


--------------------------------------------------------------------------------
/Workflows/bin/detectISlisting.css:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * From https://gist.github.com/killercup/5917178
  4 |  * I add this to html files generated with pandoc.
  5 |  */
  6 | 
  7 | 
  8 | html {
  9 |   font-size: 100%;
 10 |   overflow-y: scroll;
 11 |   -webkit-text-size-adjust: 100%;
 12 |   -ms-text-size-adjust: 100%;
 13 | }
 14 | 
 15 | body {
 16 |   color: #444;
 17 |   font-family: Georgia, Palatino, 'Palatino Linotype', Times, 'Times New Roman', serif;
 18 |   font-size: 12px;
 19 |   line-height: 1.7;
 20 |   padding: 1em;
 21 |   margin: auto;
 22 |   max-width: 42em;
 23 |   background: #fefefe;
 24 | }
 25 | 
 26 | a {
 27 |   color: #0645ad;
 28 |   text-decoration: none;
 29 | }
 30 | 
 31 | a:visited {
 32 |   color: #0b0080;
 33 | }
 34 | 
 35 | a:hover {
 36 |   color: #06e;
 37 | }
 38 | 
 39 | a:active {
 40 |   color: #faa700;
 41 | }
 42 | 
 43 | a:focus {
 44 |   outline: thin dotted;
 45 | }
 46 | 
 47 | *::-moz-selection {
 48 |   background: rgba(255, 255, 0, 0.3);
 49 |   color: #000;
 50 | }
 51 | 
 52 | *::selection {
 53 |   background: rgba(255, 255, 0, 0.3);
 54 |   color: #000;
 55 | }
 56 | 
 57 | a::-moz-selection {
 58 |   background: rgba(255, 255, 0, 0.3);
 59 |   color: #0645ad;
 60 | }
 61 | 
 62 | a::selection {
 63 |   background: rgba(255, 255, 0, 0.3);
 64 |   color: #0645ad;
 65 | }
 66 | 
 67 | p {
 68 |   margin: 1em 0;
 69 | }
 70 | 
 71 | img {
 72 |   max-width: 100%;
 73 | }
 74 | 
 75 | h1, h2, h3, h4, h5, h6 {
 76 |   color: #111;
 77 |   line-height: 125%;
 78 |   margin-top: 2em;
 79 |   font-weight: normal;
 80 | }
 81 | 
 82 | h4, h5, h6 {
 83 |   font-weight: bold;
 84 | }
 85 | 
 86 | h1 {
 87 |   font-size: 2.5em;
 88 | }
 89 | 
 90 | h2 {
 91 |   font-size: 2em;
 92 | }
 93 | 
 94 | h3 {
 95 |   font-size: 1.5em;
 96 | }
 97 | 
 98 | h4 {
 99 |   font-size: 1.2em;
100 | }
101 | 
102 | h5 {
103 |   font-size: 1em;
104 | }
105 | 
106 | h6 {
107 |   font-size: 0.9em;
108 | }
109 | 
110 | blockquote {
111 |   color: #666666;
112 |   margin: 0;
113 |   padding-left: 3em;
114 |   border-left: 0.5em #EEE solid;
115 | }
116 | 
117 | hr {
118 |   display: block;
119 |   height: 2px;
120 |   border: 0;
121 |   border-top: 1px solid #aaa;
122 |   border-bottom: 1px solid #eee;
123 |   margin: 1em 0;
124 |   padding: 0;
125 | }
126 | 
127 | pre, code, kbd, samp {
128 |   color: #000;
129 |   font-family: monospace, monospace;
130 |   _font-family: 'courier new', monospace;
131 |   font-size: 0.98em;
132 | }
133 | 
134 | pre {
135 |   white-space: pre;
136 |   white-space: pre-wrap;
137 |   word-wrap: break-word;
138 | }
139 | 
140 | b, strong {
141 |   font-weight: bold;
142 | }
143 | 
144 | dfn {
145 |   font-style: italic;
146 | }
147 | 
148 | ins {
149 |   background: #ff9;
150 |   color: #000;
151 |   text-decoration: none;
152 | }
153 | 
154 | mark {
155 |   background: #ff0;
156 |   color: #000;
157 |   font-style: italic;
158 |   font-weight: bold;
159 | }
160 | 
161 | sub, sup {
162 |   font-size: 75%;
163 |   line-height: 0;
164 |   position: relative;
165 |   vertical-align: baseline;
166 | }
167 | 
168 | sup {
169 |   top: -0.5em;
170 | }
171 | 
172 | sub {
173 |   bottom: -0.25em;
174 | }
175 | 
176 | ul, ol {
177 |   margin: 1em 0;
178 |   padding: 0 0 0 2em;
179 | }
180 | 
181 | li p:last-child {
182 |   margin-bottom: 0;
183 | }
184 | 
185 | ul ul, ol ol {
186 |   margin: .3em 0;
187 | }
188 | 
189 | dl {
190 |   margin-bottom: 1em;
191 | }
192 | 
193 | dt {
194 |   font-weight: bold;
195 |   margin-bottom: .8em;
196 | }
197 | 
198 | dd {
199 |   margin: 0 0 .8em 2em;
200 | }
201 | 
202 | dd:last-child {
203 |   margin-bottom: 0;
204 | }
205 | 
206 | img {
207 |   border: 0;
208 |   -ms-interpolation-mode: bicubic;
209 |   vertical-align: middle;
210 | }
211 | 
212 | figure {
213 |   display: block;
214 |   text-align: center;
215 |   margin: 1em 0;
216 | }
217 | 
218 | figure img {
219 |   border: none;
220 |   margin: 0 auto;
221 | }
222 | 
223 | figcaption {
224 |   font-size: 0.8em;
225 |   font-style: italic;
226 |   margin: 0 0 .8em;
227 | }
228 | 
229 | table {
230 |   margin-bottom: 2em;
231 |   border-bottom: 1px solid #ddd;
232 |   border-right: 1px solid #ddd;
233 |   border-spacing: 0;
234 |   border-collapse: collapse;
235 | }
236 | 
237 | table th {
238 |   padding: .2em 1em;
239 |   background-color: #eee;
240 |   border-top: 1px solid #ddd;
241 |   border-left: 1px solid #ddd;
242 | }
243 | 
244 | table td {
245 |   padding: .2em 1em;
246 |   border-top: 1px solid #ddd;
247 |   border-left: 1px solid #ddd;
248 |   vertical-align: top;
249 | }
250 | 
251 | .author {
252 |   font-size: 1.2em;
253 |   text-align: center;
254 | }
255 | 
256 | @media only screen and (min-width: 480px) {
257 |   body {
258 |     font-size: 14px;
259 |   }
260 | }
261 | @media only screen and (min-width: 768px) {
262 |   body {
263 |     font-size: 16px;
264 |   }
265 | }
266 | @media print {
267 |   * {
268 |     background: transparent !important;
269 |     color: black !important;
270 |     filter: none !important;
271 |     -ms-filter: none !important;
272 |   }
273 | 
274 |   body {
275 |     font-size: 12pt;
276 |     max-width: 100%;
277 |   }
278 | 
279 |   a, a:visited {
280 |     text-decoration: underline;
281 |   }
282 | 
283 |   hr {
284 |     height: 1px;
285 |     border: 0;
286 |     border-bottom: 1px solid black;
287 |   }
288 | 
289 |   a[href]:after {
290 |     content: " (" attr(href) ")";
291 |   }
292 | 
293 |   abbr[title]:after {
294 |     content: " (" attr(title) ")";
295 |   }
296 | 
297 |   .ir a:after, a[href^="javascript:"]:after, a[href^="#"]:after {
298 |     content: "";
299 |   }
300 | 
301 |   pre, blockquote {
302 |     border: 1px solid #999;
303 |     padding-right: 1em;
304 |     page-break-inside: avoid;
305 |   }
306 | 
307 |   tr, img {
308 |     page-break-inside: avoid;
309 |   }
310 | 
311 |   img {
312 |     max-width: 100% !important;
313 |   }
314 | 
315 |   @page :left {
316 |     margin: 15mm 20mm 15mm 10mm;
317 | }
318 | 
319 |   @page :right {
320 |     margin: 15mm 10mm 15mm 20mm;
321 | }
322 | 
323 |   p, h2, h3 {
324 |     orphans: 3;
325 |     widows: 3;
326 |   }
327 | 
328 |   h2, h3 {
329 |     page-break-after: avoid;
330 |   }
331 | }
332 | 


--------------------------------------------------------------------------------
/Workflows/bin/detectIS.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl 
  2 | 
  3 | ###### detectIS.pl 
  4 | ##
  5 | ##luigi.grassi@astrazeneca.com
  6 | ##
  7 | ##USAGE: perl detectIS.pl 
  8 | ################################################################################
  9 | 
 10 | use warnings ;
 11 | use strict ;
 12 | use diagnostics ;
 13 | use FindBin; # locate this script to locate pm files
 14 | use lib "$FindBin::Bin/" ; 
 15 | use GeneralTools ;
 16 | use DetectSpltR ;
 17 | use DetectChmR ;
 18 | use Data::Dumper;
 19 | use List::Util qw( min max );
 20 | use Getopt::Long;
 21 | 
 22 | my $message_text  = "detectIS usage:\nperl detectIS.pl -h1 name_mate1_gnm.paf -h2 name_mate2_gnm.paf -v1 name_mate1_vir.paf -v2 name_mate2_vir.paf -o name\n-h1:  Aln results of R1 on host genome\n-h2:  Aln results of R2 on host genome\n-v1:  Aln results of R1 on virus/plasmid\n-v2:  Aln results of R2 on virus/plasmid\n-o:  Output prefix\nExtra options -mqual [1] -ovlwind [0.05]\n-mqual: minimum mapping quality [default 1, min:0 max:60]\n-ovlwind:  Ovl tolerability window (fraction of the read length) [default 0.05, min:0 max:1]\n-mspr [default 2]";
 23 | 
 24 | my $mqual=1 ;
 25 | my $mspr=2 ;
 26 | my $ovlwind=0.05 ;
 27 | my $GNM1 = '' ;
 28 | my $GNM2 = '' ;
 29 | my $PLSM1 = '' ;
 30 | my $PLSM2 = '' ;
 31 | my $OUTPREF = '' ;
 32 | 
 33 | GetOptions ('mspr=i' => \$mspr, 'mqual=i' => \$mqual, 'ovlwind=o' => \$ovlwind, 'h1=s' => \$GNM1, 'h2=s' => \$GNM2, 'v1=s'=> \$PLSM1, 'v2=s'=> \$PLSM2, 'o=s'=> \$OUTPREF) ;
 34 | 
 35 | ( $GNM1 ne '' && $GNM2 ne '' && $PLSM1 ne '' && $PLSM2 ne '' && $OUTPREF ne '') || die $message_text ;
 36 | 
 37 | chomp $OUTPREF;
 38 | chomp $GNM1;
 39 | chomp $GNM2;
 40 | chomp $PLSM1;
 41 | chomp $PLSM2;
 42 | 
 43 | my $OUTPREFMD=$OUTPREF.'.md' ;
 44 | my $OUTPREFTXT=$OUTPREF.'.txt' ;
 45 | my $OUTPREFSPREADTXT=$OUTPREF.'_SRlist.txt' ;
 46 | 
 47 | open(OUT1, ">$OUTPREFMD") || die "ERROR: not possible to open output file $OUTPREFMD\n" ;
 48 | open(OUT2, ">$OUTPREFTXT") || die "ERROR: not possible to open output file $OUTPREFTXT\n" ;
 49 | open(OUT3, ">$OUTPREFSPREADTXT") || die "ERROR: not possible to open output file $OUTPREFSPREADTXT\n" ;
 50 | 
 51 | print OUT1 '\\titleAT'."\n\n" ;
 52 | print OUT1 '\\newpage'."\n\n" ;
 53 | print OUT1 "# detectIS Results\n" ;
 54 | print OUT1 "perl detectIS.pl -h1 $GNM1 -h2 $GNM2 -v1 $PLSM1 -v2 $PLSM2 -o $OUTPREF -mqual $mqual -ovlwind $ovlwind -mspr $mspr" ;
 55 | print OUT1 "\n\n----\n\n\n" ;
 56 | 
 57 | print OUT2 "IS\tTotSpReads\tR1R2SpReads\tR1SpReads\tR2SpReads\tChimReads\tSingleSplitRead\tInterval\n" ;
 58 | 
 59 | print OUT3 "IS\tReadID\tSRType\n" ;
 60 | 
 61 | ###Detect potential Split Reads in genomic hits
 62 | my $splt1=DetectSpltR::detect_split_reads($GNM1, $mqual) ;
 63 | my $splt2=DetectSpltR::detect_split_reads($GNM2, $mqual) ;
 64 | 
 65 | ###Remove fal Split Reads looking using plasmidic reads
 66 | my $ishash1=DetectSpltR::verify_split_reads($splt1, $PLSM1, $ovlwind) ;
 67 | my $ishash2=DetectSpltR::verify_split_reads($splt2, $PLSM2, $ovlwind) ;
 68 | 
 69 | ###Merging the results from both reads and filtering by frequency
 70 | my ($ca1, $ca2, $ca3)=GeneralTools::merge_split_pairs($ishash1, $ishash2, $mspr) ;
 71 | my %ishash=%$ca1; # Key1:IS -> Element: number of fragment/read supporting it
 72 | my %isres=%$ca2; # Key1:IS -> Key2:Read_ID -> Element: Read_occurrence
 73 | my %iscoord=%$ca3; # Key IS -> Element: @ 0[Plm_chr] 1[Plm_pos] 2[Host_chr] 3[Host_pos]
 74 | 
 75 | if ( scalar (keys %ishash) > 0 ) { 
 76 | 	my ($isfuss, $spreads) = DetectSpltR::verify_spreads_is($GNM1, $GNM2, $PLSM1, $PLSM2, \%isres, \%iscoord) ;
 77 |         my %hashres = %$isfuss;
 78 |         my %hspreads = %$spreads;
 79 | 	foreach my $n(sort { $ishash{$b} <=> $ishash{$a} } keys %ishash) {
 80 | 		if (exists $hashres{$n}) {
 81 | 			print OUT1 "## $n\n" ;
 82 | 			my $totsplit= $hashres{$n}{'R1'} + $hashres{$n}{'R2'} + $hashres{$n}{'R1R2'} ;
 83 | 			print OUT1 "SPLIT READS: $totsplit (R1R2:$hashres{$n}{'R1R2'}; R1:$hashres{$n}{'R1'}; R2:$hashres{$n}{'R2'})\n";
 84 | 			print OUT1 "CHIMERIC READS: $hashres{$n}{'CHIM'}\n";
 85 | 			print OUT1 "SINGLE SPLIT READ: $hashres{$n}{'RU'}\n";
 86 | 			print OUT1 "INTERVAL: $hashres{$n}{'INT'}\n";
 87 | 			print OUT1 "\n\n----\n\n\n" ;
 88 | 			print OUT2 "$n\t$totsplit\t$hashres{$n}{'R1R2'}\t$hashres{$n}{'R1'}\t$hashres{$n}{'R2'}\t$hashres{$n}{'CHIM'}\t$hashres{$n}{'RU'}\t$hashres{$n}{'INT'}\n" ;
 89 | 			foreach my $spr (keys %{$hspreads{$n}}) {
 90 | 				print OUT3 "$spr\t$n\t$hspreads{$n}{$spr}\n";
 91 | 			}
 92 | 		}
 93 | 	}
 94 | }
 95 | else {
 96 | 	print OUT1 "No split read identified! Looking only for chimeric reads\n" ;
 97 | 	print OUT2 "No split read identified! Looking only for chimeric reads\n" ;
 98 | 
 99 | 	my ($chm1, $chm2)=DetectChmR::detect_chimeric_reads($GNM1, $GNM2); #Detect genomic hits potentially in a chimeric pair
100 | 	DetectChmR::filter_chimeric_reads($PLSM1, $PLSM2, $chm1, $chm2); #Detect potential chimeric pairs using genomic and plasmidic hits
101 | 	
102 | 	my ($ChimReads2, $len2)=DetectChmR::count_and_collapse_chimeric_reads($PLSM1,$GNM2,$chm2) ; #Count hits dividing chromosomes in intervals 
103 | 	my ($ChimReads1, $len1)=DetectChmR::count_and_collapse_chimeric_reads($PLSM2,$GNM1,$chm1) ; #Count hits dividing chromosomes in intervals
104 | 	
105 | 	my %chmhash=();
106 | 		
107 | 	my %ch1 = %$ChimReads1;
108 | 	my %ch2 = %$ChimReads2;
109 | 	
110 |         foreach my $k(keys %ch1) {
111 |                 $chmhash{$k}+=$ch1{$k} ;
112 |         	
113 | 	}
114 | 	foreach my $k(keys %ch2) {
115 |                 $chmhash{$k}+=$ch2{$k} ;
116 |         }
117 | 	
118 | 	foreach my $k(keys %chmhash) {
119 | 		if ($chmhash{$k}<2) {
120 |                		delete $chmhash{$k} ;
121 |         	}
122 | 	}
123 | 	if ( scalar (keys %chmhash) > 0 ) { #Only one IS supported by more than 1 chimeric pair
124 |         	foreach my $n(sort { $chmhash{$b} <=> $chmhash{$a} } keys %chmhash) {
125 | 			my @tmp=split(/--/, $n) ;
126 |                         my @tmp1=split(/:/, $tmp[0]) ;
127 |                         my @tmp2=split(/:/, $tmp[1]) ;
128 |                        	my $is1=$tmp1[1]*$len1*5 ;
129 |                         my $is2=$is1+($len1*5);
130 |                        	my $is3=$tmp2[1]*$len1*5 ;
131 |                         my $is4=$is3+($len1*5);
132 |                        	print OUT1 "## $tmp1[0]:$is1-$is2--$tmp2[0]:$is3-$is4\nCHIM:$chmhash{$n}\n" ;
133 |                        	print OUT2 "## $tmp1[0]:$is1-$is2--$tmp2[0]:$is3-$is4\t0\t0\t0\t0\t$chmhash{$n}\t0\t0\n" ;
134 |                         print OUT1 "\n\n----\n\n\n" ;
135 | 		}
136 | 
137 |         }
138 | 	else {
139 |         	print OUT1 "No chimeric hit identified!\n ## No IS found!\n" ;
140 |         	print OUT2 "No chimeric hit identified!\n ## No IS found!\n" ;
141 |         }
142 | 
143 | }
144 | 
145 | close OUT1 ;
146 | close OUT2 ;
147 | close OUT3 ;
148 | 


--------------------------------------------------------------------------------
/Workflows/bin/DetectChmR.pm:
--------------------------------------------------------------------------------
  1 | package DetectChmR;
  2 | 
  3 | use List::Util qw( min max );
  4 | use POSIX;
  5 | use FindBin; # locate this script
  6 | use lib "$FindBin::Bin/";
  7 | use GeneralTools ;
  8 | 
  9 | 
 10 | ################################################################################
 11 | #########SUB
 12 | ################################################################################
 13 | 
 14 | ################################################################################
 15 | #####Passing a 2 genomic PAF file identifies potentially chimeric reads 
 16 | sub detect_chimeric_reads  
 17 | {
 18 |         my $paf1 = $_[0];
 19 |         my $paf2 = $_[1];
 20 |         my %chimreads1=() ;
 21 |         my %chimf1=() ;
 22 |         my %chimreads2=() ;
 23 |         my %chimf2=() ;
 24 |         open(PAF1 , "<", $paf1) || die "ERROR in opening $paf1: please check the file name\n" ;
 25 |         while (my $line=<PAF1>) {
 26 |                 chomp $line;
 27 |                 if (length ($line) >0) {
 28 |                         my @tmp =split("\t" , $line) ;
 29 | 			if ( ($tmp[10]>=($tmp[1]*0.90)) && ($tmp[11]==60) && ($tmp[9]>($tmp[10]*0.99)) ) {
 30 | 				$chimreads1{$tmp[0]}++ ;
 31 | 			}
 32 | 			else {
 33 | 				$chimf1{$tmp[0]}++ ;
 34 | 			}
 35 |                 }
 36 |         }
 37 |         close PAF1;
 38 | 	open(PAF2 , "<", $paf2) || die "ERROR in opening $paf2: please check the file name\n" ;
 39 |         while (my $line=<PAF2>) {
 40 |                 chomp $line;
 41 |                 if (length ($line) >0) {
 42 |                         my @tmp =split("\t" , $line) ;
 43 |                         if ( ($tmp[10]>=($tmp[1]*0.90)) && ($tmp[11]==60) && ($tmp[9]>($tmp[10]*0.99)) ) {
 44 |                                 $chimreads2{$tmp[0]}++ ;
 45 |                         }
 46 | 			else {
 47 |                                 $chimf2{$tmp[0]}++ ;
 48 |                         }
 49 |                 } 
 50 |         }
 51 |         close PAF2;
 52 | 	my %chimrd1=();
 53 | 	my %chimrd2=();
 54 | 	foreach my $k(keys %chimreads1) {
 55 | 		if ( (not exists $chimreads2{$k}) && (not exists $chimf2{$k}) && (not exists $chimf1{$k})) {
 56 | 			$chimrd1{$k}=$chimreads1{$k};
 57 | 		}
 58 | 	}
 59 | 	foreach my $k(keys %chimreads2) {
 60 |                 if ( (not exists $chimreads1{$k}) && (not exists $chimf1{$k})  && (not exists $chimf2{$k}) ){
 61 | 			$chimrd2{$k}=$chimreads2{$k};
 62 | 		}
 63 |         }
 64 | 	%chimreads1=();
 65 | 	%chimreads2=();
 66 | 	%chimf1=();
 67 | 	%chimf2=();
 68 | 	return (\%chimrd1, \%chimrd2);
 69 | }
 70 | ################################################################################
 71 | 	
 72 | sub filter_chimeric_reads  
 73 | {
 74 |         my $paf1 = $_[0];
 75 |         my $paf2 = $_[1]; 
 76 |         my $potchimr1= $_[2] ;
 77 |         my $potchimr2= $_[3] ;
 78 | 	my %chimreads1=() ;
 79 |         my %chimreads2=() ;
 80 | 	open(PAF1 , "<", $paf1) || die "ERROR in opening $paf1: please check the file name\n" ;
 81 |         while (my $line=<PAF1>) {
 82 |                 chomp $line;
 83 |                 if (length ($line) >0) {
 84 |                         my @tmp =split("\t" , $line) ;
 85 |                         if ( (exists $potchimr2->{ $tmp[0] })  && ($tmp[10]>=($tmp[1]*0.90)) && ($tmp[9]>($tmp[10]*0.90)) ) { 
 86 |                                 $chimreads1{$tmp[0]}++ ;
 87 |                         }
 88 | 			else {
 89 | 				$chimf1{$tmp[0]}++ ;
 90 | 			}
 91 |                 }
 92 |         }
 93 | 	close PAF1;
 94 | 	open(PAF2 , "<", $paf2) || die "ERROR in opening $paf2: please check the file name\n" ;
 95 |         while (my $line=<PAF2>) {
 96 |                 chomp $line;
 97 |                 if (length ($line) >0) {
 98 |                         my @tmp =split("\t" , $line) ;
 99 |                 	if ( (exists $potchimr1->{ $tmp[0] })  && ($tmp[10]>=($tmp[1]*0.90)) && ($tmp[9]>($tmp[10]*0.90)) ) { 
100 |                                 $chimreads2{$tmp[0]}++ ;
101 | 
102 |                         }
103 |                         else {
104 |                                 $chimf2{$tmp[0]}++ ;
105 |                         }
106 | 		}
107 |         }
108 |         close PAF2;
109 | 	foreach my $read (keys %$potchimr1) {
110 | 		if ( (exists $chimreads2{$read}) && (not exists $chimreads1{$read}) && (not exists $chimf1{$read}) &&  (not exists $chimf2{$read}))  {
111 | 			;
112 | 		}
113 | 		else {
114 | 			delete $potchimr1->{ $read } ;
115 | 		}
116 | 	}
117 | 	foreach my $read (keys %$potchimr2) {
118 |                 if ( (exists $chimreads1{$read}) && (not exists $chimreads2{$read}) && (not exists $chimf2{$read}) && (not exists $chimf1{$read}) ) {
119 |                 	;
120 | 		}
121 | 		else { 
122 | 		       delete $potchimr2->{ $read } ;  
123 |                 }
124 |         }  
125 | }
126 | 
127 | 
128 | sub count_and_collapse_chimeric_reads  
129 | {
130 |         my $paf1 = $_[0];
131 |         my $paf2 = $_[1];
132 |         my $potchimr= $_[2] ;
133 | 	my %rpairs=();
134 | 	my @rlen=();
135 |         open(PAF1 , "<", $paf1) || die "ERROR in opening $paf1: please check the file name\n" ;
136 |         while (my $line=<PAF1>) {
137 |                 chomp $line;
138 |                 if (length ($line) >0) {
139 |                         my @tmp =split("\t" , $line) ;
140 |                         if (exists $potchimr->{ $tmp[0] })  {
141 |                                 push (@rlen, $tmp[1]); 
142 |                                 push (@{$rpairs{$tmp[0]}}, $tmp[5]); 
143 |                                 push (@{$rpairs{$tmp[0]}}, $tmp[6]); 
144 |                                 push (@{$rpairs{$tmp[0]}}, $tmp[7]); 
145 |                                 push (@{$rpairs{$tmp[0]}}, $tmp[8]); 
146 |                         }
147 |                 }
148 |         }
149 |         close PAF1;
150 |         open(PAF2 , "<", $paf2) || die "ERROR in opening $paf2: please check the file name\n" ;
151 |         while (my $line=<PAF2>) {
152 |                 chomp $line;
153 |                 if (length ($line) >0) {
154 |                         my @tmp =split("\t" , $line) ;
155 |                 	if (exists $potchimr->{ $tmp[0] })  {
156 |                         	push (@rlen, $tmp[1]);              
157 | 				push (@{$rpairs{$tmp[0]}}, $tmp[5]); 
158 |                         	push (@{$rpairs{$tmp[0]}}, $tmp[6]); 
159 |                         	push (@{$rpairs{$tmp[0]}}, $tmp[7]); 
160 |                                 push (@{$rpairs{$tmp[0]}}, $tmp[8]);
161 | 			}
162 | 
163 | 		}
164 |         }
165 |         close PAF2;
166 | 	my $len=0 ;
167 | 	if (@rlen >0 ) {
168 | 		$len=GeneralTools::average(\@rlen);
169 | 		@rlen=();
170 | 	}
171 | 	my %comb=();
172 |         foreach my $read (keys %rpairs) {
173 | 			my $v1=floor(($rpairs{$read}[2]+(($rpairs{$read}[3]-$rpairs{$read}[2])/2))/($len*5)) ;
174 | 			my $v2=floor(($rpairs{$read}[6]+(($rpairs{$read}[7]-$rpairs{$read}[6])/2))/($len*5)) ;
175 | 			my $int1=$rpairs{$read}[0].":".$v1 ;
176 | 			my $int2=$rpairs{$read}[4].":".$v2;
177 | 			$comb{"$int1--$int2"}++ ;
178 | 	}
179 |  	return(\%comb, $len);
180 | }
181 | 1;
182 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Manual/MANUAL.rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: DetectIS (v 0.1.1)
  3 | author: Manual
  4 | output:
  5 |    pdf_document:
  6 |       fig_caption: true
  7 |       number_sections: true
  8 | ---
  9 | 
 10 | 
 11 | \newpage
 12 | 
 13 | \tableofcontents 
 14 | 
 15 | \newpage
 16 | 
 17 | ```{r setup, include = FALSE}
 18 | knitr::opts_chunk$set(
 19 |   collapse = TRUE,
 20 |   comment = "#>",
 21 |   library(knitr),
 22 |   library(formatR),
 23 |   knitr::opts_chunk$set(message=FALSE, warning=FALSE, tidy.opts=list(width.cutoff=60)) 
 24 | 
 25 |   )
 26 | ```
 27 | 
 28 | DetectIS is a pipeline designed to detect the integration sites of exogenous DNA using DNA or RNA paired-end sequencing data. A [Singularity](https://sylabs.io/docs/) container has all the software necessary for the analysis. The workflow manager [nextflow](https://www.nextflow.io/) can be used to run the analysis locally or in an HPC environment.  
 29 | 
 30 | # Installation
 31 | 
 32 | You can use git to clone the detecIS repository from GitHub to your computer:
 33 | 
 34 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
 35 | git clone https://github.com/AstraZeneca/detectIS.git
 36 | ```
 37 | or alternatively you can download the tarball from GitHub:
 38 | 
 39 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
 40 | wget https://github.com/AstraZeneca/detectIS/archive/0.1.1.tar.gz
 41 | ```
 42 | 
 43 | and extract the files:
 44 | 
 45 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
 46 | tar -xvf 0.1.1.tar.gz
 47 | ```
 48 | 
 49 | The repository is made of three directories:
 50 | 
 51 | * Workflows;
 52 | 
 53 | * utils;
 54 | 
 55 | * TestDataset.
 56 |   
 57 | The *Workflows* directory contains the *detectIS.nf* file, necessary to run the analysis with nextflow.
 58 | The directory *TestDataset* contains fastq files and reference data that can be used to test detectIS. It also contains the script *Run_detectIS.sh*, that can be used to run the analysis without using nextflow. 
 59 | The directory utils contains the recipe *detectIS.rec*, necessary to build the Singularity container, and the bash and Perl script used to generate simulated data sets.
 60 | 
 61 | The configuration file and the bash script have been written to analyse the example dataset present in the TestDataset directory and, can also be used as templates for other analyses. 
 62 | 
 63 | ## Prerequisites to run the analysis
 64 | 
 65 | The detectIS software requires [Singularity](https://www.sylabs.io/docs/) version 2.6 or higher.
 66 | The installation of [Nextflow](https://www.nextflow.io/), version 0.32.0.4897 or higher, is strongly advised.   
 67 | 
 68 | ### Creating a Singularity container
 69 | 
 70 | [Singularity](https://www.sylabs.io/docs/) is a container platform that creates and runs containers with the required software in a way that is portable and reproducible. A container is a single file that can be generated using Singularity on a laptop, and executed on HPC clusters, local university or company clusters, servers or cloud.
 71 | 
 72 | A Singularity container with all the necessary software is required to run the detectIS pipeline.
 73 | The image can be created by using the recipe contained in the *utils* directory of the detectIS GitHub repository (https://github.com/AstraZeneca/detectIS/raw/master/utils/detectIS.rec).
 74 | It is possible to generate a Singularity container from a recipe with the command:
 75 | 
 76 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
 77 | sudo singularity build detectIS.simg detectIS.rec
 78 | ```
 79 | 
 80 | Superuser privileges are necessary only to create the container but not to use it. This means you can create the container on your local pc/workstation and copy it to the system where you run the detectIS analyses (e.g. your hpc or cluster). 
 81 | 
 82 | Singularity containers are [kernel-dependent](https://www.sylabs.io/guides/2.6/user-guide/faq.html?highlight=disk%20access#are-singularity-containers-kernel-dependent), this implies that the recipes contained in this project will not necessarily produce a container able to run on your HPC system. If none of the available recipes generates a container compatible with your system you might need to modify the recipe using an operating system with compatible kernel, please raise an issue on GitHub if this is the case and you need support for it.
 83 | 
 84 | ### Nextflow
 85 | 
 86 | [Nextflow](https://www.nextflow.io/) is a workflow manager that enables scalable and reproducible scientific workflows using software containers. It requires Bash 3.2 (or later) and Java 8 (or later, up to 15) and it is distributed as a self-contained executable package. See the [nextflow manual](https://www.nextflow.io/docs/latest/) for further information.
 87 | 
 88 | # Running the workflow using nextflow
 89 | 
 90 | Once installed Singularity and Nextflow, if the container has been created and copied to the *utils* directory, you can run the nextflow workflow to analyse the test data set by using the command:
 91 | 
 92 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
 93 | nextflow run Workflows/detectIS.nf -c detectIS_TestDataset.conf --with-report \
 94 | detectIS_TestDataset_nextflow_report.html
 95 | ```
 96 | The file *detectIS.nf* is the workflow for the detectIS analysis and *detectIS_TestDataset.conf* is the configuration file with all the information needed for the analysis. The workflow file is made by all the instructions used by nextflow for the analysis, it can be used without any change, unless you need to change the default parameters of the Perl script (see the *detectIS.pl script* section for further information). 
 97 | 
 98 | ## Structure of the configuration file
 99 | The configuration file is made of two sections: the first one contains information of the HPC/cluster used for the analysis. 
100 | 
101 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
102 | params.project_name='Test_dataset-detectIS'
103 | ```
104 | 
105 | The variable *params.project_name* specifies the name of the project, in this case Test_dataset-detectIS.
106 | 
107 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
108 | process.executor='sge'
109 | ```
110 | 
111 | The variable *process.executor* specifies the executor, there are different options depending by the HPC/cluster where you run the analysis (see https://www.nextflow.io/docs/latest/executor.html# for further information). If you run the process locally you don't need to specify it, or you can set it as 'local'.
112 | 
113 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
114 | process.queue = 'infini.q'
115 | ```
116 | 
117 | The variable *process.queue* specifies the queue to use, this depends by the used HPC/cluster.
118 | 
119 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
120 | process.clusterOptions = '-S /bin/bash' 
121 | ```
122 | 
123 | The variable *process.clusterOptions* specifies options specific of the used cluster, the one in the example forces the cluster to run the job with bash, in case it is not the default option.
124 | 
125 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
126 | process.penv = 'smp'
127 | ```
128 | 
129 | The variable *process.penv* specifies the parallel environment to use submitting a parallel task to the SGE resource manager (see https://www.nextflow.io/docs/latest/process.html#penv for further info).
130 | 
131 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
132 | params.scratch='/scratch/'
133 | ```
134 | 
135 | The variable *params.scratch* specifies the scratch directory.
136 | 
137 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
138 | singularity.enabled = true
139 | process.container = "utils/detectIS.simg"
140 | ```
141 | 
142 | These variables are necessary to use Singularity and to specify the container file.
143 | 
144 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
145 | singularity.cacheDir = "/scratch/"   
146 | ```
147 | The variable *singularity.cacheDir* specifies the singularity cache.
148 | 
149 | The second part of the configuration file contains analysis specific information.
150 | 
151 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
152 | params.reads = "TestDataset/*_{R1,R2}.fq.gz"
153 | ```
154 | 
155 | The variable *params.reads* specifies the sequencing reads to process, the pattern used in the example is used to specify read pairs. 
156 | 
157 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
158 | params.outdir = "TestDataset/NextflowRes/"
159 | ```
160 | 
161 | The variable *params.outdir* specifies the output directory. 
162 | 
163 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
164 | params.cpu.minimap = 32
165 | ```
166 | The variable *params.cpu.minimap* specifies the cpu used for the mapping with Minimap2.
167 | 
168 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
169 | params.host_seq="TestDataset/Cricetulus_griseus_chok1gshd.CHOK1GS_HDv1.dna.toplevel_Scaffold0.fa"
170 | ```
171 | The variable *params.host_seq specifies* the reference fasta file of the host genome.
172 | 
173 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
174 | params.vir_seq="TestDataset/CelTag.fa"
175 | ```
176 | 
177 | The variable *params.vir_seq* specifies the reference fasta file of the exogenous element (plasmid, viral agent, etc.).
178 | 
179 | 
180 | # Test data sets and bash script
181 | 
182 | In the directory "TestDataset" are contained paired-end reads and reference files to run a detectIS analysis.The dataset simulates the integration of a plasmid in the genome of Chinese hamster ovary cell line (CHOK1).
183 | 
184 | ## Structure of the bash script
185 | A less valid alternative to Nextflow is a bash script able to run all the steps required by the pipeline. The directory *TestDataset* contains the bash script *Run_detectIS.sh* written for this aim.
186 | 
187 | The top part of the script specifies variables specific of the analysis:
188 | 
189 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
190 | singimg="../utils/detectIS.simg"
191 | ```
192 | The variable *singimg* specifies the singularity container used for the analysis
193 | 
194 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
195 | gref="./Cricetulus_griseus_chok1gshd.CHOK1GS_HDv1.dna.toplevel_Scaffold0.fa"
196 | extragenome1="./CelTag.fa"
197 | ```
198 | 
199 | These variables specify respectively the host genome and the exogenous references
200 | 
201 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
202 | reads=( ./Sim_R1.fq.gz ./Sim_R2.fq.gz )
203 | ```
204 | 
205 | This array specifies the two fastq files to process
206 | 
207 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
208 | mkdir -p Res
209 | name="./Res/SimRead"
210 | paramscpu=4
211 | ```
212 | 
213 | The second part of the script run the alignment to the host and exogenous reference and finally the detectIS.pl script to integrate the results. Finally the markdown file produced by the script is converted to pdf and html.
214 | 
215 | ## detectIS.pl script
216 | The "Workflows" directory contains the detectIS.nf file, the nextflow workflow file and the "bin" sub directory with the detectIS.pl script and the modules with all the subroutines used by it. The script processes the 4 alignment results in paf format, looking for split and/or chimeric read pairs able to identify integration sites.
217 | 
218 | Script arguments:
219 | 
220 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
221 |     -h1 name_mate1_gnm.paf 
222 |       (Aligment results of R1 reads on the host genome) [Mandatory argument]
223 |     -h2 name_mate2_gnm.paf 
224 |       (Aligment results of R2 reads on the host genome) [Mandatory argument]
225 |     -v1 name_mate1_vir.paf 
226 |       (Aligment results of R1 reads on the exogenous reference) [Mandatory argument]
227 |     -v2 name_mate2_vir.paf 
228 |       (Aligment results of R2 reads on the exogenous reference) [Mandatory argument]
229 |     -o name_prefix (Output prefix) [Mandatory argument]
230 |     -mqual 1 
231 |       (Minimum mapping quality to consider hits in the host genome) 
232 |       [Not mandatory, range: 0-60 default value: 1]
233 |     -ovlwind 0.05 
234 |       (Overlap or distance, as fraction of the read length, tollerated to detect split reads) 
235 |       [Not mandatory, range: 0-1 default value: 0.05]
236 |     -mspr 2
237 |       (Minimum number of split reads to identify an integration site) 
238 |       [Not mandatory, range: 1+ default value: 2]
239 | ```
240 | 
241 | The script can be executed specifying the mandatory arguments and leaving as default the other arguments. Alternatively the ovlwind can be increased and the mspr can be reduced to increase the sensitivity of the tool, for experiments executed at low coverage.
242 | 
243 | # Result interpretation 
244 | 
245 | The detectIS.pl script makes 2 final results: one text file (with the .txt extension) and one markdown file (with the .md extension). The text file can be visualized using the UNIX less command or edited by using any UNIX/MAC text editor like vim nano Emacs or in Windows, notepad. It can also be imported as spreadsheet in Excel or Open Office. The integration sites identified in the analysis are reported in rows with the following information:
246 | 
247 | 1. IS: The integration site with chromosome and position of either host genome and exogenous element.
248 | 
249 | 2. TotSpReads: The total number of split read pairs supporting the integration site.
250 | 
251 | 3. R1R2SpReads: Number of split read pairs supporting the integration site with both read split.
252 | 
253 | 4. R1SpReads: Number of split read pairs supporting the integration site having the R1 read split and the R2 read mapped within 5 read length of the integration site.
254 | 
255 | 5. R2SpReads: Number of split read pairs supporting the integration site having the R2 read split and the R1 read mapped within 5 read length of the integration site.
256 | 
257 | 6. ChimReads: The total number of chimeric read pairs supporting the integration site.
258 | 
259 | 7. SingleSplitRead: Number of split read pairs made only one split read and the other not mapped within 5 read length of the integration site. 
260 | 
261 | 8. Interval: Extended interval supporting the integration site. It specifies the relative orientation of host genome and exogenous element and this information is fundamental to correctly design primers for PCR verification of the integration site.
262 | 
263 | The same information are also contained in the markdown file that can be converted to pdf and/or html. In the directory Workflows/bin is contained a script to convert all the .md file present in the directory with the results to pdf and html files.
264 | 
265 | ```{r, engine = 'bash', eval = FALSE, tidy=TRUE, tidy.opts=list(width.cutoff=50)}
266 | cd detectIS/Workflows/bin/
267 | bash CreatePDFandHTML.sh /MyResultDirectory
268 | ```
269 | 


--------------------------------------------------------------------------------
/Workflows/bin/DetectSpltR.pm:
--------------------------------------------------------------------------------
  1 | package DetectSpltR;
  2 | 
  3 | use List::Util qw( min max );
  4 | use POSIX;
  5 | 
  6 | ################################################################################
  7 | #########SUB
  8 | ################################################################################
  9 | 
 10 | ################################################################################
 11 | #####Passing a genomic PAF file looks for potential split reads
 12 | sub detect_split_reads  
 13 | { 
 14 | 	my $paf = $_[0];
 15 | 	my $mq = $_[1];
 16 | 	my %spltreads=() ;
 17 | 	my %blklist=() ;
 18 | 	open(PAF , "<", $paf) || die "ERROR in opening $paf: please check the file name\n" ;
 19 | 	while (my $line=<PAF>) {
 20 |         	chomp $line;
 21 |         	if (length ($line) >0) {
 22 | 			my @tmp =split("\t" , $line) ;
 23 | 			$blklist{$tmp[0]}++ ;
 24 | 		}
 25 | 	}
 26 | 	close PAF;
 27 | 	open(PAF , "<", $paf) || die "ERROR in opening $paf: please check the file name\n" ;
 28 |         while (my $line=<PAF>) {
 29 |                 chomp $line;
 30 |                 if (length ($line) >0) {
 31 |                         my @tmp =split("\t" , $line) ;
 32 | 			if ( ($blklist{$tmp[0]}==1) && ($tmp[10]<($tmp[1])) && ($tmp[11]>=$mq) ) { #Filters to consider potential Split Reads: Univocally mapped; mapping lenght shorter than read length and Mapping Quality >= $mq
 33 | 				$spltreads{$tmp[0]}[0]=$tmp[2]; #First mapped pos
 34 | 				$spltreads{$tmp[0]}[1]=$tmp[3];	#Last mapped pos
 35 | 				$spltreads{$tmp[0]}[2]=$tmp[4]; #STRAND
 36 | 				$spltreads{$tmp[0]}[3]=$tmp[5];	#CHR
 37 | 				$spltreads{$tmp[0]}[4]=$tmp[7];	#START
 38 | 				$spltreads{$tmp[0]}[5]=$tmp[8];	#STOP
 39 | 			}
 40 | 		}
 41 |  	}
 42 | 	%blklist=() ;
 43 | 	return (\%spltreads); #1K: Read name; El: @   
 44 | } 
 45 | ################################################################################
 46 | 
 47 | ################################################################################
 48 | ######Passing a Plasmidic PAF file verify the potentially split reads identified in the genome 
 49 | sub verify_split_reads
 50 | {
 51 |         my $splt1 = $_[0];
 52 |         my $paf = $_[1];
 53 |         my $ovlwin = $_[2];
 54 |         my %isarray = ();
 55 |         my %vircounts = ();
 56 | 
 57 |         open(PAF , "<", $paf) || die "ERROR in opening $paf: please check the file name\n" ;
 58 |         while (my $line=<PAF>) {
 59 |                 chomp $line;
 60 |                 if (length ($line) >0) {
 61 |                         my @tmp =split("\t" , $line) ;
 62 |                         if ( (exists $splt1->{ $tmp[0] }) ) {
 63 | 				$vircounts{$tmp[0]}++ ;
 64 | 			}
 65 | 		}
 66 | 	}
 67 | 	close PAF ;
 68 | 	open(PAF , "<", $paf) || die "ERROR in opening $paf: please check the file name\n" ;
 69 |         while (my $line=<PAF>) {
 70 |                 chomp $line;
 71 |                 if (length ($line) >0) {
 72 |                         my @tmp =split("\t" , $line) ;
 73 |                         if ( (exists $splt1->{ $tmp[0] })  && $vircounts{$tmp[0]}==1 ) {
 74 |                                 my $rliminv=$tmp[1]*$ovlwin; #Overlap window shift allowed
 75 |                                 if ( ($tmp[2]> ($splt1->{ $tmp[0] }[1] - $rliminv) && ($tmp[2]< ($splt1->{ $tmp[0] }[1] + $rliminv))) ) {
 76 |                                         if ( ($splt1->{ $tmp[0] }[2] eq "+") &&  ($tmp[4] eq "+") ){
 77 |                                                 my $is=$tmp[5].":".$tmp[7]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[5] ;
 78 |                                                 $isarray{$is}{$tmp[0]}++;
 79 |                                         }
 80 |                                         elsif ( ($splt1->{ $tmp[0] }[2] eq "+") &&  ($tmp[4] eq "-") ){
 81 |                                                 my $is=$tmp[5].":".$tmp[8]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[5] ;
 82 |                                         	$isarray{$is}{$tmp[0]}++;
 83 | 					}
 84 |                                         elsif ( ($splt1->{ $tmp[0] }[2] eq "-") &&  ($tmp[4] eq "+") ){
 85 |                                                 my $is=$tmp[5].":".$tmp[7]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[4] ;
 86 |                                                 $isarray{$is}{$tmp[0]}++;
 87 | 					}
 88 |                                         elsif ( ($splt1->{ $tmp[0] }[2] eq "-") &&  ($tmp[4] eq "-") ){
 89 |                                                 my $is=$tmp[5].":".$tmp[8]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[4] ;
 90 |                                                 $isarray{$is}{$tmp[0]}++;
 91 | 					}
 92 |                                 }
 93 |                                 elsif ( (($tmp[3] - $rliminv) < $splt1->{ $tmp[0] }[0] ) &&  (($tmp[3] + $rliminv) > $splt1->{ $tmp[0] }[0]) ) {
 94 |                                         if ( ($splt1->{ $tmp[0] }[2] eq "+") &&  ($tmp[4] eq "+") ){
 95 |                                         	my $is=$tmp[5].":".$tmp[8]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[4] ;
 96 |                                                	$isarray{$is}{$tmp[0]}++;
 97 | 					}
 98 |                                         elsif ( ($splt1->{ $tmp[0] }[2] eq "+") &&  ($tmp[4] eq "-") ){
 99 |                                                 my $is=$tmp[5].":".$tmp[7]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[4] ;
100 |                                                 $isarray{$is}{$tmp[0]}++;
101 | 					}
102 |                                         elsif ( ($splt1->{ $tmp[0] }[2] eq "-") &&  ($tmp[4] eq "+") ){
103 |                                                 my $is=$tmp[5].":".$tmp[8]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[5] ;
104 |                                                 $isarray{$is}{$tmp[0]}++;
105 | 					}
106 |                                         elsif ( ($splt1->{ $tmp[0] }[2] eq "-") &&  ($tmp[4] eq "-") ){
107 |                                                 my $is=$tmp[5].":".$tmp[7]."--".$splt1->{ $tmp[0] }[3].":".$splt1->{ $tmp[0] }[5] ;
108 |                                                 $isarray{$is}{$tmp[0]}++;
109 | 					}
110 | 
111 |                                 }
112 | 				else {
113 | 					delete $splt1->{ $tmp[0] } ;
114 | 				}
115 |                         }
116 |                 }
117 | 
118 |         }
119 | 	close PAF ;
120 |         $splt1=() ;
121 |         return (\%isarray); #1K: IS; 2K: Supporting read
122 | }
123 | 
124 | 
125 | ################################################################################
126 | ###### 2 genomic PAF file identifies potentially chimeric reads 
127 | sub verify_spreads_is  
128 | {
129 | 	my $pafg1 = $_[0]; #Genomic PAF Read1
130 |         my $pafg2 = $_[1]; #Genomic PAF Read2
131 | 	my $pafv1 = $_[2]; #Viral PAF Read1
132 |         my $pafv2 = $_[3]; #Viral PAF Read2
133 | 	my $ishash= $_[4]; #K1 IS; K2:Read_ID
134 | 	my $iscoord= $_[5]; #K: IS ->@ [VC] [VP] [GC] [GP]
135 | 	my %readstats = (); #K: Read_ID ->@ [G1 CHR] [G1 CO] [V1 CHR] [V1 CO] [G2 CHR] [G2 CO] [V2 CHR] [V2 CO] 
136 | 	my %readcoord = (); #K1:Genomic_Chr; K2:Genomic_Pos -> IS
137 | 	my %readtois1 = (); #K:Read_ID -> IS
138 | 	my %readtois = (); #K:Read_ID -> IS
139 | 	my %pmreads1 = (); #
140 | 	my %pmreads2 = ();
141 | 	my %chmreads = ();
142 | 	my @t1 = ();
143 | 	my @t2 = ();
144 | 	foreach my  $k (keys %{$ishash}) {
145 | 		foreach my $r (keys %{$ishash->{$k}}) {
146 | 			$readtois1{$r}{$k}++ ;
147 | 		}
148 | 	}
149 | 	
150 | 	foreach my  $k (keys %{$ishash}) {
151 | 		my $vl=0;
152 | 		$readcoord{$iscoord->{$k}[2]}{$iscoord->{$k}[3]}=$k;		
153 | 		foreach my $r (keys %{$ishash->{$k}}) {
154 | 			if (scalar (keys %{$readtois1{$r}}) ==1) {
155 | 				$vl++;
156 | 				$readtois{$r}=$k ;
157 | 				$readstats{$r}[0]=0;
158 | 				$readstats{$r}[1]=0;
159 | 				$readstats{$r}[2]=0;
160 | 				$readstats{$r}[3]=0;
161 | 				$readstats{$r}[4]=0;
162 |                         	$readstats{$r}[5]=0;
163 |                         	$readstats{$r}[6]=0;
164 |                         	$readstats{$r}[7]=0;
165 | 
166 | 			}
167 | 			else {
168 | 				delete $ishash->{$k}->{$r};
169 | 			}
170 | 		}
171 | 		if ($vl==0) {
172 | 			delete $ishash->{$k}; 
173 | 		}
174 | 	}
175 | 	open(PAF1 , "<", $pafg1) || die "ERROR in opening $pafg1: please check the file name\n" ;
176 |         while (my $line=<PAF1>) {
177 |                 chomp $line;
178 |                 if (length ($line) >0) {
179 |                         my @tmp =split("\t" , $line) ;
180 | 			if (exists $readstats{$tmp[0]}) {
181 | 				if ( ($tmp[5] eq $iscoord->{$readtois{$tmp[0]}}[2]) && (($tmp[7]>=($iscoord->{$readtois{$tmp[0]}}[3]-($tmp[1]*5))) && ($tmp[8]<=($iscoord->{$readtois{$tmp[0]}}[3]+($tmp[1]*5)))) ) { 
182 | 					$readstats{$tmp[0]}[0]=1;
183 | 				
184 | 					if ($tmp[7] == $iscoord->{$readtois{$tmp[0]}}[3]) {
185 |                                         	$readstats{$tmp[0]}[1]=1;
186 | 						push (@{$t1{$readtois{$tmp[0]}}}, $tmp[8]) ;
187 |                                 	}
188 | 					elsif ($tmp[8] == $iscoord->{$readtois{$tmp[0]}}[3]) {
189 | 						$readstats{$tmp[0]}[1]=1;
190 | 						push (@{$t1{$readtois{$tmp[0]}}}, $tmp[7]) ;
191 | 
192 | 					}
193 | 				}
194 | 			}
195 | 			elsif (exists $readcoord{$tmp[5]}) {
196 | 				foreach my $k(keys %{$readcoord{$tmp[5]}}) {
197 | 					if ( ($tmp[7]>=($k-($tmp[1]*5))) && ($tmp[8]<=($k+($tmp[1]*5))) ) { # && ($tmp[10]>=(0.9*$tmp[1]))) {
198 | 						$pmreads1{$tmp[0]}{$readcoord{$tmp[5]}{$k}}=$tmp[4];
199 | 					} 
200 | 				}
201 | 			}
202 | 		}
203 | 	}
204 | 	close PAF1 ;
205 | 	open(PAF2 , "<", $pafg2) || die "ERROR in opening $pafg2: please check the file name\n" ;
206 |         while (my $line=<PAF2>) {
207 |                 chomp $line;
208 |                 if (length ($line) >0) {
209 |                         my @tmp =split("\t" , $line) ;
210 |                         if (exists $readstats{$tmp[0]}) {
211 |                                 if ( ($tmp[5] eq $iscoord->{$readtois{$tmp[0]}}[2]) && (($tmp[7]>=($iscoord->{$readtois{$tmp[0]}}[3]-($tmp[1]*5))) && ($tmp[8]<=($iscoord->{$readtois{$tmp[0]}}[3]+($tmp[1]*5)))) ) {        
212 | 					$readstats{$tmp[0]}[4]=1;
213 |                                 
214 |                                 	if ($tmp[7] == $iscoord->{$readtois{$tmp[0]}}[3]) {
215 |                                         	$readstats{$tmp[0]}[5]=1;
216 | 						push (@{$t1{$readtois{$tmp[0]}}}, $tmp[8]) ;
217 |                                 	}
218 | 					elsif ($tmp[8] == $iscoord->{$readtois{$tmp[0]}}[3]) {
219 | 						$readstats{$tmp[0]}[5]=1;
220 | 						push (@{$t1{$readtois{$tmp[0]}}}, $tmp[7]) ;
221 | 					}		
222 |                         	}
223 | 			}
224 |                         elsif (exists $readcoord{$tmp[5]}) {
225 |                                 foreach my $k(keys %{$readcoord{$tmp[5]}}) {
226 |                                         if ( ($tmp[7]>=($k-($tmp[1]*5))) && ($tmp[8]<=($k+($tmp[1]*5))) ) { #&& ($tmp[10]>=(0.9*$tmp[1]))) {
227 |                                                 $pmreads2{$tmp[0]}{$readcoord{$tmp[5]}{$k}}=$tmp[4];
228 |                                         }
229 |                                 }
230 |                         }
231 |                 }
232 |         }
233 |         close PAF2 ;
234 | 	open(PAF3 , "<", $pafv1) || die "ERROR in opening $pafv1: please check the file name\n" ;
235 |         while (my $line=<PAF3>) {
236 |                 chomp $line;
237 |                 if (length ($line) >0) {
238 |                         my @tmp =split("\t" , $line) ;
239 |                         if (exists $readstats{$tmp[0]}) {
240 |                         	if ( ($tmp[5] eq $iscoord->{$readtois{$tmp[0]}}[0]) && (($tmp[7]>=($iscoord->{$readtois{$tmp[0]}}[1]-($tmp[1]*5))) && ($tmp[8]<=($iscoord->{$readtois{$tmp[0]}}[1]+($tmp[1]*5)))) ) {
241 |                                         $readstats{$tmp[0]}[2]=1;
242 |                                 	if ($tmp[7] == $iscoord->{$readtois{$tmp[0]}}[1]) {
243 |                                         	$readstats{$tmp[0]}[3]=1;
244 |                                 		push (@{$t2{$readtois{$tmp[0]}}}, $tmp[8]) ;
245 | 					}
246 | 					elsif  ($tmp[8] == $iscoord->{$readtois{$tmp[0]}}[1]) {
247 | 						$readstats{$tmp[0]}[3]=1;
248 | 						push (@{$t2{$readtois{$tmp[0]}}}, $tmp[7]) ;
249 | 					}
250 | 				}
251 | 
252 | 			}
253 |                         elsif (exists $pmreads2{$tmp[0]}) {
254 | 				foreach my $is(keys %{$pmreads2{$tmp[0]}} ) {
255 | 					if ( ($tmp[5] eq $iscoord->{$is}[0]) && ($tmp[7]>=($iscoord->{$is}[1]-($tmp[1]*5))) && ($tmp[8]<=($iscoord->{$is}[1]+($tmp[1]*5))) && ($tmp[4] ne $pmreads2{$tmp[0]}{$is}) ) {
256 |                                                 $chmreads{$is}{$tmp[0]}++;
257 |                                   
258 |                         		}
259 |                 		}
260 |         		}
261 | 		}
262 | 	}
263 |         close PAF3 ;
264 | 	open(PAF4 , "<", $pafv2) || die "ERROR in opening $pafv2: please check the file name\n" ;
265 |         while (my $line=<PAF4>) {
266 |                 chomp $line;
267 |                 if (length ($line) >0) {
268 |                         my @tmp =split("\t" , $line) ;
269 |                         if (exists $readstats{$tmp[0]}) {
270 |                         	if ( ($tmp[5] eq $iscoord->{$readtois{$tmp[0]}}[0]) && (($tmp[7]>=($iscoord->{$readtois{$tmp[0]}}[1]-($tmp[1]*5))) && ($tmp[8]<=($iscoord->{$readtois{$tmp[0]}}[1]+($tmp[1]*5)))) ) {
271 |                                         $readstats{$tmp[0]}[6]=1;
272 |                                 	if ($tmp[7] == $iscoord->{$readtois{$tmp[0]}}[1]) {
273 |                                         	$readstats{$tmp[0]}[7]=1;
274 |                                 		push(@{$t2{$readtois{$tmp[0]}}}, $tmp[8]) ;
275 | 					}
276 | 					elsif ($tmp[8] == $iscoord->{$readtois{$tmp[0]}}[1])  {
277 | 						$readstats{$tmp[0]}[7]=1;
278 |                                                 push(@{$t2{$readtois{$tmp[0]}}}, $tmp[7]) ;
279 |                                         }
280 | 
281 | 				}
282 |                         
283 | 			}
284 |                         elsif (exists $pmreads1{$tmp[0]}) {
285 | 				foreach my $is(keys %{$pmreads1{$tmp[0]}} ) {
286 | 					if ( ($tmp[5] eq $iscoord->{$is}[0]) && ($tmp[7]>=($iscoord->{$is}[1]-($tmp[1]*5))) && ($tmp[8]<=($iscoord->{$is}[1]+($tmp[1]*5))) && ($tmp[4] ne $pmreads1{$tmp[0]}{$is}) ) {
287 |                                                 $chmreads{$is}{$tmp[0]}++;
288 | 
289 |                         		}
290 |                 		}
291 |         		}
292 | 		}
293 | 	}
294 |         close PAF4 ;
295 | 	my $verifis=();
296 | 	my $spreads=();
297 | 	foreach my  $k (keys %{$ishash}) {
298 | 		my $chim= scalar keys(%{$chmreads{$k}}) ;
299 | 		my $R1R2=0;
300 | 		my $R1=0;
301 | 		my $R2=0;
302 | 		my $R0=0;
303 | 		foreach my $r (keys %{$ishash->{$k}}) {
304 | 			if ($readstats{$r}[0]==1 && $readstats{$r}[1]==1 && $readstats{$r}[2]==1 && $readstats{$r}[3]==1 && $readstats{$r}[4]==1 && $readstats{$r}[5]==1 && $readstats{$r}[6]==1 && $readstats{$r}[7]==1) {
305 | 				$R1R2++ ;
306 | 				$spreads{$k}{$r}="R1R2" ;
307 | 			}
308 | 			elsif ( ($readstats{$r}[0]==1 && $readstats{$r}[1]==1 && $readstats{$r}[2]==1 && $readstats{$r}[3]==1) && ( ($readstats{$r}[4]==1 && $readstats{$r}[6]==0) ||  ($readstats{$r}[4]==0 && $readstats{$r}[6]==1) ) ) {
309 |                                 $R1++ ;
310 | 				$spreads{$k}{$r}="R1" ;
311 |                         }
312 | 			elsif ( ($readstats{$r}[4]==1 && $readstats{$r}[5]==1 && $readstats{$r}[6]==1 && $readstats{$r}[7]==1) && ( ($readstats{$r}[0]==1 && $readstats{$r}[2]==0) || ($readstats{$r}[0]==0 && $readstats{$r}[2]==1) ) ) {
313 | 				$R2++ ;
314 | 				$spreads{$k}{$r}="R2" ;
315 | 			}
316 | 			else {
317 | 				$R0++ ;
318 | 				$spreads{$k}{$r}="R0" ;
319 | 			}
320 | 		}
321 | 		if (exists $t2{$k} && $t1{$k} ) { 
322 | 			$verifis{$k}{"R1R2"}=$R1R2 ;
323 | 			$verifis{$k}{"R1"}=$R1 ;
324 | 			$verifis{$k}{"R2"}=$R2 ;
325 | 			$verifis{$k}{"CHIM"}=$chim ;
326 | 			$verifis{$k}{"RU"}=$R0 ;
327 | 			my @isc=split(/--/, $k) ;
328 |                 	my @iscp=split(/:/, $isc[0]) ;
329 |                 	my @iscg=split(/:/, $isc[1]) ;
330 | 			my $intp=max(@{$t2{$k}}) ;
331 | 			if ($intp<=$iscp[1]) {
332 | 				$intp=min(@{$t2{$k}}) ;
333 | 			}
334 | 			my $intg=max(@{$t1{$k}}) ;
335 |                 	if ($intg <= $iscg[1]) {
336 |                         	$intg=min(@{$t1{$k}}) ;
337 |                 	}
338 | 			my $interval="$iscp[0]:$iscp[1]-$intp--$iscg[0]:$iscg[1]-$intg";
339 | 			$verifis{$k}{"INT"}=$interval;
340 | 		}
341 | 	
342 | 	}
343 | 	return(\%verifis, \%spreads) ;
344 | 
345 | }
346 | 
347 | 
348 | ################################################################################
349 | 
350 | 1;
351 | 


--------------------------------------------------------------------------------