├── FAQ
└── FAQ1.png
├── README.md
├── Seq.pm
├── annotate_bed.pl
├── extract_informative.pl
├── filter_insertion.pl
├── genotype_caculator.r
├── identity_inser_sites.pl
├── itis.pl
├── lean_fq.pl
├── mask_te_homo_in_genome.pl
├── modify_informative.pl
├── sam_to_fq.pl
├── sw.pl
├── te_realin_bwa.pl
├── test_dir
└── sample_data.tar.gz
├── transform_to_bed.pl
└── utilities
├── G_test.r
├── average_depth_for_temp.pl
├── cluster_for_venn_diff_tools.pl
├── count.gt.pl
├── count_fts_hits.pl
├── count_hit_at_gene_flank_inter.pl
├── count_line.pl
├── count_true_and_false.pl
├── enlarge_bed.pl
├── enlarge_bed_for_meme.pl
├── filter_TEMP.pl
├── flanking_primer.pl
├── generate_snp_batch_from_bed.pl
├── genotype_calculator.r
├── get_seq_for_logo.pl
├── gt_ratio.pl
├── part_in_venn.pl
├── perfomance_itis.r
├── plot_cov.r
├── plot_ins_matri.pl
├── plot_venn_medi.r
├── random_10000_insertion.pl
├── simulate_TE_inser.pl
├── trans_relocate_2_bed.pl
├── trans_retro_2_bed.pl
├── trans_temp_2_bed.pl
└── trans_tif_2_bed.pl
/FAQ/FAQ1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chuan-Jiang/ITIS/833f5893d0e3f2fa6e36fd1246c8c4046d078f50/FAQ/FAQ1.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #ITIS:(Identify Transposon Insertion Sites)
2 | is a pipeline to identify novel TE insertion sites in genome
3 |
4 | It require three input files:
5 | (i) reference genome sequence,
6 | (ii)TE sequence,
7 | (iii)paired-end (PE) short reads, with no restriction on length,generated from the re-sequenced genome that contains novel TE insertions.
8 |
9 | By aligning read pairs to merged reference sequence, reference genome and TE sequence, ITIS will check each informative read pairs as long as it have more than 20bp overlap with TE sequence and determine if it supports the TE insertion around the location mapped by one of read pair. In theory, by inspecting both cross read pairs and clipped reads at the same time, ITIS will have a higher sensitivity than other tools
10 |
11 |
12 | NOTE: ITIS should be used to identy de novo insertions sites, It is unable to detect the lose event of one preexisting TEs.
13 | ---
14 | ---
15 |
16 |
17 | ##Table of Contents
18 | ### Dependencies
19 | ### Command line options
20 | ### Quick start with a sample dataset
21 | ### Report an Issue
22 |
23 | ---
24 | ---
25 | ### Dependencies:
26 |
27 | The following programs need to be installed and the executable commands should be in $PATH of system:
28 |
29 | samtools (v 0.1.19) #####******IMPORTANT******######
30 | bwa (v 0.7.7-r441)
31 | bedtools (v 2.17.0)
32 | Bio::Perl
33 | blast+
34 | R
35 |
36 | Other usefull tool:
37 | IGV
38 |
39 | -------------
40 |
41 | ### Comamnd Line Options
42 |
43 | --------------------
44 | USAGE:
45 | #### perl itis.pl
46 | /psc/home/jiangchuan/Dropbox/Code/Code_TE_inser/itis.pl
47 | REQUIRED -g the genome sequence file in fasta format
48 | OR -G prefix of bwa-indexed reference file ( genome + transposon)
49 | REQUIRED -t the TE sequence file in fasta format
50 | or -T prefix of bwa-indexed transposon sequence file
51 | REQUIRED -l the average length of fragments in library
52 | REQUIRED -N the name of project
53 | REQUIRED -1 the paired read file 1
54 | REQUIRED -2 the paired read file 2
55 |
56 | -f if provided, ITIS will check if TE inserted in gentic or intergeneic region
57 | -F run scripts in 'FAST' mode; It won't align all reads to reference genome,caculate the average bg depth,
58 | and estimate if insertion is homo or heter,[default N]
59 |
60 | ## parameters used with '-F N' :
61 | -B use previous sorted and indexed bam file of reads aligned to reference genome
62 | -d the depth range to filter raw insertion site, [default 2,200]
63 |
64 | -q the minimum average mapping quality of all supporting reads, [default 1]
65 |
66 | -e If reference genome contains this TE or it's homolog. using blast to hard mask these sequence is required, [default N]
67 |
68 | -a the number of bases allowed to be lost when transposing, [defualt 10]
69 |
70 | -b minimum required number of flanking reads , in the format of /Tag=Value/Tag=Value/Tag=Value/ , the avaliable tags:
71 | t: total supporting reads at detected insertion /t=3/
72 | CS:clipped reads cover TE start site /CS=0/
73 | CE:clipped reads cover TE end site /CE=0/
74 | cs:cross reads cover TE start /cs=0/
75 | ce:cross reads cover TE end /cs=0/
76 | TS:total reads cover TE start /TS=1/
77 | TE:total reads cover TE end /TE=1/
78 | [default /t=3/TS=1/TE=1/]
79 | -c cpu number for 'BWA mem', 'samtools view' and 'samtools sort', [defualt 8,2,2]
80 |
81 | -w window size used to cluster supportting reads, [default library_length/2]
82 |
83 | -D use this specifed temperate directory, [default[project].[aStringOfNumbers]]
84 |
85 | -m Only print out all commands to STDERR, [default N]
86 |
87 | -h print this help message
88 |
89 |
90 | eg: perl /psc/home/jiangchuan/Dropbox/Code/Code_TE_inser/itis.pl -g genome.fa -t tnt1.fa -l 300 -N test_run -1 reads.fq1 -2 reads.fq2 -f medicago.gff3
91 |
92 | -------------
93 |
94 |
95 | ### Quick start with a sample dataset
96 | this test dataset derived from the genome resequencing project of Japonica A123(SRR631734), which have transposon mping be activted.
97 |
98 | All PE reads mapped at chr1:1-2000000 were extracted and saved in file sample.fq1 and sample.fq2.
99 |
100 | First of all, untar the sample dataset:
101 |
102 | cd test_dir
103 | tar xvzf sample_data.tar.gz
104 |
105 | ####Input Files
106 | PE reads:
107 | sample.fq1
108 | sample.fq1
109 | reference genome:
110 | rice_chr1_200k.fa
111 | Transposon sequence:
112 | mping.fa
113 |
114 | ####command to detect mping insertions in reference genome
115 | perl path/to/itis.pl -g rice_chr1_200k.fa -t mping.fa -l 500 -N test -1 sample.fq1 -2 sample.fq2 -e Y
116 |
117 | \#-e Y : to tell itis.pl that there are mping homologous sequence in reference genome
118 |
119 | ####Output Files
120 | itis will produce a lot of files in a directory named test.[aStringOfNumbers]
121 |
122 | The important files included:
123 |
124 | test.mping.filtered.bed
125 | This is a list of reliable insertion sites.
126 | TAGS in column 4:
127 | SR=(NO. of supporting library fragment),(No. supporting reads),(No. clipped reads at TE start),(No. clipped reads at TE end),(No. cross reads at TE start),(No. cross reads at TE end)
128 | MQ=average mapping quality of all supporting reads
129 | NM=name of TE
130 | GT=(No. of supporting reads),(No. of background reads):(Heter|Homo)
131 | PV=the p-value ofbinomial test for zygosity based on the GT values
132 | DP=average depth of 100bp region flanking the TE insertion site
133 | TS=the joint position at the begin of TE
134 | TE=the joint pisition at the end of TE
135 | result contents with test_data:
136 | Chr1:0-2000000 174497 174500 SR=8,9,6,2,0,1;MQ=57;NM=mping;GT=8,0:Heter;PV=0.03125;DP=16;TS=1;TE=430;NB=N . +
137 | Chr1:0-2000000 214352 214355 SR=15,17,9,5,1,2;MQ=57;NM=mping;GT=15,0:Homo;PV=0.00390625;DP=17;TS=1;TE=430;NB=N . +
138 | Chr1:0-2000000 316534 316537 SR=9,14,3,11,0,0;MQ=57;NM=mping;GT=9,0:Heter;PV=0.03125;DP=20;TS=1;TE=430;NB=N . -
139 | Chr1:0-2000000 639972 639975 SR=8,8,4,3,0,1;MQ=51;NM=mping;GT=8,14:Heter;PV=0.9903946;DP=28;TS=1;TE=430;NB=N . +
140 | Chr1:0-2000000 1193504 1193507 SR=9,12,5,7,0,0;MQ=56;NM=mping;GT=9,0:Heter;PV=0.03125;DP=18;TS=1;TE=430;NB=N . +
141 | Chr1:0-2000000 1374936 1374939 SR=8,10,7,2,0,1;MQ=59;NM=mping;GT=8,10:Heter;PV=0.9407654;DP=28;TS=1;TE=430;NB=N . +
142 | test.mping.raw.bed
143 | This is all candidate insertion sites, some of which may be false
144 | test.mping.support.reads.sam and test.mping.support.reads.sorted.bam
145 | This is alignment file of all supportive reads
146 | test.all_reads_aln_ref_and_te.sort.bam
147 | This is alignment file of all reads
148 | commands_rcd
149 | A record of all the command used by itis.pl to identify TE insertions.
150 | test.ref_and_te.fa
151 | the reference sequence, containing genome and mping sequence, used by bwa to align reads.
152 |
153 | *bam, *bed and reference sequence can be visuallized in IGV
154 | If you want to filter the raw insertion list by personalized criteria, you can rerun the script filter_insertion.pl, just as shown in command_rcd
155 |
156 |
157 |
158 | -------------
159 |
160 | ### Report an Issue
161 | If you have any questions or suggestion, please feel free to contact me :chuan-j@foxmail.com or chjiang at sibs.ac.cn
162 | -----------
163 |
164 |
165 |
166 |
167 |
--------------------------------------------------------------------------------
/Seq.pm:
--------------------------------------------------------------------------------
1 | package Seq;
2 | use warnings; use strict;
3 | use Bio::SeqIO;
4 | use Bio::Seq;
5 |
6 | sub seq_hash{
7 | my $file = shift @_;
8 | my $seq_in = Bio::SeqIO -> new (-file => $file,-format => "fasta");
9 | my %hash;
10 | while (my $seq_obj = $seq_in -> next_seq){
11 | my $id = $seq_obj -> id;
12 | my $seq = $seq_obj -> seq;
13 | $hash{$id} = $seq;
14 | }
15 | return %hash;
16 | }
17 |
18 | sub rev_com{
19 | my $seq = shift @_;
20 | (my $seq_com = $seq )=~ tr/ATCGatcg/TAGCtacg/;
21 | my $rev_com = reverse $seq_com;
22 | return $rev_com;
23 | }
24 |
25 | sub translate{
26 | my $seq = shift @_;
27 | my $seq_obj = Bio::Seq -> new (-alphabet=>"dna",-seq => $seq);
28 | my $pro = $seq_obj -> translate;
29 | my $pro_seq = $pro -> seq;
30 | return $pro_seq;
31 | }
32 | 1;
33 |
34 |
--------------------------------------------------------------------------------
/annotate_bed.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 | use Seq;
4 | use Getopt::Std;
5 |
6 | my %opt;
7 |
8 | my $usage = "USAGE:
9 | $0 :
10 | -b bed file of insertion locs {REQUIRED}
11 | -a annotation file in gff3 format { If not provied , only generate snpbatch filt for IGV}
12 | -g merged genome sequence with TE {REQUIRED}
13 | -n the name of your retrotransposon name {REQUIRED}
14 | -p the prefix you project {REQUIRED}
15 | -h print this help infor
16 | -d the directory contain your results Default: Current directory
17 | " ;
18 | die $usage if (@ARGV == 0);
19 | getopts("b:a:g:i:n:p:d:h",\%opt);
20 | die $usage if ($opt{h});
21 | my($bed_file,$gff_file,$genome_file,$te,$proj) = ($opt{b},$opt{a},$opt{g},$opt{n},$opt{p});
22 | my $folder = $opt{d}?$opt{d}:'.';
23 | my $wd = $ENV{'PWD'};
24 |
25 | ################ generate igv batch #############
26 | open FH,"> $folder/$proj.$te.igv.bat" or die $!;
27 | open BED,"$bed_file" or die $!;
28 | print FH "snapshotDirectory $wd/$proj.$te.snap.dir\n";
29 | while (){
30 | chomp;
31 | my ($chr,$s,$e) = split /\t/,$_;
32 | if (($e - $s) > 300){
33 | print STDERR "A large region: $chr,$s,$e\n";
34 | print FH "goto $chr:$s-$e\n";
35 | print FH "snapshot ${chr}_${s}_${e}.png\n";
36 | }else{
37 | my $slop = 150 - ($e - $s);
38 | my $s_p = $s - int($slop/2);
39 | my $e_p = $e + int($slop/2);
40 | print FH "goto $chr:$s_p-$e_p\n";
41 | print FH "snapshot ${chr}_${s}_${e}_slop$slop.png\n";
42 | }
43 | }
44 |
45 | if($gff_file){ # if provided gff file, then process the following code util the end
46 |
47 | ################# generate usefull gff file containg intergenic region ##########
48 |
49 | open OUT, ">$folder/$proj.$te.annotation.tsv" or die $!;
50 | open TEM,">$proj.tem" or die $!;
51 | open GFF,"awk 'BEGIN{IGNORECASE=1} {if(\$3 ~ /gene/){print \$0}}' $gff_file |" or die $!;
52 | chomp (my @gff = );
53 | for (my $i = 1;$i<@gff;$i++){
54 | print TEM "$gff[$i-1]\n";
55 | my $l = $gff[$i-1];
56 | my $n = $gff[$i];
57 | my @las = split /\t/,$l;
58 | my @nos = split /\t/,$n;
59 | if (($nos[3] - $las[4]) > 0 and $nos[0] eq $las[0]){
60 | my $s = $las[4] +1;
61 | my $e = $nos[3] -1;
62 |
63 | my %la;
64 | foreach (split /;/,$las[8]){
65 | my ($k,$v) = split /=/,$_;
66 | $la{$k} = $v;
67 | }
68 |
69 | my %ne;
70 | foreach (split /;/,$nos[8]){
71 | my ($k,$v) = split /=/,$_;
72 | $ne{$k} = $v;
73 | }
74 |
75 | my $la_ge = $la{ID};
76 | my $la_no = $la{Note};
77 |
78 | my $ne_ge = $ne{ID};
79 | my $ne_no = $ne{Note};
80 | print TEM "$las[0]\t$las[1]\tIntergenic\t$s\t$e\t.\t.\t.\tID=${la_ge}_${ne_ge};Note=Intergenic_${la_no}_INSERT_$ne_no\n";
81 | }
82 | }
83 | ##################### put genome in hash##################################
84 |
85 | my %genome = Seq::seq_hash($genome_file);
86 | ##########################################################################
87 |
88 |
89 |
90 | ################### intersect using bedtools ###########################
91 |
92 | open BEDTOOL, "bedtools intersect -a $bed_file -b $proj.tem -wa -wb |" or die $!;
93 |
94 | my $la=0;
95 | my $la_t;
96 | while (){
97 | chomp;
98 | #print "$_\n";
99 | my($chr,$s,$e,$name,$d,$t,$an) = (split /\t/,$_)[0,1,2,3,5,8,14];
100 |
101 | my ($up,$down);
102 |
103 | my $te_seq = ($d =~ /\+/)?$genome{$te}:Seq::rev_com($genome{$te});
104 |
105 | $up = substr ($genome{$chr},$e-1005,1005);
106 | $down = substr ($genome{$chr},$s,1005);
107 |
108 | my $seq = $up.lc($te_seq).$down;
109 |
110 | print OUT "$chr\t$s\t$e\t$name\t$d\t$t\t$an\t$seq\n";
111 |
112 | $la = "$chr:$s:$e";
113 | $la_t = $t;
114 |
115 | }
116 |
117 | unlink "$proj.tem";
118 |
119 | }
120 |
--------------------------------------------------------------------------------
/extract_informative.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 | use Getopt::Std;
4 | use Bio::SeqIO;
5 |
6 | my %opt;
7 | my $usage = "USAGE:
8 | $0
9 | -s sam file of short reads aligned to reference if not specifed , it will read from stdin
10 | -n the ID of TE seq
11 | -p out_put file prefix
12 | -h print this help
13 | -g genome file
14 | " ;
15 |
16 |
17 | die $usage if ( @ARGV == 0);
18 | getopts("s:n:g:p:h",\%opt);
19 | die $usage if ($opt{h});
20 |
21 |
22 |
23 | my $id = $opt{n};
24 | my $pre = $opt{p};
25 | my $sam = $opt{s};
26 | my $genome = $opt{g};
27 |
28 | open my $fh,">$pre.$id.informative.sam" or die $!;
29 | my $seq_in = Bio::SeqIO -> new ( -file => $genome,-format=>"fasta");
30 | my %gid;
31 | while(my $seq = $seq_in -> next_seq){
32 | my $id = $seq -> id;
33 | $gid{$id} =1 ;
34 | }
35 | my %reads; # hash to assistant to detect paired reads in sam file
36 | my @rs; # array contain the pairs of one fragment
37 |
38 | open SAM, $sam or die $!;
39 |
40 |
41 | my $num_infor =0;
42 | while (){ # reading sam file one by one
43 | chomp;
44 | if (/^@/){ # save header
45 | print $fh "$_\n";
46 | next;
47 | }
48 |
49 | my ($title,$chr,$cig,$rnext) = (split /\t/,$_)[0,2,5,6];
50 |
51 | if (! keys(%reads ) or exists ($reads{$title})){ #if have no defined hash %reads or the key of %reads is equal to the $title, then I am reading another pair read
52 | push @rs,$_;
53 | $reads{$title} = "1";
54 | if(eof(SAM)){
55 | print_clu(@rs);
56 | }
57 | }else{
58 |
59 | print_clu(@rs);
60 |
61 | undef(%reads);
62 | $reads{$title} = 1;
63 | @rs = ($_);
64 | print_clu(@rs) if (eof(SAM));
65 | }
66 | }
67 |
68 | if($num_infor == 0){
69 | system ("touch $pre.$id.empty") == 0 or die $!;
70 | #die "NO insertions can be found. Exit!\n";
71 | }
72 |
73 | sub print_clu{
74 | my @va = @_;
75 | my $pt = join "\n",@va; # $pt is ready to print
76 | #my $map_q; # used to check map_q of reads at genome
77 |
78 | my $gboo;
79 | my $tboo;
80 | foreach my $it (@va){
81 | my ($title,$flag,$chr,$mq,$cig,$rnext) = (split /\t/,$it)[0,1,2,4,5,6];
82 | return unless ( $gid{$chr} or $chr eq $id);
83 | return unless ( $gid{$rnext} or $rnext eq $id or $rnext eq "=" );
84 | if($gid{$chr} or $gid{$rnext}){
85 | $gboo = 1;
86 | }
87 | if ($chr eq $id or $rnext eq $id){
88 | $tboo = 1;
89 | }
90 | }
91 | if ($gboo and $tboo){
92 | print $fh "$pt\n" ;
93 | $num_infor ++;
94 | }
95 | }
96 |
97 |
--------------------------------------------------------------------------------
/filter_insertion.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 | use Getopt::Std;
4 |
5 | my $help = "$0
6 | -i : insertion list in bed format
7 | -l : postion list of te homo in ref genome
8 | -n : in the form /t=3/TS=1/TE=1/ , the minimum requried:
9 | t:total reads supporting insertion /3/
10 | CS:clipped reads cover TE start site /0/
11 | CE:clipped reads cover TE end site /0/
12 | cs:cross reads cover TE start /0/
13 | ce:cross reads cover TE end /0/
14 | TS:total reads cover TE start /1/
15 | TE:total reads cover TE end /1/
16 | -q : degault <1>, the minimum required average mapping quality
17 | -d : default <2,200>, the reads depth range
18 | -b : the treshhold of NB tag default : 100;
19 | -h : help message
20 | ";
21 |
22 | die $help unless ( @ARGV);
23 |
24 | my %opt;
25 | getopts("i:l:b:c:n:q:d:r:h",\%opt);
26 | die $help if($opt{h});
27 |
28 | ######## parameters ###########
29 | my $nb = $opt{b}?$opt{b}:100;
30 | my $ins_file = $opt{i};
31 | my $lst = $opt{l};
32 |
33 | my $sr = $opt{n}? $opt{n} : '/t=3/TS=1/TE=1/';
34 | my %paras = parse_sr($sr);
35 |
36 | my $MQ = defined($opt{q})?$opt{q}:1;
37 | my ($DP_L,$DP_H) = $opt{d}?(split ',',$opt{d}):(split ',',"2,200");
38 |
39 | open INS, "$ins_file" or die $!;
40 |
41 |
42 | ## all homo in hash %homos
43 | my %homos;
44 | if($lst){
45 | open LST, $lst or die $!;
46 | while(){
47 | chomp;
48 | my($chr,$s,$e,$te) = split /\t/;
49 | $s = $s - $nb;
50 | $e = $e + $nb;
51 | foreach my $i ($s..$e){
52 | $homos{$te}{$chr}{$i} = 1;
53 | }
54 | }
55 | }
56 |
57 |
58 | while(){
59 |
60 | my $boo = 1;
61 |
62 | my($chr,$s,$e,$t,$rest) = (split /\t/, $_,5);
63 |
64 | my@tags = split /;/, $t;
65 |
66 | ### parse the rest key and values ###
67 | #####################################
68 | my %other;
69 | foreach my $r (@tags){
70 | my($k,$v) = split /=/,$r;
71 | $other{$k} = $v;
72 | }
73 |
74 | # filter support reads number
75 | if(exists $other{SR}){
76 | my($cf,$tot,$r1,$r2,$r3,$r4) = split /,/,$other{SR};
77 |
78 | if($tot < $paras{t} or $r1 < $paras{CS} or $r2 < $paras{CE} or $r3 < $paras{cs} or $r4 < $paras{ce}
79 | or ($r1+$r3) < $paras{TS} or ($r2+$r4) < $paras{TE}){
80 | $boo = 0;
81 | }
82 | }
83 | # filter eveage mapping valeu
84 | if(exists $other{MQ} and $other{MQ} < $MQ){
85 | $boo = 0;
86 | }
87 |
88 | # filter bg depth
89 | if(exists $other{DP} ){
90 | my $dp = $other{DP};
91 | if ($dp < $DP_L or $dp > $DP_H){
92 | $boo = 0;
93 | }
94 | }
95 | # mark ins near known site
96 | if ($lst){
97 | my $near = 0 ;
98 | for my $i ($s..$e){
99 | if ($i ~~ %{$homos{$other{NM}}{$chr}}){
100 | $near = 1;
101 | }
102 | }
103 | if($near){
104 | $t .= ";NB=Y";
105 | }else{
106 | $t .= ";NB=N";
107 | }
108 | }
109 | print join "\t", ($chr,$s,$e,$t,$rest) if $boo;
110 | }
111 |
112 | sub parse_sr{
113 | my $p = shift @_;
114 | my %paras = (
115 | t => 0,
116 | CS => 0,
117 | CE => 0,
118 | cs => 0,
119 | ce => 0,
120 | TS => 0,
121 | TE => 0,
122 | );
123 | my @ps = split /\//,$p;
124 | foreach my $t (@ps){
125 | next unless($t);
126 | my($k,$v) = split /=/,$t;
127 | $paras{$k} = $v;
128 | }
129 | return %paras;
130 | }
131 |
--------------------------------------------------------------------------------
/genotype_caculator.r:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | opts <- as.numeric(commandArgs(trailingOnly = TRUE))
4 | t <-binom.test(opts,p=0.5,alternative="greater")
5 | #t <-binom.test(opts,p=2/3)
6 | p <- t$p.value
7 | cat(p)
8 |
9 |
--------------------------------------------------------------------------------
/identity_inser_sites.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 | use Seq;
4 | use Getopt::Std;
5 | use Bio::AlignIO;
6 |
7 | ########## get parameters #################
8 | my %opts;
9 | my $usage = "$0
10 | -h : help message
11 | -s : informative sam file
12 | -g : genome file
13 | -l : library insertion size
14 | -n : the id of te seq
15 | -a : , allow how many bases can be lost when transpositon,
16 | -p : 'directory/prefix' of your output files, relative to your working direcroty;
17 | this script generate two output files : '\$WD/directory/prefix.ins.loc.lst' and '\$WD/directory/prefix.supported.reads.sam
18 | -r : tnt realian sam
19 | -d : debug mode on
20 | ";
21 |
22 | die "$usage\n" if (@ARGV==0);
23 | getopts("hs:g:l:n:a:p:r:d",\%opts);
24 | if ($opts{h}){ print "$usage"; exit};
25 |
26 | my ($sam_file,$genome_file,$ins_size,$te) = ($opts{s},$opts{g},$opts{l},$opts{n});
27 | my $lost = $opts{a}?$opts{a}:0;
28 | my $sam_te = $opts{r};
29 | my $project = $opts{p};
30 |
31 |
32 |
33 |
34 | ############# put genome seq in hash #########
35 | my %genomes = Seq::seq_hash($genome_file);
36 | my $tnt_len = length ($genomes{$te}); # transposon length
37 |
38 |
39 |
40 | ############# files used to save results ##
41 | #open OUT, ">/dev/stdout" or die $!;
42 | open OUT ,">${project}.$te.ins.loc.lst" or die $!;
43 | open SUPP,">${project}.$te.support.reads.sam" or die $!;
44 | open NSUPP,">${project}.$te.unsupport.reads.sam" or die $!;
45 |
46 | ############ parsing the te realn sam file ##### because TE have LTRs at both end
47 |
48 | my %guanxi = te_aln($sam_te);
49 |
50 |
51 |
52 | ###########################################################
53 | ################ the mainbody of code######################
54 | ###########################################################
55 | ###########################################################
56 | my @aligns = scan_sam ($sam_file);
57 |
58 | =head
59 | foreach my $it ( @aligns){
60 | print "@$it\n" if $$it[0] =~ (/ACB053:89:D260YACXX:7:1107:12510:30746/);
61 | }
62 | =cut
63 |
64 | my $rds;
65 | foreach my $grp (@aligns){ # parse each group of reads
66 | my %cors;
67 | my @hits = @$grp;
68 | $rds = join "\n",@hits;
69 |
70 | my %te_ha = read_2_ha($hits[0] );
71 | $cors{$te} = \%te_ha;
72 | my %chr_ha = read_2_ha($hits[1] );
73 | $cors{tar} = \%chr_ha;
74 |
75 |
76 | #print "THERE :$rds\n";
77 | my $tar_cig = $cors{tar}{cig};
78 | my $te_cig = $cors{$te}{cig};
79 |
80 | my $boo_ns = 0;
81 | if ( $tar_cig =~ /M/ and $te_cig =~ /M/){
82 | $boo_ns = 1 if( cross(%cors));
83 | }
84 | if($te_cig =~ /S/){
85 | $boo_ns = 1 if(te_start(%cors));
86 | }
87 | if($te_cig =~ /E/){
88 | $boo_ns = 1 if(te_end(%cors));
89 | }
90 | if($tar_cig =~ /S/){
91 | $boo_ns = 1 if (ge_start(%cors));
92 | }
93 | if($tar_cig =~ /E/){
94 | $boo_ns = 1 if (ge_end(%cors));
95 | }
96 | print NSUPP "$rds\n" unless($boo_ns);
97 | }
98 | #################### the end of mainbody of code ######################
99 |
100 |
101 |
102 |
103 | ####### sub functions #######
104 | sub te_aln{
105 | my $sam_te = shift @_;
106 |
107 | my %guanxi;
108 | my %te_rcd;
109 | open my $fh, "samtools view -S -X $sam_te|" or die $!;
110 | my $l_seq;
111 | my $l_as;
112 | while (<$fh>){
113 | chomp;
114 | my @ar = (split /\t/,$_,12);
115 | $ar[5] =~ s/H/S/;
116 | my ($id,$flag,$chr,$pos,$cig,$seq,$tags) = @ar[0,1,2,3,5,9,11];
117 |
118 |
119 | my $direc = ($flag =~ /r/)?-1:1;
120 | my ($as) = $tags =~ /AS:i:(\d+)/;
121 |
122 | ## firstly, save the full infor to $l_seq and $l_as
123 | unless($te_rcd{$id}){
124 | $guanxi{$id} = [];
125 | if ($direc == -1){
126 | $l_seq = Seq::rev_com($seq);
127 | }else{
128 | $l_seq = $seq;
129 | }
130 | $l_as = $as;
131 | $te_rcd{$id} = 1;
132 | }
133 | next if ($chr !~ $te);
134 | if($direc == -1){
135 | $seq = Seq::rev_com($l_seq);
136 | }else{
137 | $seq = $l_seq;
138 | }
139 | $ar[9] = "$seq";
140 | my $p = join "\t", @ar;
141 |
142 |
143 | if ($cig =~ /^\d+M$/){
144 | push @{$guanxi{$id}}, $p;
145 | }elsif($cig =~ /^\d+M(\d+)S$/){
146 | if (($tnt_len - (length($l_seq) - $1) +1 - $lost - 10) <= $pos){
147 | push @{$guanxi{$id}} ,$p;
148 | }
149 | }elsif($cig =~ /^\d+S\d+M$/){
150 | if ( $pos <= $lost + 10 ){
151 | push @{$guanxi{$id}} , $p;
152 | }
153 | }
154 | }
155 | return %guanxi;
156 | }
157 |
158 |
159 |
160 | sub scan_sam{ # put the pair reads in to one element of one array @re
161 | my $file = shift @_;
162 | open my $fh , $file or die $!;
163 |
164 | my @re;
165 | while (<$fh>){
166 | chomp;
167 | if (/^@/){
168 | print SUPP "$_\n";
169 | next;
170 | }
171 | my($id,$flag,$chr) = (split /\t/,$_)[0,1,2];
172 | next if ( $chr =~ /$te/);
173 |
174 | my $r = ($flag =~ /1/)?1:2;
175 | $_ =~ s/$id\t/$id:$r\t/;
176 | my $r_a_t = ($r =~ /2/)?1:2;
177 |
178 | if( defined $guanxi{"$id:$r_a_t"}){
179 | foreach my $te_aln (@{$guanxi{"$id:$r_a_t"}}){
180 | #print "$te_aln\n$_\n" if ($id eq "SRR556175.40406632");
181 | push @re, [$te_aln,$_];
182 | }
183 | }
184 | }
185 | return @re;
186 | }
187 |
188 | sub read_2_ha{ # this subroutine used to
189 | my %cors;
190 | my $hit =shift @_;
191 | my($id,$flag,$chr,$pos,$mq,$cig,$nchr,$npos,$seq) = (split /\t/,$hit)[0,1,2,3,4,5,6,7,9];
192 | my $rc = (($flag =~ /r/)? -1:1);
193 | my $cs = cigar($cig);
194 | ($id,my $r) = $id =~ /(.+)\:(\d)$/;
195 | $cors{cig} = $cs;
196 | $cors{direc} = $rc;
197 | $cors{id} = $id;
198 | $cors{pos} = $pos;
199 | $cors{seq} = $seq;
200 | $cors{chr} = $chr;
201 | $cors{mq} = $mq;
202 | #print "$cs\t$rc\t$id\n" if ($id =~ /SRR556175\.40406632/);
203 | return %cors;
204 | }
205 |
206 |
207 | sub cigar {
208 | my $cig = shift @_;
209 | return "Z" if ($cig =~ /\*/);
210 | my $len; # reads length
211 | my $cs; # simplified cigars
212 | my ($m_l); # match length
213 |
214 | while ($cig =~ /(\d+)([MSIH])/g){
215 | my $n = $1;
216 | my $c = $2;
217 | $len += $n;
218 | $m_l += $1 if ($c eq "M");
219 | }
220 | if ($m_l/$len > 95/100 ){ # matcha and mismatch > 0.95 ; it will be considered as totally matched reads
221 | $cs = "M";
222 | }else{
223 | if ( $cig =~ /^(\d+)[SH]/ and $1 >= 5){
224 | $cs .= "S:$1";
225 | }
226 | if ( $cig =~ /(\d+)[SH]$/ and $1 >= 5){
227 | $cs .= "E:$1";
228 | }
229 | }
230 | $cs = ($cs?$cs:"Z");
231 | return $cs;
232 | }
233 |
234 | sub cross {
235 | my %cors = @_;
236 | my $len_tar = length ($cors{tar}{seq});
237 | my $len_te = length ($cors{$te}{seq});
238 | my $len_inter = int($ins_size/2);
239 | if ( $cors{$te}{direc} == 1 and $cors{$te}{pos} > ($tnt_len - $ins_size - $lost)){
240 |
241 | # ---------------------------->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>------------------------------
242 | # s-----> <--------
243 | #
244 | # ----------------------------<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<------------------------------
245 | # --------> <------s
246 | my ($ins_direc,$jun);
247 | if ($cors{tar}{direc} == 1){
248 | $ins_direc = "R";
249 | $jun = $cors{tar}{pos} + $len_tar + $len_inter; # assume the ins site is the end of match of reads at genome
250 | }else{
251 | $ins_direc = "S";
252 | $jun = $cors{tar}{pos} - $len_inter; # assume the ins site at the start of match of read at genome
253 | }
254 | my $te_jun = $cors{$te}{pos} + $len_te;
255 | print OUT "$cors{$te}{id}\t$ins_direc\t$cors{tar}{chr}\t$jun\tCE:$te_jun\t$cors{tar}{mq}\n";
256 | print SUPP "$rds\n";
257 | return 1;
258 | }elsif ( $cors{$te}{direc} == -1 and $cors{$te}{pos} < ($ins_size - $len_te + $lost )){
259 |
260 | # ---------------------------->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>------------------------------
261 | # --------> s<--------
262 | #
263 | # ----------------------------<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<------------------------------
264 | # ------>s <-----
265 | my $len_tar = length ($cors{tar}{seq});
266 | my ($ins_direc,$jun);
267 | if ( $cors{tar}{direc} == 1 ){
268 | $ins_direc = "S";
269 | $jun = $cors{tar}{pos} + $len_tar + $len_inter;
270 | }else{
271 | $ins_direc = "R";
272 | $jun = $cors{tar}{pos} - $len_inter;
273 | }
274 | my $te_jun = $cors{$te}{pos};
275 | print OUT "$cors{$te}{id}\t$ins_direc\t$cors{tar}{chr}\t$jun\tCS:$te_jun\t$cors{tar}{mq}\n";
276 | print SUPP "$rds\n";
277 | return 1;
278 | }else{
279 | print STDERR "error:cross fail:$cors{$te}{id}\tte_direc:$cors{$te}{direc}\tte_pos:$cors{$te}{pos}\ttar_direc:$cors{tar}{direc}\n" if ($opts{d});
280 | }
281 | }
282 |
283 | sub te_start{ # postion limitor in guanxi
284 | my %cors = @_;
285 | if($cors{$te}{cig} =~ /S:(\d+)/ and $cors{$te}{direc} == -1 ){
286 |
287 | # ---------------------------->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>------------------------------
288 | # -------> <-------
289 |
290 | #
291 | # --------------------------- <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<------------------------------
292 | # -------> <-----
293 |
294 | my $l = $1;
295 |
296 | my $que = uc(substr($cors{$te}{seq},0,$l));
297 | ( my $que_r = $que) =~ tr/ATCG/TAGC/;
298 | $que_r = reverse $que_r;
299 | my $ins_direc;
300 | my $sub;
301 | my $chr_t = $cors{tar}{chr};
302 | my $pos_t = $cors{tar}{pos};
303 | my $len_t = length $cors{tar}{seq};
304 | my $jun_te = $cors{$te}{pos};
305 | if($cors{tar}{direc} == 1){
306 | $ins_direc = "S";
307 | $sub = substr($genomes{$chr_t},$pos_t-1,$ins_size);
308 |
309 | my ($diff,@juns) = mat($que,$sub);
310 |
311 | if($diff/$l < 0.05){
312 | #@juns = map { $_ + $pos_t+ $l -1 } @juns;
313 | #my $jun = join ":", @juns;
314 | my $jun = $juns[0] + $pos_t+ $l -1;
315 | my $mark = @juns >1 ?"ts":"TS";
316 | print OUT "$cors{$te}{id}\t$ins_direc\t$chr_t\t$jun\t$mark:$jun_te\t$cors{tar}{mq}\n";
317 | print SUPP "$rds\n";
318 | return 1;
319 | }
320 | }else{
321 | $ins_direc = "R";
322 | my $sub_start = $pos_t-$ins_size +$len_t;
323 | $sub = substr($genomes{$chr_t},$sub_start-1,$ins_size);
324 |
325 | my($diff,@juns) = mat($que_r,$sub);
326 | if($diff/$l < 0.05){
327 | #@juns = map{ $_ + $sub_start} @juns ;
328 | #my $jun = join ":",@juns;
329 | my $jun = $juns[0] + $sub_start;
330 | my $mark = @juns >1 ?"ts":"TS";
331 | print OUT "$cors{$te}{id}\t$ins_direc\t$chr_t\t$jun\t$mark:$jun_te\t$cors{tar}{mq}\n";
332 | print SUPP "$rds\n";
333 | return 1;
334 | }
335 | }
336 |
337 | }else{
338 | print STDERR "error: te_start_direc:$cors{$te}{id}\t$cors{$te}{cig}\t$cors{$te}{direc}\n" if ($opts{d});
339 | }
340 | }
341 |
342 | sub te_end{
343 | my %cors = @_;
344 | if ($cors{$te}{cig} =~ /E:(\d+)/ and $cors{$te}{direc} == 1){
345 |
346 | #---------------------------->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>-------------------------------
347 | # --------> <-------
348 | #
349 | #----------------------------<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<-------------------------------
350 | # ------> <--------
351 | my $l = $1;
352 | my $end_pos = $cors{$te}{pos} + length($cors{$te}{seq}) - $l -1;
353 |
354 | my $que = uc(substr($cors{$te}{seq},-$l));
355 | ( my $que_r = $que) =~ tr/ATCG/TAGC/;
356 | $que_r = reverse $que_r;
357 | my $ins_direc;
358 | my $sub;
359 | my $chr_t = $cors{tar}{chr};
360 | my $pos_t = $cors{tar}{pos};
361 | my $len_t = length $cors{tar}{seq};
362 | if($cors{tar}{direc} == 1){
363 | $ins_direc = "R";
364 | $sub = substr($genomes{$chr_t},$pos_t-1,$ins_size);
365 |
366 | my ($diff,@juns) = mat($que_r,$sub);
367 |
368 | if( $diff/$l <0.05 ){
369 | #@juns = map {$_ + $pos_t + $l -1} @juns;
370 | #my $jun = join ":",@juns;
371 | my $jun = $juns[0] + $pos_t + $l -1;
372 | my $mark = @juns > 1 ? "te":"TE";
373 | print OUT "$cors{$te}{id}\t$ins_direc\t$chr_t\t$jun\t$mark:$end_pos\t$cors{tar}{mq}\n";
374 | print SUPP "$rds\n";
375 | return 1;
376 | }
377 | }else{
378 | $ins_direc = "S";
379 | my $sub_start = $pos_t+$len_t-$ins_size;
380 | $sub = substr($genomes{$chr_t},$sub_start-1,$ins_size);
381 |
382 | my ($diff,@juns) = mat($que,$sub);
383 | if($diff/$l < 0.05 ){
384 | #@juns = map {$_ + $sub_start} @juns;
385 | #my $jun = join ":",@juns;
386 | my $jun = $juns[0] + $sub_start;
387 | my $mark = @juns > 1 ? "te":"TE";
388 | print OUT "$cors{$te}{id}\t$ins_direc\t$chr_t\t$jun\t$mark:$end_pos\t$cors{tar}{mq}\n";
389 | print SUPP "$rds\n";
390 | return 1;
391 | }
392 | }
393 | }else{
394 | print "error te_end_direc:$cors{$te}{id}\t$cors{$te}{cig}\t$cors{$te}{direc} \n"if ($opts{d});
395 | }
396 | }
397 |
398 |
399 |
400 | sub ge_start{
401 | my %cors = @_;
402 | if($cors{tar}{cig} =~ /S:(\d+)/ ){
403 |
404 | my $l = $1;
405 |
406 | my $que = uc(substr($cors{tar}{seq},0,$l));
407 | my $ry = uc(substr($cors{tar}{seq},$l));
408 | ( my $que_r = $que) =~ tr/ATCG/TAGC/;
409 | ( my $ry_r = $ry) =~ tr/ATCG/TAGC/;
410 | $que_r = reverse $que_r;
411 | $ry_r = reverse $ry_r;
412 |
413 | my $sub_h = "NNNNNNNNNN".substr($genomes{$te},0,$l+$lost+10);
414 | my $sub_t = substr($genomes{$te},-($l+$lost+10))."NNNNNNNNNN";
415 |
416 | my %ma_hash = match($que,$que_r,$sub_h,$sub_t);
417 | if (%ma_hash){
418 | if ($ma_hash{3} and $cors{$te}{direc} == 1 and $cors{$te}{pos} > ($tnt_len - $ins_size - $lost - 10 ) ){
419 |
420 | # --------------------------------------->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>-------------------------------------------------
421 | # --------> <---------
422 | #
423 | my $adj = $ma_hash{3}[-1];
424 | my $seq_te = substr($sub_t,$adj+$l);
425 | my $r = 0;
426 | for my $i (0..length($seq_te)){
427 | my $s = substr($seq_te,$i,1);
428 | my $g = substr($ry,$i,1);
429 | if ($s eq $g){
430 | $r = $i +1;
431 | }else{
432 | last;
433 | }
434 | }
435 | my $rr = length($seq_te) - 10 - $r;
436 | #my $mat = substr($sub_t,$adj,$l);
437 | #print OUT "RAW:$mat\t$seq_te\t$que\t$ry\n";
438 | my $ins_direc = "S";
439 | my $jun = $cors{tar}{pos} + $r;
440 | my $te_jun = $tnt_len - $rr;
441 | #my $te_jun = $tnt_len -$lost + $adj + $r;
442 | print OUT "$cors{tar}{id}\t$ins_direc\t$cors{tar}{chr}\t$jun\tGE:$te_jun\t$cors{tar}{mq}\n";
443 | print SUPP "$rds\n";
444 | return 1;
445 | }elsif($ma_hash{-2} and $cors{$te}{direc} == -1 and $cors{$te}{pos} < ($ins_size - length($cors{$te}{seq}) + $lost + 10)) {
446 |
447 | #
448 | # ---------------------------------------<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<-------------------------------------------------
449 | # --------> <---------
450 | my $adj =$ma_hash{-2}[0];
451 | my $seq_te = substr($sub_h,0,$adj);
452 |
453 | #my $mat = substr($sub_h,$adj,$l);
454 | #print OUT "RAW:$mat\t$seq_te\t$que_r\t$ry_r\n";
455 |
456 | my $r = 0;
457 | for my $i (0..length($seq_te)){
458 | my $s = substr($seq_te,-($i+1));
459 | my $g = substr($ry_r,-($i+1));
460 | if($s eq $g){
461 | $r = $i+1;
462 | }else{
463 | last;
464 | }
465 | }
466 | my $rr = length($seq_te) - 10 - $r;
467 | my $ins_direc = "R";
468 | my $jun = $cors{tar}{pos} + $r ;
469 | my $te_jun = $rr +1;
470 | #my $te_jun = $adj + $rr -10 +1;
471 | print OUT "$cors{tar}{id}\t$ins_direc\t$cors{tar}{chr}\t$jun\tGS:$te_jun\t$cors{tar}{mq}\n";
472 | print SUPP "$rds\n";
473 | return 1;
474 | }
475 | }else{
476 | print "error ge_start_mism:$cors{tar}{id}:$que\t$que_r\n"if ($opts{d});
477 | }
478 | }
479 | }
480 |
481 | sub ge_end{
482 | my %cors = @_;
483 | if($cors{tar}{cig} =~ /E:(\d+)/){
484 |
485 | my $l = $1;
486 | my $tar_l = length($cors{tar}{seq});
487 |
488 | my $que = uc(substr($cors{tar}{seq},-$l));
489 | my $ry = uc(substr($cors{tar}{seq},0,$tar_l-$l));
490 | ( my $que_r = $que) =~ tr/ATCG/TAGC/;
491 | ( my $ry_r = $ry) =~ tr/ATCG/TAGC/;
492 | $que_r = reverse $que_r;
493 | $ry_r = reverse $ry_r;
494 |
495 | my $sub_h = "NNNNNNNNNN".substr($genomes{$te},0,$l+$lost+10);
496 | my $sub_t = substr($genomes{$te},-($l+$lost+10))."NNNNNNNNNN";
497 |
498 | my %ma_hash = match($que,$que_r,$sub_h,$sub_t);
499 |
500 | if (%ma_hash){
501 |
502 | #
503 | # ---------------------------------------<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<-------------------------------------------------
504 | # --------> <----------
505 |
506 |
507 | if ( $ma_hash{-3} and $cors{$te}{direc} == 1 and $cors{$te}{pos} > ($tnt_len - $ins_size - $lost -10 )){
508 | my $adj = $ma_hash{-3}[-1];
509 | my $seq_te = substr($sub_t,$adj+$l);
510 | my $r = 0;
511 | for my $i (0..length($seq_te)){
512 | my $s = substr($seq_te,$i,1);
513 | my $g = substr($ry_r,$i,1);
514 | if ($s eq $g){
515 | $r = $i +1;
516 | }else{
517 | last;
518 | }
519 | }
520 | #my $mat = substr($sub_t,$adj,$l);
521 | #print OUT "RAW:$mat\t$seq_te\t$que_r\t$ry_r\n";
522 | my $rr = length($seq_te) - 10 - $r;
523 | my $ins_direc = "R";
524 | my $jun = $cors{tar}{pos} + (length($cors{tar}{seq})-$l) - $r -1 ;
525 | my $te_jun = $tnt_len - $rr;
526 | #my $te_jun = $tnt_len -$lost +$adj + $rr;
527 |
528 | print OUT "$cors{tar}{id}\t$ins_direc\t$cors{tar}{chr}\t$jun\tGE:$te_jun\t$cors{tar}{mq}\n";
529 | print SUPP "$rds\n";
530 | return 1;
531 | }elsif ( $ma_hash{2} and $cors{$te}{direc} == -1 and $cors{$te}{pos} < ($ins_size - length($cors{$te}{seq}) + $lost + 10)){
532 | my $adj = $ma_hash{2}[0];
533 |
534 | # --------------------------------------->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>-------------------------------------------------
535 | # ---------> <---------
536 | #
537 |
538 | my $seq_te = substr($sub_h,0,$adj);
539 |
540 | #my $mat = substr($sub_h,$adj,$l);
541 | #print OUT "RAW:$mat\t$seq_te\t$que\t$ry\n";
542 |
543 | my $r = 0 ;
544 | for my $i (0..length($seq_te)){
545 | my $s = substr($seq_te,-($i+1));
546 | my $g = substr($ry,-($i+1));
547 | if($s eq $g){
548 | $r = $i+1;
549 | }else{
550 | last;
551 | }
552 | }
553 | my $rr = length($seq_te) - $r -10;
554 |
555 | my $ins_direc = "S";
556 | my $jun = $cors{tar}{pos} + (length($cors{tar}{seq}) - $l) - $r -1;
557 | my $te_jun = $rr +1;
558 | #my $te_jun = $adj + $rr -10 +1;
559 |
560 | print OUT "$cors{tar}{id}\t$ins_direc\t$cors{tar}{chr}\t$jun\tGS:$te_jun\t$cors{tar}{mq}\n";
561 | print SUPP "$rds\n";
562 | return 1;
563 | }
564 | }else{
565 | print "error: ge_end_mism:$cors{tar}{id}\t$que\tsub:$sub_t\t$sub_h\n"if ($opts{d});
566 | }
567 | }
568 | }
569 |
570 | sub mat {
571 | my ($que,$sub) = @_;
572 | $que = uc($que);
573 | $sub = uc($sub);
574 | my $q_l = length $que;
575 | my $s_l = length $sub;
576 |
577 | my %ref_dif;
578 | for my $i (0..($s_l-$q_l)){
579 | my $tgt = substr($sub,$i,$q_l);
580 | my $diffcount = () = ($que ^ $tgt) =~ /[^\x00]/g;
581 | $ref_dif{$diffcount} = [] unless(exists $ref_dif{$diffcount});
582 | push @{$ref_dif{$diffcount}}, $i;
583 | }
584 | foreach my $k (sort {$a <=> $b} keys %ref_dif){
585 | my @pos = @{$ref_dif{$k}};
586 | return ($k, @pos); # 0 based return position
587 | last;
588 | }
589 | }
590 |
591 |
592 | # return one hash with key if direction and value is start position of match on subject
593 | sub match {
594 | my %relas;
595 |
596 | for my $j (0..1){
597 | for my $k (2..3){
598 | my $que = $_[$j];
599 | my $sub = $_[$k];
600 | my ($diff,@locs) = mat($que,$sub);
601 | my $ratio = $diff/(length $que);
602 | my $direc = ($j==0?1:-1);
603 |
604 | my $dir_pos = $direc * $k;
605 | $relas{$ratio}{$dir_pos} = [@locs] # head of tail of sequence matched
606 | }
607 | }
608 |
609 | # exact the most similarity region and must be small than 0,05
610 | foreach (sort {$a<=>$b} keys %relas){
611 | if ( $_ <= 0.05){
612 | my $v = $relas{$_};
613 | my %m_h = %$v;
614 | return (%m_h); #### return value SH ST RH RT
615 | }
616 | last;
617 | }
618 | }
619 |
620 | sub count_m {
621 | my $cig = shift @_;
622 | my $num;
623 | while($cig =~ /(\d+)M/g){
624 | $num += $1;
625 | }
626 | return $num;
627 | }
628 |
--------------------------------------------------------------------------------
/itis.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | use warnings; use strict;
3 | use FindBin;
4 | use Getopt::Std;
5 | use File::Basename;
6 | use Time::localtime;
7 |
8 | ######################### parameters #####################
9 | my $usage = "USAGE:
10 | $0
11 | REQUIRED -g the genome sequence file in fasta format
12 | OR -G prefix of bwa-indexed reference file ( genome + transposon)
13 | REQUIRED -t the TE sequence file in fasta format
14 | ### (deprecated)or -T prefix of bwa-indexed transposon sequence file
15 | REQUIRED -l the average length of fragments in library
16 | REQUIRED -N the name of project
17 | REQUIRED -1 the paired read file 1
18 | REQUIRED -2 the paired read file 2
19 |
20 | -f if provided, ITIS will check if TE inserted in gentic or intergeneic region
21 | -F run scripts in 'FAST' mode; It won't align all reads to reference genome,caculate the average bg depth,
22 | and estimate if insertion is homo or heter,[default N]
23 |
24 | ## parameters used with '-F N' :
25 | -B use previous sorted and indexed bam file of reads aligned to reference genome
26 | -d the depth range to filter raw insertion site, [default 2,200]
27 |
28 | -q the minimum average mapping quality of all supporting reads, [default 1]
29 |
30 | -e If reference genome contains this TE or it's homolog. using blast to hard mask these sequence is required, [default N]
31 |
32 | -a the number of bases allowed to be lost when transposing, [defualt 10]
33 |
34 | -b minimum required number of flanking reads , in the format of /Tag=Value/Tag=Value/Tag=Value/ , the avaliable tags:
35 | t: total supporting reads at detected insertion /t=3/
36 | CS:clipped reads cover TE start site /CS=0/
37 | CE:clipped reads cover TE end site /CE=0/
38 | cs:cross reads cover TE start /cs=0/
39 | ce:cross reads cover TE end /cs=0/
40 | TS:total reads cover TE start /TS=1/
41 | TE:total reads cover TE end /TE=1/
42 | [default /t=3/TS=1/TE=1/]
43 | -c cpu number for 'BWA mem', 'samtools view' and 'samtools sort', [defualt 8,2,2]
44 |
45 | -w window size used to cluster supportting reads, [default library_length/2]
46 |
47 | -D use this specifed temperate directory, [default[project].[aStringOfNumbers]]
48 |
49 | -m Only print out all commands to STDERR, [default N]
50 |
51 | -h print this help message
52 |
53 |
54 | eg: perl $0 -g genome.fa -t tnt1.fa -l 300 -N test_run -1 reads.fq1 -2 reads.fq2 -f medicago.gff3
55 |
56 |
57 | ";
58 |
59 | die "$usage\n" if (@ARGV == 0);
60 | my %opt;
61 | getopts("g:t:l:N:1:2:f:b:R:B:d:G:T:c:q:e:a:F:D:w:m:h",\%opt);
62 |
63 | die "$usage\n" if ($opt{h});
64 |
65 | ########===============================
66 |
67 | my $genome = $opt{g};
68 | my $te_seq = $opt{t};
69 |
70 | my $index_te = $opt{T};
71 | my $index_ref = $opt{G};
72 |
73 | my $lib_len = $opt{l};
74 | my $proj = $opt{N};
75 | my $rs1_ori = $opt{1};
76 | my $rs2_ori = $opt{2};
77 | my $gff = $opt{f}? "-a $opt{f}" : " " ;
78 |
79 | my $min_reads = $opt{b}?$opt{b}:"/t=3/TS=1/TE=1/";
80 | my $bam = $opt{B}?$opt{B}:0;
81 | my $depth_range= $opt{d}?$opt{d}:"2,200";
82 | my $cpu = $opt{c}?$opt{c}:"8,2,2";
83 | my($cpu_bwa,$cpu_view,$cpu_sort) = split /,/,$cpu;
84 | my $exists = $opt{e}?$opt{e}:"N";
85 | my $fast = $opt{F}?$opt{F}:"N";
86 | my $tmp_dir = $opt{D}? $opt{D} : "$proj.".time();
87 | my $window = $opt{w}?$opt{w}:$lib_len/2;
88 | my $only_cmd = $opt{m}?$opt{m}:"N";
89 | my $cmd;
90 | my $bindir = "$FindBin::Bin";
91 | my $map_q = $opt{q}?$opt{q}:1;
92 | my $lost = $opt{a}?$opt{a}:10;
93 | my $bwa = "bwa";
94 | ##########################################################
95 |
96 |
97 |
98 | if(-e $tmp_dir){
99 | print "using dir: $tmp_dir\n";
100 | }else{
101 | $cmd = "mkdir $tmp_dir";
102 | system($cmd) == 0 or die $!;
103 | }
104 |
105 | open CMD, ">$tmp_dir/commands_rcd" or die $!;
106 |
107 |
108 | if($index_ref){
109 | $genome = $index_ref;
110 | }else{
111 | ####################### prepare reference #########
112 | if($exists =~ /N/i){
113 | $cmd = "cat $genome $te_seq >$tmp_dir/$proj.ref_and_te.fa";
114 | }else{
115 | $cmd = "perl -I $bindir $bindir/mask_te_homo_in_genome.pl -g $genome -t $te_seq -o $tmp_dir/$proj.ref_and_te.fa";
116 | }
117 |
118 | process_cmd($cmd); # cat sequence together
119 |
120 |
121 | ###################### Index sequence file #######
122 |
123 | $cmd = "$bwa index $tmp_dir/$proj.ref_and_te.fa";
124 | if ( -e "$tmp_dir/$proj.ref_and_te.fa.bwt"){
125 | print STDERR "Seems like the Indexes for merged sequence exists. Skipped\n";
126 | }else{
127 | process_cmd($cmd); # index merged sequence
128 | }
129 |
130 | $index_ref = "$tmp_dir/$proj.ref_and_te.fa";
131 | }
132 | if($index_te){
133 | 1;
134 | }else{
135 | $cmd = "cp $te_seq $tmp_dir/";
136 | my $te_base = basename $te_seq;
137 | process_cmd($cmd); # copy te sequence to tmp/
138 | $cmd = "$bwa index $tmp_dir/$te_base";
139 | process_cmd($cmd); # index te sequence
140 | $index_te = "$tmp_dir/$te_base";
141 | }
142 |
143 |
144 | ##### align original reads to reference genome ######
145 | my $transformtobed_bam ;
146 | if($fast =~ /N/i and $bam == 0){
147 | $cmd = "$bwa mem -T 20 -t $cpu_bwa $index_ref $rs1_ori $rs2_ori 2>/dev/null | samtools view -@ $cpu_view -buS - | samtools sort -@ $cpu_sort - $tmp_dir/$proj.all_reads_aln_ref_and_te.sort";
148 | process_cmd($cmd);
149 | $transformtobed_bam = "-b $tmp_dir/$proj.all_reads_aln_ref_and_te.sort.bam";
150 |
151 | $cmd = "samtools index $tmp_dir/$proj.all_reads_aln_ref_and_te.sort.bam";
152 | process_cmd($cmd);
153 |
154 | }elsif($fast =~ /N/i and $bam){
155 | $transformtobed_bam = "-b $bam";
156 | }elsif($fast =~ /Y/i){
157 | $transformtobed_bam = "" ; # run transform_to_bed/pl withot bam file provided
158 | }
159 |
160 |
161 |
162 | ##### firstly extracting reads aligned at TE#####
163 |
164 | $cmd = "perl -I $bindir $bindir/lean_fq.pl -1 $rs1_ori -2 $rs2_ori -p $tmp_dir/rds_te -i $index_te -c $cpu_bwa ";
165 | process_cmd($cmd);
166 | my $rds = "$tmp_dir/rds_te.fq1 $tmp_dir/rds_te.fq2";
167 |
168 |
169 | ####### align reads associate with TE to the merged reference sequence #######
170 |
171 | $cmd = "$bwa mem -T 20 -v 1 -t $cpu_bwa $index_ref $rds 2>/dev/null |tee $tmp_dir/$proj.ref_and_te.sam | samtools view -@ $cpu_view -buS - | samtools sort -@ $cpu_sort - $tmp_dir/$proj.ref_and_te.sorted";
172 |
173 |
174 | if (-e "$tmp_dir/$proj.ref_and_te.sam"){
175 | print STDERR "Seems alignment file exists. Skipped\n";
176 | }else{
177 | process_cmd($cmd);
178 | }
179 |
180 | $cmd = "samtools index $tmp_dir/$proj.ref_and_te.sorted.bam";
181 | process_cmd($cmd);
182 |
183 | ###### check te seq id in te seq file####
184 |
185 | open TE,$te_seq or die $!;
186 |
187 | my @tes;
188 | while(){
189 | chomp;
190 | if($_ =~ /^>(\S+)/){
191 | push @tes,$1;
192 | }
193 | }
194 |
195 | foreach my $te (@tes){
196 |
197 | ########## extract informative reads from sam file ###############
198 |
199 | $cmd = "perl -I $bindir $bindir/extract_informative.pl -g $genome -s $tmp_dir/$proj.ref_and_te.sam -n $te -p $tmp_dir/$proj ";
200 | process_cmd($cmd);
201 | if ( -e "$tmp_dir/$proj.$te.empty"){
202 | print STDERR "$te no insertions\n";
203 | last;
204 | }
205 |
206 | $cmd = "perl -I $bindir $bindir/modify_informative.pl $tmp_dir/$proj.$te.informative.sam > $tmp_dir/$proj.$te.informative.full.sam";
207 | process_cmd($cmd);
208 | $cmd = "samtools view -buS $tmp_dir/$proj.$te.informative.sam | samtools sort - $tmp_dir/$proj.$te.informative.sorted";
209 | process_cmd($cmd);
210 | $cmd = "samtools index $tmp_dir/$proj.$te.informative.sorted.bam";
211 | process_cmd($cmd);
212 |
213 |
214 | ####### raln the informative reads back to the TE sequence ###########
215 | $cmd = "perl -I $bindir $bindir/te_realin_bwa.pl -n $te -s $tmp_dir/$proj.$te.informative.full.sam -i $index_te -p $tmp_dir/$proj";
216 | process_cmd($cmd);
217 |
218 |
219 | ###### identify the reads support insertion #####
220 | $cmd = "perl -I $bindir $bindir/identity_inser_sites.pl -s $tmp_dir/$proj.$te.informative.full.sam -g $index_ref -l $lib_len -n $te -r $tmp_dir/$proj.$te.alnte.sam -p $tmp_dir/$proj -a $lost";
221 | process_cmd($cmd);
222 |
223 | $cmd = "samtools view -buS $tmp_dir/${proj}.$te.support.reads.sam | samtools sort - $tmp_dir/${proj}.$te.support.reads.sorted ";
224 | process_cmd($cmd);
225 | $cmd = "samtools index $tmp_dir/${proj}.$te.support.reads.sorted.bam " ;
226 | process_cmd($cmd);
227 | ######
228 |
229 | ###### sort the support reads and generate bed files ######
230 | $cmd = "sort -k 3,3 -k 4,4n $tmp_dir/${proj}.$te.ins.loc.lst >$tmp_dir/${proj}.$te.ins.loc.sorted.lst";
231 | process_cmd($cmd);
232 | $cmd = "perl -I $bindir $bindir/transform_to_bed.pl $transformtobed_bam -e $index_te -n $te -p $tmp_dir/$proj.$te -i $tmp_dir/$proj.$te.ins.loc.sorted.lst -l $lib_len -w $window ";
233 | process_cmd($cmd);
234 |
235 | my $par = ($exists =~ /N/i)?" ":"-l $index_ref.list";
236 | $cmd = "perl -I $bindir $bindir/filter_insertion.pl $par -i $tmp_dir/$proj.$te.raw.bed -n $min_reads -q $map_q -d $depth_range >$tmp_dir/$proj.$te.filtered.bed";
237 | process_cmd($cmd);
238 |
239 | ##### Intergrate gene information in GFF and generate IGV snpshot batch file ####
240 | $cmd = "perl -I $bindir $bindir/annotate_bed.pl -b $tmp_dir/$proj.$te.filtered.bed $gff -g $index_ref -n $te -p $proj -d $tmp_dir ";
241 | process_cmd($cmd);
242 | }
243 |
244 | ######
245 |
246 |
247 |
248 |
249 | # subfuctions derived from trinity package
250 | sub process_cmd {
251 | my ($cmd) = @_;
252 |
253 | if($only_cmd =~ /Y/i){
254 | print STDERR "$cmd\n\n";
255 | print CMD " $cmd\n\n";
256 | }else{
257 | print STDERR &mytime."CMD: $cmd\n";
258 | print CMD "$cmd\t#".&mytime."\n";
259 | my $start_time = time();
260 | my $ret = system($cmd);
261 | my $end_time = time();
262 |
263 | if ($ret) {
264 | die "Error, cmd: $cmd died with ret $ret";
265 | }
266 |
267 | print STDERR "CMD finished (" . ($end_time - $start_time) . " seconds)\n";
268 |
269 | return;
270 | }
271 | }
272 |
273 | sub mytime() {
274 | my @mabbr = qw(January February March April May June July August September October November December);
275 | my @wabbr = qw(Sunday Monday Tuesday Wednesday Thursday Friday Saturday);
276 | my $sec = localtime->sec() < 10 ? '0' . localtime->sec() : localtime->sec();
277 | my $min = localtime->min() < 10 ? '0' . localtime->min() : localtime->min();
278 | my $hour = localtime->hour() < 10 ? '0' . localtime->hour() : localtime->hour();
279 | my $wday = $wabbr[localtime->wday];
280 | my $mday = localtime->mday;
281 | my $mon = $mabbr[localtime->mon];
282 | my $year = localtime->year() + 1900;
283 | return "$wday, $mon $mday, $year: $hour:$min:$sec\t";
284 | }
285 |
--------------------------------------------------------------------------------
/lean_fq.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 | use Getopt::Std;
4 |
5 | my $usage = "usage: $0
6 | -1 : read1
7 | -2 : reads
8 | -p : prefix of out put reads
9 | -i : reference index file
10 | -c : cpu number for bwa
11 | -h : print this help
12 |
13 | BWA should in your PATH
14 | ";
15 | die $usage if (@ARGV ==0);
16 | my %opt;
17 | getopts("1:2:p:i:c:h",\%opt);
18 |
19 | my($r1,$r2,$pre,$index,$cpu) = ($opt{1},$opt{2},$opt{p},$opt{i},$opt{c});
20 | my $bwa = "bwa";
21 | open R1,">$pre.fq1" or die $!;
22 | open R2, ">$pre.fq2" or die $!;
23 |
24 | open ALN, "$bwa mem -MT 20 -t $cpu $index $r1 $r2 2>/dev/null | samtools view -XS - |" or die $!;
25 | my %reads;
26 | my $last ;
27 |
28 | while (){
29 | chomp;
30 | next if (/^@/);
31 |
32 | my($id,$flag,$seq,$q) = (split /\t/)[0,1,9,10];
33 |
34 | if($last and $id ne $last){
35 | prt($last);
36 | }
37 | if(eof(ALN)){
38 | prt($id);
39 | }
40 |
41 |
42 | next if ($flag =~ /uU/);
43 | next if ($flag =~ /s/);
44 | my $r = $flag =~ /1/? 1:2;
45 |
46 | if($flag =~ /r/){
47 | $seq = reverse($seq);
48 | $seq =~ tr/atcgATCG/tagcTAGC/;
49 | }
50 | $reads{$id}{$r} =">$id\n$seq\n";
51 | $last = $id;
52 | }
53 |
54 | sub prt{
55 | my $id = shift @_;
56 | my $r1 = $reads{$id}{1};
57 | my $r2 = $reads{$id}{2};
58 | if($r1 and $r2){
59 | print R1 $r1;
60 | print R2 $r2;
61 | }
62 | undef $reads{$id};
63 | }
64 |
65 |
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/mask_te_homo_in_genome.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 | use Seq;
4 | use Bio::SeqIO;
5 | use Getopt::Std;
6 |
7 | my %opt;
8 | getopts("g:t:o:h",\%opt) ;
9 |
10 | die "USAGE $0
11 | -g genome seq file
12 | -t te seq file
13 | -o name of output file with combined sequence (*.fa)
14 | -h help
15 |
16 | " if ( $opt{h});
17 |
18 | # put genome seq in hash
19 | my $seq_in = Bio::SeqIO -> new (-file => $opt{g},-format => "fasta");
20 | my %genome ;
21 | my @order;
22 | while (my $seq_obj = $seq_in -> next_seq){
23 | my $id = $seq_obj -> id;
24 | my $seq = $seq_obj -> seq;
25 | $genome{$id} = $seq;
26 | push @order,$id;
27 | }
28 | # put te seq in hash
29 | my %te = Seq::seq_hash($opt{t});
30 |
31 | # using blast2seq to identify the te homolog
32 | open LIS, ">$opt{o}.list" or die $!;
33 | open BLA, "blastn -query $opt{t} -subject $opt{g} -outfmt 6 |" or die $!;
34 | while(){
35 | chomp;
36 | my ($te,$chr,$s,$e) = (split /\t/,$_)[0,1,8,9];
37 | ($s,$e) = sort {$a<=>$b}($s,$e);
38 | print LIS "$chr\t$s\t$e\t$te\n";
39 | my $l = $e-$s+1;
40 | #print STDERR "substracting...\nalignment $_\n";
41 | substr($genome{$chr},$s-1,$l) = "N"x$l;
42 | }
43 |
44 |
45 |
46 | # put masked genome seq and te seq together
47 | open OUT, ">$opt{o}" or die $!;
48 | foreach my $k ( @order){
49 | print OUT ">$k\n$genome{$k}\n";
50 | }
51 |
52 | foreach my $k(keys %te){
53 | print OUT ">$k\n$te{$k}\n";
54 | }
55 |
--------------------------------------------------------------------------------
/modify_informative.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 | my ($diff_file) = @ARGV;
4 | use Seq;
5 |
6 | my %reads;
7 | open my $fh, "samtools view -h -S -X $diff_file|" or die $!;
8 |
9 | while (<$fh>){
10 |
11 | chomp;
12 | if (/^@/){
13 | print "$_\n";
14 | next;
15 | }
16 |
17 | my ($id,$flag,$chr,$pos,$mq,$cig,$nchr,$npos,$seq) = (split /\t/,$_)[0,1,2,3,4,5,6,7,9];
18 |
19 |
20 | if (! keys %reads or exists $reads{$id}){
21 | push @{$reads{$id}},$_;
22 | if(eof($fh)){
23 | modify(\%reads);
24 | }
25 | }else{
26 |
27 | ################ main body ###########
28 |
29 | modify(\%reads);
30 |
31 |
32 | ################## END ####################
33 | undef(%reads);
34 | push @{$reads{$id}}, $_;
35 |
36 | }
37 | }
38 |
39 | sub modify {
40 | my $f = shift @_;
41 | my %reads = %$f;
42 |
43 | my ($g) = values %reads;
44 | my @hits = @$g;
45 | my %seq_ha;
46 | my %qua_ha;
47 |
48 | foreach my $hit (@hits){
49 | my ($id,$flag,$chr,$pos,$mq,$cig,$nchr,$npos,$dis,$seq,$qua,$keys) = (split /\t/,$hit,12)[0,1,2,3,4,5,6,7,8,9,10,11];
50 | my $r = ($flag =~ /1/?1:2);
51 | if ($cig !~ /H/){
52 | my ($s,$q);
53 | if($flag =~ /r/){
54 | $s = Seq::rev_com($seq);
55 | $q = reverse($qua);
56 | }else{
57 | $s = $seq;
58 | $q = $qua;
59 | }
60 | $seq_ha{$r} = $s;
61 | $qua_ha{$r} = $q;
62 | print "$hit\n";
63 | }else{
64 | my $s;
65 | my $q;
66 | if ($flag =~ /r/){
67 | $s = Seq::rev_com($seq_ha{$r});
68 | $q = reverse($qua_ha{$r});
69 | }else{
70 | $s = $seq_ha{$r};
71 | $q = $qua_ha{$r};
72 | }
73 | $cig =~ s/H/S/g;
74 | print "$id\t$flag\t$chr\t$pos\t$mq\t$cig\t$nchr\t$npos\t$dis\t$s\t$q\t$keys\n";
75 | }
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/sam_to_fq.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 | while(<>){
4 | my ($id,$seq) = (split /\t/,$_)[0,9];
5 | my $q = "J"x(length($seq));
6 | print "\@$id\n$seq\n\+\n$q\n";
7 | }
8 |
--------------------------------------------------------------------------------
/sw.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 |
3 | #use strict; use warnings;
4 |
5 | die "usage: $0 \\n" unless @ARGV == 2;
6 |
7 | my ($seq1, $seq2) = @ARGV;
8 |
9 | my $MATCH = 1;
10 | my $MISMATCH = -1;
11 | my $GAP = 1;
12 |
13 | my @matrix;
14 |
15 | $matrix[0][0]{score} = 0;
16 | $matrix[0][0]{pointer} = "none";
17 |
18 | for (my $j = 1; $j <= length($seq1); $j++) {
19 | $matrix[0][0]{score} = 0;
20 | $matrix[0][$j]{pointer} = "none";
21 | }
22 |
23 | for (my $i = 1; $i <= length($seq2); $i++) {
24 | $matrix[$i][0]{score} = 0;
25 | $matrix[$i][0]{pointer} = "none";
26 | }
27 |
28 | # fill
29 | my $max_i = 0;
30 | my $max_j = 0;
31 | my $max_score = 0;
32 |
33 | for (my $i = 1; $i <= length($seq2); $i++) {
34 | for (my $j = 1; $j <= length($seq1); $j++) {
35 | my ($diagonal_score, $left_score, $up_score);
36 |
37 | # calculate match score
38 | my $letter1 = substr($seq1, $j-1, 1);
39 | my $letter2 = substr($seq2, $i-1, 1);
40 | if ($letter1 eq $letter2) {
41 | $diagonal_score = $matrix[$i-1][$j-1]{score} + $MATCH; #TODO: this needs to be initialized prior to incrementing
42 | }
43 | else {
44 | $diagonal_score = $matrix[$i-1][$j-1]{score} + $MISMATCH;
45 | }
46 |
47 | # calculate gap scores
48 | $up_score = $matrix[$i-1][$j]{score} + $GAP; #TODO: same as above, need to initialize this value
49 | $left_score = $matrix[$i][$j-1]{score} + $GAP;
50 |
51 | if ($diagonal_score <= 0 and $up_score <= 0 and $left_score <= 0) {
52 | $matrix[$i][$j]{score} = 0;
53 | $matrix[$i][$j]{pointer} = "none";
54 | next; # terminate this iteration of the loop
55 | }
56 |
57 | # choose best score
58 | if ($diagonal_score >= $up_score) {
59 | if ($diagonal_score >= $left_score) {
60 | $matrix[$i][$j]{score} = $diagonal_score;
61 | $matrix[$i][$j]{pointer} = "diagonal";
62 | }
63 | else {
64 | $matrix[$i][$j]{score} = $left_score;
65 | $matrix[$i][$j]{pointer} = "left";
66 | }
67 | }
68 | else {
69 | if ($up_score >= $left_score) {
70 | $matrix[$i][$j]{score} = $up_score;
71 | $matrix[$i][$j]{pointer} = "up";
72 | }
73 | else {
74 | $matrix[$i][$j]{score} = $left_score;
75 | $matrix[$i][$j]{pointer} = "left";
76 | }
77 | }
78 |
79 | # set maximum score
80 | if ($matrix[$i][$j]{score} > $max_score) {
81 | $max_i = $i;
82 | $max_j = $j;
83 | $max_score = $matrix[$i][$j]{score};
84 | }
85 | }
86 | }
87 |
88 | # trace-back
89 | my $align1 = "";
90 | my $align2 = "";
91 |
92 | my $j = $max_j;
93 | my $i = $max_i;
94 |
95 | while (1) {
96 | last if $matrix[$i][$j]{pointer} eq "none";
97 |
98 | if ($matrix[$i][$j]{pointer} eq "diagonal") {
99 | $align1 .= substr($seq1, $j-1, 1);
100 | $align2 .= substr($seq2, $i-1, 1);
101 | $i--; $j--;
102 | }
103 | elsif ($matrix[$i][$j]{pointer} eq "left") {
104 | $align1 .= substr($seq1, $j-1, 1);
105 | $align2 .= "-";
106 | $j--;
107 | }
108 | elsif ($matrix[$i][$j]{pointer} eq "up") {
109 | $align1 .= "-";
110 | $align2 .= substr($seq2, $i-1, 1);
111 | $i--;
112 | }
113 | }
114 |
115 | $align1 = reverse $align1;
116 | $align2 = reverse $align2;
117 |
118 | print "$align1\n";
119 | print "$align2\n";
120 |
--------------------------------------------------------------------------------
/te_realin_bwa.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 | use Seq;
4 | use Getopt::Std;
5 |
6 | my %opt;
7 |
8 | my $usage = "USAGE $0
9 | -n te seq ID in fasta
10 | -s full informative sam of TE
11 | -i full path bwa index of te sequence
12 | -h print this help
13 | -p prefix of you output file
14 | " ;
15 |
16 | die "$usage\n" if (@ARGV == 0);
17 | getopts("n:s:i:p:h",\%opt);
18 | die "$usage\n" if ($opt{h});
19 |
20 | my ($te,$file) = ($opt{n},$opt{s});
21 | my $bwa = "bwa";
22 |
23 | open my $fh, $file or die $!;
24 | my $tmp = "tmp_bwa".time();
25 | open my $out, ">$tmp" or die $!;
26 |
27 | while (<$fh>){ # open full informative sam file
28 | chomp;
29 | next if (/^@/);
30 | my ($id,$flag,$chr,$seq) = (split /\t/,$_)[0,1,2,9];
31 | (my$r) = $flag =~ /(\d)/;
32 | print "ERRO:::$id\n" unless $seq;
33 | $seq = Seq::rev_com($seq) if ( $flag =~ /r/);
34 | #print "$id\t$seq\n" if ( $flag =~ /r/);
35 | if ($chr =~ /$te/ ){
36 | my $q = "J"x(length($seq));
37 | print $out "\@$id:$r\n$seq\n\+\n$q\n";
38 | }
39 | }
40 |
41 | my $o = $opt{p};
42 |
43 | system ("$bwa mem -T 20 $opt{i} $tmp -a 2>/dev/null >$o.$te.alnte.sam" )== 0 or die $!;
44 |
45 | system("rm -rf $tmp") == 0 or die $!;
46 |
47 |
--------------------------------------------------------------------------------
/test_dir/sample_data.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chuan-Jiang/ITIS/833f5893d0e3f2fa6e36fd1246c8c4046d078f50/test_dir/sample_data.tar.gz
--------------------------------------------------------------------------------
/transform_to_bed.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 | use Getopt::Std;
4 | use Seq;
5 | use FindBin;
6 |
7 | ############## parameters ##############
8 | my %opt;
9 | my $usage = "$0
10 | -p : project_name/ will be prefix of your output file
11 | -i : reads list file support insertion
12 | -w : default 100, windows size to cluster reads
13 | -n : the name of you te seq'
14 | -l : the lenght of library insertion
15 | -b : original bam file; used to calculate the bg depth; bam file should been sorted and indexed
16 | All reads should been aligned to mergered reference genome file with bwa mem
17 |
18 | Sub parameter:
19 | -e : the .fa file of TE
20 | -h : help
21 | -d : swithh on debug model
22 | ";
23 |
24 | my $bindir="$FindBin::Bin";
25 |
26 | die $usage if (@ARGV == 0);
27 | getopts("p:i:b:w:n:l:e:hd",\%opt);
28 | die $usage if ($opt{h});
29 |
30 | my $proj = $opt{p};
31 |
32 | my $ins_file = $opt{i};
33 |
34 |
35 | my $lib_l = $opt{l}?$opt{l}:500;
36 |
37 | my $window = $opt{w}?$opt{w}:$lib_l/2;
38 |
39 | my $bam = $opt{b}?$opt{b}:0;
40 |
41 | my $te_file = $opt{e};
42 | my %te_ha = Seq::seq_hash($te_file);
43 |
44 | my $te = $opt{n};
45 |
46 | my $db = $opt{d};
47 |
48 | ###############parameters################
49 | #########################################
50 | #########################################
51 |
52 |
53 |
54 | open INS,$ins_file or die $!;
55 | open OUT_R, ">$proj.raw.bed" or die $!;
56 | open OUT_BG, ">$proj.bg.sam" or die $!;
57 |
58 | open HEA,"samtools view -H $bam |";
59 | print OUT_BG $_ while ();
60 | close HEA;
61 |
62 | ###################################
63 | ####### read through ins.loc.lst###
64 | ###################################
65 | ###################################
66 |
67 | # to get this two variables:
68 | my @lsts;
69 | my @tsds;
70 | # this two variables will be used in the final step
71 |
72 | my %rcder;
73 | my @clus;
74 | my %dirs;
75 | while(){ # iterate the lst file and get tsd information and candidate insertions
76 | chomp;
77 | my ($id,$dir,$chr,$pos,$ty) = split /\t/;
78 |
79 | my $win_s;# determine using which window step
80 |
81 | if(! %rcder ){
82 | rcd_it($_,$chr,$pos,$ty,$dir);
83 | next;
84 | }
85 |
86 | # determine the window step to cluster reads
87 | if ($ty=~ /C/ and $rcder{ty} =~ /C/){
88 | $win_s = $window ;
89 | }elsif($ty =~ /G|T/ and $rcder{ty} =~ /G|T/){
90 | $win_s = 50 ;
91 | }else{
92 | $win_s = $window;
93 | }
94 | # get a cluter of support reads
95 |
96 | if(($chr eq $rcder{chr}) and ($pos - $rcder{pos} <= $win_s)){
97 | rcd_it($_,$chr,$pos,$ty,$dir);
98 |
99 | if(eof(INS)){
100 | my($tsd,$inf) = collect_infor(\@clus,\%dirs);
101 | print "CLU:@clus\n" if $db;
102 | push @tsds,$tsd if $tsd != -999;
103 | push @lsts, $inf;
104 | }
105 | }else{
106 | my($tsd,$inf) = collect_infor(\@clus,\%dirs);
107 | print "CLU:@clus\n" if $db;
108 | push @tsds,$tsd if $tsd != -999;
109 | push @lsts, $inf;
110 |
111 | undef(%dirs);
112 | undef(%rcder) ;
113 | undef(@clus);
114 | rcd_it($_,$chr,$pos,$ty,$dir);
115 | if (eof(INS)){
116 | print "CLU:@clus\n" if $db;
117 | my($tsd,$inf) = collect_infor(\@clus,\%dirs);
118 | push @tsds,$tsd if $tsd != -999;
119 | push @lsts, $inf;
120 | }
121 | }
122 | }
123 | close( INS );
124 |
125 | sub rcd_it{
126 | my($it,$chr,$pos,$ty,$d) = @_;
127 | push @clus, $it;
128 | $dirs{$d} ++;
129 | $rcder{chr} = $chr;
130 | $rcder{pos} = $pos;
131 | $rcder{ty} = $ty;
132 | }
133 |
134 | ######### the final step to print each candidate#######
135 | # 1, used esimated tsd length to determine the exact insertion site
136 | #
137 | # 2,calculate the bg depth and estimate Genome Type if BAM file is used #
138 | ##########################
139 |
140 |
141 | # get the most of tsd length
142 | my $tsd_l ;
143 | if(@tsds){
144 | $tsd_l = mode(@tsds);
145 | $tsd_l++;
146 | print STDERR "Estimate the length of TSD is $tsd_l bp\n";
147 | $tsd_l--;
148 | }else{
149 | $tsd_l = 1;
150 | print STDERR "Can't determine the length TSD, because there are no clipped aligned reads\n";
151 | }
152 |
153 |
154 | # check each candidate insertion
155 | for my $it ( @lsts){ # iterate to integrate other information
156 |
157 | my($chr,$ss,$ee,$in_ha_ref,$sc,$clp_s,$clp_e,$crs_s,$crs_e,$tags,$t_s,$t_e,$dir) = @$it;
158 | if($dir eq "."){
159 | print STDERR "Can't determine the direction of insertion at $chr\t$ss\t$sc\n";
160 | next ; # if a cluster of reads have different diretion, discarded
161 | }
162 |
163 |
164 | my $sign = $dir eq "+"? 1:-1;
165 |
166 | ### esitimate the exact insertion site ####
167 | my ($s_p,$e_p);
168 | if($clp_s and $clp_e and $sign*($ss-$ee) == $tsd_l){
169 | ($s_p,$e_p) = ($ss,$ee);
170 | }elsif($clp_s >= 1 ){
171 | ($s_p,$e_p) = ($ss,$ss-$sign*$tsd_l);
172 | }elsif($clp_e >= 1 ){
173 | ($s_p,$e_p) = ($ee+$sign*$tsd_l,$ee);
174 | }else{
175 | ($s_p,$e_p) = deter_ord($ss,$ee);
176 | }
177 |
178 | if($sign == 1){
179 | $e_p --;
180 | }elsif($sign == -1){
181 | $s_p --;
182 | }
183 | ($s_p,$e_p) = sort {$a <=> $b} ($s_p,$e_p);
184 |
185 | ## if bam exists, calculate bg depth and esimate homo or hetero
186 | if ($bam){
187 |
188 | ###### esitmate ratio #####
189 | my ($sup,$nsup,$noise) = ("NA","NA","NA");
190 | ($sup,$nsup) = estimate_homo($chr,$s_p,$e_p,$in_ha_ref,$dir,$t_s,$t_e) if($clp_s or $clp_e);
191 | #
192 | #$in_ha_ref : supportive reads
193 | #
194 |
195 | my $pva;
196 | if($sup =~ /NA/ or $nsup =~ /NA/){
197 | $pva = "NA";
198 | }else{
199 | my $half = int($sup / 2 + 1);
200 | $pva = `Rscript $bindir/genotype_caculator.r $half $nsup`;
201 | if($?){
202 | print " Calculate genome type pvalue error : $sup $nsup!\n";
203 | }
204 | }
205 | my $gt;
206 | if($pva eq "NA"){
207 | $gt = "NA";
208 | }elsif($pva >= 0.01){
209 | $gt="Heter";
210 | }elsif($pva < 0.01){
211 | $gt="Homo";
212 | }
213 | $tags .= ";GT=$sup,$nsup:$gt;PV=$pva";
214 | ######## pick the bg depth ######
215 |
216 | my $pad = int((100- ($e_p-$s_p))/2);
217 | my ($s_r,$e_r)=$pad>0 ? ($s_p-$pad,$e_p+$pad):($s_p,$e_p);
218 |
219 | $s_r = 0 if $s_r < 0;
220 |
221 | open SV,"samtools depth $bam -r $chr:$s_r-$e_r 2>/dev/null| " or die $!;
222 | my $dep;
223 | my $m_l;
224 | while(){
225 | my ($id,$pos,$d)= (split /\t/,$_)[0,1,2];
226 | $dep += $d;
227 | $m_l++;
228 | }
229 | close SV;
230 | if( $m_l and $dep){
231 | $dep = int($dep/$m_l);
232 | }else{
233 | $dep = 0;
234 | }
235 | $tags .= ";DP=$dep";
236 | }
237 | print "$chr\t$s_p\t$e_p\t$tags\t.\t$dir\n" if $db;
238 | print OUT_R "$chr\t$s_p\t$e_p\t$tags;TS=$t_s;TE=$t_e\t.\t$dir\n";
239 | }
240 |
241 | ######################
242 | #######################
243 |
244 |
245 |
246 |
247 |
248 | # the aim of this subroutine is to collect the information of candidate sites
249 | sub collect_infor{ # use a cluster of support reads to determine if it is a ture TE insertion
250 |
251 | my @clu = @{$_[0]};
252 | my %dirs = %{$_[1]};
253 |
254 |
255 | ## use ratio to determine the direction of insertion
256 | my $dir;
257 | if ($dirs{S} and $dirs{R}){
258 | if($dirs{S}/($dirs{R}+$dirs{S}) > 0.8){
259 | $dir = "+";
260 | }elsif( $dirs{R}/($dirs{R}+$dirs{S}) > 0.8){
261 | $dir = "-";
262 | }else{
263 | $dir = ".";
264 | }
265 | }else{
266 | ($dir) = keys %dirs;
267 | $dir = ($dir eq "S")?"+":"-";
268 | }
269 | # direction is now in variable $dir
270 |
271 | ### determine the insertion site
272 | my $sc = scalar @clu; # $sc : the number of support reads
273 |
274 | my @sit_s; # exact start site
275 | my @sit_e; # exact end site
276 | my @rou_s; # uncertain start site
277 | my @rou_e; # uncertain end site
278 | my @te_s; # the start site of junction at te
279 | my @te_e; # the end site of junction at te
280 |
281 | my %in_ha; # used to store read id
282 | my ($id,$d,$chr,$pos,$ty);
283 | my $map_q;
284 | foreach my $pos (@clu){
285 | ($id,$d,$chr,$pos,$ty,my $mq) = split /\t/,$pos;
286 | if ($ty =~ /(GS:|TS:)(-?\d+)/ ){
287 | push @sit_s,$pos;
288 | push @te_s, $2;
289 | }elsif ($ty =~ /(GE:|TE:)(-?\d+)/ ){
290 | push @sit_e,$pos;
291 | push @te_e, $2;
292 | }elsif( $ty =~ /CS/ or $ty =~ /ts/){
293 | push @rou_s,$pos;
294 | }elsif( $ty =~ /CE/ or $ty =~ /te/){
295 | push @rou_e,$pos;
296 | }
297 | $in_ha{$id} .= $ty;
298 | $map_q += $mq;
299 | }
300 | $map_q = int($map_q/@clu);
301 |
302 | my $num_fg = keys %in_ha; # the total num of read pairs suppor insertion
303 |
304 | ### collet exact insert pos
305 |
306 | my $ss; # site in genome cover TE start
307 | if (@sit_s){
308 | $ss = mode(@sit_s);
309 | }elsif(@rou_s) {
310 | $ss = median(@rou_s);
311 | }else{
312 | $ss = 0;
313 | }
314 |
315 | my $ee; # site in genome cover TE end
316 | if (@sit_e){
317 | $ee = mode(@sit_e);
318 | }elsif (@rou_e){
319 | $ee = median(@rou_e);
320 | }else{
321 | $ee = 0;
322 | }
323 |
324 | my $t_s; # start site at TE
325 | if(@te_s){
326 | $t_s = mode(@te_s);
327 | }else{
328 | $t_s = "NA";
329 | }
330 | my $t_e; # end site at TE
331 | if(@te_e){
332 | $t_e = mode (@te_e);
333 | }else{
334 | $t_e = "NA";
335 | }
336 |
337 | ##################
338 | my($clp_s,$clp_e,$crs_s,$crs_e) = (scalar@sit_s,scalar@sit_e,scalar@rou_s,scalar@rou_e);
339 | my $SR = join ",", $num_fg,$sc,$clp_s,$clp_e,$crs_s,$crs_e; # in the order of 'Reads support Start and End'. Fragment suported Start and End
340 |
341 | my $tsd_l;
342 | if($dir eq "+" and @sit_s and @sit_e){
343 | $tsd_l = $ss- $ee;
344 | }elsif($dir eq "-" and @sit_s and @sit_e){
345 | $tsd_l = $ee - $ss;
346 | }else{
347 | $tsd_l = -999;
348 | }
349 | return($tsd_l,[$chr,$ss,$ee,\%in_ha,$sc,$clp_s,$clp_e,$crs_s,$crs_e,"SR=$SR;MQ=$map_q;NM=$te",$t_s,$t_e,$dir]);
350 | }
351 |
352 | # SR : counts of total and every type of supporting reads
353 | # MQ : the average mapping quality
354 | # NM : the name of te
355 | # TS : the start site of te
356 | # TE : the end site of te
357 |
358 |
359 |
360 | sub deter_ord{
361 | my ($aa,$bb) = @_;
362 | my ($ar,$br);
363 | if ($aa * $bb == 0){ # if juction of one side is determined
364 | $ar = ($aa != 0)?abs($aa):abs($bb);
365 | $br = ($bb != 0)?abs($bb):abs($aa);
366 | return($ar,$br);
367 | }else{
368 | ($ar,$br) = sort {$a <=> $b} (abs($aa),abs($bb));
369 | return ($ar,$br);
370 | }
371 | }
372 |
373 |
374 |
375 | sub mode {
376 | my @nums = @_;
377 | my %ha;
378 | foreach my $i (@nums){
379 | $ha{$i} ++;
380 | }
381 | my ($mode) = (sort {$ha{$a} <=> $ha{$b}} keys %ha)[-1];
382 | return $mode;
383 | }
384 |
385 |
386 |
387 | sub median {
388 | my @num = @_;
389 | @num = sort {$a <=> $b} @num;
390 | my $len = scalar @num;
391 | my $me = $num[int($len/2)];
392 | return $me;
393 | }
394 |
395 |
396 |
397 | sub estimate_homo { # check each read pair around the candidate insert sites
398 | my($chr,$s_r,$e_r,$in_ha_ref,$dir,$t_s,$t_e) = @_;
399 | # $s_r 1 based start site of insertion
400 | # $e_r 1 based end site of insertion
401 | $t_s = 0 if ($t_s eq "NA");
402 | $t_e = length($te_ha{$te}) if ($t_e eq "NA");
403 |
404 | print "\nEstimate:$chr\t$s_r\t$e_r\n" if $db;
405 | my %in_ha = %$in_ha_ref;
406 | my $sam_s = $s_r - $lib_l;
407 | my $sam_e = $e_r + $lib_l;
408 |
409 | $sam_s = 0 if $sam_s < 0;
410 |
411 | if($db){
412 | while(my($k,$v) = each %in_ha){
413 | print "SUPP: $k\t$v\n";
414 | }
415 | }
416 | ###iterate each read pair
417 | my %reads;
418 | print "SAMVIEW:samtools view -h -X $bam $chr:$sam_s-$sam_e\n" if $db;
419 | open my $sam, "samtools view -h -X $bam $chr:$sam_s-$sam_e | " or die $!;
420 | while(<$sam>){
421 | chomp;
422 | if (/^@/){
423 | next;
424 | }
425 | my $r = $_;
426 | my ( $id,$tag,$chr,$pos,$mq,$cig,$nchr,$npos,$tlen,$seq) = (split /\t/,$r)[0,1,2,3,4,5,6,7,8,9];
427 | print "RD: $r\n" if $db;
428 | #next if ($mq == 0);
429 |
430 | next if ($tag =~ /u/);
431 | $reads{$id} = 9;
432 |
433 | # 1 : support insertion
434 | # 2 : support excision
435 | # 3 : flank reads
436 | # 4 : discarded reads
437 |
438 | if (exists $in_ha{$id}){
439 | $reads{$id} = 1;
440 | }elsif($tlen < 0 and abs($tlen) < 2*$lib_l and $cig =~ /(\d+)S$/ and $1 >= 20){
441 | ####alined reads with soft clipped ends to test if it support insertion###
442 | my $l = $1;
443 | my $que = substr($seq,-$l);
444 | my $sub;
445 | if($dir eq "+"){
446 | $sub = substr($te_ha{$te},$t_s-1,$l+5);
447 | }else{
448 | $sub = Seq::rev_com(substr($te_ha{$te},$t_e-$l-5,$l+5));
449 | }
450 | #print "$id\t$que\t$sub\n";
451 | if(check_te($que,$sub)){
452 | #print STDERR "Have find a new support reads $r\n";
453 | $reads{$id} = 1 ;
454 | }
455 | }elsif( $tlen > 0 and abs($tlen) < 2*$lib_l and $cig =~ /^(\d+)S/ and $1 >= 20){ # at least 20 bp soft clipped to check if it is from TE
456 | ####aligned reads with sfor clipped ends to test if it support insertion###
457 | my $l = $1;
458 | my $que = substr($seq,0,$l);
459 | my $sub;
460 | if($dir eq "+"){
461 | $sub = substr($te_ha{$te},$t_e-$l-5,$l+5);
462 | }else{
463 | $sub = Seq::rev_com(substr($te_ha{$te},$t_s-1,$l+5));
464 | }
465 | #print "$id\t$que\t$sub\n" if ( check_te($que,$sub));
466 | if( check_te($que,$sub)){
467 | #print STDERR "Have find a new support reads $r\n";
468 | $reads{$id} = 1;
469 | }
470 | }else{
471 | if($nchr eq "=" and $tlen != 0 and abs($tlen) < 2*$lib_l ){
472 |
473 | my @range; # determine the range of pair
474 |
475 | if($tlen > 0){
476 | @range = ($pos,$pos+$tlen-1);
477 | }elsif($tlen < 0){
478 | @range = ($npos,$npos-$tlen-1) ;
479 | }
480 |
481 | # check if the range overlap with TE insertion site
482 | if ($s_r >= $range[0] + 20 and $range[1] >= $e_r + 20 and $reads{$id} >= 2 ){
483 | $reads{$id} = 2;
484 | print OUT_BG "$r\n";
485 | }
486 | }
487 | }
488 | }
489 | my $sup = 0;
490 | my $nsup = 0;
491 | my (@sup_r,@nsup_r) if $db;
492 | while(my ($k,$v) = each %reads){
493 | if($v == 1){
494 | push @sup_r, $k if $db;
495 | $sup++;
496 | }elsif($v == 2){
497 | push @nsup_r, $k if $db;
498 | $nsup ++;
499 | }
500 | }
501 | if($db){
502 | print "SUP: @sup_r\n";
503 | print "NSUP: @nsup_r\n";
504 |
505 | }
506 | return ($sup,$nsup);
507 |
508 | }
509 |
510 |
511 | sub check_te{
512 | my ($que,$sub) = @_;
513 | $que = uc($que);
514 | $sub = uc($sub);
515 | my $q_l = length $que;
516 | my $s_l = length $sub;
517 |
518 | for my $i (0..($s_l-$q_l)){
519 | my $tgt = substr($sub,$i,$q_l);
520 | my $diffcount = () = ($que ^ $tgt) =~ /[^\x00]/g;
521 | return 1 if ($diffcount/$q_l <= 0.1);
522 | }
523 | return 0;
524 | }
525 |
526 | =head
527 | sub com_pos {
528 | my ($fir,$sec,$win) = @_;
529 | if($fir != /:/ and $sec != /:/){
530 | return ($fir,$sec) if($fir - $sec < $win);
531 | }else{
532 | my @fir = split /:/, $fir;
533 | my @sec = split /:/, $sed;
534 | }
535 | # ready to use
536 | }
537 |
538 |
539 |
540 |
541 |
542 |
--------------------------------------------------------------------------------
/utilities/G_test.r:
--------------------------------------------------------------------------------
1 | # Log-likelihood tests of independence & goodness of fit
2 | # Does Williams' and Yates' correction
3 | # does Monte Carlo simulation of p-values, via gtestsim.c
4 | #
5 | # G & q calculation from Sokal & Rohlf (1995) Biometry 3rd ed.
6 | # TOI Yates' correction taken from Mike Camann's 2x2 G-test fn.
7 | # GOF Yates' correction as described in Zar (2000)
8 | # more stuff taken from ctest's chisq.test()
9 | #
10 | # V3.3 Pete Hurd Sept 29 2001. phurd@ualberta.ca
11 |
12 | g.test <- function(x, y = NULL, correct="williams",
13 | p = rep(1/length(x), length(x)), simulate.p.value = FALSE, B = 2000)
14 | #can also use correct="none" or correct="yates"
15 | {
16 | DNAME <- deparse(substitute(x))
17 | if (is.data.frame(x)) x <- as.matrix(x)
18 | if (is.matrix(x)) {
19 | if (min(dim(x)) == 1)
20 | x <- as.vector(x)
21 | }
22 | if (!is.matrix(x) && !is.null(y)) {
23 | if (length(x) != length(y))
24 | stop("x and y must have the same length")
25 | DNAME <- paste(DNAME, "and", deparse(substitute(y)))
26 | OK <- complete.cases(x, y)
27 | x <- as.factor(x[OK])
28 | y <- as.factor(y[OK])
29 | if ((nlevels(x) < 2) || (nlevels(y) < 2))
30 | stop("x and y must have at least 2 levels")
31 | x <- table(x, y)
32 | }
33 | if (any(x < 0) || any(is.na(x)))
34 | stop("all entries of x must be nonnegative and finite")
35 | if ((n <- sum(x)) == 0)
36 | stop("at least one entry of x must be positive")
37 | #If x is matrix, do test of independence
38 | if (is.matrix(x)) {
39 | #Test of Independence
40 | nrows<-nrow(x)
41 | ncols<-ncol(x)
42 | if (correct=="yates"){ # Do Yates' correction?
43 | if(dim(x)[1]!=2 || dim(x)[2]!=2) # check for 2x2 matrix
44 | stop("Yates' correction requires a 2 x 2 matrix")
45 | if((x[1,1]*x[2,2])-(x[1,2]*x[2,1]) > 0)
46 | {
47 | x[1,1] <- x[1,1] - 0.5
48 | x[2,2] <- x[2,2] - 0.5
49 | x[1,2] <- x[1,2] + 0.5
50 | x[2,1] <- x[2,1] + 0.5
51 | }
52 | else
53 | {
54 | x[1,1] <- x[1,1] + 0.5
55 | x[2,2] <- x[2,2] + 0.5
56 | x[1,2] <- x[1,2] - 0.5
57 | x[2,1] <- x[2,1] - 0.5
58 | }
59 | }
60 |
61 | sr <- apply(x,1,sum)
62 | sc <- apply(x,2,sum)
63 | E <- outer(sr,sc, "*")/n
64 | # are we doing a monte-carlo?
65 | # no monte carlo GOF?
66 | if (simulate.p.value){
67 | METHOD <- paste("Log likelihood ratio (G-test) test of independence\n\t with simulated p-value based on", B, "replicates")
68 | tmp <- .C("gtestsim", as.integer(nrows), as.integer(ncols),
69 | as.integer(sr), as.integer(sc), as.integer(n), as.integer(B),
70 | as.double(E), integer(nrows * ncols), double(n+1),
71 | integer(ncols), results=double(B), PACKAGE= "ctest")
72 | g <- 0
73 | for (i in 1:nrows){
74 | for (j in 1:ncols){
75 | if (x[i,j] != 0) g <- g + x[i,j] * log(x[i,j]/E[i,j])
76 | }
77 | }
78 | STATISTIC <- G <- 2 * g
79 | PARAMETER <- NA
80 | PVAL <- sum(tmp$results >= STATISTIC)/B
81 | }
82 | else {
83 | # no monte-carlo
84 | # calculate G
85 | g <- 0
86 | for (i in 1:nrows){
87 | for (j in 1:ncols){
88 | if (x[i,j] != 0) g <- g + x[i,j] * log(x[i,j]/E[i,j])
89 | }
90 | }
91 | q <- 1
92 | if (correct=="williams"){ # Do Williams' correction
93 | row.tot <- col.tot <- 0
94 | for (i in 1:nrows){ row.tot <- row.tot + 1/(sum(x[i,])) }
95 | for (j in 1:ncols){ col.tot <- col.tot + 1/(sum(x[,j])) }
96 | q <- 1+ ((n*row.tot-1)*(n*col.tot-1))/(6*n*(ncols-1)*(nrows-1))
97 | }
98 | STATISTIC <- G <- 2 * g / q
99 | PARAMETER <- (nrow(x)-1)*(ncol(x)-1)
100 | PVAL <- 1-pchisq(STATISTIC,df=PARAMETER)
101 | if(correct=="none")
102 | METHOD <- "Log likelihood ratio (G-test) test of independence without correction"
103 | if(correct=="williams")
104 | METHOD <- "Log likelihood ratio (G-test) test of independence with Williams' correction"
105 | if(correct=="yates")
106 | METHOD <- "Log likelihood ratio (G-test) test of independence with Yates' correction"
107 | }
108 | }
109 | else {
110 | # x is not a matrix, so we do Goodness of Fit
111 | METHOD <- "Log likelihood ratio (G-test) goodness of fit test"
112 | if (length(x) == 1)
113 | stop("x must at least have 2 elements")
114 | if (length(x) != length(p))
115 | stop("x and p must have the same number of elements")
116 | E <- n * p
117 |
118 | if (correct=="yates"){ # Do Yates' correction
119 | if(length(x)!=2)
120 | stop("Yates' correction requires 2 data values")
121 | if ( (x[1]-E[1]) > 0.25) {
122 | x[1] <- x[1]-0.5
123 | x[2] <- x[2]+0.5
124 | }
125 | else if ( (E[1]-x[1]) > 0.25){
126 | x[1] <- x[1]+0.5
127 | x[2] <- x[2]-0.5
128 | }
129 | }
130 | names(E) <- names(x)
131 | g <- 0
132 | for (i in 1:length(x)){
133 | if (x[i] != 0) g <- g + x[i] * log(x[i]/E[i])
134 | }
135 | q <- 1
136 | if (correct=="williams"){ # Do Williams' correction
137 | q <- 1+(length(x)+1)/(6*n)
138 | }
139 | STATISTIC <- G <- 2*g/q
140 | PARAMETER <- length(x) - 1
141 | PVAL <- pchisq(STATISTIC, PARAMETER, lower = FALSE)
142 | }
143 | names(STATISTIC) <- "Log likelihood ratio statistic (G)"
144 | names(PARAMETER) <- "X-squared df"
145 | names(PVAL) <- "p.value"
146 | structure(list(statistic=STATISTIC,parameter=PARAMETER,p.value=PVAL,
147 | method=METHOD,data.name=DNAME, observed=x, expected=E),
148 | class="htest")
149 | }
--------------------------------------------------------------------------------
/utilities/average_depth_for_temp.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 | my $dep = 0;
5 | while(<>){
6 | chomp;
7 | my @ar = split /\t/;
8 | my ($chr,$s,$e,$p,$d) = @ar[0,1,2,6,7];
9 | my $len = $e - $s;
10 | if($p == $len){
11 | $dep += $d;
12 | my $ever = $dep/$p;
13 | print "$chr:$s-$e\t$ever\n";
14 | $dep = 0;
15 | }else{
16 | $dep += $d;
17 | }
18 | }
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/utilities/cluster_for_venn_diff_tools.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 |
5 | my %rcd;
6 | while(<>){
7 | chomp;
8 | my($chr,$s,$t) = (split /\t/, $_)[0,1,3];
9 | if($t =~ /^(\S+?),/){
10 | $t = $1;
11 | }
12 |
13 | $t = "ITIS" if ($t =~ /SR/);
14 | $rcd{$chr}{$s}{$t} = 1;
15 | }
16 |
17 | my %lst;
18 | foreach my $chr(keys %rcd){
19 | my $st = $rcd{$chr};
20 | my %sub_ha = %$st;
21 |
22 | my $l_k=0;
23 | foreach my $k( sort {$a<=>$b} keys %sub_ha){ # $k contain the name of chromosome
24 | my $v = $sub_ha{$k};
25 | my @ts = keys (%$v);
26 | if ($k - $l_k < 1000){
27 | foreach my $t (@ts){
28 | push @{$lst{$t} }, "$chr:$l_k";
29 | }
30 | }else{
31 | foreach my $t ( @ts){
32 | push @{$lst{$t} }, "$chr:$k";
33 | }
34 | $l_k = $k;
35 | }
36 | }
37 | }
38 |
39 | foreach my $tool ( keys %lst){
40 | my @locs = @{$lst{$tool}};
41 | my $num = @locs;
42 | print STDERR "#$tool\t$num\n";
43 | print "$tool\t".(join "\t", @locs) . "\n";
44 | }
45 |
46 |
47 |
--------------------------------------------------------------------------------
/utilities/count.gt.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 | my ($su,$nsu);
5 | while(<>){
6 | chomp;
7 | my($aa,$bb) = $_ =~ /GT=(\d+),(\d+)/;
8 | $su += $aa;
9 | $nsu += $bb;
10 | }
11 | print "$su\t$nsu\n";
12 |
13 |
--------------------------------------------------------------------------------
/utilities/count_fts_hits.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | use warnings; use strict;
3 | use File::Basename;
4 |
5 | my($fts_f,$bed_list_f) = @ARGV;
6 |
7 |
8 | open BED, "$bed_list_f" or die $!;
9 | chomp(my @bed_list = );
10 | foreach my $bed (@bed_list){
11 | my $fn = basename($bed);
12 | system ("enlarge_bed.pl $bed >$fn.f1000.bed" ) == 0 or die $!;
13 | system ("bedtools getfasta -fi ~/jiangchuan/genome/medicago/JCVI.clean.fa -bed $fn.f1000.bed -fo $fn.f1000.fa") == 0 or die $!;
14 | open BLA, ("blastn -evalue 1e-10 -query $fts_f -subject $fn.f1000.fa -outfmt 6 | " ) or die $1;
15 | my %list;
16 | while (){
17 | chomp;
18 | my($id) = (split /\t/,$_)[0];
19 | $list{$id} = 1;
20 | }
21 | my @k = keys %list;
22 | print $fn."\t".scalar@k."\n";
23 | }
24 |
25 |
--------------------------------------------------------------------------------
/utilities/count_hit_at_gene_flank_inter.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 | my $file = shift @ARGV;
5 |
6 |
7 | open GENE, "bedtools window -w 0 -a ~/jiangchuan/genome/medicago/Mt4.0v1_genes_20130731_1800.gff3 -b $file | " or die $!;
8 |
9 | my %hits;
10 |
11 | while (){
12 | my@ar = split /\t/;
13 | if ($ar[2] =~ /exon/){
14 | $hits{$ar[9],$ar[10],$ar[11]}{0} = "exon";
15 | next;
16 | }elsif($ar[2] =~ /gene/){
17 | $hits{$ar[9],$ar[10],$ar[11]}{0} = "gene";
18 | }
19 | }
20 |
21 | count_hit("-r 2500 -l 0","r25");
22 | count_hit("-l 2500 -r 0 ","l25");
23 | #count_hit("-l 5000","l50");
24 | #count_hit("-r 5000","r50");
25 |
26 |
27 | my ($exon,$intron,$l25,$r25,$inter) ;
28 | open RE, "$file" or die $!;
29 | while(){
30 | my@ar = split /\t/;
31 | my $va = $hits{$ar[0],$ar[1],$ar[2]};
32 | unless($va){
33 | $inter ++;
34 | }else{
35 | my %h = %$va;
36 | #print keys %h;
37 | #print ": $ar[0],$ar[1],$ar[2]\n";
38 | foreach my $k ( sort {$a <=> $b} keys %h){
39 | my $v = $h{$k};
40 | if($v =~ /exon/){
41 | $exon ++;
42 | }elsif($v =~ /gene/){
43 | $intron ++;
44 | }elsif($v =~ /l25/){
45 | $l25 ++;
46 | }elsif($v =~ /r25/){
47 | $r25 ++;
48 | }
49 | last;
50 | }
51 | }
52 | }
53 | print "$exon,$intron,$l25,$r25,$inter\n";
54 |
55 |
56 | sub count_hit{
57 | my($pa,$v) = @_;
58 | open IN , "bedtools window $pa -a ~/jiangchuan/genome/medicago/Mt4.0v1_genes_20130731_1800.gff3 -b $file |" or die $!;
59 | while (){
60 | my $dis;
61 | my@ar = split /\t/;
62 | if($ar[2] =~ /gene/ and ! exists $hits{$ar[9],$ar[10],$ar[11]}{0} ){
63 | my $h = $ar[3] - $ar[11];
64 | my $t = $ar[10] - $ar[4];
65 | $dis = $h>0?$h:$t;
66 | die if ($dis <0);
67 | $hits{$ar[9],$ar[10],$ar[11]}{$dis}= "$v";
68 | }
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/utilities/count_line.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 | my @files = @ARGV;
5 |
6 | foreach my $f(@files){
7 | open my $hd, $f or die $!;
8 |
9 | my @lines = <$hd>;
10 | print "$f\t".scalar @lines."\n";
11 | }
12 |
13 |
--------------------------------------------------------------------------------
/utilities/count_true_and_false.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 | my($ref_list,$list) = @ARGV;
5 |
6 | my $ref_ln;
7 | open REF, $ref_list or die $!;
8 | $ref_ln ++ while([);
9 |
10 | my $ln;
11 | open LIS, $list or die $!;
12 | $ln++ while ();
13 |
14 |
15 | my($t_500,$e_500) = count(500);
16 | my($t_100,$e_100) = count(100);
17 |
18 | print "$ref_ln\t$ln\t$t_500\t$t_100\t$e_100\n";
19 |
20 | sub count{
21 | my $w = shift @_;
22 |
23 | open WIN, " bedtools window -a $ref_list -w $w -b $list | " or die $!;
24 |
25 | my $tot_over = 0;
26 | my $tot_exact = 0;
27 | my $rec=0;
28 | while (){
29 | chomp;
30 | my($s_r,$e_r,$d_r,$s,$e,$d) = (split /\t/,$_)[1,2,3,5,6,9];
31 | if($s != $rec){
32 | my $sign = ($d_r == 1)? "\\+":"\\-";
33 | if ($d =~ /^$sign/ or $d =~ /NA|\./){
34 | $tot_over ++;
35 | if($s_r == $s and $e_r == $e){
36 | $tot_exact ++;
37 | }
38 | }
39 | }else{
40 | next;
41 | }
42 | $rec = $s;
43 | }
44 | return("$tot_over","$tot_exact");
45 | }
46 |
47 |
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/utilities/enlarge_bed.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | use warnings; use strict;
3 |
4 | die "$0 bed_file window_size\n" unless @ARGV;
5 |
6 | open BED, shift @ARGV or die $!;
7 | my $w = shift @ARGV;
8 | while(){
9 | chomp;
10 | my($chr,$s,$e,$rest) = split (/\t/,$_,4);
11 | my $dis = $e-$s+1;
12 | if($dis<$w){
13 | my $pad = int(($w-$dis+1)/2);
14 | $s -= $pad;
15 | $e += $pad;
16 | }
17 | print "$chr\t$s\t$e\t$rest\n";
18 | }
19 | print "chr1\t1\t4\t.\t.\t.\n";
20 |
21 |
--------------------------------------------------------------------------------
/utilities/enlarge_bed_for_meme.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 | my($file,$step) = @ARGV;
4 |
5 | open my $fh, "$file " or die $!;
6 |
7 | while (<$fh>){
8 | my ($chr,$s,$e,$dir) = split /\t/ , $_, 4 ;
9 | if ($e - $s == 5){
10 | $s -= $step;
11 | $e += $step;
12 | print "$chr\t$s\t$e\t$dir";
13 | }
14 | }
15 |
16 |
--------------------------------------------------------------------------------
/utilities/filter_TEMP.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 | while(<>){
5 | chomp;
6 | my($p,$s,$e) = (split /\t/, $_)[5,12,13];
7 | my $boo = 1;
8 | if ($p ne "1p1"){
9 | $boo = 0;
10 | }elsif($s+$e <3){
11 | $boo = 0;
12 | }
13 |
14 | print "$_\n" if ($boo);
15 | }
16 |
17 |
18 |
--------------------------------------------------------------------------------
/utilities/flanking_primer.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 | my($ins_bed,$sup_bam)= @ARGV;
5 |
6 | open INS, $ins_bed or die $!;
7 | while(){
8 | chomp;
9 | my ($chr,$s,$e,$ann) = split /\t/;
10 | $e++;
11 |
12 | if($ann =~ /GT=NA/){
13 | $s = $s - 100 ;
14 | $e = $e + 100;
15 | }
16 |
17 | ### find the forward primer
18 | open SAM, "samtools view $sup_bam $chr:$s-$e |" or die $!;
19 | print "samtools view $sup_bam $chr:$s-$e \n";
20 | my @reads = ;
21 | unless (@reads){
22 | print "$_\tNA\tNA\n";
23 | }else{
24 | my $fir = shift @reads;
25 | my $las = pop @reads;
26 |
27 | my ($l5,$p5) = ext_seq(5,$fir);
28 | my ($l3,$p3) = ext_seq(3,$las);
29 |
30 | if($l5 >= $s){
31 | $p5 = "NA" ;
32 | }
33 |
34 | if($l3 <= $s){
35 | $p3 = "NA";
36 | }else{
37 | $p3 = reverse($p3);
38 | $p3 =~ tr/ATGC/TACG/;
39 | }
40 | print "$_\t$p5\t$p3\n";
41 | }
42 | }
43 |
44 | sub ext_seq{
45 | my ($ty,$aln) = @_;
46 | my @arr = split /\t/, $aln;
47 | my $pos = $arr[3];
48 | my $cig = $arr[5];
49 | my $seq = $arr[9];
50 | my $seq_r = $seq;
51 | if ($ty == 5 and $cig =~ /(\d+)S$/){
52 | $seq_r = substr($seq,0,-$1);
53 | }elsif($ty == 3 and $cig =~ /^(\d+)S/){
54 | $seq_r = substr($seq,$1-1);
55 | }
56 | return ($pos,$seq_r);
57 | }
58 |
--------------------------------------------------------------------------------
/utilities/generate_snp_batch_from_bed.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 | unless (-e "snap_shot"){
5 | print "snapshotDirectory ./snap_shot\n" ;
6 | }else{
7 | die "dir exists\n";
8 | }
9 |
10 | while (<>){
11 | chomp;
12 | my ($chr,$s,$e) = split /\t/,$_;
13 | if (($e - $s) > 300){
14 | die "$chr,$s,$e,haha\n";
15 | }else{
16 | my $slop = 150 - ($e - $s);
17 | my $s_p = $s - int($slop/2);
18 | my $e_p = $e + int($slop/2);
19 | print "goto $chr:$s_p-$e_p\n";
20 | print "snapshot ${chr}_${s}_${e}_slop$slop.png\n";
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/utilities/genotype_calculator.r:
--------------------------------------------------------------------------------
1 | paras <- as.numeric(commandArgs(trailingOnly = T))
2 |
3 | p <- paras[3]
4 | num <- paras[c(1,2)]
5 |
6 | test <- binom.test(num,p=p)
7 | pv <- test$p.value
8 |
9 | Q <- -10*log10(pv)
10 |
11 | cat(Q)
12 |
--------------------------------------------------------------------------------
/utilities/get_seq_for_logo.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 |
--------------------------------------------------------------------------------
/utilities/gt_ratio.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 | while(<>){
5 | chomp;
6 | if($_ =~ /GT=(\d+),(\d+)/){
7 | print "$1\t$2\n";
8 | }
9 | }
10 |
11 |
--------------------------------------------------------------------------------
/utilities/part_in_venn.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 |
5 |
--------------------------------------------------------------------------------
/utilities/perfomance_itis.r:
--------------------------------------------------------------------------------
1 | svg("recoverate_on_simu.svg")
2 |
3 |
4 | itis <- read.table("itis_sort",as.is=T,row.names=1)
5 | relocTE <- read.table("relocate_sort",as.is=T,row.names=1)
6 | retroseq <- read.table("retro_sort",as.is=T,row.names=1)
7 | temp <- read.table("TEMP_sort",as.is=T,row.names=1)
8 | tif <- read.table("tif_sort",as.is=T,row.names=1)
9 |
10 | total <- matrix(c(itis[,2],relocTE[,2],retroseq[,2],temp[,2],tif[,2]),ncol=5)
11 |
12 | overlap <- matrix(c(itis[,3],relocTE[,3],retroseq[,3],temp[,3],tif[,3]),ncol=5)
13 | exact_overlap <- matrix(c(itis[,4],relocTE[,4],retroseq[,4],temp[,4],tif[,4]),ncol=5)
14 |
15 | nm <- c("ITIS","RelocaTE","RetroSeq","TEMP","TIF")
16 |
17 | bp <- barplot(total,beside=T,col="red",names.arg=nm,main="Recover Rate of Insertins Using Simulated Data")
18 | barplot(overlap,beside=T,add=T,col="blue")
19 | barplot(exact_overlap,beside=T,col="green",add=T)
20 |
21 | abline(h=52,lty="13")
22 | text(2,54,labels="REF:52")
23 | legend("topleft",legend=c("Total","Overlap 100bp","Exact overlap"),fill=c("red","blue","green"),bty="n")
24 |
25 | mtext(side=1,cex=0.7,text= c("3x","5x","10x","15x","20x","25x"),at=bp,las=2)
26 |
27 | dev.off()
28 |
--------------------------------------------------------------------------------
/utilities/plot_cov.r:
--------------------------------------------------------------------------------
1 | itis <- read.table("nf54_itis_counts.txt",as.is=T,row.names=2)
2 | tif <- read.table("nf54_tif_counts.txt",as.is=T,row.names=2)
3 | retro <- read.table("nf54_retro_counts.txt",as.is=T,row.names=2)
4 | reloc <- read.table("nf54_reloca_counts.txt",as.is=T,row.names=2)
5 |
6 |
7 | xpos = c(3,5,10,15,20,25,30,40,50,60,70,80,90,100)
8 |
9 | plot(x=xpos,y=itis[,1],xlim=c(0,100),type="b",col="blue",xlab="Coverage",ylab="No. of Insertion",main="No. of Insertion under different coverage",xaxp=c(0,100,10),yaxp=c(0,100,10),axes=F)
10 |
11 | lines(x=xpos,y=tif[,1],type="b",col="red")
12 | lines(x=xpos,y=retro[,1],type="b",col="green")
13 | lines(x=xpos,y=reloc[,1],type="b",col="black")
14 | box()
15 | axis(side=1,at=xpos,labels=F,tcl=-0.3)
16 | axis(side=2,at=seq(10,100,by=10))
17 |
18 | text(xpos,par("usr")[3]-1,srt=-45,adj=0,labels=paste(xpos,rep("x",14)),xpd=T,cex=0.8)
19 |
20 | legend("topleft",legend=c("ITIS","TIF","retroSeq","relocaTE"),col=c("blue","red","'green","black"),lty=1)
21 |
22 |
23 |
--------------------------------------------------------------------------------
/utilities/plot_ins_matri.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 | my ($ref,@rest) = @ARGV;
5 |
6 | open REF, $ref or die $!;
7 |
8 | my %ref; # save each point in hash, value is the insert position
9 | my %pri; # key is the insert position, value is the support information
10 | my @rec; # save the order of insert position;
11 | while(][){
12 | chomp;
13 | my ($chr,$s,$e,$t) = split /\t/, $_;
14 |
15 | foreach my $i ( $s..$e){
16 | $ref{$chr}{$i} = "$chr:$s:$e";
17 | }
18 | $pri{"$chr:$s:$e"} = $t;
19 | push @rec,"$chr:$s:$e";
20 | }
21 |
22 | close(REF);
23 |
24 | my %other;
25 | for my $i (@rest){ # process other file
26 | open OT, $i or die $!;
27 | my %tem_ha; # temperate hash for save information for individual file
28 | while(){
29 | chomp;
30 | my ($c,$s,$e,$t) = split /\t/, $_;
31 | my $site; # the position in the ref list
32 | for my $j ($s..$e){
33 | if (exists $ref{$c}{$j}){
34 | $site = $ref{$c}{$j};
35 | last;
36 | }
37 | }
38 | $tem_ha{$site} = "$t" if ($site); # correspond the type with site in ref
39 | }
40 | $other{$i} = \%tem_ha; # save in %other hash
41 | }
42 |
43 | my @list;
44 | print (join "\t",("POS",$ref,@rest,"\n") );
45 | for my $i (@rec){ # $i is the insert position in ref file
46 | my $p = "$i\t$pri{$i}"; # the support record in ref file
47 | for my $j (@rest){ # extract support record in other files
48 | my %ha = %{$other{$j}}; # dereference of hash
49 | $p .= $ha{$i}? "\t$ha{$i}":"\tNA"; # print
50 | }
51 | print "$p\n";
52 | }
53 |
54 |
--------------------------------------------------------------------------------
/utilities/plot_venn_medi.r:
--------------------------------------------------------------------------------
1 | library("VennDiagram")
2 | c <- readLines("count_ins_tools.tsv")
3 | cl <- strsplit(c,split="\t")
4 | nm <- lapply(cl,function(x){x <- x[1]})
5 | cl <- lapply(cl,function(x){x <- x[-1]})
6 | names(cl) <- nm
7 |
8 |
9 |
10 | #venn.diagram(cl,cat.cex=0.6,margin=0.2,filename="venn.png",imagetype="png",fill=rainbow(5))
11 | venn.diagram(cl,cat.cex=0.7,cat.dist=c(0.1,0.08,0.1,0.1,0.08),alpha = 0.50,margin=0.2,filename="venn.png",imagetype="png",fill=rainbow(5), cat.fontface = "bold")
12 |
--------------------------------------------------------------------------------
/utilities/random_10000_insertion.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 | use Bio::SeqIO;
4 | # this scripts use to randomly insert 10000; to calculate the distributuon
5 | #
6 | #
7 | my $genome_file = shift @ARGV;
8 | my $seq_in = Bio::SeqIO -> new (-file => $genome_file, -format => "fasta");
9 |
10 | my @chrs;
11 | my %geno;
12 | my $tot;
13 | while(my $seq_obj = $seq_in -> next_seq()){
14 | my $id = $seq_obj -> id();
15 | my $len = $seq_obj -> length();
16 |
17 | push @chrs,$id;
18 | $tot += $len;
19 | $geno{$id} = $tot;
20 | }
21 |
22 |
23 | srand(100);
24 | my @nums;
25 | for (1..10000){
26 | push @nums ,int(rand($tot-5));
27 | }
28 | @nums = sort {$a <=> $b} @nums;
29 |
30 |
31 |
32 | my $pp = 0;
33 |
34 | LOOP: foreach my $id (@chrs){
35 |
36 | while(1){
37 | if(@nums == 0){
38 | last;
39 | }elsif($nums[0] <= $geno{$id}-5){
40 | my $first = $nums[0] - $pp;
41 | my $last = $first + 5;
42 | print "$id\t$first\t$last\n";
43 | shift @nums;
44 | }else{
45 | $pp = $geno{$id}- 1;
46 | next LOOP;
47 | }
48 | }
49 | }
50 |
51 |
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/utilities/simulate_TE_inser.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | use warnings; use strict;
3 | use Bio::SeqIO;
4 | use Seq;
5 | use Bio::Seq;
6 |
7 | my($genome_file,$te_file) = @ARGV;
8 | my $seq_in = Bio::SeqIO -> new(-file=>$genome_file,-format=>"fasta");
9 | my $seq_out = Bio::SeqIO -> new(-file=> ">$genome_file.sim.fa",-format=> "fasta");
10 |
11 | my %te_ha = Seq::seq_hash($te_file);
12 |
13 | my $seed = 10;
14 | my $step = 4e5;
15 | #my $step= 8e6; # simulate 52 insertion
16 |
17 | srand(100);
18 | while(my $inseq = $seq_in -> next_seq){
19 | my $len = $inseq -> length;
20 | my $seq = $inseq -> seq;
21 | my $id = $inseq -> id;
22 | my $num = int($len/$step+1);
23 |
24 | my @ins_site;
25 | for(my $i=0; $i<$num;$i++){
26 | my $r = int(rand($len));
27 | my $tsd = substr($seq,$r-300,600);
28 | if($tsd =~ /n/i){
29 | redo;
30 | }else{
31 | push @ins_site,$r;
32 | }
33 | }
34 |
35 | my $pre = 0;
36 | my @frags;
37 | my $seq_p;
38 | foreach my $s ( sort {$a<=>$b} @ins_site){
39 | my $sr = (rand(1)<0.5)?"-1":"1";
40 |
41 | my ($te,$teid) = ran_te(\%te_ha);
42 | print "$id\t".$s."\t".($s+5)."\t$sr:$teid\n";
43 | if($sr eq "-1"){
44 | $te = reverse($te);
45 | $te =~ tr/ATCG/TAGC/;
46 | }
47 |
48 | my $l = $s - $pre;
49 | my $f = substr($seq,0,$l+5);
50 | $seq = substr($seq,$l);
51 | $seq_p .= $f.lc($te);
52 | $pre = $s;
53 | }
54 |
55 | $seq_p .= $seq;
56 | $inseq ->seq($seq_p);
57 | $seq_out -> write_seq($inseq);
58 |
59 | }
60 |
61 | sub ran_te {
62 | my $ref = shift @_;
63 | my @keys = keys (%{$ref});
64 | my $n = scalar @keys;
65 | my $r = int(rand($n));
66 | my $rseq = $$ref{$keys[$r]};
67 | return($rseq,$keys[$r]);
68 | }
69 |
--------------------------------------------------------------------------------
/utilities/trans_relocate_2_bed.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 |
5 | while (<>){
6 | chomp;
7 | next if (/^#/);
8 |
9 | my( $chr,$s,$e,$d) = (split /\t/,$_)[0,3,4,6];
10 | $s++;
11 | $e += 2;
12 | print "$chr\t$s\t$e\tRelocaTE\t.\t$d\n";
13 |
14 | }
15 |
16 |
--------------------------------------------------------------------------------
/utilities/trans_retro_2_bed.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 | while(<>){
5 | chomp;
6 | my($chr,$s,$e,$d) = (split /\t/)[0,1,2,5];
7 | print "$chr\t$s\t$e\tRetroSeq\t.\t$d\n";
8 | }
9 |
10 |
--------------------------------------------------------------------------------
/utilities/trans_temp_2_bed.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 | while(<>){
5 | chomp;
6 | my($chr,$s,$e,$d,$type,$head,$tail) = (split /\t/,$_)[0,1,2,4,5,12,13];
7 |
8 | next unless ($s =~ /\d/);
9 | my $boo = 1;
10 | if ($type ne "1p1"){
11 | $boo =0;
12 | }elsif($head+$tail <3){
13 | $boo = 0;
14 | }
15 |
16 | $s --;
17 | $d = ($d=~ /antisense/)?"-":"+";
18 | print "$chr\t$s\t$e\tTEMP\t.\t$d\n" if ($boo);
19 | }
20 |
21 |
--------------------------------------------------------------------------------
/utilities/trans_tif_2_bed.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings; use strict;
3 |
4 | while(<>){
5 | chomp;
6 | my($chr,$start,$end,$res) = split(/\t/,$_,4);
7 | $res =~ s/\t/,/g;
8 | my $d;
9 | if ($res =~ /forward/){
10 | $d = "+";
11 | }else{
12 | $d = "-";
13 | }
14 |
15 | ($start,$end) = sort{$a<=>$b} ($start,$end);
16 | $start --;
17 | print "$chr\t$start\t$end\tTIF,$res\t.\t$d\n";
18 | }
19 |
20 |
--------------------------------------------------------------------------------
]