├── .gitattributes ├── README.md ├── ascii2num ├── bam2fasta.pl ├── calc_pop_divs.pl ├── calc_vcf_diversity.pl ├── convert_fastq_quality.pl ├── convert_trf.pl ├── count_vcf_pairs.pl ├── detect_mutations.pl ├── downsample_vcf.pl ├── extract_bam_pairs.pl ├── extract_split_seqs.pl ├── fasta_process.pl ├── gff2fasta.pl ├── gff2tables.pl ├── ggplot-liner-fit.R ├── ggplot-windows-count.R ├── ggplot-windows-multi.R ├── intervals2bed ├── map_pos2intervals.pl ├── map_records.pl ├── maskedSEQ2bed.pl ├── paintGenomeBlocks.pl ├── parallel_baseml.pl ├── parallel_codeml.pl ├── parseSEQs.pl ├── plot-scatter_with_lines.R ├── reference_align.pl ├── rename_fasta.pl ├── sam2fastq.pl ├── site_perl └── MyPerl │ ├── Align.pm │ ├── Compare.pm │ ├── Convert.pm │ ├── FileIO.pm │ ├── README.md │ ├── Statistics.pm │ └── Vcf.pm ├── subtotal_stats.pl ├── vcf2tables.pl └── vcf_process.pl /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /ascii2num: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # ascii2num.pl -- Convert ASCII codes to numbers. 4 | # 5 | # Author: Nowind 6 | # Created: 2010-09-29 7 | # Updated: 2015-06-11 8 | # Version: 1.0.0 9 | # 10 | # Change logs: 11 | # Version 1.0.0 15/06/11: The initial version. 12 | 13 | use strict; 14 | 15 | use Bio::SeqIO; 16 | 17 | 18 | ######################## Main ######################## 19 | 20 | my $CMDLINE = "perl $0 @ARGV"; 21 | my $VERSION = '1.0.0'; 22 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 23 | 24 | my ($input) = @ARGV; 25 | 26 | 27 | unless( $input ) { 28 | print < 35 | 36 | EOF 37 | 38 | exit(1); 39 | } 40 | 41 | 42 | my @ascii_codes = split //, $input; 43 | my @numbers = (); 44 | for (@ascii_codes) 45 | { 46 | push @numbers, ord($_); 47 | } 48 | 49 | ###print STDOUT join "\t", @ascii_codes, "\n"; 50 | print STDOUT join "\t", @numbers, "\n"; 51 | 52 | -------------------------------------------------------------------------------- /bam2fasta.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # bam2fasta.pl -- Convert aligned sequences from a SAM format file 4 | # 5 | # 6 | # Author: Nowind 7 | # Created: 2012-02-21 8 | # Updated: 2016-02-25 9 | # Version: 1.0.0 10 | # 11 | # Change logs: 12 | # Version 1.0.0 16/02/25: The initial version. 13 | 14 | 15 | 16 | 17 | use strict; 18 | 19 | use Data::Dumper; 20 | use Getopt::Long; 21 | 22 | use MyPerl::FileIO qw(:all); 23 | 24 | ##################### Main #################### 25 | 26 | 27 | my $CMDLINE = "perl $0 @ARGV"; 28 | my $VERSION = '1.0.0'; 29 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 30 | 31 | my $SOURCE = (scalar localtime()) . " Version: $VERSION"; 32 | 33 | my $out_format = 'fasta'; 34 | my (@bam_files, $no_rc, $output, $extend_size, $use_rg_id, $samtools_opts, 35 | $min_seq_len, $max_clipping, $min_insert_size, $max_insert_size); 36 | GetOptions( 37 | "bam=s{,}" => \@bam_files, 38 | 39 | "output=s" => \$output, 40 | "format=s" => \$out_format, 41 | "no-rc" => \$no_rc, 42 | 43 | "samtools=s" => \$samtools_opts, 44 | "min-insert=i" => \$min_insert_size, 45 | "max-insert=i" => \$max_insert_size, 46 | "min-len=i" => \$min_seq_len, 47 | "max-clipping=i" => \$max_clipping, 48 | "use-rg" => \$use_rg_id, 49 | ); 50 | 51 | my $show_help = ($CMDLINE =~ /\-help/) ? 0 : 1; 52 | 53 | unless( (@bam_files >= 1) && $show_help ) { 54 | print < 65 | bam file(s), required 66 | 67 | Output Options: 68 | 69 | -o, --output 70 | output filename if output in sam format, or output filename 71 | prefix if output if fastq format 72 | -f, --format 73 | output format, default output in sam format, can be set to 74 | fastq format 75 | 76 | -n, --no-rc 77 | do not reverse complement sequence with negtive strand 78 | -u, --use-rg 79 | add read group id to extracted records 80 | 81 | Filtering Options: 82 | -s, --samtools 83 | directly pass samtools view options to this script, e.g. 84 | "-f 4 -F 8" 85 | 86 | --min-len 87 | minium sequence length 88 | 89 | --max-clipping 90 | maximum allowed clipping length, include both soft and hard 91 | clipping bases 92 | 93 | --min-insert 94 | --max-insert 95 | screen out records with insert size wihtin this range 96 | 97 | EOF 98 | 99 | exit(1); 100 | } 101 | 102 | 103 | 104 | 105 | $|++; 106 | 107 | 108 | 109 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 110 | 111 | if ($output) { 112 | open (STDOUT, "> $output") || die $!; 113 | } 114 | 115 | 116 | bam2fasta(); 117 | 118 | 119 | print STDERR "# " . (scalar localtime()) . "\n"; 120 | 121 | 122 | ######################### Sub ######################### 123 | 124 | 125 | =head2 bam2fasta 126 | 127 | About : Convert SAM records to fasta/fastq sequences 128 | Usage : bam2fasta(); 129 | Args : Null 130 | Returns : Null 131 | 132 | =cut 133 | sub bam2fasta 134 | { 135 | for (my $i=0; $i<@bam_files; $i++) 136 | { 137 | my $pipe_str = "samtools view $bam_files[$i] |"; 138 | 139 | if ($samtools_opts) { 140 | $pipe_str = "samtools view $samtools_opts $bam_files[$i] |"; 141 | } 142 | 143 | open (my $fh, $pipe_str) || die $!; 144 | while (<$fh>) 145 | { 146 | next if (/^@/ || /^\s+$/); ## skip header 147 | 148 | chomp(my $record = $_); 149 | 150 | my ($QNAME, $FLAG, $RNAME, $POS, $MAPQ, $CIGAR, 151 | $MRNM, $NPOS, $TLEN, $SEQ, $QUAL, @OPT) = (split /\s+/, $record); 152 | 153 | 154 | ## 155 | ## check if this record should be update 156 | ## 157 | my $rg_id = ($record =~ /RG:Z:(.*?)\s+/); 158 | 159 | $QNAME =~ s/\/\d$//; 160 | 161 | my $read_id = $use_rg_id ? "$rg_id:$QNAME" : $QNAME; 162 | 163 | next if ($min_seq_len && length($SEQ) < $min_seq_len); 164 | 165 | if (defined $max_clipping) { 166 | my $soft_clipped = ($CIGAR =~ /(\d+)S/) ? $1 : 0; 167 | my $hard_clipped = ($CIGAR =~ /(\d+)H/) ? $1 : 0; 168 | 169 | next if ($soft_clipped + $hard_clipped > $max_clipping); 170 | } 171 | 172 | 173 | if (($FLAG & 16) && !$no_rc) { ## reverse strand 174 | $SEQ =~ tr/ATGCatgc/TACGtacg/; 175 | $SEQ = reverse $SEQ; 176 | 177 | $QUAL = reverse $QUAL; 178 | } 179 | 180 | my $pair_id = 0; 181 | if ($FLAG & 64) { ## first in pair 182 | $pair_id = 1; 183 | } 184 | elsif ($FLAG & 128) { ## second in pair 185 | $pair_id = 2; 186 | } 187 | 188 | if ($out_format eq 'fastq') { 189 | print "\@$read_id/$pair_id\n$SEQ\n\+\n$QUAL\n"; 190 | } 191 | else { 192 | print ">$read_id/$pair_id\n$SEQ\n"; 193 | } 194 | 195 | } 196 | } 197 | } 198 | 199 | 200 | -------------------------------------------------------------------------------- /calc_pop_divs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # calc_pop_divs.pl -- Calculating population pi values. 4 | # 5 | # 6 | # Author: Nowind 7 | # Created: 2012-05-31 8 | # Updated: 2016-10-05 9 | # Version: 1.1.0 10 | # 11 | # Change logs: 12 | # Version 1.0.0 16/02/10: The initial version. 13 | # Version 1.1.0 16/10/05: Updated: add option "--all-pairs" to output corrected diversities for all pairs. 14 | 15 | 16 | 17 | 18 | use strict; 19 | 20 | use Data::Dumper; 21 | use Getopt::Long; 22 | use File::Find::Rule; 23 | use File::Basename; 24 | 25 | use MyPerl::FileIO qw(:all); 26 | 27 | ################################# Main ############################### 28 | 29 | 30 | my $CMDLINE = "perl $0 @ARGV"; 31 | my $VERSION = '1.1.0'; 32 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 33 | 34 | 35 | my $min_info_perc = 50; 36 | my ($query_file, $subject_file, $output, $out_all_pairs); 37 | GetOptions( 38 | "query=s" => \$query_file, 39 | "subject=s" => \$subject_file, 40 | "output=s" => \$output, 41 | "min-info-perc=f" => \$min_info_perc, 42 | 43 | "all-pairs" => \$out_all_pairs, 44 | ); 45 | 46 | unless( $query_file && $subject_file ) { 47 | print < 57 | nucleotide differences between each pairs, required 58 | -s, --subject 59 | informative sites between each pairs required, required 60 | -o, --output 61 | output file, default to STDOUT 62 | 63 | -m, --min-info-perc 64 | windows with informative sites less than this percentage will be 65 | discarded from calculation [default: 50] 66 | 67 | -a, --all-pairs 68 | output corrected diversities for all pairs 69 | 70 | EOF 71 | 72 | exit(1); 73 | } 74 | 75 | $|++; 76 | 77 | 78 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 79 | 80 | if ($output) { 81 | open (STDOUT, "> $output") || die $!; 82 | } 83 | 84 | 85 | ## 86 | ## read and parsing query file 87 | ## 88 | print STDERR ">> Start parsing in $query_file ... "; 89 | my %pair_diffs = (); 90 | my @query_pair_ids = (); 91 | my $fh1 = getInputFilehandle($query_file); 92 | while (<$fh1>) 93 | { 94 | next if (/\#\#/ || /^\s+$/); 95 | 96 | my ($bin_id, $chrom, $bin_start, $bin_end, @pairs) = (split /\s+/); 97 | 98 | if (/^\#BIN_ID/) { 99 | for (my $i=0; $i<@pairs; $i++) 100 | { 101 | push @query_pair_ids, $pairs[$i]; 102 | } 103 | next; 104 | } 105 | 106 | for (my $i=0; $i<@pairs; $i++) 107 | { 108 | $pair_diffs{"$chrom\t$bin_start\t$bin_end"}->{$query_pair_ids[$i]} = $pairs[$i] * ($bin_end - $bin_start + 1); 109 | } 110 | } 111 | print STDERR "done!\n"; 112 | 113 | 114 | ## 115 | ## retrieving query records in subject file 116 | ## 117 | print STDERR ">> Start parsing $subject_file ... "; 118 | print STDOUT "$HEADER##" . (scalar localtime()) . "\n"; 119 | unless ($out_all_pairs) { 120 | print STDOUT "#BIN_ID\tCHROM\tBIN_START\tBIN_END\tNo_Of_Pairs\tPop_Divers\n"; 121 | } 122 | my @sub_pair_ids = (); 123 | my $fh2 = getInputFilehandle($subject_file); 124 | while (<$fh2>) 125 | { 126 | next if (/\#\#/ || /^\s+$/); 127 | 128 | my ($bin_id, $chrom, $bin_start, $bin_end, @pairs) = (split /\s+/); 129 | 130 | if (/^\#BIN_ID/) { 131 | if ($out_all_pairs) { 132 | print STDOUT; 133 | } 134 | 135 | for (my $i=0; $i<@pairs; $i++) 136 | { 137 | push @sub_pair_ids, $pairs[$i]; 138 | } 139 | next; 140 | } 141 | 142 | my $bin_size = $bin_end - $bin_start + 1; 143 | 144 | my %pair_divs = (); 145 | $pair_divs{sum} = 0; 146 | $pair_divs{num} = 0; 147 | 148 | for (my $i=0; $i<@pairs; $i++) 149 | { 150 | if (100 * $pairs[$i] / $bin_size >= $min_info_perc) { 151 | my $divs = $pair_diffs{"$chrom\t$bin_start\t$bin_end"}->{$sub_pair_ids[$i]} / $pairs[$i];; 152 | 153 | $pair_divs{sum} += $divs; 154 | $pair_divs{num} ++; 155 | 156 | push @{$pair_divs{all}}, $divs; 157 | } 158 | else { 159 | push @{$pair_divs{all}}, -1; 160 | } 161 | } 162 | 163 | my $out_divs = "$pair_divs{num}\t"; 164 | 165 | if ($out_all_pairs) { 166 | $out_divs = join "\t", @{$pair_divs{all}}; 167 | } 168 | else { 169 | $out_divs .= $pair_divs{num} > 0 ? $pair_divs{sum} / $pair_divs{num} : -1; 170 | } 171 | 172 | print "$bin_id\t$chrom\t$bin_start\t$bin_end\t$out_divs\n"; 173 | } 174 | print STDERR "done!\n"; 175 | 176 | 177 | 178 | print STDERR "# " . (scalar localtime()) . "\n"; 179 | -------------------------------------------------------------------------------- /convert_fastq_quality.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # convert_fastq_qaulity.pl -- Convert Illumina quality (+64) to Sanger quality (+33) or reverse. 4 | # 5 | # Author: Nowind 6 | # Created: 2012-02-21 7 | # Updated: 2016-03-13 8 | # Version: 1.0.0 9 | # 10 | # Change logs: 11 | # Version 1.0.0 16/03/13: The initial version. 12 | 13 | 14 | 15 | use strict; 16 | 17 | use Data::Dumper; 18 | use Getopt::Long; 19 | 20 | use MyPerl::FileIO qw(:all); 21 | 22 | ##################### Main #################### 23 | 24 | 25 | my $CMDLINE = "perl $0 @ARGV"; 26 | my $VERSION = '1.0.0'; 27 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 28 | 29 | my $SOURCE = (scalar localtime()) . " Version: $VERSION"; 30 | 31 | my $convert_type = 'illumina2sanger'; 32 | my ($fastq_file, $output); 33 | GetOptions( 34 | "input=s" => \$fastq_file, 35 | 36 | "output=s" => \$output, 37 | 38 | "convert-type" => \$convert_type, 39 | ); 40 | 41 | my $show_help = ($CMDLINE =~ /\-help/) ? 0 : 1; 42 | 43 | unless( $fastq_file && $show_help ) { 44 | print < 55 | fastq file, default from STDIN, required 56 | 57 | -o, --output 58 | output file, default to STDOUT 59 | 60 | -c, --convert-type 61 | illumina2sanger: Convert Illumina quality (+64) to Sanger quality (+33) 62 | sanger2illumina: Convert Sanger quality (+33) to Illumina quality (+64) 63 | default: illumina2sanger 64 | 65 | EOF 66 | 67 | exit(1); 68 | } 69 | 70 | 71 | 72 | 73 | $|++; 74 | 75 | 76 | 77 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 78 | 79 | 80 | 81 | 82 | print STDERR ">> Start converting $fastq_file ... "; 83 | convert_quality($fastq_file); 84 | print STDERR "done!\n"; 85 | 86 | 87 | 88 | print STDERR "# " . (scalar localtime()) . "\n"; 89 | 90 | 91 | ######################### Sub ######################### 92 | 93 | 94 | =head2 illumina2sanger 95 | 96 | About : Convert Illumina quality (+64) to Sanger quality (+33) or reverse. 97 | Usage : convert_quality($fastq_file) 98 | Args : Fastq file. 99 | Returns : Null 100 | Reference: http://wiki.bioinformatics.ucdavis.edu/index.php/IllQ2SanQ.pl 101 | https://github.com/jstjohn/KentLib/tree/master/examples/qseqToFastq 102 | 103 | =cut 104 | sub convert_quality 105 | { 106 | my ($in) = @_; 107 | 108 | my $fh = getInputFilehandle($in); 109 | while (<$fh>) 110 | { 111 | chomp(my $seq_id = $_); 112 | chomp(my $seq = <$fh>); 113 | chomp(my $fill = <$fh>); 114 | chomp(my $qual = <$fh>); 115 | 116 | my @prev_quals = split(//,$qual); 117 | my @curr_quals = (); 118 | 119 | for (my $i=0; $i<=$#prev_quals; $i++) { 120 | if ($convert_type eq 'illumina2sanger') { 121 | ## Convert a phred64 string into a phred64 string assuming illumina's minimum 'B' score thing 122 | my $char = $prev_quals[$i] eq 'B' ? '!' : chr(ord($prev_quals[$i]) - 31); 123 | push @curr_quals, $char; 124 | } 125 | else { 126 | ## Convert a phred33 string into a phred64 string, doing illumina's minimum 'B' score thing 127 | my $char = $prev_quals[$i] eq '!' ? 'B' : chr(ord($prev_quals[$i]) + 31); 128 | push @curr_quals, $char; 129 | } 130 | } 131 | 132 | my $curr_qual = join '', @curr_quals; 133 | 134 | if (length($qual) != length($curr_qual)) { 135 | print STDERR "$seq_id\n$seq\n$fill\n$qual\n$curr_qual\n"; 136 | exit; 137 | } 138 | 139 | print STDOUT "$seq_id\n$seq\n$fill\n$curr_qual\n"; 140 | } 141 | } 142 | 143 | -------------------------------------------------------------------------------- /convert_trf.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # convert_trf.pl -- Convert results from TRF (Tandem Repeats Finder) to other format. 4 | # 5 | # Author: Nowind 6 | # Created: 2011-09-18 7 | # Updated: 2016-05-25 8 | # Version: 1.0.0 9 | # References: 10 | # http://tandem.bu.edu/trf/trf.html 11 | # https://gist.github.com/lexnederbragt/3689ee2301493c34c8ab#file-trf2gff-py 12 | # 13 | # Change logs: 14 | # Version 1.0.0 16/05/25: The initial version. 15 | 16 | 17 | =head1 NAME 18 | 19 | convert_trf.pl 20 | 21 | 22 | =head1 SYNOPSIS 23 | 24 | convert_trf.pl --help/? 25 | 26 | =head1 DESCRIPTION 27 | 28 | Convert results from TRF (Tandem Repeats Finder) to other format like gff3 or bed. 29 | 30 | =cut 31 | 32 | 33 | use strict; 34 | 35 | use Data::Dumper; 36 | use Getopt::Long; 37 | use File::Find::Rule; 38 | 39 | use MyPerl::FileIO qw(:all); 40 | 41 | ################## Main ################## 42 | 43 | my $CMDLINE = "perl $0 @ARGV"; 44 | my $VERSION = '1.0.0'; 45 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 46 | my $SOURCE = (scalar localtime()) . " Version: $VERSION"; 47 | 48 | my $out_format = 'gff'; 49 | my ($input, $output); 50 | GetOptions( 51 | "input=s" => \$input, 52 | "output=s" => \$output, 53 | "format=s" => \$out_format, 54 | ); 55 | 56 | unless( $input ) { 57 | print < 70 | input dat file generate from TRF, required 71 | -o,--output 72 | output file, default to STDOUT 73 | 74 | -f,--format 75 | output format, could be set as "gff", "bed" or "tabular" 76 | [default: gff] 77 | EOF 78 | 79 | exit(1); 80 | } 81 | 82 | $|++; 83 | 84 | 85 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 86 | 87 | if ($output) { 88 | open (STDOUT, "> $output") || die $!; 89 | } 90 | 91 | print STDERR "Start parsing $input ... "; 92 | parse_trf($input); 93 | print STDERR "done!\n"; 94 | 95 | 96 | ################# Sub ################# 97 | 98 | sub parse_trf 99 | { 100 | my ($in) = shift; 101 | 102 | if ($out_format eq 'gff') { 103 | print STDOUT "##gff-version 3\n"; 104 | } 105 | elsif ($out_format eq 'tabular') { 106 | print STDOUT "$HEADER##" . (scalar localtime()) . "\n"; 107 | print STDOUT "#source\tsequence_id\ttype\tstart\tend\tperiod\tcopies\tconsensus_size\tperc_match\t" . 108 | "perc_indels\talign_score\tperc_A\tperc_C\tperc_G\tperc_T\tentropy\tcons_seq\trepeat_seq\n"; 109 | } 110 | 111 | 112 | my $fh = getInputFilehandle($in); 113 | my $sequence_id = ''; 114 | my %counts = (); 115 | while(<$fh>) 116 | { 117 | next if (/\#/ || /^\s+$/); 118 | 119 | if (/^Sequence\:/) { 120 | (undef, $sequence_id) = (split /\s+/); 121 | $counts{$sequence_id} = 0; 122 | } 123 | elsif (/^\d+/) { 124 | unless($sequence_id) { 125 | print STDERR "Error: No sequence id found!"; exit(2); 126 | } 127 | 128 | $counts{$sequence_id} ++; 129 | 130 | my ($start, $end, $period, $copies, $consensus_size, $perc_match, 131 | $perc_indels, $align_score, $perc_A, $perc_C, $perc_G, $perc_T, 132 | $entropy, $cons_seq, $repeat_seq) = (split /\s+/); 133 | 134 | 135 | if ($out_format eq 'gff') { 136 | my $tr_id = $sequence_id . "." . $counts{$sequence_id}; 137 | 138 | print "$sequence_id\tTRF\ttandem_repeat\t$start\t$end\t$align_score\t+\t.\t" . 139 | "ID=$tr_id;Name=($cons_seq)$copies;Target=$repeat_seq\n"; 140 | } 141 | elsif ($out_format eq 'bed') { 142 | my $bed_start = $start - 1; 143 | print "$sequence_id\t$bed_start\t$end\t$repeat_seq([$cons_seq]x$copies)\n"; 144 | } 145 | elsif ($out_format eq 'tabular') { 146 | print "TRF\t$sequence_id\ttandem_repeat\t$start\t$end\t$period\t$copies\t$consensus_size\t$perc_match\t" . 147 | "$perc_indels\t$align_score\t$perc_A\t$perc_C\t$perc_G\t$perc_T\t$entropy\t$cons_seq\t$repeat_seq\n"; 148 | } 149 | } 150 | } 151 | } 152 | 153 | -------------------------------------------------------------------------------- /count_vcf_pairs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # count_vcf_pairs.pl -- Count informative sites between each pairs. 4 | # 5 | # 6 | # Author: Nowind 7 | # Created: 2011-01-08 8 | # Updated: 2016-02-10 9 | # 10 | # Change logs: 11 | # Version 1.0.0 16/02/10: The initial version. 12 | 13 | 14 | 15 | 16 | 17 | =head1 NAME 18 | 19 | count_vcf_pairs.pl 20 | 21 | 22 | =head1 SYNOPSIS 23 | 24 | count_vcf_pairs.pl --help/? 25 | 26 | 27 | =head1 DESCRIPTION 28 | 29 | Count informative sites between each pairs. 30 | 31 | =cut 32 | 33 | 34 | use strict; 35 | 36 | use File::Find::Rule; 37 | use Getopt::Long; 38 | use Data::Dumper; 39 | use Data::Random qw(:all); 40 | use Parallel::ForkManager; 41 | 42 | use MyPerl::FileIO qw(:all); 43 | use MyPerl::Vcf qw(:all); 44 | use Statistics::Descriptive; 45 | 46 | 47 | ############################# Main ########################### 48 | 49 | my $CMDLINE = "perl $0 @ARGV"; 50 | my $VERSION = '1.0.0'; 51 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 52 | 53 | 54 | my %options = (); 55 | my $max_threads = 1; 56 | my $window_size = 250; 57 | my ($vcf_file, @filters, $interval_file, $step_size, $show_help, $user_type, 58 | $skip_unphased, $output, 59 | $ref_for_missing, $hete_as_missing, $hete_as_alt, $ref_sample); 60 | GetOptions( 61 | "vcf=s" => \$vcf_file, 62 | "intervals=s" => \$interval_file, 63 | 64 | "length-file=s" => \$options{length_file}, 65 | 66 | "L|window-size=i" => \$window_size, 67 | "S|step-size=i" => \$step_size, 68 | 69 | "filter=s{,}" => \@filters, 70 | "phased" => \$skip_unphased, 71 | 72 | "R|ref-for-missing" => \$ref_for_missing, 73 | "H|hete-as-missing" => \$hete_as_missing, 74 | "A|hete-as-alt" => \$hete_as_alt, 75 | 76 | "output=s" => \$output, 77 | 78 | 79 | "cmp-to-sample=s" => \$ref_sample, 80 | 81 | "threads=i" => \$max_threads, 82 | 83 | "help|?" => \$show_help, 84 | 85 | "V|var-type=s" => \$user_type, 86 | 87 | "exclude=s{,}" => \@{$options{exclude_ids}}, 88 | ); 89 | 90 | unless( !$show_help && $vcf_file ) { 91 | print < 101 | multiple-Samples vcf file, compressed by bgzip and index by tabix, 102 | required 103 | 104 | --intervals 105 | file contans one or more genomic intervals over which to operate, 106 | calculate diversities only in those specified intervals instead 107 | of slide-windows along the whole chromosome, each interval contains 108 | 4 values "id chrom start end", delimited by space or tabs e.g. 109 | interval01 chr01 100 200 110 | interval01 chr01 500 700 111 | ... 112 | 113 | --informative 114 | calculating pairwise diversity by dividing informative sites rather 115 | than interval length, note this option requires the vcf file contain 116 | all informative sites, only windows with effective sites over the 117 | specified percentage would be considered as informative 118 | 119 | -l, --length-file 120 | a file contains chromosome length should be specified here in the 121 | format: 122 | 123 | #CHROM LENGTH 124 | chr01 43270923 125 | chr02 35937250 126 | chr03 36413819 127 | chr04 35502694 128 | ... 129 | 130 | required while chromosome info is absent in vcf header 131 | 132 | -e, --exclude 133 | exclude unwanted chromosomes or scaffolds while calculating, all 134 | chromosomes with ids match strings specified here would be ignored 135 | 136 | -o, --output 137 | output filename, default to STDOUT 138 | 139 | 140 | -L, --window-size 141 | window size(bp) [default:250] 142 | -S, --step-size 143 | step size(bp), default same as window size, indicate non-overlapping 144 | windows 145 | 146 | 147 | 148 | -c, --cmp-to-sample 149 | compare all other samples to one certain sample specified 150 | 151 | -f, --filter 152 | skip filter loci, can have multiple values, separate by blanks, e.g. 153 | "LowQual SNPFilter" ... [default: no filtering] 154 | -p, --phased 155 | skip unphased sites 156 | 157 | -R, --ref-for-missing 158 | use the REF allele instead of the default missing genotype. Because it 159 | is not obvious what ploidy should be used, a user-defined string is used 160 | instead (e.g. 0/0) 161 | -H, --hete-as-missing 162 | assume no heterozygous sites should be present here and treat those 163 | heterozygous sites as missing alleles 164 | -A, --hete-as-alt 165 | assume heterozygous calls as homozygous alternative calls 166 | 167 | -t, --threads 168 | how many data threads should be allocated to running this analysis 169 | [default: 1] 170 | 171 | -V, --var-type 172 | set "snp" to process snp sites only, or set "indel" to process indels only 173 | 174 | -?, --help 175 | show this help message 176 | 177 | EOF 178 | 179 | exit(1); 180 | } 181 | 182 | $|++; 183 | 184 | 185 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 186 | 187 | $step_size = $window_size if (!$step_size); 188 | 189 | if ($output) { 190 | open (STDOUT, "> $output") || die $!; 191 | } 192 | 193 | my %filters = (); 194 | my $filter_str = join '|', @filters; 195 | 196 | 197 | unless( -f "$vcf_file.tbi" ) { 198 | print STDERR "[Error]: no index found for vcf file!\n"; exit(2); 199 | } 200 | 201 | 202 | 203 | ## 204 | ## get sample names 205 | ## 206 | print STDERR ">> Start parsing vcf file headers ... "; 207 | my %CHROM_LENGTHS = (); 208 | my @CHROM_IDS = (); 209 | my @SAMPLE_NAMES = (); 210 | my $CONTIG_INFO = ''; 211 | my %pair_ids = (); 212 | my @id_orders = (); 213 | parse_vcf_header($vcf_file); 214 | if ($options{length_file}) { 215 | get_genome_length(\@CHROM_IDS, \%CHROM_LENGTHS, $options{length_file}, \@{$options{exclude_ids}}); 216 | } 217 | print STDERR "done!\n"; 218 | 219 | if (!$interval_file && @CHROM_IDS == 0 && !$options{length_file}) { 220 | print STDERR "Error: Chromosome info not found, please check vcf header or specify a length file!\n"; 221 | exit(2); 222 | } 223 | 224 | 225 | 226 | ## 227 | ## parse intervals 228 | ## 229 | print STDERR ">> Generating intervals needed to calculate diversities ... "; 230 | my $ra_intervals = parse_intervals($interval_file); 231 | print STDERR "done!\n"; 232 | 233 | 234 | 235 | ## get all results from each childs 236 | my %pair_infos_all = (); 237 | my $pm = new Parallel::ForkManager($max_threads); 238 | if ($max_threads > 1) { 239 | $pm->run_on_finish( 240 | sub{ 241 | my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $data) = @_; 242 | 243 | my ($interval, $rh_pair_infos) = @{$data}; 244 | 245 | $pair_infos_all{$interval} = $rh_pair_infos; 246 | } 247 | ); 248 | } 249 | 250 | 251 | my $total_jobs_cnt = scalar @{$ra_intervals}; 252 | my $curr_job_num = 0; 253 | 254 | for (my $i=0; $i<@{$ra_intervals}; $i++) 255 | { 256 | my $interval = $ra_intervals->[$i]; 257 | 258 | $curr_job_num++; 259 | 260 | print STDERR "\r>> Start calculating diversities ... $curr_job_num/$total_jobs_cnt"; 261 | 262 | my $pid = $pm->start and next if ($max_threads > 1); 263 | 264 | ## 265 | ## calculate diversities 266 | ## 267 | ##print STDERR ">> Start calculating pairwise distances ... "; 268 | my $rh_pair_infos = count_pairwise_informative($vcf_file, $interval); 269 | ##print STDERR "done!\n"; 270 | 271 | if ($max_threads > 1) { 272 | $pm->finish(0, [$interval, $rh_pair_infos]); 273 | } 274 | else { 275 | $pair_infos_all{$interval} = $rh_pair_infos; 276 | } 277 | } 278 | $pm->wait_all_children; 279 | print STDERR "\tdone!\n"; 280 | 281 | 282 | ## 283 | ## generate final results 284 | ## 285 | print STDERR ">> Start writing results ... "; 286 | my @out_orders = (); 287 | if ($ref_sample) { 288 | for my $id (@id_orders) 289 | { 290 | next if ($id eq $ref_sample); 291 | my $pair_id = join "\-", (sort ($id, $ref_sample)); 292 | push @out_orders, $pair_id; 293 | } 294 | } 295 | else { 296 | for (my $i=0; $i<@id_orders; $i++) 297 | { 298 | for (my $j=$i+1; $j<@id_orders; $j++) { 299 | my $pair_id = join "\-", ($id_orders[$i], $id_orders[$j]); 300 | push @out_orders, $pair_id; 301 | } 302 | } 303 | } 304 | 305 | ###print STDERR Dumper(@id_orders); 306 | ###print STDERR Dumper(@out_orders);exit; 307 | 308 | my $out_pair_ids = join "\t", @out_orders; 309 | 310 | print STDOUT "$HEADER##" . (scalar localtime()) . "\n"; 311 | print STDOUT "#BIN_ID\tCHROM\tBIN_START\tBIN_END\t$out_pair_ids\n"; 312 | for my $interval (@{$ra_intervals}) 313 | { 314 | generate_results($pair_infos_all{$interval}, $interval); 315 | } 316 | 317 | print STDERR "done!\n"; 318 | 319 | print STDERR "# " . (scalar localtime()) . "\n"; 320 | 321 | ########################## Sub ######################### 322 | 323 | 324 | =head2 parse_vcf_header 325 | 326 | About : Parse vcf header informations. 327 | Usage : parse_vcf_header($interval_file); 328 | Args : Vcf file. 329 | Returns : Null 330 | 331 | =cut 332 | sub parse_vcf_header 333 | { 334 | my ($in) = @_; 335 | 336 | ## exclude unwanted chromosomes or scaffolds while simulating, all 337 | ## chromosomes with ids match strings specified here would be ignored 338 | my $exclude_str = ''; 339 | if ($options{exclude_ids} && @{$options{exclude_ids}} > 0) { 340 | $exclude_str = join '|', @{$options{exclude_ids}}; 341 | } 342 | 343 | my $fh = getInputFilehandle($in); 344 | while (<$fh>) 345 | { 346 | if (/\#\#contig=/) { 347 | my ($chrom_id, $chrom_len) = ($1, $2); 348 | 349 | if ($options{exclude_ids} && @{$options{exclude_ids}} > 0) { 350 | next if ($chrom_id =~ /($exclude_str)/); 351 | } 352 | 353 | $CHROM_LENGTHS{$chrom_id} = $chrom_len; 354 | 355 | push @CHROM_IDS, $chrom_id; 356 | 357 | $CONTIG_INFO .= $_; 358 | } 359 | elsif (/#CHROM/) { 360 | my @line = (split /\s+/); 361 | for (my $i=9; $i<@line; $i++) 362 | { 363 | $line[$i] =~ s/-/_/g; ## replace sample id if contains "-" 364 | 365 | $SAMPLE_NAMES[$i-9] = $line[$i]; 366 | } 367 | 368 | @id_orders = @line[9..$#line]; 369 | 370 | return 0; 371 | } 372 | } 373 | } 374 | 375 | 376 | 377 | =head2 parse_intervals 378 | 379 | About : Parse interval list or generate windows along the whole genome. 380 | Usage : parse_intervals($interval_file); 381 | Args : File contains all intervals (optional). 382 | Returns : Array reference to all intervals or windows. 383 | 384 | =cut 385 | sub parse_intervals 386 | { 387 | my ($interval_file) = @_; 388 | 389 | my @intervals = (); 390 | 391 | ## read intervals from a file 392 | if ($interval_file) { 393 | my $fh = getInputFilehandle($interval_file); 394 | 395 | while(<$fh>) 396 | { 397 | next if (/\#/ || /^\s+$/); 398 | 399 | my ($id, $chrom, $start, $end) = (split /\s+/); 400 | 401 | push @intervals, "$id\t$chrom\t$start\t$end"; 402 | } 403 | } 404 | else { 405 | ## use sliding-windows 406 | for my $chrom (@CHROM_IDS) 407 | { 408 | my $bin_id = 1; 409 | my $bin_start = 1; 410 | 411 | while ($bin_start <= $CHROM_LENGTHS{$chrom}) 412 | { 413 | my $bin_end = $bin_start + $window_size - 1; 414 | $bin_end = $CHROM_LENGTHS{$chrom} if ($bin_end > $CHROM_LENGTHS{$chrom}); ## the last window 415 | 416 | push @intervals, "$bin_id\t$chrom\t$bin_start\t$bin_end"; 417 | 418 | $bin_start += $step_size; 419 | $bin_id ++; 420 | } 421 | } 422 | } 423 | 424 | return \@intervals; 425 | } 426 | 427 | 428 | =head2 count_pairwise_informative 429 | 430 | About : Calculate pairwise distances. 431 | Usage : my $rh_pairwise_divers = count_pairwise_informative(\%pairwise_differs, $interval); 432 | Args : Vcf file; 433 | Interval string. 434 | Returns : Reference to a hase contains pairwise diversities 435 | 436 | =cut 437 | sub count_pairwise_informative 438 | { 439 | my ($in, $interval) = @_; 440 | 441 | my ($id, $chrom, $start, $end) = (split /\t/, $interval); 442 | 443 | my $region = "$chrom:$start-$end"; 444 | 445 | my %info_sites = (); 446 | 447 | open (my $fh, "tabix $in $region |") or die $!; 448 | while (<$fh>) 449 | { 450 | next if (/\#/ || /^\s+$/); 451 | 452 | next if ($skip_unphased && /\//); ## only considering phased genotypes 453 | next if ($filter_str && /$filter_str/i); ## keep variants in repeat regions 454 | 455 | my ($CHROM, $POS, $ID, $REF, $ALT, $QUAL, $FILTER, $INFO, 456 | $FORMAT, @Samples) = (split /\s+/); 457 | 458 | my @vars = ($REF, (split /\,/, $ALT)); 459 | 460 | my $var_type = get_var_type($REF, $ALT); 461 | 462 | next if ($user_type && $var_type !~ /$user_type/); ## process snp or indel only 463 | 464 | my @tags = (split /\:/, $FORMAT); 465 | my %tags = (); 466 | for (my $i=0; $i<@tags; $i++) { $tags{$tags[$i]} = $i; } 467 | 468 | for (my $i=0; $i<@Samples; $i++) 469 | { 470 | my $sample1 = $SAMPLE_NAMES[$i]; 471 | 472 | my $GT1 = (split /\:/, $Samples[$i])[$tags{GT}]; 473 | 474 | next if ($GT1 eq '.' || $GT1 eq './.'); ## missing calls 475 | 476 | ## pairwise comparison 477 | for (my $j=$i+1; $j<@Samples; $j++) 478 | { 479 | my $sample2 = $SAMPLE_NAMES[$j]; 480 | 481 | next if ($ref_sample && ($sample1 ne $ref_sample) ## all samples compare to a reference sample 482 | && ($sample2 ne $ref_sample)); 483 | 484 | my $GT2 = (split /\:/, $Samples[$j])[$tags{GT}]; 485 | 486 | next if ($GT2 eq '.' || $GT2 eq './.'); 487 | 488 | $info_sites{$chrom}->{$id}->{"$sample1\-$sample2"}++; 489 | } 490 | } 491 | } 492 | 493 | return \%info_sites; 494 | } 495 | 496 | 497 | 498 | 499 | =head2 generate_results 500 | 501 | About : Generate results. 502 | Usage : my $rh_group_divers = generate_results(\%group_divers, $interval); 503 | Args : Hash of group diversities; 504 | Interval string. 505 | Returns : Null 506 | 507 | =cut 508 | sub generate_results 509 | { 510 | my ($rh_infos, $interval) = @_; 511 | 512 | my ($id, $chrom, $start, $end) = (split /\t/, $interval); 513 | 514 | my @pair_info_sites = (); 515 | 516 | for my $pair_id (@out_orders) 517 | { 518 | my $pair_infos = $rh_infos->{$chrom}->{$id}->{$pair_id} ? 519 | $rh_infos->{$chrom}->{$id}->{$pair_id} : 0; 520 | push @pair_info_sites, $pair_infos; 521 | } 522 | 523 | my $pair_info_sites = join "\t", @pair_info_sites; 524 | 525 | print STDOUT "$id\t$chrom\t$start\t$end\t$pair_info_sites\n"; 526 | } 527 | 528 | 529 | 530 | -------------------------------------------------------------------------------- /downsample_vcf.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # downsample_vcf.pl -- generate certain number of loci by random. 4 | # 5 | # Author: Nowind 6 | # Created: 2011-10-19 7 | # Updated: 2014-03-11 8 | # Version: 1.0.1 9 | # 10 | # Change logs: 11 | # Version 1.0.0 14/03/05: The initial version. 12 | # Version 1.0.1 14/03/11: Bug fixed in generate suffix of file names; add option 13 | # "--pos-only". 14 | 15 | 16 | 17 | 18 | 19 | use strict; 20 | 21 | use Data::Dumper; 22 | use Getopt::Long; 23 | use File::Find::Rule; 24 | use Statistics::Descriptive; 25 | use Statistics::PointEstimation; 26 | use Data::Random qw(:all); 27 | 28 | use MyPerl::FileIO qw(:all); 29 | use MyPerl::Statistics; 30 | 31 | ################### Main ################# 32 | 33 | my $CMDLINE = "perl $0 @ARGV"; 34 | my $VERSION = '1.0.1'; 35 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 36 | my $SOURCE = (scalar localtime()) . " Version: $VERSION"; 37 | 38 | my %opts = (); 39 | $opts{times} = 1; 40 | $opts{number} = 100; 41 | my ($out_pos_only); 42 | GetOptions( 43 | "input=s" => \$opts{input}, 44 | "output=s" => \$opts{prefix}, 45 | "times=i" => \$opts{times}, 46 | "size=i" => \$opts{size}, 47 | 48 | "pos-only" => \$out_pos_only, 49 | ); 50 | 51 | unless( $opts{input} ) { 52 | print < 62 | file contains chromosome lengths 63 | -o, --output 64 | output file prefix, default: rand[001.vcf ...] 65 | 66 | -t, --times 67 | random times [default: 1] 68 | -s, --size 69 | numbers of loci [default: 100] 70 | 71 | -p, --pos-only 72 | only output chromosome and positions 73 | 74 | EOF 75 | 76 | exit(0); 77 | } 78 | 79 | $|++; 80 | 81 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 82 | 83 | unless ($opts{prefix}) { 84 | $opts{prefix} = 'rand'; 85 | } 86 | 87 | print STDERR ">> Start parsing $opts{input} ... "; 88 | my $vcf_header = ''; 89 | my $sample_line = ''; 90 | my @records_all = (); 91 | parse_vcf($opts{input}); 92 | print STDERR "\tdone!\n"; 93 | 94 | 95 | gen_random_vars($opts{times}, $opts{size}); 96 | 97 | 98 | print STDERR "# " . (scalar localtime()) . "\n"; 99 | 100 | ######################### Sub ######################### 101 | 102 | sub parse_vcf 103 | { 104 | my ($in) = shift; 105 | 106 | my $fh = getInputFilehandle($in); 107 | while (<$fh>) 108 | { 109 | if (/\#\#/) { 110 | $vcf_header .= $_; 111 | } 112 | elsif (/^\#CHROM/) { 113 | if ($out_pos_only) { 114 | $sample_line = "#CHROM\tPOS\n"; 115 | } 116 | else { 117 | $sample_line = $_; 118 | } 119 | } 120 | 121 | next if (/\#/ || /^\s+$/); 122 | 123 | if ($out_pos_only) { 124 | my ($chrom, $pos) = (split /\s+/, $_)[0,1]; 125 | push @records_all, "$chrom\t$pos"; 126 | } 127 | else { 128 | chomp; 129 | push @records_all, $_; 130 | } 131 | 132 | } 133 | } 134 | 135 | 136 | sub gen_random_vars 137 | { 138 | my ($rand_times, $rand_size) = @_; 139 | 140 | my %rand_values = (); 141 | 142 | my $i = 0; 143 | my $tag = '0' x (length($rand_times)); 144 | while (++$i <= $rand_times) 145 | { 146 | print STDERR "\r>> Start random process ... duplicate $i\/$rand_times"; 147 | 148 | open (my $fh, "> $opts{prefix}.$tag.vcf") || die $!; 149 | 150 | print {$fh} "$vcf_header"; 151 | print {$fh} "##source=$SOURCE $CMDLINE\n"; 152 | print {$fh} "$sample_line"; 153 | 154 | my @rand_vars = rand_set( set => [@records_all], size => $rand_size, shuffle => 0 ); 155 | 156 | print {$fh} (join "\n", @rand_vars); 157 | 158 | $tag++; 159 | } 160 | print STDERR "\tdone!\n"; 161 | } 162 | 163 | 164 | 165 | -------------------------------------------------------------------------------- /extract_bam_pairs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # extract_bam_pairs.pl -- extract aligned sequences and related reference sequence 4 | # from a SAM format file 5 | # 6 | # Author: Nowind 7 | # Created: 2012-02-21 8 | # Updated: 2015-01-21 9 | # Version: 1.1.2 10 | # 11 | # Change logs: 12 | # Version 1.0.0 14/02/23: The initial version. 13 | # Version 1.0.1 14/02/25: Add support for multiple bam files. 14 | # Version 1.0.2 14/03/02: Add option "--extend". 15 | # Version 1.1.0 14/05/29: Change input file format; add option "--rows"; add options 16 | # for filtering bam records. 17 | # Version 1.1.1 14/06/01: Add option "--use-rg" to add readgroup id to bam records 18 | # instead of bam file index. 19 | # Version 1.1.2 15/01/21: Add option "--replace-file" to replace extracted sam records 20 | # which matches those in the replace file. 21 | 22 | 23 | 24 | use strict; 25 | 26 | use Data::Dumper; 27 | use Getopt::Long; 28 | 29 | use MyPerl::FileIO qw(:all); 30 | 31 | ##################### Main #################### 32 | 33 | 34 | my $CMDLINE = "perl $0 @ARGV"; 35 | my $VERSION = '1.1.2'; 36 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 37 | 38 | my $SOURCE = (scalar localtime()) . " Version: $VERSION"; 39 | 40 | my $out_format = 'sam'; 41 | my ($input, @bam_files, $no_rc, @rows, $output, $patch_file, 42 | $extend_size, $use_rg_id, $samtools_opts, 43 | $min_seq_len, $max_clipping, $min_insert_size, $max_insert_size); 44 | GetOptions( 45 | "I|input=s" => \$input, 46 | "R|rows=i{,}" => \@rows, 47 | 48 | "extend=i" => \$extend_size, 49 | 50 | "bam=s{,}" => \@bam_files, 51 | 52 | "output=s" => \$output, 53 | "format=s" => \$out_format, 54 | "no-rc" => \$no_rc, 55 | 56 | "samtools=s" => \$samtools_opts, 57 | "min-insert=i" => \$min_insert_size, 58 | "max-insert=i" => \$max_insert_size, 59 | "min-len=i" => \$min_seq_len, 60 | "max-clipping=i" => \$max_clipping, 61 | "use-rg" => \$use_rg_id, 62 | 63 | "patches=s" => \$patch_file, 64 | ); 65 | 66 | my $show_help = ($CMDLINE =~ /\-help/) ? 0 : 1; 67 | 68 | unless( $input && (@bam_files > 0) && $show_help ) { 69 | print < 80 | input file of query positions, required 81 | 82 | -R, --rows 83 | specify the row fields of chromosome, start position and end 84 | position (0-based), in the query block file [default: 0 1 2] 85 | 86 | -p, --patches 87 | update sam records with same QNAME and FLAG found in this file 88 | 89 | -e, --extend 90 | extend regions of this size to retrieve for read pairs 91 | 92 | -b, --bam 93 | bam file(s), at least one bam file should be specified 94 | 95 | 96 | Output Options: 97 | 98 | -o, --output 99 | output filename if output in sam format, or output filename 100 | prefix if output if fastq format 101 | -f, --format 102 | output format, default output in sam format, can be set to 103 | fastq format 104 | 105 | -n, --no-rc 106 | do not reverse complement sequence with negtive strand 107 | -u, --use-rg 108 | add read group id to extracted records 109 | 110 | Filtering Options: 111 | -s, --samtools 112 | directly pass samtools view options to this script, e.g. 113 | "-f 4 -F 8" 114 | 115 | --min-len 116 | minium sequence length 117 | 118 | --max-clipping 119 | maximum allowed clipping length, include both soft and hard 120 | clipping bases 121 | 122 | --min-insert 123 | --max-insert 124 | screen out records with insert size wihtin this range 125 | 126 | EOF 127 | 128 | exit(1); 129 | } 130 | 131 | 132 | 133 | 134 | $|++; 135 | 136 | 137 | 138 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 139 | 140 | 141 | unless(@rows > 0){ @rows = qw(0 1 2) }; 142 | 143 | if ($out_format eq 'fastq') { 144 | open (FQ1, "> $output" . "_1.fq") || die $!; 145 | open (FQ2, "> $output" . "_2.fq") || die $!; 146 | } 147 | else { 148 | if ($output) { 149 | open (STDOUT, "> $output") || die $!; 150 | } 151 | } 152 | 153 | 154 | 155 | my %new_sam_records = (); 156 | if ($patch_file) { 157 | print STDERR ">> Patch data found, start parsing records in $patch_file ... "; 158 | my $patch_fh = getInputFilehandle($patch_file); 159 | while (<$patch_fh>) 160 | { 161 | next if (/^@/ || /^\s+$/); ## skip header 162 | 163 | chomp; 164 | my ($QNAME, $FLAG) = (split /\s+/)[0,1]; 165 | 166 | $new_sam_records{"$QNAME\t$FLAG"} = $_; 167 | } 168 | print STDERR "done!\n"; 169 | 170 | ###print STDERR Dumper(%new_sam_records);exit; 171 | } 172 | 173 | 174 | 175 | print STDERR ">> Read in query ids in $input ... "; 176 | my %query_ids = (); 177 | my %counts_all = (); 178 | my $fh = getInputFilehandle($input); 179 | while (<$fh>) 180 | { 181 | next if (/^#/ || /^\s+$/); ## skip header 182 | 183 | my ($chrom, $start, $end) = (split /\s+/)[@rows]; 184 | 185 | my %reads = (); 186 | 187 | search_read_id(\%reads, $chrom, $start, $end); 188 | extract_pairs(\%reads, $chrom, $start, $end); 189 | 190 | 191 | for my $id (sort keys %{$reads{pairs}}) 192 | { 193 | $counts_all{reads} ++; 194 | 195 | next unless(keys %{$reads{pairs}->{$id}} == 2); 196 | 197 | $counts_all{paired} ++; 198 | 199 | if ($out_format eq 'fastq') { 200 | print FQ1 "\@" . "$id/1\n" . $reads{pairs}->{$id}->{1} . "\n"; 201 | print FQ2 "\@" . "$id/2\n" . $reads{pairs}->{$id}->{2} . "\n"; 202 | } 203 | else { 204 | print $reads{pairs}->{$id}->{1} . "\n"; 205 | print $reads{pairs}->{$id}->{2} . "\n"; 206 | } 207 | } 208 | } 209 | print STDERR "done!\n"; 210 | 211 | my $find_perc = ($counts_all{reads} > 0) ? ($counts_all{paired} / $counts_all{reads} * 100) : 0; 212 | $find_perc = sprintf("%.2f", $find_perc); 213 | 214 | print STDERR <) 253 | { 254 | next if (/^@/ || /^\s+$/); ## skip header 255 | 256 | chomp(my $record = $_); 257 | 258 | my ($QNAME, $FLAG, $RNAME, $POS, $MAPQ, $CIGAR, 259 | $MRNM, $NPOS, $TLEN, $SEQ, $QUAL, @OPT) = (split /\s+/, $record); 260 | 261 | 262 | ## 263 | ## check if this record should be update 264 | ## 265 | if ($new_sam_records{"$QNAME\t$FLAG"}) { 266 | $record = $new_sam_records{"$QNAME\t$FLAG"}; 267 | 268 | ($QNAME, $FLAG, $RNAME, $POS, $MAPQ, $CIGAR, 269 | $MRNM, $NPOS, $TLEN, $SEQ, $QUAL, @OPT) = (split /\s+/, $record); 270 | } 271 | 272 | 273 | my $rg_id = ($record =~ /RG:Z:(.*?)\s+/); 274 | 275 | next if ($min_seq_len && length($SEQ) < $min_seq_len); 276 | 277 | next if ($min_insert_size && abs($TLEN) < $min_insert_size); 278 | next if ($max_insert_size && abs($TLEN) > $max_insert_size); 279 | 280 | if (defined $max_clipping) { 281 | my $soft_clipped = ($CIGAR =~ /(\d+)S/) ? $1 : 0; 282 | my $hard_clipped = ($CIGAR =~ /(\d+)H/) ? $1 : 0; 283 | 284 | next if ($soft_clipped + $hard_clipped > $max_clipping); 285 | } 286 | 287 | $QNAME =~ s/\/\d$//; 288 | 289 | my $read_id = $use_rg_id ? "$rg_id:$QNAME" : $QNAME; 290 | 291 | $rh_reads->{id}->{$read_id} = 1; 292 | } 293 | } 294 | 295 | 296 | } 297 | 298 | =head2 search_bam_records 299 | 300 | About : Extract sequences from SAM format file 301 | Usage : extract_seqs(\%Reference_SEQs, $sam_file); 302 | Args : Hash of reference sequences 303 | File in SAM format 304 | Returns : Null 305 | 306 | =cut 307 | sub extract_pairs 308 | { 309 | my ($rh_reads, $chrom, $start, $end) = @_; 310 | 311 | if ($extend_size) { 312 | $start -= $extend_size; 313 | $end += $extend_size; 314 | 315 | $start = 0 if $start < 0; 316 | } 317 | 318 | for (my $i=0; $i<@bam_files; $i++) 319 | { 320 | my $pipe_str = "samtools view $bam_files[$i] $chrom:$start-$end |"; 321 | 322 | if ($samtools_opts) { 323 | $pipe_str = "samtools view $samtools_opts $bam_files[$i] $chrom:$start-$end |"; 324 | } 325 | 326 | open (my $fh, $pipe_str) || die $!; 327 | while (<$fh>) 328 | { 329 | next if (/^@/ || /^\s+$/); ## skip header 330 | 331 | chomp(my $record = $_); 332 | 333 | my ($QNAME, $FLAG, $RNAME, $POS, $MAPQ, $CIGAR, 334 | $MRNM, $NPOS, $TLEN, $SEQ, $QUAL, @OPT) = (split /\s+/, $record); 335 | 336 | 337 | ## 338 | ## check if this record should be update 339 | ## 340 | if ($new_sam_records{"$QNAME\t$FLAG"}) { 341 | $record = $new_sam_records{"$QNAME\t$FLAG"}; 342 | 343 | ($QNAME, $FLAG, $RNAME, $POS, $MAPQ, $CIGAR, 344 | $MRNM, $NPOS, $TLEN, $SEQ, $QUAL, @OPT) = (split /\s+/, $record); 345 | } 346 | 347 | 348 | my $rg_id = ($record =~ /RG:Z:(.*?)\s+/); 349 | 350 | $QNAME =~ s/\/\d$//; 351 | 352 | my $read_id = $use_rg_id ? "$rg_id:$QNAME" : $QNAME; 353 | 354 | next unless($rh_reads->{id}->{$read_id}); 355 | 356 | next if ($min_seq_len && length($SEQ) < $min_seq_len); 357 | 358 | if (defined $max_clipping) { 359 | my $soft_clipped = ($CIGAR =~ /(\d+)S/) ? $1 : 0; 360 | my $hard_clipped = ($CIGAR =~ /(\d+)H/) ? $1 : 0; 361 | 362 | next if ($soft_clipped + $hard_clipped > $max_clipping); 363 | } 364 | 365 | 366 | if ($rh_reads->{id}->{$read_id}) { 367 | if (($FLAG & 16) && !$no_rc) { ## reverse strand 368 | $SEQ =~ tr/ATGCatgc/TACGtacg/; 369 | $SEQ = reverse $SEQ; 370 | 371 | $QUAL = reverse $QUAL; 372 | } 373 | 374 | my $pair_id = 0; 375 | if ($FLAG & 64) { ## first in pair 376 | $pair_id = 1; 377 | } 378 | elsif ($FLAG & 128) { ## second in pair 379 | $pair_id = 2; 380 | } 381 | 382 | if ($out_format eq 'fastq') { 383 | $rh_reads->{pairs}->{$read_id}->{$pair_id} = "$SEQ\n\+\n$QUAL"; 384 | } 385 | else { 386 | $rh_reads->{pairs}->{$read_id}->{$pair_id} = $use_rg_id ? "$rg_id:$record" : $record; 387 | } 388 | } 389 | } 390 | } 391 | } 392 | 393 | 394 | -------------------------------------------------------------------------------- /extract_split_seqs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # extract_split_seqs.pl -- Query fasta sequences 4 | # 5 | # 6 | # Author: Nowind 7 | # Created: 2012-05-31 8 | # Updated: 2017-02-05 9 | # Version: 1.0.0 10 | # 11 | # Change logs: 12 | # Version 1.0.0 17/02/05: The initial version. 13 | 14 | 15 | =head1 NAME 16 | 17 | extract_split_seqs.pl 18 | 19 | 20 | =head1 SYNOPSIS 21 | 22 | extract_split_seqs.pl --help/? 23 | 24 | =head1 DESCRIPTION 25 | 26 | Fasta format file related processes. 27 | 28 | =cut 29 | 30 | 31 | use strict; 32 | 33 | use Data::Dumper; 34 | use Getopt::Long; 35 | use File::Find::Rule; 36 | use File::Basename; 37 | 38 | use MyPerl::FileIO qw(:all); 39 | use MyPerl::Convert qw(:all); 40 | 41 | ################################# Main ############################### 42 | 43 | 44 | my $CMDLINE = "perl $0 @ARGV"; 45 | my $VERSION = '1.0.0'; 46 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 47 | my $SOURCE = (scalar localtime()) . " Version: $VERSION"; 48 | 49 | my $convert_pos = 'no'; 50 | my ($fasta_file, $output, $word_wrap, $query_file); 51 | GetOptions( 52 | "fasta=s" => \$fasta_file, 53 | "output=s" => \$output, 54 | 55 | "query=s" => \$query_file, 56 | 57 | "wordwrap=i" => \$word_wrap, ## Line feed for print 58 | 59 | "convert-pos=s" => \$convert_pos, 60 | ); 61 | 62 | unless( $query_file && $fasta_file ) { 63 | print < 74 | sequences file(s) in fasta format, required 75 | -q, --query 76 | query file with sequence ids need to be extracted 77 | 78 | -o, --output 79 | output file, default to STDOUT 80 | 81 | -w, --wordwrap 82 | line feed for print, only valid while output in fasta format 83 | 84 | -c, --convert-pos 85 | convert protein positions to cds positions "pro2cds" or 86 | vice versa "cds2pro" 87 | 88 | EOF 89 | 90 | exit(1); 91 | } 92 | 93 | $|++; 94 | 95 | 96 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 97 | 98 | if ($output) { 99 | open (STDOUT, "> $output") || die $!; 100 | } 101 | 102 | ## read into all sequences 103 | my %input_seqs = (); 104 | print STDERR ">> Start reading sequences from $fasta_file ... "; 105 | parse_fasta_SEQs(\%input_seqs, $fasta_file); 106 | print STDERR "done!\n"; 107 | 108 | 109 | ## 110 | ## query sequences 111 | ## 112 | print STDERR ">> Start querying $query_file ... "; 113 | extract_split_seqs($query_file); 114 | print STDERR "done!\n"; 115 | 116 | print STDERR "# " . (scalar localtime()) . "\n"; 117 | 118 | 119 | 120 | 121 | ######################### Sub ######################### 122 | 123 | 124 | =head2 extract_seqs 125 | 126 | About : Query sequence data. 127 | Usage : extract_seqs($fasta_file); 128 | Args : Source fasta file. 129 | Returns : Null 130 | 131 | =cut 132 | sub extract_split_seqs 133 | { 134 | my ($in) = @_; 135 | 136 | my $fh = getInputFilehandle($in); 137 | while (<$fh>) 138 | { 139 | next if (/\#/ || /^\s+$/); 140 | 141 | my ($query_id, $ranges) = (split /\s+/, $_)[0,1]; 142 | 143 | if ($input_seqs{$query_id}) { 144 | my $seq = $input_seqs{$query_id}; 145 | $seq =~ s/\*$//; 146 | 147 | my @ranges = (split /\;/, $ranges); 148 | @ranges = sort {(split /\-/, $a)[0] <=> (split /\-/, $b)[0]} @ranges; 149 | 150 | for (my $i=0; $i+1<@ranges; $i++) 151 | { 152 | my ($prev_start, $prev_end) = (split /\-/, $ranges[$i]); 153 | my ($next_start, $next_end) = (split /\-/, $ranges[$i+1]); 154 | 155 | if ( $prev_end >= $next_start ) { 156 | $ranges[$i] = "$prev_start\-$next_end"; 157 | splice(@ranges, $i+1, 1); 158 | } 159 | } 160 | 161 | my @out_ranges = (); 162 | 163 | my @seqs = (); 164 | for my $range (@ranges) 165 | { 166 | my ($start, $end) = (split /\-/, $range); 167 | 168 | if ($convert_pos eq 'cds2pro') { 169 | $start = ($start+2)/3; 170 | $end = $end/3; 171 | } 172 | elsif ($convert_pos eq 'pro2cds') { 173 | $start = $start*3-2; 174 | $end = $end*3; 175 | } 176 | 177 | push @out_ranges, "$start-$end"; 178 | 179 | my $sub_seq = substr($seq, $start-1, $end-$start+1); 180 | push @seqs, $sub_seq; 181 | } 182 | 183 | unless (@seqs > 0) { 184 | print STDERR "$query_id\t$ranges\n";exit; 185 | } 186 | 187 | my $joined_seq = join '', @seqs; 188 | my $out_ranges = join ';', @out_ranges; 189 | 190 | print STDOUT ">$query_id $out_ranges\n$joined_seq\n"; 191 | } 192 | else { 193 | print STDERR "no record found: $query_id\n"; 194 | } 195 | } 196 | } 197 | 198 | 199 | -------------------------------------------------------------------------------- /gff2fasta.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # gff2fasta.pl -- Extract sequences from gff3 file to fasta file 4 | # 5 | # 6 | # Author: Nowind 7 | # Created: 2012-05-31 8 | # Updated: 2017-05-25 9 | # Version: 1.1.2 10 | # 11 | # Change logs: 12 | # Version 1.0.0 13/11/13: The initial version. 13 | # Version 1.1.0 15/02/12: Skip loci where chromosome sequence is not given; add 14 | # and change several options. 15 | # Version 1.1.1 16/05/06: Bug fixed: use Parent id instead of ID field in CDS. 16 | # Version 1.1.2 17/05/25: Bug fixed: unused features other than mRNA or CDS would cause 17 | # script abort due to no IDs present; cds sequences contain 18 | # lowercased nucleotides would cause complement failure in reverse strand; 19 | # Update: output sequences in the same order as input gff file. 20 | 21 | 22 | 23 | 24 | 25 | =head1 NAME 26 | 27 | gff2fasta.pl 28 | 29 | 30 | =head1 SYNOPSIS 31 | 32 | gff2fasta.pl --help/? 33 | 34 | =head1 DESCRIPTION 35 | 36 | Extract sequences from gff3 file to fasta file 37 | 38 | =cut 39 | 40 | 41 | 42 | 43 | use strict; 44 | 45 | use Data::Dumper; 46 | use Getopt::Long; 47 | use File::Find::Rule; 48 | use File::Basename; 49 | 50 | use MyPerl::FileIO qw(:all); 51 | use MyPerl::Convert; 52 | use MyPerl::Compare; 53 | 54 | ######################### Main ######################### 55 | 56 | my $CMDLINE = "perl $0 @ARGV"; 57 | my $VERSION = '1.1.2'; 58 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 59 | my $SOURCE = (scalar localtime()) . " Version: $VERSION"; 60 | 61 | 62 | my $out_features = "mRNA"; 63 | my $out_format = "fasta"; 64 | my ($gff_file, $output, $ref_seq, $out_details, $word_wrap, $show_help); 65 | GetOptions( 66 | "i|gff=s" => \$gff_file, 67 | "s|seqs=s" => \$ref_seq, 68 | 69 | "output=s" => \$output, 70 | 71 | "F|format=s" => \$out_format, 72 | 73 | "R|features=s" => \$out_features, 74 | 75 | "wordwrap=i" => \$word_wrap, 76 | 77 | "details" => \$out_details, 78 | 79 | "help|?" => \$show_help, 80 | ); 81 | 82 | unless( !$show_help && $gff_file && $ref_seq ) { 83 | print < 93 | annotation file in gff3 format, required 94 | -s, --seqs 95 | reference sequences in fasta format, required 96 | 97 | -o, --output 98 | output filename, default to STDOUT 99 | 100 | -F, --format 101 | output file format, can be set to fasta or tabular, 102 | default: fasta 103 | 104 | -R, --features 105 | output features, mRNA or cds, default: mRNA 106 | 107 | -w, --wordwrap 108 | line feed for print 109 | 110 | -d, --details 111 | output verbose info in header line 112 | 113 | -?, --help 114 | show this help message 115 | 116 | EOF 117 | 118 | exit(1); 119 | } 120 | 121 | $|++; 122 | 123 | 124 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 125 | 126 | 127 | if ($output) { 128 | open (STDOUT, "> $output") || die $!; 129 | } 130 | 131 | $out_features = lc($out_features); 132 | 133 | print STDERR ">> Start reading $ref_seq ... "; 134 | my %SEQs = (); 135 | my @ids = parse_fasta_SEQs(\%SEQs, $ref_seq); 136 | print STDERR "done!\n"; 137 | 138 | 139 | 140 | 141 | print STDERR ">> Start converting $gff_file ... "; 142 | convert_gff2seqs($gff_file); 143 | print STDERR "done!\n"; 144 | 145 | print STDERR "# " . (scalar localtime()) . "\n"; 146 | 147 | ######################### Sub ######################### 148 | 149 | 150 | 151 | =head2 convert_gff2seqs 152 | 153 | About : Convert annotation file from gff format to table-delimited format. 154 | Usage : convert_gff2tables($gff_file, \%representative_trans); 155 | Args : Annotation file in gff3 format. 156 | Returns : Null 157 | =cut 158 | sub convert_gff2seqs 159 | { 160 | my ($in) = @_; 161 | 162 | my %gff_features = (); 163 | 164 | my @out_chrs = (); 165 | my %out_ids = (); 166 | my %dups = (); 167 | my $fh = getInputFilehandle($in); 168 | while (<$fh>) 169 | { 170 | chomp; 171 | my ($chrom, $source, $feature, $start, $end, 172 | $score, $strand, $frame, $attribute) = (split /\t/); 173 | 174 | next unless($SEQs{$chrom} && (($feature eq 'mRNA') || ($feature eq 'CDS'))); 175 | 176 | my $ID = ''; 177 | 178 | if ($feature eq 'CDS' && $attribute =~ /Parent\=(.*?)(,|;|$)/) { ## CDS 179 | $ID = $1; 180 | } 181 | elsif ($attribute =~ /^ID\=(.*?)(,|;|$)/) { ## mRNA 182 | $ID = $1; 183 | } 184 | 185 | unless( $ID ) { 186 | print STDERR "Error: No ID info found for line $.: $_\n"; exit; 187 | } 188 | 189 | if ($feature eq 'mRNA') { 190 | $gff_features{gene}->{$chrom}->{$ID} = [$start, $end, $strand, $frame]; 191 | } 192 | elsif ($feature eq 'CDS') { 193 | push @{$gff_features{cds}->{$chrom}->{$ID}}, [$start, $end, $strand, $frame]; 194 | } 195 | 196 | unless($dups{chrom}->{$chrom}) { 197 | push @out_chrs, $chrom; $dups{chrom}->{$chrom}++; 198 | } 199 | 200 | unless($dups{$chrom}->{$ID}) { 201 | push @{$out_ids{$chrom}}, $ID; $dups{$chrom}->{$ID}++; 202 | } 203 | } 204 | 205 | if ($out_features eq 'mrna') { 206 | for my $chrom (@out_chrs) ## sort keys %{$gff_features{gene}} 207 | { 208 | #my @ids = sort { $gff_features{gene}->{$chrom}->{$a}->[0] <=> 209 | # $gff_features{gene}->{$chrom}->{$b}->[0] } 210 | # keys %{$gff_features{gene}->{$chrom}}; 211 | 212 | for my $id (@{$out_ids{$chrom}}) 213 | { 214 | my ($start, $end, $strand) = @{$gff_features{gene}->{$chrom}->{$id}}; 215 | 216 | my $strand_info = ($strand eq "+") ? "FORWARD" : "REVERSE"; 217 | my $seq_length = $end - $start + 1; 218 | 219 | my $out_id = $out_details ? "$id\t$chrom($start..$end)\t$strand_info\tLENGTH=$seq_length" : $id; 220 | 221 | my $seq = substr($SEQs{$chrom}, $start-1, $end-$start+1); 222 | 223 | if ($strand eq '-') { 224 | $seq =~ tr/ATGC/TACG/; 225 | $seq = reverse $seq; 226 | } 227 | 228 | if ($out_format eq 'tabular') { 229 | print STDOUT "$out_id\t$seq\n"; 230 | } 231 | else { 232 | print STDOUT format_fasta_SEQs($out_id, \$seq, $word_wrap); 233 | } 234 | 235 | } 236 | } 237 | } 238 | elsif ($out_features eq 'cds') { 239 | for my $chrom (@out_chrs) ## sort keys %{$gff_features{cds}} 240 | { 241 | my %cds_seqs = (); 242 | 243 | for my $id (keys %{$gff_features{cds}->{$chrom}}) 244 | { 245 | my @cds_parts = @{$gff_features{cds}->{$chrom}->{$id}}; 246 | 247 | if ($cds_parts[0]->[2] eq '+') { 248 | @cds_parts = sort { $a->[0] <=> $b->[0] } @cds_parts; 249 | 250 | $cds_seqs{$id}->{start} = $cds_parts[0]->[0]; 251 | $cds_seqs{$id}->{strand} = '+'; 252 | 253 | for (my $i=0; $i<@cds_parts; $i++) 254 | { 255 | my ($start, $end, $strand, $frame) = @{$cds_parts[$i]}; 256 | 257 | if ($i == 0 && $frame > 0) { 258 | $start += $frame; 259 | } 260 | 261 | my $seq = substr($SEQs{$chrom}, $start-1, $end-$start+1); 262 | 263 | push @{$cds_seqs{$id}->{seq}}, $seq; 264 | push @{$cds_seqs{$id}->{pos}}, "$start..$end"; 265 | } 266 | } 267 | elsif ($cds_parts[0]->[2] eq '-') { 268 | @cds_parts = sort { $b->[0] <=> $a->[0] } @cds_parts; 269 | 270 | $cds_seqs{$id}->{start} = $cds_parts[-1]->[0]; 271 | $cds_seqs{$id}->{strand} = '-'; 272 | 273 | 274 | for (my $i=0; $i<@cds_parts; $i++) 275 | { 276 | my ($start, $end, $strand, $frame) = @{$cds_parts[$i]}; 277 | 278 | if ($i == 0 && $frame > 0) { 279 | $end -= $frame; 280 | } 281 | 282 | my $seq = substr($SEQs{$chrom}, $start-1, $end-$start+1); 283 | $seq =~ tr/atgcATGC/tacgTACG/; 284 | $seq = reverse $seq; 285 | 286 | push @{$cds_seqs{$id}->{seq}}, $seq; 287 | push @{$cds_seqs{$id}->{pos}}, "$start..$end"; 288 | } 289 | } 290 | } 291 | 292 | 293 | #my @cds_ids = sort { $cds_seqs{$a}->{start} <=> 294 | # $cds_seqs{$b}->{start} } keys %cds_seqs; 295 | for my $id (@{$out_ids{$chrom}}) 296 | { 297 | next unless($cds_seqs{$id}); 298 | 299 | my $cds_seq = join '', @{$cds_seqs{$id}->{seq}}; 300 | my $cds_pos = join ',', @{$cds_seqs{$id}->{pos}}; 301 | my $strand = $cds_seqs{$id}->{strand}; 302 | 303 | my $strand_info = ($strand eq "+") ? "FORWARD" : "REVERSE"; 304 | my $seq_length = length($cds_seq); 305 | 306 | my $out_id = $out_details ? "$id\t$chrom($cds_pos)\t$strand_info\tLENGTH=$seq_length" : $id; 307 | 308 | if ($out_format eq 'tabular') { 309 | print STDOUT "$out_id\t$cds_seq\n"; 310 | } 311 | else { 312 | print STDOUT format_fasta_SEQs($out_id, \$cds_seq, $word_wrap); 313 | } 314 | } 315 | } 316 | } 317 | } 318 | 319 | 320 | 321 | -------------------------------------------------------------------------------- /gff2tables.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # gff2tables.pl -- Convert gff3 file to table-delimited file 4 | # 5 | # 6 | # Author: Nowind 7 | # Created: 2012-05-31 8 | # Updated: 2017-05-22 9 | # Version: 1.0.5 10 | # 11 | # Change logs: 12 | # Version 1.0.0 13/04/14: The initial version. 13 | # Version 1.0.1 13/05/15: Add locus id and strand info in output results. 14 | # Version 1.0.2 13/06/28: Add options "--all-features", "--up-length" and "--down-length" to 15 | # output all features including intergenic regions. 16 | # Version 1.0.3 13/06/29: Add function annotations to output; bug fixed: parent ids not assigned 17 | # for each mRNA record while "--remove-alt" is omited. 18 | # Version 1.0.4 15/12/14: Bug fixed: skip header lines; Updated: output promoter regions. 19 | # Version 1.0.5 17/05/22: Bug fixed: skip features without ID present. 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | =head1 NAME 28 | 29 | gff2tables.pl 30 | 31 | 32 | =head1 SYNOPSIS 33 | 34 | gff2tables.pl --help/? 35 | 36 | =head1 DESCRIPTION 37 | 38 | Convert gff3 file to table-delimited file 39 | 40 | =cut 41 | 42 | 43 | 44 | 45 | use strict; 46 | 47 | use Data::Dumper; 48 | use Getopt::Long; 49 | use File::Find::Rule; 50 | use File::Basename; 51 | 52 | use MyPerl::FileIO qw(:all); 53 | use MyPerl::Convert; 54 | use MyPerl::Compare; 55 | 56 | ######################### Main ######################### 57 | 58 | my $CMDLINE = "perl $0 @ARGV"; 59 | my $VERSION = '1.0.5'; 60 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 61 | my $SOURCE = (scalar localtime()) . " Version: $VERSION"; 62 | 63 | 64 | my $promoter_len = 500; 65 | my ($gff_file, $output, $remove_alt, $out_all_features, 66 | $no_functions, @flank_lens, $show_help); 67 | GetOptions( 68 | "i|gff=s" => \$gff_file, 69 | 70 | "output=s" => \$output, 71 | 72 | "remove-alt" => \$remove_alt, 73 | 74 | "all-features" => \$out_all_features, 75 | 76 | "flank-length=i{,}" => \@flank_lens, 77 | 78 | "promoter=i" => \$promoter_len, 79 | 80 | "no-funcs" => \$no_functions, 81 | 82 | "help|?" => \$show_help, 83 | ); 84 | 85 | unless( !$show_help && $gff_file ) { 86 | print < 96 | annotation file in gff3 format, required 97 | 98 | -o, --output 99 | output filename, default to STDOUT 100 | 101 | -r, --remove-alt 102 | remove alternative splicings, only remain the longest one as the 103 | representative transcript 104 | 105 | -a, --all-features 106 | output all features include intergenic regions, upsteam and downstream 107 | regions of genes 108 | 109 | -p, --promoter 110 | regions within specified regions upstream of transcription start sites 111 | will be annotated as promoter regions, default: 500bp 112 | 113 | -f, --flank-length 114 | length(s) of flanking regions, default: 2000bp 115 | this options valid only while -a option is specified 116 | 117 | -n, --no-funcs 118 | do not output function annotations 119 | 120 | -?, --help 121 | show this help message 122 | 123 | EOF 124 | 125 | exit(1); 126 | } 127 | 128 | $|++; 129 | 130 | 131 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 132 | 133 | 134 | if ($output) { 135 | open (STDOUT, "> $output") || die $!; 136 | } 137 | 138 | unless( @flank_lens > 0 ) { 139 | @flank_lens = (5000); 140 | } 141 | 142 | 143 | my %Locus_ids = (); 144 | my %representative_trans = (); 145 | print STDERR ">> Start checking alternative splicings ... "; 146 | get_represent_trans($gff_file); 147 | print STDERR "done!\n"; 148 | 149 | 150 | print STDERR ">> Start converting $gff_file ... "; 151 | convert_gff2tables($gff_file); 152 | print STDERR "done!\n"; 153 | 154 | print STDERR "# " . (scalar localtime()) . "\n"; 155 | 156 | ######################### Sub ######################### 157 | 158 | 159 | 160 | =head2 get_represent_trans 161 | 162 | About : Get the longest transcript as the representative transcript while 163 | alternative splicing present. 164 | Usage : get_represent_trans($gff_file); 165 | Args : Annotation file in gff format. 166 | Returns : Null 167 | 168 | =cut 169 | sub get_represent_trans 170 | { 171 | my ($in) = @_; 172 | 173 | my %mRNAs_all = (); 174 | 175 | my $fh = getInputFilehandle($in); 176 | while (<$fh>) 177 | { 178 | next if (/^\#/ || /^\s+$/); 179 | 180 | chomp; 181 | my ($chrom, $source, $feature, $start, $end, 182 | $score, $strand, $frame, $attribute) = (split /\t/); 183 | 184 | 185 | if ( $feature =~ /(mRNA|pseudogenic\_transcript)/ ) { ## mRNA 186 | m{ 187 | ^(.*?)\s+.*? # Chromosome ID 188 | \s+(\d+) # Start Position 189 | \s+(\d+)\s+.* # End Position 190 | \s+(\-|\+)\s+\. # Strand 191 | \s+ID\=(.*?)\;.* # ID 192 | (Parent|Locus_id)\=(.*?)(;|$) # Parent 193 | }x; 194 | 195 | my ($ID, $Parent) = ($5, $7); 196 | 197 | $Locus_ids{$ID} = $Parent; 198 | 199 | unless( $ID ) { 200 | #print STDERR "Error: No ID info found for line $.: $_"; 201 | next; 202 | } 203 | 204 | ###unless( $Parent ) { 205 | ### print STDERR "Error: No Parent ID info found for line $.: $_"; exit; 206 | ###} 207 | 208 | my $length = $end - $start + 1; 209 | 210 | if ($mRNAs_all{$Parent}->{represent}) { 211 | if ($mRNAs_all{$Parent}->{length} < $length) { 212 | $mRNAs_all{$Parent}->{represent} = $ID; 213 | $mRNAs_all{$Parent}->{length} = $length; 214 | } 215 | } 216 | else { 217 | $mRNAs_all{$Parent}->{represent} = $ID; 218 | $mRNAs_all{$Parent}->{length} = $length; 219 | } 220 | } 221 | } 222 | 223 | for my $mRNA (keys %mRNAs_all) 224 | { 225 | my $represent_id = $mRNAs_all{$mRNA}->{represent}; 226 | 227 | $representative_trans{$represent_id} = $mRNA; 228 | } 229 | } 230 | 231 | 232 | 233 | =head2 convert_gff2tables 234 | 235 | About : Convert annotation file from gff format to table-delimited format. 236 | Usage : convert_gff2tables($gff_file, \%representative_trans); 237 | Args : Annotation file in gff3 format. 238 | Returns : Null 239 | Note : This function can be easily replaced by using awk, e.g. 240 | awk -F'\t' 'BEGIN{OFS="\t"} {print $1,$3,$4,$5,$9;}' $gff_file | \ 241 | sed 's/Parent=//' | \ 242 | sed -r 's/ID=(.*);Name.*Locus_id=(\w+);.*/\1/g' 243 | 244 | =cut 245 | sub convert_gff2tables 246 | { 247 | my ($in) = @_; 248 | 249 | print "$HEADER##" . (scalar localtime()) . "\n"; 250 | print "#chrom\tfeature\tstart\tend\tID\tstrand\tLocus\n"; 251 | 252 | my $fh = getInputFilehandle($in); 253 | while (<$fh>) 254 | { 255 | next if (/^\#/ || /^\s+$/); 256 | 257 | chomp; 258 | my ($chrom, $source, $feature, $start, $end, 259 | $score, $strand, $frame, $attribute) = (split /\t/); 260 | 261 | next if ($feature eq 'gene'); 262 | 263 | my $ID = ''; 264 | 265 | if ($feature eq 'mRNA' && $attribute =~ /^ID\=(.*?)\;/) { ## mRNA 266 | $ID = $1; 267 | } 268 | elsif ($attribute =~ /Parent\=(.*?)(;|$)/) { 269 | $ID = $1; 270 | } 271 | 272 | unless( $ID && $Locus_ids{$ID} ) { 273 | #print STDERR "Error: No ID info found for line $.: $_\n"; 274 | next; 275 | } 276 | 277 | next if ($remove_alt && !$representative_trans{$ID}); 278 | 279 | if ($feature eq 'mRNA') { 280 | if ($out_all_features) { 281 | for my $f_len (@flank_lens) 282 | { 283 | my $promoter_start = $end + 1; 284 | my $promoter_end = $start - 1; 285 | 286 | my $aft_start = $end + 1; 287 | my $bef_end = $start - 1; 288 | 289 | 290 | if ($strand eq '+') { 291 | $promoter_start = $promoter_end - $promoter_len + 1; 292 | $bef_end = $promoter_start - 1; 293 | } 294 | else { 295 | $promoter_end = $promoter_start + $promoter_len - 1; 296 | $aft_start = $promoter_end + 1; 297 | } 298 | 299 | my $bef_start = $bef_end - $f_len + 1; 300 | my $aft_end = $aft_start + $f_len - 1; 301 | 302 | $bef_start = 1 if ($bef_start <= 0); 303 | 304 | 305 | my $bef_flag = ($strand eq '+') ? 'upstream' : 'downstream'; 306 | my $aft_flag = ($strand eq '+') ? 'downstream' : 'upstream'; 307 | 308 | $bef_flag .= '_' . $f_len/1000 . 'k'; 309 | $aft_flag .= '_' . $f_len/1000 . 'k'; 310 | 311 | print "$chrom\t$bef_flag\t$bef_start\t$bef_end\t$ID\t$strand\t$Locus_ids{$ID}\n"; 312 | print "$chrom\tpromoter_$promoter_len\t$promoter_start\t$promoter_end\t$ID\t$strand\t$Locus_ids{$ID}\n"; 313 | print "$chrom\t$aft_flag\t$aft_start\t$aft_end\t$ID\t$strand\t$Locus_ids{$ID}\n"; 314 | } 315 | } 316 | 317 | my @attrs = (split /;/, $attribute); 318 | my @funcs = (); 319 | for my $attr (@attrs) 320 | { 321 | if ($attr =~ /(Note=|GO=|InterPro=|CGSNL)/){ 322 | push @funcs, $attr; 323 | } 324 | } 325 | 326 | if (@funcs > 0 && !$no_functions) { 327 | my $funcs = join ";", @funcs; 328 | print "$chrom\t$feature\t$start\t$end\t$ID\t$strand\t$Locus_ids{$ID}\t$funcs\n"; 329 | next; 330 | } 331 | } 332 | 333 | print "$chrom\t$feature\t$start\t$end\t$ID\t$strand\t$Locus_ids{$ID}\n"; 334 | } 335 | } 336 | 337 | 338 | 339 | -------------------------------------------------------------------------------- /ggplot-liner-fit.R: -------------------------------------------------------------------------------- 1 | #******************************************************** 2 | # ggplot-windows-count.R -- Plot number of varaints in each window along each chromosomes using ggplot. 3 | # 4 | # 5 | # Author: Nowind 6 | # Created: 2013-07-10 7 | # Updated: 2014-11-28 8 | # Version: 1.0.0 9 | # 10 | # Change logs: 11 | # Version 1.0.0 14/11/28: The initial version. 12 | #********************************************************* 13 | 14 | 15 | library(lsr) 16 | library(reshape2) 17 | library(ggplot2) 18 | library(grid) 19 | 20 | 21 | 22 | 23 | 24 | #********************************************************* 25 | ## plot distributions of markers 26 | infile <- commandArgs(TRUE)[1] 27 | out_tiff <- commandArgs(TRUE)[2] 28 | 29 | dat <- as.data.frame(read.table(infile, header=T, sep="\t")) 30 | 31 | out_height <- 3200 32 | tiff(out_tiff, width=3200, height=out_height, units='px', res=600, compression='lzw', bg='transparent') 33 | 34 | 35 | ggplotRegression <- function (fit) { 36 | 37 | require(ggplot2) 38 | 39 | ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 40 | geom_point(shape=1) + 41 | xlab("Divergence") + 42 | ylab("Diversity") + 43 | stat_smooth(method = "lm", se=FALSE, col = "red") + 44 | labs(title = paste( 45 | "R2 = ",signif(summary(fit)$r.squared, 3), 46 | ", Adj R2 = ",signif(summary(fit)$adj.r.squared, 3), 47 | ", Intercept =",signif(fit$coef[[1]], 3), 48 | ", Slope =",signif(fit$coef[[2]], 3), 49 | ", P =",signif(summary(fit)$coef[2,4], 3), sep = "")) + 50 | theme(plot.title = element_text(size = 10)) 51 | } 52 | 53 | fit <- lm(Divergence ~ Diversity, data = dat) 54 | ggplotRegression(fit) 55 | #********************************************************* 56 | 57 | 58 | #lm_eqn = function(df){ 59 | # m = lm(y ~ x, df); 60 | # eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 61 | # list(a = format(coef(m)[1], digits = 2), 62 | # b = format(coef(m)[2], digits = 2), 63 | # r2 = format(summary(m)$r.squared, digits = 3))) 64 | # as.character(as.expression(eq)); 65 | #} 66 | # 67 | # 68 | # 69 | #ggplot(dat, aes(x=x, y=y)) + 70 | # geom_point(shape=1) + 71 | # scale_colour_hue(l=50) + 72 | # xlab("Divergence(1Mbp,all)") + 73 | # ylab("Diversity(1Mbp,all)") + 74 | # geom_smooth(method='lm',se=FALSE, color="black", formula=y ~ x) + 75 | # theme_bw() + 76 | # theme(legend.justification=c(1,0), legend.position=c(1,0)) + 77 | # geom_text(aes(x = 0.03, y = 0.0045, label = lm_eqn(dat)), parse = TRUE) 78 | 79 | warnings() 80 | 81 | -------------------------------------------------------------------------------- /ggplot-windows-count.R: -------------------------------------------------------------------------------- 1 | #******************************************************** 2 | # ggplot-windows-count.R -- Plot number of varaints in each window along each chromosomes using ggplot. 3 | # 4 | # 5 | # Author: Nowind 6 | # Created: 2013-07-10 7 | # Updated: 2017-05-30 8 | # Version: 1.0.0 9 | # 10 | # Change logs: 11 | # Version 1.0.0 14/11/28: The initial version. 12 | # Version 1.1.0 17/05/30: Add support for other figure format. 13 | #********************************************************* 14 | 15 | 16 | library(lsr) 17 | library(reshape2) 18 | library(ggplot2) 19 | library(grid) 20 | library(tools) 21 | 22 | 23 | 24 | #********************************************************* 25 | ## plot distributions of markers 26 | varfile <- commandArgs(TRUE)[1] 27 | out_fig <- commandArgs(TRUE)[2] 28 | 29 | variants <- as.data.frame(read.table(varfile, header=T, sep="\t")) 30 | 31 | chrom_num <- length(unique(variants$CHROM)) 32 | out_height <- chrom_num * 600 33 | 34 | if (file_ext(out_fig) == "png") { 35 | png(out_fig, width=4800, height=out_height, units='px', res=600, bg='transparent') 36 | } else if (file_ext(out_fig) == "tiff") { 37 | tiff(out_fig, width=4800, height=out_height, units='px', res=600, compression='lzw', bg='transparent') 38 | } else if (file_ext(out_fig) == "jpg") { 39 | jpeg(out_fig, width=4800, height=out_height, units='px', res=600, bg='white') 40 | } else if (file_ext(out_fig) == "bmp") { 41 | bmp(out_fig, width=4800, height=out_height, units='px', res=600, bg='transparent') 42 | } else if (file_ext(out_fig) == "svg") { 43 | svg(out_fig, width=12, height=out_height/400, pointsize=12, onefile=TRUE, family="sans", bg='transparent') 44 | } else if (file_ext(out_fig) == "pdf") { 45 | cairo_pdf(out_fig, width=12, height=out_height/400, pointsize=12, onefile=TRUE, family="sans", bg='transparent', fallback_resolution = 600) 46 | } else if (file_ext(out_fig) == "ps") { 47 | cairo_ps(out_fig, width=12, height=out_height/400, pointsize=12, onefile=TRUE, family="sans", bg='transparent', fallback_resolution = 600) 48 | } 49 | 50 | ggplot(variants, aes(x=BIN_ID, y=COUNT, colour=Sample, group=Sample)) + 51 | geom_line() + 52 | xlab("Windows(x500kbp)") + 53 | ylab("Number of Markers") + 54 | scale_colour_hue(name="Sample", 55 | breaks=unique(variants$Sample), 56 | labels=unique(variants$Sample), 57 | l=40) + 58 | theme_bw() + 59 | theme(legend.justification=c(1,0), legend.position=c(1,0)) + 60 | facet_grid(CHROM ~ .) 61 | #********************************************************* 62 | 63 | warnings() 64 | 65 | -------------------------------------------------------------------------------- /ggplot-windows-multi.R: -------------------------------------------------------------------------------- 1 | #******************************************************** 2 | # ggplot-windows-multi.R -- Plot number of each windows along chromosomes using ggplot, support multi-sample input. 3 | # 4 | # 5 | # Author: Nowind 6 | # Created: 2013-07-10 7 | # Updated: 2014-11-29 8 | # Version: 1.0.0 9 | # 10 | # Change logs: 11 | # Version 1.0.0 14/11/29: The initial version. 12 | #********************************************************* 13 | 14 | 15 | library(lsr) 16 | library(reshape2) 17 | library(ggplot2) 18 | library(grid) 19 | 20 | 21 | 22 | #********************************************************* 23 | ## plot distributions of markers 24 | varfile <- commandArgs(TRUE)[1] 25 | out_tiff <- commandArgs(TRUE)[2] 26 | 27 | variants <- as.data.frame(read.table(varfile, header=T, sep="\t")) 28 | 29 | chrom_num <- length(unique(variants$chrom)) 30 | out_height <- chrom_num * 600 31 | tiff(out_tiff, width=4800, height=out_height, units='px', res=600, compression='lzw', bg='transparent') 32 | 33 | variants_melted <- melt(variants, id.vars=c("chrom", "interval", "bin")) 34 | 35 | ggplot(variants_melted, aes(x=bin, y=value, colour=variable, group=variable)) + 36 | geom_line() + 37 | xlab("Windows(x500kbp)") + 38 | ylab("Number of Variants") + 39 | scale_colour_hue(name="variable", 40 | breaks=unique(variants_melted$variable), 41 | labels=unique(variants_melted$variable), 42 | l=40) + 43 | theme_bw() + 44 | theme(legend.justification=c(1,0), legend.position=c(1,0)) + 45 | facet_grid(chrom ~ ., scales="free_y") 46 | #********************************************************* 47 | 48 | warnings() 49 | 50 | -------------------------------------------------------------------------------- /intervals2bed: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # intervals2bed.sh - convert samtools intervals into bed format 3 | # 4 | # Author: Nowind 5 | # Created: 2013-07-26 6 | # Updated: 2013-07-26 7 | # 8 | # Change logs: 9 | # Version 1.0.0 13/07/26: The initial version. 10 | 11 | if [ -z $1 ]; then 12 | echo "usage: `basename $0` <*.intervals> [threshold]"; 13 | exit 0; 14 | fi 15 | 16 | intervals_file=$1 17 | 18 | if [ -z $2 ]; then 19 | grep "-" $intervals_file | sed 's/\-/\t/' | sed 's/\:/\t/' | \ 20 | awk 'BEGIN{OFS="\t"} {print $1,($2-1),$3;}' 21 | else 22 | grep "-" $intervals_file | sed 's/\-/\t/' | sed 's/\:/\t/' | \ 23 | awk -v threshold=$2 'BEGIN{OFS="\t"} {len=$3-$2+1; if (len >= threshold) print $1,($2-1),$3;}' 24 | fi 25 | -------------------------------------------------------------------------------- /map_pos2intervals.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # map_pos2intervals.pl -- Get the information of intervals where the query position reside. 4 | # 5 | # 6 | # Author: Nowind 7 | # Created: 2012-05-31 8 | # Updated: 2016-06-04 9 | # Version: 1.2.1 10 | # 11 | # Change logs: 12 | # Version 1.0.0 15/08/18: The initial version. 13 | # Version 1.0.1 15/11/14: Bug fixed: no default value assigned for row numbers. 14 | # Version 1.1.0 16/05/25: Updated: Remove prerequisites of sample rows. 15 | # Version 1.2.0 16/05/28: Updated: Output all original records in query file; 16 | # write all mapped subject records in a single line. 17 | # Version 1.2.1 16/06/04: Updated: update output file header lines. 18 | 19 | 20 | use strict; 21 | 22 | use Data::Dumper; 23 | use Getopt::Long; 24 | use File::Find::Rule; 25 | use File::Basename; 26 | 27 | use MyPerl::FileIO qw(:all); 28 | 29 | ################################# Main ############################### 30 | 31 | 32 | my $CMDLINE = "perl $0 @ARGV"; 33 | my $VERSION = '1.2.1'; 34 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 35 | 36 | 37 | my ($query_file, $subject_file, $output, @query_rows, @subject_rows); 38 | GetOptions( 39 | "query=s" => \$query_file, 40 | "subject=s" => \$subject_file, 41 | "output=s" => \$output, 42 | "Q|rows1=i{,}" => \@query_rows, 43 | "S|rows2=i{,}" => \@subject_rows, 44 | ); 45 | 46 | unless( $query_file && $subject_file ) { 47 | print < 57 | query file, required 58 | -s, --subject 59 | subject file, required 60 | -o, --output 61 | output file, default to STDOUT 62 | 63 | -Q, --rows1 64 | specify chromosome, position and sample(optional) rows in query file; 65 | if the sample row was used, the subject file should specify the same 66 | field [default: 0 1] 67 | -S, --rows2 68 | specify rows of range (chromosome, start, end, [sample]) in subject 69 | file [default: 0 1 2] 70 | 71 | EOF 72 | 73 | exit(1); 74 | } 75 | 76 | $|++; 77 | 78 | 79 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 80 | 81 | if ($output) { 82 | open (STDOUT, "> $output") || die $!; 83 | } 84 | 85 | 86 | unless(@query_rows >= 1){ @query_rows = (0, 1) }; 87 | unless(@subject_rows >= 1){ @subject_rows = (0, 1, 2) }; 88 | 89 | 90 | 91 | ## 92 | ## read and parsing query file 93 | ## 94 | print STDERR ">> Start parsing $query_file ... "; 95 | my %query_records = (); 96 | my @query_details = (); 97 | my $query_header = 'Query_record'; 98 | my $fh1 = getInputFilehandle($query_file); 99 | while (<$fh1>) 100 | { 101 | if(/^\#\w+/) { 102 | chomp($query_header = $_); 103 | } 104 | 105 | next if (/\#/ || /^\s+$/); 106 | 107 | chomp(my $line = $_); 108 | 109 | my ($chrom, $pos, $sample) = (split /\s+/, $line)[@query_rows]; 110 | 111 | $sample ||= "ALL"; 112 | 113 | push @{$query_records{$sample}->{$chrom}}, $pos; 114 | 115 | push @query_details, $line; 116 | } 117 | print STDERR "done!\n"; 118 | 119 | ## 120 | ## retrieving query records in subject file 121 | ## 122 | print STDERR ">> Start searching records in $subject_file ... "; 123 | my %subject_records = (); 124 | my $fh2 = getInputFilehandle($subject_file); 125 | while (<$fh2>) 126 | { 127 | if (/\#/ || /^\s+$/) { 128 | next; 129 | } 130 | 131 | chomp(my $interval_info = $_); 132 | 133 | my ($chrom, $start, $end, $sample) = (split /\s+/, $interval_info)[@subject_rows]; 134 | 135 | $sample ||= "ALL"; 136 | 137 | next unless($query_records{$sample}->{$chrom}); 138 | 139 | my @mapped_pos = grep {$_ >= $start && $_ <= $end} @{$query_records{$sample}->{$chrom}}; 140 | 141 | for my $pos (@mapped_pos) 142 | { 143 | $interval_info =~ s/\s+/,/g; 144 | $subject_records{$sample}->{$chrom}->{$pos}->{$interval_info} ++; 145 | } 146 | } 147 | print STDERR "done!\n"; 148 | 149 | 150 | 151 | ## 152 | ## writing results 153 | ## 154 | print STDOUT "$HEADER##" . (scalar localtime()) . "\n"; 155 | print STDOUT "$query_header\tMapped_Intervals\n"; 156 | for (my $i=0; $i<@query_details; $i++) 157 | { 158 | my ($chrom, $pos, $sample) = (split /\s+/, $query_details[$i])[@query_rows]; 159 | 160 | $sample ||= "ALL"; 161 | 162 | if ($subject_records{$sample}->{$chrom}->{$pos}) { 163 | my $mapped_records = join "|", (sort keys %{$subject_records{$sample}->{$chrom}->{$pos}}); 164 | 165 | print STDOUT "$query_details[$i]\t$mapped_records\n"; 166 | } 167 | else { 168 | print STDOUT "$query_details[$i]\tN/A\n"; 169 | } 170 | } 171 | 172 | 173 | print STDERR "# " . (scalar localtime()) . "\n"; 174 | -------------------------------------------------------------------------------- /map_records.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # map_records.pl -- mapping records from one file to another file 4 | # 5 | # 6 | # Author: Nowind 7 | # Created: 2012-05-31 8 | # Updated: 2023-10-30 9 | # Version: 1.2.0 10 | # 11 | # Change logs: 12 | # Version 1.0.0 12/09/18: The initial version. 13 | # Version 1.1.0 12/11/19: Add option "--rows2" to specify rows in subject file. 14 | # Version 1.1.1 12/12/07: Query and subject rows now can have only a single value. 15 | # Version 1.1.2 12/12/29: Change the way to import functions from MyPerl::FileIO. 16 | # Version 1.1.3 13/01/23: Add option "--interval" and "--flanking" to map records in intervals. 17 | # Version 1.1.4 13/03/14: Change output header. 18 | # Version 1.1.5 13/03/15: Bug fixed while output file header. 19 | # Version 1.1.6 13/04/11: Bug fixed while reading query file from STDIN. 20 | # Version 1.1.7 16/05/31: Updated: Add option "--no-secondary"; remove mapped subject rows to reduce redundancy. 21 | # Version 1.1.8 16/06/13: Bug fixed while no remain fields present in subject file. 22 | # Version 1.1.9 16/02/16: Deprecated non-functional "-q" and "-s" options. 23 | # Version 1.2.0 23/10/30: Bug fixed: --no-dups prints empty results; add support for setting delimiters separately for query and subject files. 24 | 25 | use strict; 26 | 27 | use Data::Dumper; 28 | use Getopt::Long; 29 | use File::Find::Rule; 30 | use File::Basename; 31 | 32 | use MyPerl::FileIO qw(:all); 33 | 34 | ################################# Main ############################### 35 | 36 | 37 | my $CMDLINE = "perl $0 @ARGV"; 38 | my $VERSION = '1.2.0'; 39 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 40 | 41 | 42 | my $qdelimiter = '\s+'; 43 | my $sdelimiter = '\s+'; 44 | my $flank_len = 0; 45 | my ($query_file, $subject_file, $output, $interval, 46 | @query_rows, @subject_rows, $match_str, @sub_set, $out_first_only, $merge_sub_dups); 47 | GetOptions( 48 | "query=s" => \$query_file, 49 | "subject=s" => \$subject_file, 50 | "output=s" => \$output, 51 | "qdelimiter=s" => \$qdelimiter, 52 | "sdelimiter=s" => \$sdelimiter, 53 | "interval" => \$interval, 54 | "flanking=i" => \$flank_len, 55 | "match=s" => \$match_str, 56 | "Q|rows1=i{,}" => \@query_rows, 57 | "S|rows2=i{,}" => \@subject_rows, 58 | "no-dups" => \$out_first_only, 59 | "merge-dups" => \$merge_sub_dups, 60 | ); 61 | 62 | unless( $query_file && $subject_file ) { 63 | print < 73 | query file, required 74 | --subject 75 | subject file, required 76 | -o, --output 77 | output file, default to STDOUT 78 | 79 | -Q, --rows1 80 | specify rows for comparing in query file [default: 0 1] 81 | -S, --rows2 82 | specify rows for comparing in subject file [default: 0 1] 83 | 84 | -i, --interval 85 | use this option to map all records in the interval between 86 | query start and end, while this option is set, the -Q option 87 | should be specified with following rows in the order: 88 | 89 | start position, end position, [other keys...] 90 | 91 | while the -S option should be specified as: 92 | 93 | position, [other keys...] 94 | 95 | -f, --flanking 96 | flanking region length to extend the interval, only valid while 97 | -i option is specified 98 | 99 | --qdelimiter 100 | --sdelimiter 101 | specify a delimiter while reading input files, such as ",", 102 | "\\t", multiple delimiters can be set such as ",|\\t" 103 | [default: "\\s+" for both query and subject] 104 | --match 105 | only considering lines matching a pattern, support perl 106 | regular expression 107 | 108 | -n, --no-dups 109 | if query record have multiple hits in subject file, only the first 110 | one would be written to output 111 | --merge-dups 112 | if query record have multiple hits in subject file, all hits would 113 | be written in a single record and seperate by semicolon 114 | 115 | EOF 116 | 117 | exit(1); 118 | } 119 | 120 | $|++; 121 | 122 | 123 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 124 | 125 | if ($output) { 126 | open (STDOUT, "> $output") || die $!; 127 | } 128 | 129 | unless(@query_rows >= 1){ @query_rows = (0, 1) }; 130 | unless(@subject_rows >= 1){ @subject_rows = (0, 1) }; 131 | 132 | 133 | ## 134 | ## read and parsing query file 135 | ## 136 | print STDERR ">> Read in $query_file ... "; 137 | my %Query = (); 138 | my @Query_Records = (); 139 | my $query_header = ''; 140 | my $fh1 = getInputFilehandle($query_file); 141 | while (<$fh1>) 142 | { 143 | push @Query_Records, $_; 144 | 145 | if (/^\#\w/) { 146 | chomp($query_header = $_); 147 | } 148 | 149 | next if (/\#/ || /^\s+$/); 150 | next if ($match_str && !/$match_str/); 151 | 152 | if ($interval) { 153 | my ($start, $end, @cmp_rows) = (split /$qdelimiter/, $_)[@query_rows]; 154 | 155 | for my $pos (($start-$flank_len)..($end+$flank_len)) 156 | { 157 | my $cmp_rows = join "\t", ($pos, @cmp_rows); 158 | $Query{$cmp_rows}->{query} = 1; 159 | } 160 | } 161 | else { 162 | my $cmp_rows = join "\t", ((split /$qdelimiter/, $_)[@query_rows]); 163 | 164 | $Query{$cmp_rows}->{query} = 1; 165 | } 166 | } 167 | print STDERR "done!\n"; 168 | 169 | ## 170 | ## retrieving query records in subject file 171 | ## 172 | print STDERR ">> Search records in $subject_file ... "; 173 | my $subject_header = ''; 174 | my $fh2 = getInputFilehandle($subject_file); 175 | while (<$fh2>) 176 | { 177 | if (/\#\#/ || /^\s+$/) { 178 | next; 179 | } 180 | 181 | chomp; 182 | 183 | my @all_rows = (split /$sdelimiter/, $_); 184 | 185 | my $cmp_rows = join "\t", @all_rows[@subject_rows]; 186 | 187 | @all_rows[@subject_rows] = (); 188 | my @remain_rows = grep defined, @all_rows; 189 | 190 | my $remain_rows = (@remain_rows > 0) ? join "\t", @remain_rows : "Found"; 191 | 192 | if (/^\#\w/) { 193 | $subject_header = $remain_rows; 194 | next; 195 | } 196 | 197 | next unless( $Query{$cmp_rows}->{query} ); 198 | 199 | push @{$Query{$cmp_rows}->{cmp}}, $remain_rows; 200 | } 201 | print STDERR "done!\n"; 202 | 203 | 204 | ## 205 | ## generate results 206 | ## 207 | print STDERR ">> Start generate results ... "; 208 | print STDOUT "$HEADER##" . (scalar localtime()) . "\n"; 209 | foreach (@Query_Records) 210 | { 211 | if (/\#\#/ || /^\s+$/) { 212 | print STDOUT; next; 213 | } 214 | elsif (/^\#\w/) { 215 | print STDOUT "$query_header\t$subject_header\n"; 216 | } 217 | 218 | next if (/\#/ || /^\s+$/); 219 | next if ($match_str && !/$match_str/); 220 | 221 | chomp; 222 | 223 | if ($interval) { 224 | my ($start, $end, @cmp_rows) = (split /$qdelimiter/, $_)[@query_rows]; 225 | 226 | for my $pos (($start-$flank_len)..($end+$flank_len)) 227 | { 228 | my $cmp_rows = join "\t", ($pos, @cmp_rows); 229 | 230 | unless( $Query{$cmp_rows}->{cmp} ) { 231 | print STDOUT "$_\tN/A\n"; 232 | next; 233 | } 234 | 235 | my @out_records = (); 236 | for my $record (@{$Query{$cmp_rows}->{cmp}}) 237 | { 238 | 239 | if ($out_first_only) { 240 | print STDOUT "$_\t$record\n"; 241 | last; 242 | } 243 | elsif ($merge_sub_dups) { 244 | chomp($record); 245 | push @out_records, $record; 246 | } 247 | else { 248 | print STDOUT "$_\t$record\n"; 249 | } 250 | } 251 | 252 | if ($merge_sub_dups) { 253 | my $merged_record = join ";", @out_records; 254 | 255 | print STDOUT "$_\t$merged_record\n"; 256 | } 257 | } 258 | } 259 | else { 260 | my $cmp_rows = join "\t", ((split /$qdelimiter/, $_)[@query_rows]); 261 | 262 | unless( $Query{$cmp_rows}->{cmp} ) { 263 | print STDOUT "$_\tN/A\n"; 264 | next; 265 | } 266 | 267 | my @out_records = (); 268 | for my $record (@{$Query{$cmp_rows}->{cmp}}) 269 | { 270 | 271 | if ($out_first_only) { 272 | print STDOUT "$_\t$record\n"; 273 | last; 274 | } 275 | elsif ($merge_sub_dups) { 276 | chomp($record); 277 | push @out_records, $record; 278 | } 279 | else { 280 | print STDOUT "$_\t$record\n"; 281 | } 282 | } 283 | 284 | if ($merge_sub_dups) { 285 | my $merged_record = join ";", @out_records; 286 | 287 | print STDOUT "$_\t$merged_record\n"; 288 | } 289 | } 290 | 291 | } 292 | print STDERR "done!\n"; 293 | 294 | 295 | print STDERR "# " . (scalar localtime()) . "\n"; 296 | -------------------------------------------------------------------------------- /maskedSEQ2bed.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # maskedSEQ2bed.pl -- sequence related analysis 4 | # 5 | # Author: Nowind 6 | # Created: 2012-02-21 7 | # Updated: 2014-06-04 8 | # Version: 1.0.1 9 | # 10 | # Change logs: 11 | # Version 1.0.0 12/05/24: The initial version. 12 | # Version 1.0.1 14/06/04: Add some comments. 13 | 14 | use strict; 15 | 16 | use Data::Dumper; 17 | 18 | 19 | ##################### Main #################### 20 | 21 | my $CMDLINE = "perl $0 @ARGV"; 22 | my $VERSION = '1.0.1'; 23 | my $HEADER = "# $CMDLINE\n# Version: $VERSION\n"; 24 | 25 | 26 | unless( @ARGV == 1 ) { 27 | print < 33 | 34 | EOF 35 | 36 | exit(0); 37 | } 38 | 39 | $|++; 40 | 41 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 42 | 43 | print STDERR ">> Read in $ARGV[0]..."; 44 | my %SEQs = (); 45 | parseFasta(\%SEQs, $ARGV[0]); 46 | print STDERR "done!\n"; 47 | 48 | #print "$HEADER# " . (scalar localtime()) . "\n"; 49 | 50 | ## BED -- BED type, a general purpose format for representing 51 | ## genomic interval data, useful for masks and other interval 52 | ## outputs. Please note that the bed format is 0-based (most 53 | ## other formats are 1-based). 54 | 55 | print STDERR ">> Start analysis sequences..."; 56 | for my $id (sort keys %SEQs) 57 | { 58 | my $seq = $SEQs{$id}; 59 | 60 | #my @nts = split //, $seq; 61 | # 62 | #for my $nt (@nts) 63 | #{ 64 | # 65 | #} 66 | 67 | ## the matching process is really slow and a new solution is urged 68 | while ( $seq =~ m/(N|[a-z])+/g ) 69 | { 70 | my $end = (pos $seq); ## BED ends are one-based 71 | my $len = length $&; 72 | my $start = $end - $len; ## BED starts are zero-based 73 | 74 | print STDOUT "$id\t$start\t$end\n"; 75 | } 76 | } 77 | print STDERR "done!\n"; 78 | 79 | 80 | print STDERR "# " . (scalar localtime()) . "\n"; 81 | 82 | ####################### Sub ########################### 83 | 84 | #************************************************** 85 | # Save all the sequences in the fasta 86 | # file into a hash or an array 87 | #************************************************** 88 | sub parseFasta 89 | { 90 | my ($rf_seq, $in, $desc) = @_; 91 | 92 | my $match_str = '^(.*?)\s+'; 93 | 94 | if ($desc) { 95 | if ($desc eq 'full') { 96 | $match_str = '^(.*?)\n' 97 | } 98 | elsif ($desc eq 'gbk') { 99 | $match_str = 'gb\|(.*?)\.'; 100 | } 101 | } 102 | 103 | open (F, "< $in") or die $!; 104 | my $fas = do { local $/; }; 105 | close F; 106 | 107 | my @fas = split /\>/, $fas; 108 | 109 | shift @fas; 110 | 111 | for my $str (@fas) 112 | { 113 | $str =~ /$match_str/; 114 | my $id = $1; 115 | 116 | $str =~ s/.*\n//; 117 | #$str =~ s/\>//; 118 | $str =~ s/\s+//g; 119 | 120 | if (ref($rf_seq) eq 'HASH') { 121 | $rf_seq->{$id} = $str; 122 | } 123 | elsif (ref($rf_seq) eq 'ARRAY') { 124 | push @{$rf_seq}, $str; 125 | } 126 | 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /parallel_baseml.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # parallel_codeml.pl -- A wrapper to run codeml in parallel. 4 | # 5 | # 6 | # Author: Nowind 7 | # Created: 2012-05-31 8 | # Updated: 2018-03-30 9 | # Version: 1.0.1 10 | # 11 | # Change logs: 12 | # Version 1.0.0 15/05/01: The initial version. 13 | # Version 1.0.1 18/03/30: Remove ambiguious sites when measure sequence length. 14 | 15 | 16 | 17 | 18 | =head1 NAME 19 | 20 | parallel_baseml.pl 21 | 22 | 23 | =head1 SYNOPSIS 24 | 25 | parallel_baseml.pl --help/? 26 | 27 | =head1 DESCRIPTION 28 | 29 | A wrapper A wrapper to run baseml in parallel. 30 | 31 | =cut 32 | 33 | 34 | 35 | 36 | use strict; 37 | 38 | use Data::Dumper; 39 | use Getopt::Long; 40 | use File::Find::Rule; 41 | use File::Basename; 42 | use File::Temp; 43 | use Parallel::ForkManager; 44 | 45 | use MyPerl::FileIO qw(:all); 46 | use MyPerl::Convert; 47 | use MyPerl::Compare; 48 | 49 | ######################### Main ######################### 50 | 51 | my $CMDLINE = "perl $0 @ARGV"; 52 | my $VERSION = '1.0.1'; 53 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 54 | my $SOURCE = (scalar localtime()) . " Version: $VERSION"; 55 | 56 | 57 | my $max_threads = 1; 58 | my @models = (); 59 | my $min_aln_len = 100; 60 | my ($aln_file, $output, $out_directly, $show_help); 61 | GetOptions( 62 | "input=s" => \$aln_file, 63 | 64 | "output=s" => \$output, 65 | 66 | "threads=i" => \$max_threads, 67 | 68 | "models=i{,}" => \@models, 69 | 70 | "align-length=i" => \$min_aln_len, 71 | 72 | "help|?" => \$show_help, 73 | ); 74 | 75 | unless( !$show_help && $aln_file ) { 76 | print < 86 | input alignment files in fasta format, required 87 | 88 | -o, --output 89 | output filename, default to STDOUT 90 | 91 | -m, --model 92 | specifies one or more models of nucleotide substitution, 0:JC69, 93 | 1:K80, 2:F81, 3:F84, 4:HKY85, 5:T92, 6:TN93, 7:REV, 8:UNREST 94 | [default: 0] 95 | 96 | -a, --align-length 97 | minimum alignment length to process, [default: 100 (bp)] 98 | 99 | -t, --threads 100 | how many data threads should be allocated to running this analysis 101 | [default: 1] 102 | 103 | -?, --help 104 | show this help message 105 | 106 | EOF 107 | 108 | exit(1); 109 | } 110 | 111 | $|++; 112 | 113 | 114 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 115 | 116 | 117 | if ($output) { 118 | open (STDOUT, "> $output") || die $!; 119 | } 120 | 121 | unless(@models > 0) { 122 | @models = (0); 123 | } 124 | 125 | my %substitution_models = ( 126 | 0 => "JC69", 127 | 1 => "K80", 128 | 2 => "F81", 129 | 3 => "F84", 130 | 4 => "HKY85", 131 | 5 => "T92", 132 | 6 => "TN93", 133 | 7 => "REV", 134 | 8 => "UNREST", 135 | ); 136 | 137 | my @model_names = (); 138 | for my $model (@models) 139 | { 140 | push @model_names, $substitution_models{$model}; 141 | } 142 | my $model_names = join "\t", @model_names; 143 | 144 | 145 | ## 146 | ## read reference sequences 147 | ## 148 | my $pm = new Parallel::ForkManager($max_threads) if $max_threads > 1; 149 | 150 | print STDOUT "$HEADER##" . (scalar localtime()) . "\n"; 151 | print "#ref_id\tcmp_id\tref_len\tcmp_len\ttaln_length\t$model_names\n"; 152 | 153 | my $curr_job_num = 0; 154 | my $aln_fh = getInputFilehandle($aln_file); 155 | while (<$aln_fh>) 156 | { 157 | chomp(my $ref_id = $_); 158 | chomp(my $ref_seq = <$aln_fh>); 159 | chomp(my $cmp_id = <$aln_fh>); 160 | chomp(my $cmp_seq = <$aln_fh>); 161 | 162 | $ref_id =~ s/^\>//; 163 | $cmp_id =~ s/^\>//; 164 | 165 | my $aln_len = ($ref_seq =~ tr/ATGCatgc-/ATGCatgc-/); 166 | my $ref_len = $aln_len - ($ref_seq =~ tr/-/-/); 167 | my $cmp_len = $aln_len - ($cmp_seq =~ tr/-/-/); 168 | 169 | next if ($min_aln_len && $aln_len < $min_aln_len); 170 | 171 | $curr_job_num++; 172 | 173 | my $pid = $pm->start and next if ($max_threads > 1); 174 | 175 | print STDERR "\r>>Start calculating ... $curr_job_num"; 176 | 177 | my $tmp_dir = File::Temp->newdir(CLEANUP => 1); 178 | my $tmp_in_fh = File::Temp->new(DIR => $tmp_dir, UNLINK => 1); 179 | my $tmp_in = $tmp_in_fh->filename; 180 | 181 | open (my $tmp_fh, "> $tmp_in") || die $!; 182 | 183 | print {$tmp_fh} "2\t$aln_len\n"; 184 | print {$tmp_fh} "$ref_id\n $ref_seq\n"; 185 | print {$tmp_fh} "$cmp_id\n $cmp_seq\n"; 186 | 187 | my @results = (); 188 | for my $model (@models) 189 | { 190 | write_baseml_ctl("$tmp_in.ctl", $tmp_in, $model); 191 | 192 | my $run_baseml = "baseml $tmp_in.ctl"; 193 | 194 | my $return = system "$run_baseml >/dev/null 2>&1"; 195 | 196 | my $result_fh = getInputFilehandle("$tmp_in.out"); 197 | while (<$result_fh>) 198 | { 199 | next unless(m/$cmp_id\s+(\d+\.\d+)(\(|$)/); 200 | 201 | push @results, $1; 202 | 203 | last; 204 | 205 | ###print; 206 | } 207 | } 208 | 209 | my $results = join "\t", @results; 210 | 211 | print "$ref_id\t$cmp_id\t$ref_len\t$cmp_len\t$aln_len\t$results\n"; 212 | 213 | 214 | if ($max_threads > 1) { 215 | $pm->finish; 216 | } 217 | } 218 | print STDERR "\tdone!\n"; 219 | 220 | 221 | ######################### Sub ######################### 222 | 223 | 224 | sub write_baseml_ctl 225 | { 226 | my ($config_file, $seqfile, $select_model) = @_; 227 | 228 | my $treefile = $seqfile . ".tree"; 229 | my $outfile = $seqfile . ".out"; 230 | 231 | open (OUT, "> $config_file") || die $!; 232 | print OUT <0) or ancestral states 261 | 262 | Small_Diff = 7e-6 263 | cleandata = 1 * remove sites with ambiguity data (1:yes, 0:no)? 264 | * icode = 0 * (with RateAncestor=1. try "GC" in data,model=4,Mgene=4) 265 | * fix_blength = -1 * 0: ignore, -1: random, 1: initial, 2: fixed 266 | method = 0 * Optimization method 0: simultaneous; 1: one branch a time 267 | EOF 268 | close OUT; 269 | } 270 | 271 | 272 | 273 | -------------------------------------------------------------------------------- /parallel_codeml.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # parallel_codeml.pl -- A wrapper to run codeml in parallel. 4 | # 5 | # 6 | # Author: Nowind 7 | # Created: 2012-05-31 8 | # Updated: 2021-04-07 9 | # Version: 1.0.1 10 | # 11 | # Change logs: 12 | # Version 1.0.0 18/03/30: The initial version. 13 | # Version 1.0.1 21/04/07: Correct some descriptions. 14 | 15 | 16 | 17 | 18 | =head1 NAME 19 | 20 | parallel_codeml.pl 21 | 22 | 23 | =head1 SYNOPSIS 24 | 25 | parallel_codeml.pl --help/? 26 | 27 | =head1 DESCRIPTION 28 | 29 | A wrapper to run codeml in parallel. 30 | 31 | =cut 32 | 33 | 34 | 35 | 36 | use strict; 37 | 38 | use Data::Dumper; 39 | use Getopt::Long; 40 | use File::Find::Rule; 41 | use File::Basename; 42 | use File::Temp; 43 | use Parallel::ForkManager; 44 | 45 | use MyPerl::FileIO qw(:all); 46 | use MyPerl::Convert; 47 | use MyPerl::Compare; 48 | 49 | ######################### Main ######################### 50 | 51 | my $CMDLINE = "perl $0 @ARGV"; 52 | my $VERSION = '1.0.1'; 53 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 54 | my $SOURCE = (scalar localtime()) . " Version: $VERSION"; 55 | 56 | 57 | my $max_threads = 1; 58 | my @models = (); 59 | my $min_aln_len = 30; 60 | my ($aln_file, $output, $out_directly, $show_help); 61 | GetOptions( 62 | "input=s" => \$aln_file, 63 | 64 | "output=s" => \$output, 65 | 66 | "threads=i" => \$max_threads, 67 | 68 | "models=i{,}" => \@models, 69 | 70 | "align-length=i" => \$min_aln_len, 71 | 72 | "help|?" => \$show_help, 73 | ); 74 | 75 | unless( !$show_help && $aln_file ) { 76 | print < 86 | input alignment files in fasta format, required 87 | 88 | -o, --output 89 | output filename, default to STDOUT 90 | 91 | -m, --model 92 | currently not used 93 | 94 | -a, --align-length 95 | minimum alignment length to process, [default: 30 (bp)] 96 | 97 | -t, --threads 98 | how many data threads should be allocated to running this analysis 99 | [default: 1] 100 | 101 | -?, --help 102 | show this help message 103 | 104 | EOF 105 | 106 | exit(1); 107 | } 108 | 109 | $|++; 110 | 111 | 112 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 113 | 114 | 115 | if ($output) { 116 | open (STDOUT, "> $output") || die $!; 117 | } 118 | 119 | #unless(@models > 0) { 120 | @models = (0); 121 | #} 122 | 123 | 124 | 125 | ## 126 | ## read reference sequences 127 | ## 128 | my $pm = new Parallel::ForkManager($max_threads) if $max_threads > 1; 129 | 130 | print STDOUT "$HEADER##" . (scalar localtime()) . "\n"; 131 | print "#Sample1\tSample2\taln_len\tref_len\tcmp_len\tt\tS\tN\tdNdS\tdN\tdS\n"; 132 | 133 | 134 | my $curr_job_num = 0; 135 | my $aln_fh = getInputFilehandle($aln_file); 136 | while (<$aln_fh>) 137 | { 138 | chomp(my $ref_id = $_); 139 | chomp(my $ref_seq = <$aln_fh>); 140 | chomp(my $cmp_id = <$aln_fh>); 141 | chomp(my $cmp_seq = <$aln_fh>); 142 | 143 | $ref_id =~ s/^\>//; 144 | $cmp_id =~ s/^\>//; 145 | 146 | my $aln_len = ($ref_seq =~ tr/ATGCatgc-/ATGCatgc-/); 147 | my $ref_len = $aln_len - ($ref_seq =~ tr/-/-/); 148 | my $cmp_len = $aln_len - ($cmp_seq =~ tr/-/-/); 149 | 150 | next if ($min_aln_len && $aln_len < $min_aln_len); 151 | 152 | $curr_job_num++; 153 | 154 | my $pid = $pm->start and next if ($max_threads > 1); 155 | 156 | print STDERR "\r>>Start calculating ... $curr_job_num"; 157 | 158 | my $tmp_dir = File::Temp->newdir(CLEANUP => 1); 159 | my $tmp_in_fh = File::Temp->new(DIR => $tmp_dir, UNLINK => 1); 160 | my $tmp_in = $tmp_in_fh->filename; 161 | 162 | open (my $tmp_fh, "> $tmp_in") || die $!; 163 | 164 | print {$tmp_fh} "2\t$aln_len\n"; 165 | print {$tmp_fh} "$ref_id\n $ref_seq\n"; 166 | print {$tmp_fh} "$cmp_id\n $cmp_seq\n"; 167 | 168 | my @results = (); 169 | for my $model (@models) 170 | { 171 | write_codeml_ctl("$tmp_in.ctl", $tmp_in, $model); 172 | 173 | my $run_codeml = "codeml $tmp_in.ctl"; 174 | 175 | my $return = system "$run_codeml >/dev/null 2>&1"; 176 | 177 | my $result_fh = getInputFilehandle("$tmp_in.out"); 178 | while (<$result_fh>) 179 | { 180 | ##print STDOUT; 181 | 182 | next unless(/t=\s+(.*?)\s+S=\s+(.*?)\s+N=\s+(.*?)\s+dN\/dS=\s+(.*?)\s+dN\s+=\s+(.*?)\s+dS\s+=\s+(.*?)\s+/); 183 | 184 | my ($t, $S, $N, $dNdS, $dN, $dS) = ($1, $2, $3, $4, $5, $6); 185 | 186 | push @results, $1; 187 | 188 | print STDOUT "$ref_id\t$cmp_id\t$aln_len\t$ref_len\t$cmp_len\t$t\t$S\t$N\t$dNdS\t$dN\t$dS\n"; 189 | 190 | last; 191 | } 192 | } 193 | 194 | if ($max_threads > 1) { 195 | $pm->finish; 196 | } 197 | } 198 | print STDERR "\tdone!\n"; 199 | 200 | 201 | ######################### Sub ######################### 202 | 203 | 204 | sub write_codeml_ctl 205 | { 206 | my ($config_file, $seqfile, $select_model) = @_; 207 | 208 | my $treefile = $seqfile . ".tree"; 209 | my $outfile = $seqfile . ".out"; 210 | 211 | open (OUT, "> $config_file") || die $!; 212 | print OUT <AAs 223 | CodonFreq = 2 * 0:1/61 each, 1:F1X4, 2:F3X4, 3:codon table 224 | 225 | * ndata = 10 226 | clock = 0 * 0:no clock, 1:clock; 2:local clock; 3:CombinedAnalysis 227 | aaDist = 0 * 0:equal, +:geometric; -:linear, 1-6:G1974,Miyata,c,p,v,a 228 | aaRatefile = dat/jones.dat * only used for aa seqs with model=empirical(_F) 229 | * dayhoff.dat, jones.dat, wag.dat, mtmam.dat, or your own 230 | 231 | model = 2 232 | * models for codons: 233 | * 0:one, 1:b, 2:2 or more dN/dS ratios for branches 234 | * models for AAs or codon-translated AAs: 235 | * 0:poisson, 1:proportional, 2:Empirical, 3:Empirical+F 236 | * 6:FromCodon, 7:AAClasses, 8:REVaa_0, 9:REVaa(nr=189) 237 | 238 | NSsites = 0 * 0:one w;1:neutral;2:selection; 3:discrete;4:freqs; 239 | * 5:gamma;6:2gamma;7:beta;8:beta&w;9:betaγ 240 | * 10:beta&gamma+1; 11:beta&normal>1; 12:0&2normal>1; 241 | * 13:3normal>0 242 | 243 | icode = 0 * 0:universal code; 1:mammalian mt; 2-10:see below 244 | Mgene = 0 245 | * codon: 0:rates, 1:separate; 2:diff pi, 3:diff kapa, 4:all diff 246 | * AA: 0:rates, 1:separate 247 | 248 | fix_kappa = 0 * 1: kappa fixed, 0: kappa to be estimated 249 | kappa = 2 * initial or fixed kappa 250 | fix_omega = 0 * 1: omega or omega_1 fixed, 0: estimate 251 | omega = .4 * initial or fixed omega, for codons or codon-based AAs 252 | 253 | fix_alpha = 1 * 0: estimate gamma shape parameter; 1: fix it at alpha 254 | alpha = 0. * initial or fixed alpha, 0:infinity (constant rate) 255 | Malpha = 0 * different alphas for genes 256 | ncatG = 8 * # of categories in dG of NSsites models 257 | 258 | getSE = 0 * 0: don't want them, 1: want S.E.s of estimates 259 | RateAncestor = 1 * (0,1,2): rates (alpha>0) or ancestral states (1 or 2) 260 | 261 | Small_Diff = .5e-6 262 | cleandata = 1 * remove sites with ambiguity data (1:yes, 0:no)? 263 | * fix_blength = -1 * 0: ignore, -1: random, 1: initial, 2: fixed 264 | method = 0 * Optimization method 0: simultaneous; 1: one branch a time 265 | 266 | * Genetic codes: 0:universal, 1:mammalian mt., 2:yeast mt., 3:mold mt., 267 | * 4: invertebrate mt., 5: ciliate nuclear, 6: echinoderm mt., 268 | * 7: euplotid mt., 8: alternative yeast nu. 9: ascidian mt., 269 | * 10: blepharisma nu. 270 | * These codes correspond to transl_table 1 to 11 of GENEBANK. 271 | EOF 272 | close OUT; 273 | } 274 | 275 | 276 | 277 | -------------------------------------------------------------------------------- /parseSEQs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # parseSEQs.pl -- sequence related analysis 4 | # 5 | # Author: Nowind 6 | # Created: 2012-02-21 7 | # Updated: 2018-12-20 8 | # Version: 1.2.5 9 | # 10 | # Change logs: 11 | # Version 1.0.0 12/02/21: The initial version. 12 | # Version 1.0.1 12/05/29: Add stat of masked info. 13 | # Version 1.0.2 12/05/30: Error correct in stat repeat regions. 14 | # Version 1.1.0 12/06/21: Add support to gzipped or bzipped files. 15 | # Version 1.1.1 13/03/13: Change the way to import functions from MyPerl::FileIO. 16 | # Version 1.1.2 13/06/23: Change the output order to the same as the input fasta file. 17 | # Version 1.2.0 14/01/03: Add options to count codons. 18 | # Version 1.2.1 14/05/29: Add option "--rate" to calculate GC and repeat contents. 19 | # Version 1.2.2 14/06/17: Bug fixed while "--strict" option is specified; add option "--skip". 20 | # Version 1.2.3 15/12/07: Bug fixed: use informative length to calculate GC content and repeat content. 21 | # Version 1.2.4 15/12/15: Updated: add option "--sum-all" to output overall stats; add control 22 | # for verbosity. 23 | # Version 1.2.5 18/12/20: Bug fixed: add check for sequences with no informative length. 24 | 25 | 26 | 27 | use strict; 28 | 29 | use Data::Dumper; 30 | use Getopt::Long; 31 | use File::Find::Rule; 32 | use File::Basename; 33 | 34 | 35 | use MyPerl::FileIO qw(:all); 36 | 37 | ##################### Main #################### 38 | 39 | my $CMDLINE = "perl $0 @ARGV"; 40 | my $VERSION = '1.2.5'; 41 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 42 | 43 | 44 | my $verbosity = 1; 45 | my ($fasta_file, @filter_strings, $output, $count_codon, $strict_mode, 46 | $calc_ratio, $summary_all); 47 | GetOptions( 48 | "fasta=s" => \$fasta_file, 49 | "output=s" => \$output, 50 | 51 | "count-codon" => \$count_codon, 52 | "K|strict" => \$strict_mode, 53 | 54 | "rate" => \$calc_ratio, 55 | 56 | "T|skip=s{,}" => \@filter_strings, 57 | 58 | "A|sum-all" => \$summary_all, 59 | 60 | "verbosity=i" => \$verbosity, 61 | ); 62 | unless( $fasta_file ) { 63 | print < 73 | sequences file in fasta format, required 74 | 75 | -o, --output 76 | output file, default to STDOUT 77 | 78 | -c, --count-codon 79 | count codon triplets 80 | -K, --strict 81 | check and skip sequences not a multiple of 3 while count codons 82 | 83 | -r, --rate 84 | calculate gc and repeat contents instead of output numbers of all 85 | nucleotides 86 | 87 | -T, --skip 88 | skip sequences with ids match these strings 89 | 90 | -A, --sum-all 91 | output summaries over all given sequences 92 | 93 | -v, --verbosity 94 | change verbosity levels, 0 indicates no verbosity to STDERR 95 | [default: 1] 96 | 97 | EOF 98 | 99 | exit(1); 100 | } 101 | 102 | $|++; 103 | 104 | 105 | 106 | if ($output) { 107 | open (STDOUT, "> $output") || die $!; 108 | } 109 | 110 | if ($verbosity > 0) { 111 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 112 | } 113 | 114 | 115 | 116 | my $filter_str = join '|', @filter_strings; 117 | 118 | print STDERR ">> Start reading $fasta_file ... " if ($verbosity > 0); 119 | my %SEQs = (); 120 | my @ids = parse_fasta_SEQs(\%SEQs, $fasta_file, "full"); 121 | print STDERR "done!\n" if ($verbosity > 0); 122 | 123 | 124 | 125 | my @chars = qw(A T G C N a t g c n); 126 | my %codons = (); 127 | my %stats_all = (); 128 | 129 | 130 | 131 | if ($verbosity > 0) { 132 | print "$HEADER##" . (scalar localtime()) . "\n"; 133 | } 134 | 135 | if ($count_codon) { 136 | print "#Codon\tCount\n"; 137 | } 138 | elsif ($calc_ratio) { 139 | print "#ID\tLength\tInformative_Length\tGC_Num\tGC_Ratio\tRepeat_Len\tRepeat_Ratio\n"; 140 | } 141 | else { 142 | print "#ID\tLength\tCount:A\tT\tG\tC\tN\ta\tt\tg\tc\tn\n"; 143 | } 144 | for my $id (@ids) 145 | { 146 | if (@filter_strings > 0) { 147 | next if ($id =~ /($filter_str)/); 148 | } 149 | 150 | print STDERR "\r>> Start parsing sequences $id ... " if $verbosity > 0; 151 | 152 | my $seq = $SEQs{$id}; 153 | my $len = length $seq; 154 | 155 | $stats_all{len} += $len; 156 | 157 | my @count = (); 158 | 159 | if ($count_codon) { 160 | if ($strict_mode && ($len % 3)) { 161 | print STDERR "WARNING: $id sequence length not a multiple of 3!\n"; 162 | next; 163 | } 164 | 165 | my $i = 0; 166 | while ($i+3 <= $len) 167 | { 168 | my $codon = substr($seq, $i, 3); 169 | $i += 3; 170 | 171 | $codons{$codon} ++ ; 172 | } 173 | } 174 | elsif ($calc_ratio) { 175 | my $info_len = ($seq =~ tr/ATGCatgc/ATGCatgc/); 176 | 177 | if ($info_len == 0) { 178 | print "$id\t$len\t$info_len\t-\t-\t-\t-\n"; next; 179 | } 180 | 181 | my $gc_num = ($seq =~ tr/GCgc/GCgc/); 182 | my $gc_rate = $gc_num / $info_len; 183 | 184 | my $repeat_len = ($seq =~ tr/atgc/atgc/); 185 | my $repeat_rate = $repeat_len / $info_len; 186 | 187 | print "$id\t$len\t$info_len\t$gc_num\t$gc_rate\t$repeat_len\t$repeat_rate\n"; 188 | 189 | $stats_all{gc_num} += $gc_num; 190 | $stats_all{repeat} += $repeat_len; 191 | $stats_all{info_len} += $info_len; 192 | } 193 | else { 194 | for my $nt (@chars) 195 | { 196 | my $cnt = ($seq =~ s/$nt/$nt/g); 197 | $cnt = $cnt ? $cnt : 0; 198 | 199 | push @count, $cnt; 200 | 201 | $stats_all{$nt} += $cnt; 202 | } 203 | 204 | my $count = join "\t", @count; 205 | 206 | print "$id\t$len\t$count\n"; 207 | } 208 | } 209 | 210 | 211 | print STDERR "done!\n" if $verbosity > 0; 212 | 213 | 214 | 215 | 216 | if ($count_codon) { 217 | print STDERR ">> Start generating results ... " if $verbosity > 0; 218 | for my $codon (sort keys %codons) 219 | { 220 | my $count = $codons{$codon} ? $codons{$codon} : 0; 221 | print "$codon\t$count\n"; 222 | } 223 | print STDERR "done!\n" if $verbosity > 0; 224 | } 225 | 226 | 227 | ## 228 | ## output summaries upon all sequences 229 | ## 230 | if ($summary_all) { 231 | if ($calc_ratio) { 232 | my $gc_rate_all = $stats_all{gc_num} / $stats_all{info_len}; 233 | my $repeat_rate_all = $stats_all{repeat} / $stats_all{info_len}; 234 | 235 | print "All\t$stats_all{len}\t$stats_all{info_len}\t$stats_all{gc_num}\t$gc_rate_all\t$stats_all{repeat}\t$repeat_rate_all\n"; 236 | } 237 | else { 238 | my @counts_all = (); 239 | for my $nt (@chars) 240 | { 241 | push @counts_all, $stats_all{$nt}; 242 | } 243 | 244 | my $counts_all = join "\t", @counts_all; 245 | 246 | print "All\t$stats_all{len}\t$counts_all\n"; 247 | } 248 | } 249 | 250 | 251 | print STDERR "# " . (scalar localtime()) . "\n" if ($verbosity > 0); 252 | 253 | 254 | 255 | 256 | 257 | ######################### Sub ######################### 258 | 259 | 260 | 261 | =head2 262 | 263 | About : Determin ancestors of each loci due to comparison of different markers 264 | Usage : compare_markers(\%variants, $vcf_file); 265 | Args : A hash to save all compare results 266 | Vcf file needs to be processed 267 | Returns : Null 268 | 269 | =cut 270 | sub count_base 271 | { 272 | 273 | } 274 | -------------------------------------------------------------------------------- /plot-scatter_with_lines.R: -------------------------------------------------------------------------------- 1 | #******************************************************** 2 | # ggplot-windows-count.R -- Plot number of varaints in each window along each chromosomes using ggplot. 3 | # 4 | # 5 | # Author: Nowind 6 | # Created: 2013-07-10 7 | # Updated: 2016-01-03 8 | # Version: 1.1.2 9 | # 10 | # Change logs: 11 | # Version 1.0.0 14/11/28: The initial version. 12 | # Version 1.1.0 15/12/30: Updated: change input/output file handle. 13 | # Version 1.1.1 16/01/02: Updated: baseline is now read in from commandline. 14 | # Version 1.1.2 16/01/03: Updated: change some colors. 15 | #********************************************************* 16 | 17 | 18 | library(lsr) 19 | library(reshape2) 20 | library(ggplot2) 21 | library(grid) 22 | 23 | 24 | #********************************************************* 25 | ## plot distributions of markers 26 | infile <- file("stdin") 27 | baseline <- as.numeric(commandArgs(TRUE)[1]) 28 | outfile <- commandArgs(TRUE)[2] 29 | 30 | 31 | dat <- as.data.frame(read.table(infile, header=T, sep="\t")) 32 | 33 | 34 | 35 | ggplot(dat, aes(x=bin, y=val_mean)) + 36 | geom_errorbar(data=subset(dat, group == "group1"), aes(ymin=val_mean-val_sem, ymax=val_mean+val_sem), color= "#E41A1C", width=.3, size=0.5) + 37 | geom_line(data=subset(dat, group == "group1"), color= "#E41A1C", size = 0.5) + 38 | geom_point(data=subset(dat, group == "group1"), color= "#E41A1C", size=2, shape=21, fill="white") + 39 | #geom_errorbar(data=subset(dat, group == "group2"), aes(ymin=val_mean-val_sem, ymax=val_mean+val_sem), color= "#07F900", width=.1, size=0.5) + 40 | #geom_line(data=subset(dat, group == "group2"), color= "#07F900", size = 0.5) + 41 | #geom_point(data=subset(dat, group == "group2"), color= "#07F900", size=3, shape=21, fill="white") + 42 | xlab("xlab") + ylab("ylab") + 43 | scale_y_continuous(breaks=seq(0.01, 0.02, 0.001)) + 44 | scale_x_continuous(breaks=seq(0, 20, 2)) + 45 | geom_hline(yintercept=baseline, linetype=5, size=0.5, color="#689EC9") + 46 | theme_bw() + 47 | theme(panel.border = element_blank(), 48 | panel.background = element_blank(), 49 | panel.grid.major = element_blank(), 50 | panel.grid.minor = element_blank(), 51 | axis.line = element_line(colour = "grey")) + 52 | theme(axis.text=element_text(size=8), 53 | axis.title=element_text(size=8)) + 54 | theme(legend.justification=c(1,1), 55 | legend.position=c(1,1)) 56 | 57 | 58 | ggsave(file=outfile, plot=last_plot(), width=120, height=100, units="mm", dpi=600, bg='transparent') 59 | 60 | #********************************************************* 61 | 62 | 63 | ###print(summary(dat)) 64 | 65 | # The errorbars overlapped, so use position_dodge to move them horizontally 66 | pd <- position_dodge(0.1) # move them .05 to the left and right 67 | 68 | 69 | 70 | warnings() 71 | 72 | -------------------------------------------------------------------------------- /reference_align.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # reference_align.pl -- Align sequences to a reference sequence. 4 | # 5 | # 6 | # Author: Nowind 7 | # Created: 2012-02-21 8 | # Updated: 2016-04-19 9 | # Version: 1.1.1 10 | # 11 | # Change logs: 12 | # Version 1.0.0 12/12/31: The initial version. 13 | # Version 1.0.1 13/07/02: Update usage infos. 14 | # Version 1.0.2 13/07/04: Correct id name for consensus sequence. 15 | # Version 1.0.3 13/09/07: Add option "--consensus" to output consensus sequence. 16 | # Version 1.1.0 16/03/17: Updated: 1) add support for choose muscle as an alternative aligner; 17 | # 2) add options to set parameters. 18 | # Version 1.1.1 16/04/19: Updated: 1) add more comments; 2) remove some useless codes. 19 | 20 | 21 | use strict; 22 | 23 | use Data::Dumper; 24 | use Getopt::Long; 25 | 26 | use MyPerl::FileIO qw(:all); 27 | use MyPerl::Align; 28 | 29 | ##################### Main #################### 30 | 31 | 32 | my $CMDLINE = "perl $0 @ARGV"; 33 | my $VERSION = '1.1.1'; 34 | my $HEADER = "# $CMDLINE\n# Version: $VERSION\n"; 35 | 36 | my $SOURCE = (scalar localtime()) . " Version: $VERSION"; 37 | 38 | my $aligner = 'clustalw2'; 39 | my $maxiters = 3; 40 | my ($input, $output, $params, $out_cns); 41 | GetOptions( 42 | "input=s" => \$input, 43 | "output=s" => \$output, 44 | 45 | "aligner=s" => \$aligner, 46 | "params=s" => \$params, 47 | "maxiters=i" => \$maxiters, 48 | 49 | "consensus" => \$out_cns, 50 | ); 51 | 52 | my $show_help = ($CMDLINE =~ /\-help/) ? 0 : 1; 53 | 54 | unless( $input && $show_help ) { 55 | print < 73 | input file contains at least two sequences in fasta 74 | format, the first sequence appeared in this file will 75 | be used as the reference sequence, required 76 | 77 | -o, --output 78 | output filename, output extracted sequences in fasta 79 | format, default to STDOUT 80 | 81 | -c, --consensus 82 | output consensus sequence 83 | 84 | -a, --aligner 85 | choose aligner, 'clustalw2' or 'muscle', [default: clustalw2] 86 | 87 | -p, --params 88 | change parameters for specified aligner 89 | default: '-gapopen=15 -gapext=6.66' [clustalw2] 90 | '-quiet' [muscle] 91 | 92 | -m, --maxiters 93 | maximum number of iterations for muscle [default: 3] 94 | 95 | EOF 96 | 97 | exit(1); 98 | } 99 | 100 | 101 | 102 | 103 | $|++; 104 | 105 | if ($output) { 106 | open (STDOUT, "> $output") || die $!; 107 | } 108 | 109 | unless($params) { 110 | if ($aligner eq 'clustalw2') { 111 | $params = '-gapopen=15 -gapext=6.66'; 112 | } 113 | if ($aligner eq 'muscle') { 114 | $params = '-quiet'; 115 | } 116 | } 117 | 118 | 119 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 120 | 121 | 122 | print STDERR ">> Start align sequences in $input ... "; 123 | pairwise_align($input); 124 | print STDERR "done!\n"; 125 | 126 | print STDERR "# " . (scalar localtime()) . "\n"; 127 | 128 | 129 | ######################### Sub ######################### 130 | 131 | 132 | =head2 pairwise_align 133 | 134 | About : Align each sequence to a reference sequence 135 | Usage : pairwise_align($fasta_file); 136 | Args : Sequence file in fasta format 137 | Returns : Null 138 | 139 | =cut 140 | sub pairwise_align 141 | { 142 | my ($in) = @_; 143 | 144 | my @SEQs = (); 145 | 146 | my @ids = parse_fasta_SEQs(\@SEQs, $in); 147 | 148 | ###print format_fasta_SEQs($ids[0], \$SEQs[0]); 149 | 150 | ## step1: generate a longest gapped reference sequence 151 | my $tmp_ref = $SEQs[0]; 152 | 153 | for (my $i=1; $i<@ids; $i++) 154 | { 155 | my $aln = MyPerl::Align->new(prog => $aligner, 156 | type => 'DNA', 157 | params => $params, 158 | maxiters => $maxiters); 159 | 160 | my $rh_aln_seqs = $aln->align_seqs($tmp_ref, $SEQs[$i]); 161 | 162 | $tmp_ref = $rh_aln_seqs->[0]; 163 | 164 | ###print format_fasta_SEQs($ids[0], \$rh_aln_seqs->[0]); 165 | ###print format_fasta_SEQs($ids[$i], \$rh_aln_seqs->[1]); 166 | } 167 | 168 | print format_fasta_SEQs($ids[0], \$tmp_ref); 169 | 170 | ## step2: re-align all sequences to the longest reference sequence 171 | my %aln_nts = (); 172 | 173 | for (my $i=1; $i<@ids; $i++) 174 | { 175 | my $aln = MyPerl::Align->new(prog => $aligner, 176 | type => 'DNA', 177 | params => $params, 178 | maxiters => $maxiters); 179 | 180 | my $rh_aln_seqs = $aln->align_seqs($tmp_ref, $SEQs[$i]); 181 | 182 | my @nts = split //, $rh_aln_seqs->[1]; 183 | 184 | ## count the frequncy of each nucleotide in position j 185 | for (my $j=0; $j<@nts; $j++) 186 | { 187 | next if ($nts[$j] eq '-'); 188 | $aln_nts{$j}->{$nts[$j]}++; 189 | } 190 | 191 | ###print format_fasta_SEQs($ids[0], \$rh_aln_seqs->[0]); 192 | ###print format_fasta_SEQs($ids[$i], \$rh_aln_seqs->[1]); 193 | ###print "###\n"; 194 | } 195 | 196 | ## step3: choose the nucleotide with highest frequncy in each position to build 197 | ## a consensus sequence, if absent, use reference instead 198 | my @ref_nts = split //, $tmp_ref; 199 | my @cns_nts = (); 200 | for (my $j=0; $j<(length $tmp_ref); $j++) 201 | { 202 | my $major_nt = $ref_nts[$j]; 203 | if( $aln_nts{$j} ) { 204 | $major_nt = (sort {$aln_nts{$j}->{$a} <=> $aln_nts{$j}->{$b}} (keys %{$aln_nts{$j}}))[-1]; 205 | } 206 | 207 | push @cns_nts, $major_nt; 208 | } 209 | 210 | my $cns_seq = join '', @cns_nts; 211 | 212 | 213 | if ($out_cns) { 214 | print format_fasta_SEQs("$ids[0]-cns", \$cns_seq); 215 | } 216 | 217 | 218 | ## step4: re-align each sequence to the consensus sequence 219 | for (my $i=1; $i<@ids; $i++) 220 | { 221 | my $aln = MyPerl::Align->new(prog => $aligner, 222 | type => 'DNA', 223 | params => $params, 224 | maxiters => $maxiters); 225 | 226 | my $rh_aln_seqs = $aln->align_seqs($cns_seq, $SEQs[$i]); 227 | 228 | ###print format_fasta_SEQs($ids[0], \$rh_aln_seqs->[0]); 229 | print format_fasta_SEQs($ids[$i], \$rh_aln_seqs->[1]); 230 | ###print "###\n"; 231 | } 232 | } 233 | 234 | 235 | -------------------------------------------------------------------------------- /rename_fasta.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # rename_fasta.pl -- rename sequences ids in fasta format file. 4 | # 5 | # Author: Nowind 6 | # Created: 2012-02-21 7 | # Updated: 2014-08-25 8 | # Version: 1.0.1 9 | # 10 | # Change logs: 11 | # Version 1.0.0 13/05/30: The initial version. 12 | # Version 1.0.1 14/08/25: Add support for paired ids in ref_list. 13 | 14 | 15 | 16 | 17 | use strict; 18 | 19 | use Data::Dumper; 20 | use Getopt::Long; 21 | 22 | 23 | use MyPerl::FileIO qw(:all); 24 | 25 | ##################### Main #################### 26 | 27 | 28 | my $CMDLINE = "perl $0 @ARGV"; 29 | my $VERSION = '1.0.1'; 30 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 31 | 32 | my $SOURCE = (scalar localtime()) . " Version: $VERSION"; 33 | 34 | 35 | my $word_wrap = 0; 36 | my ($fasta_file, $ref_list, $output, $show_help); 37 | GetOptions( 38 | "i|fasta=s" => \$fasta_file, 39 | 40 | "refer=s" => \$ref_list, 41 | 42 | "output=s" => \$output, 43 | 44 | "word-wrap=i" => \$word_wrap, 45 | 46 | "help|?" => \$show_help, 47 | ); 48 | 49 | unless( $fasta_file && $ref_list && !$show_help ) { 50 | print < 60 | sequence file to be sorted in fasta format, required 61 | 62 | -r, --refer 63 | rename sequence name by related ids listed in this file 64 | 65 | -o, --output 66 | output fasta file, default to STDOUT 67 | 68 | -w, --word-wrap 69 | maximum length of sequence to write per line, default each sequence 70 | per line 71 | 72 | -?, --help 73 | show this help message 74 | 75 | EOF 76 | 77 | exit(1); 78 | } 79 | 80 | 81 | 82 | 83 | $|++; 84 | 85 | 86 | 87 | if ($output) { 88 | open (STDOUT, "> $output") || die $!; 89 | } 90 | 91 | 92 | 93 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 94 | 95 | 96 | 97 | my @new_ids = (); 98 | my %new_ids = (); 99 | if ($ref_list) { 100 | print STDERR ">> Start parsing $ref_list ... "; 101 | my $fh = getInputFilehandle($ref_list); 102 | while (<$fh>) 103 | { 104 | next if (/#/ || /^\s+$/); 105 | 106 | my ($id, $new_id) = (split /\s+/); 107 | 108 | if ($new_id) { 109 | $new_ids{$id} = $new_id; 110 | } 111 | else { 112 | push @new_ids, $id; 113 | } 114 | } 115 | print STDERR "done!\n"; 116 | } 117 | 118 | 119 | print STDERR ">> Start reading $fasta_file ... "; 120 | my @SEQs = (); 121 | my @ids = parse_fasta_SEQs(\@SEQs, $fasta_file); 122 | print STDERR "done!\n"; 123 | 124 | 125 | 126 | 127 | print STDERR ">> Start generating renamed file ... "; 128 | for (my $i=0; $i<@ids; $i++) 129 | { 130 | my $id = $ids[$i]; 131 | 132 | unless($SEQs[$i]) { 133 | print STDERR "Error: $fasta_file: no records found for $id!\n"; exit(2); 134 | } 135 | 136 | if (@new_ids > 0) { 137 | print format_fasta_SEQs($new_ids[$i], \$SEQs[$i], $word_wrap); 138 | } 139 | else { 140 | print format_fasta_SEQs($new_ids{$id}, \$SEQs[$i], $word_wrap); 141 | } 142 | } 143 | print STDERR "done!\n"; 144 | 145 | 146 | print STDERR "# " . (scalar localtime()) . "\n"; 147 | 148 | 149 | ######################### Sub ######################### 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /sam2fastq.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # sam2fastq.pl -- Convert sam format to fastq format. 4 | # 5 | # Author: Nowind 6 | # Created: 2012-02-21 7 | # Updated: 2016-03-02 8 | # Version: 1.1.0 9 | # 10 | # Change logs: 11 | # Version 1.0.0 14/01/21: The initial version. 12 | # Version 1.1.0 16/03/02: Updated: remove duplicated records with same id. 13 | 14 | 15 | 16 | use strict; 17 | 18 | use Data::Dumper; 19 | use Getopt::Long; 20 | 21 | use MyPerl::FileIO qw(:all); 22 | 23 | ##################### Main #################### 24 | 25 | 26 | my $CMDLINE = "perl $0 @ARGV"; 27 | my $VERSION = '1.1.0'; 28 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 29 | 30 | my $SOURCE = (scalar localtime()) . " Version: $VERSION"; 31 | 32 | my ($input, $output, $no_rc, $use_rg_id); 33 | GetOptions( 34 | "I|input=s" => \$input, 35 | 36 | "output=s" => \$output, 37 | 38 | "no-rc" => \$no_rc, 39 | "use-rg" => \$use_rg_id, 40 | ); 41 | 42 | my $show_help = ($CMDLINE =~ /\-help/) ? 0 : 1; 43 | 44 | unless( $input && $show_help ) { 45 | print < 56 | input file of query positions, required 57 | 58 | -o, --output 59 | output prefix, required 60 | 61 | -n, --no-rc 62 | do not reverse complement sequence with negtive strand 63 | -u, --use-rg 64 | add read group id to extracted records 65 | 66 | EOF 67 | 68 | exit(1); 69 | } 70 | 71 | 72 | 73 | 74 | $|++; 75 | 76 | 77 | 78 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 79 | 80 | 81 | 82 | 83 | print STDERR ">> Start converting $input ... "; 84 | sam2fastq($input, $output); 85 | print STDERR "done!\n"; 86 | 87 | 88 | 89 | print STDERR "# " . (scalar localtime()) . "\n"; 90 | 91 | 92 | ######################### Sub ######################### 93 | 94 | 95 | =head2 sam2fastq 96 | 97 | About : Convert sam file to fastq files. 98 | Usage : sam2fastq($sam_file, $out_prefix); 99 | Args : File in SAM format; 100 | Prefix of output files. 101 | Returns : Null 102 | 103 | =cut 104 | sub sam2fastq 105 | { 106 | my ($in, $out_prefix) = @_; 107 | 108 | my %read_ids = (); 109 | 110 | open (FQ1, "> $out_prefix" . "_1.fq") || die $!; 111 | open (FQ2, "> $out_prefix" . "_2.fq") || die $!; 112 | 113 | my $fh = getInputFilehandle($input); 114 | while (<$fh>) 115 | { 116 | next if (/^@/ || /^\s+$/); ## skip header 117 | 118 | chomp(my $record = $_); 119 | 120 | my ($QNAME, $FLAG, $RNAME, $POS, $MAPQ, $CIGAR, 121 | $MRNM, $NPOS, $TLEN, $SEQ, $QUAL, @OPT) = (split /\s+/, $record); 122 | 123 | my $rg_id = ($record =~ /RG:Z:(.*?)\s+/); 124 | 125 | $QNAME =~ s/\/\d$//; 126 | 127 | my $read_id = $use_rg_id ? "$rg_id:$QNAME" : $QNAME; 128 | 129 | ## re-reverse bases and qualities of reads with negative strand 130 | ## flag set 131 | if (($FLAG & 16) && !$no_rc) { 132 | $SEQ =~ tr/ATGCatgc/TACGtacg/; 133 | $SEQ = reverse $SEQ; 134 | 135 | $QUAL = reverse $QUAL; 136 | } 137 | 138 | my $pair_id = 0; 139 | if ($FLAG & 64) { ## first in pair 140 | $pair_id = 1; 141 | 142 | next if (exists($read_ids{"$read_id:$pair_id"})); 143 | 144 | print FQ1 "\@$read_id/1\n$SEQ\n\+\n$QUAL\n"; 145 | } 146 | elsif ($FLAG & 128) { ## second in pair 147 | $pair_id = 2; 148 | 149 | next if (exists($read_ids{"$read_id:$pair_id"})); 150 | 151 | print FQ2 "\@$read_id/2\n$SEQ\n\+\n$QUAL\n"; 152 | } 153 | 154 | $read_ids{"$read_id:$pair_id"}++; 155 | } 156 | 157 | close FQ1; 158 | close FQ2; 159 | } 160 | 161 | 162 | -------------------------------------------------------------------------------- /site_perl/MyPerl/Align.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # Align.pm -- Sequences alignment operations. 4 | # 5 | # Author: Nowind 6 | # Created: 2010-10-09 7 | # Updated: 2016-09-14 8 | # Version: 1.2.2 9 | # 10 | # Change logs: 11 | # Version 1.0.0 10/10/09: The initial version. 12 | # Version 1.1.0 12/12/31: Rearrange full constructions. 13 | # Version 1.1.1 12/01/02: Add option "parameters" to set additional program specified 14 | # parameters. 15 | # Version 1.1.2 12/03/30: Use File::Temp to handle temporary files safely in order to 16 | # aviod collisions while two processes access same temporary file. 17 | # Version 1.1.3 16/03/17: Bug fixed: could not write result while muscle is specified without 18 | # "--quiet" option. 19 | # Updated: Add module "Data::Dumper". 20 | # Version 1.2.0 16/09/04: Updated: remove dependency on Bioperl. 21 | # Version 1.2.1 16/09/13: Updated: append the stop codon while recovering the original DNA 22 | # sequences through aligned proteins. 23 | # Version 1.2.2 16/09/14: Bug fixed: failed to generate tmp results while align with muscle. 24 | 25 | 26 | =head1 NAME 27 | 28 | MyPerl::Align - Local perl module for alignment operations 29 | 30 | 31 | =head1 SYNOPSIS 32 | 33 | use MyPerl::Align; 34 | 35 | ## creat a new align object 36 | my $aln = MyPerl::Align->new( 37 | input => "input.fasta", 38 | output => "output.fasta", 39 | prog => "clustalw2", 40 | type => "DNA", 41 | ); 42 | 43 | ## start alignment 44 | $aln->start; 45 | 46 | ## directly align sequences 47 | my $rh_aln_seqs = $aln->align_seqs($seq1, $seq2, ...) 48 | 49 | =head1 DESCRIPTION 50 | 51 | Local perl module invoke clustalw2 and muscle to align sequences in fasta format 52 | 53 | =cut 54 | 55 | package MyPerl::Align; 56 | 57 | use strict; 58 | 59 | require Exporter; 60 | 61 | use File::Temp; 62 | use Data::Dumper; 63 | 64 | use MyPerl::FileIO qw(:all); 65 | use MyPerl::Convert qw(:all); 66 | 67 | ## 68 | ## Global Constants and Variables 69 | ## 70 | use vars qw( 71 | @ISA 72 | %EXPORT_TAGS 73 | @EXPORT_OK 74 | @EXPORT 75 | ); 76 | 77 | @ISA = qw(Exporter); 78 | 79 | %EXPORT_TAGS = ( 80 | 'all' => [ 81 | qw( 82 | ) 83 | ] 84 | ); 85 | 86 | @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); 87 | @EXPORT = qw(); 88 | 89 | 90 | $MyPerl::FileIO::VERSION = '1.2.2'; 91 | 92 | 93 | =head1 METHODS 94 | 95 | =head2 new 96 | 97 | About : Create an object 98 | Usage : my $aln = MyPerl::Align->new(%settings); 99 | Args : Using defined settings about: 100 | input: input file 101 | output: output file 102 | prog: align program, clustalw2 or muscle 103 | type: sequence type, DNA or PROTEIN 104 | dir: output directory, default to current directory 105 | maxiters: max iterations(only valid while using muscle) 106 | log: log file 107 | params: additional parameters 108 | Returns : An object to run the alignment 109 | 110 | =cut 111 | sub new 112 | { 113 | my ($class, %user_set) = @_; 114 | 115 | my $default_set = { 116 | 'input' => undef, 117 | 'output' => undef, 118 | 'prog' => 'clustalw2', 119 | 'type' => 'DNA', 120 | 'dir' => '.', 121 | 'maxiters' => 3, 122 | 'log' => '', 123 | }; 124 | 125 | for (keys %{$default_set}) 126 | { 127 | unless (defined $user_set{$_}) { 128 | $user_set{$_} = $default_set->{$_}; 129 | } 130 | } 131 | 132 | unless( -d $user_set{dir} ) { mkdir $user_set{dir} || die $!; } 133 | 134 | return bless \%user_set, $class; 135 | } 136 | 137 | 138 | =head2 start 139 | 140 | About : Run alignment 141 | Usage : my $obj = $aln->start; 142 | Args : Null 143 | Returns : Null 144 | 145 | =cut 146 | sub start 147 | { 148 | my ($settings) = @_; 149 | 150 | my $aln_in = $settings->{input}; 151 | my $aln_out = $settings->{output}; 152 | my $rh_ids = $settings->{id}; 153 | my $rh_seqs = $settings->{seq}; 154 | my $prog = $settings->{prog}; 155 | my $type = $settings->{type}; 156 | my $dir = $settings->{dir}; 157 | my $maxiters = $settings->{maxiters}; 158 | my $log = $settings->{log}; 159 | my $params = $settings->{params}; 160 | 161 | my $run_clustaw2 = "clustalw2 -infile=$aln_in -type=$type " 162 | . "-align -outfile=$aln_out -output=fasta " 163 | . "-outorder=input"; 164 | 165 | my $run_muscle = "muscle -in $aln_in -out $aln_out " 166 | . "-maxiters $maxiters -quiet"; 167 | 168 | my $cmdline = ($prog eq 'clustalw2') ? $run_clustaw2 : $run_muscle; 169 | $cmdline = "$cmdline $params " if $params; 170 | 171 | if ($log) { 172 | $cmdline .= " >>$log 2>&1"; 173 | } 174 | elsif ($prog eq 'clustalw2') { 175 | $cmdline .= " >/dev/null 2>&1"; 176 | } 177 | else { 178 | $cmdline .= " 2>/dev/null"; 179 | } 180 | ###print STDERR "$cmdline\n";exit; 181 | 182 | system "$cmdline"; 183 | 184 | return 0; 185 | } 186 | 187 | 188 | =head2 align_seqs 189 | 190 | About : Directly align sequences 191 | Usage : my $ra_aln_seqs = $aln->align_seqs($seq1, $seq2, ...); 192 | Args : Sequences need to be aligned, require at least two sequences 193 | Returns : Reference to an array contains aligned sequences 194 | 195 | =cut 196 | sub align_seqs 197 | { 198 | my ($obj, @seqs) = @_; 199 | 200 | my $tmp_dir = File::Temp->newdir(); 201 | my $tmp_in_fh = File::Temp->new( DIR => $tmp_dir ); 202 | my $tmp_in = $tmp_in_fh->filename; 203 | my $tmp_ot = $tmp_in . "_aln"; 204 | 205 | for (my $i=0; $i<@seqs; $i++) 206 | { 207 | print $tmp_in_fh ">$i\n$seqs[$i]\n"; 208 | } 209 | 210 | $obj->{input} = $tmp_in; 211 | $obj->{output} = $tmp_ot; 212 | 213 | my $flag = $obj->start($obj); 214 | 215 | return -1 if ($flag); 216 | 217 | my @aln_seqs = (); 218 | parse_fasta_SEQs(\@aln_seqs, $tmp_ot); 219 | ###print STDERR Dumper($tmp_ot); 220 | return \@aln_seqs; 221 | } 222 | 223 | 224 | 225 | =head2 align_codons 226 | 227 | About : Codon-guided sequences alignments 228 | Usage : my $ra_aln_seqs = $aln->align_seqs($seq1, $seq2, ...); 229 | Args : Sequences need to be aligned, require at least two sequences 230 | Returns : Reference to an array contains aligned sequences 231 | 232 | =cut 233 | sub align_codons 234 | { 235 | my ($obj, @seqs) = @_; 236 | 237 | my $tmp_dir = File::Temp->newdir(); 238 | my $tmp_in_fh = File::Temp->new( DIR => $tmp_dir ); 239 | my $tmp_in = $tmp_in_fh->filename; 240 | my $tmp_ot = $tmp_in . "_aln"; 241 | 242 | my %original_nts = (); 243 | for (my $i=0; $i<@seqs; $i++) 244 | { 245 | @{$original_nts{$i}} = split //, $seqs[$i]; 246 | 247 | my $prot = Translate($seqs[$i]); 248 | print $tmp_in_fh ">$i\n$prot\n"; 249 | } 250 | 251 | $obj->{input} = $tmp_in; 252 | $obj->{output} = $tmp_ot; 253 | $obj->{type} = 'PROTEIN'; 254 | 255 | my $flag = $obj->start($obj); 256 | 257 | return -1 if ($flag); 258 | 259 | my @aln_prots = (); 260 | parse_fasta_SEQs(\@aln_prots, $tmp_ot); 261 | 262 | my @aln_seqs = (); 263 | 264 | my %aligned_aas = (); 265 | for (my $i=0; $i<@aln_prots; $i++) 266 | { 267 | @{$aligned_aas{$i}} = split //, $aln_prots[$i]; 268 | 269 | my $rs_aln_seq = recover_codon2nt(\@{$original_nts{$i}}, $aligned_aas{$i}); 270 | 271 | push @aln_seqs, $$rs_aln_seq; 272 | } 273 | 274 | return \@aln_seqs; 275 | } 276 | 277 | 278 | =head2 recover_codon2nt 279 | 280 | About : Recover Nucleotide Sequence 281 | Usage : my $recovered_seqs = recover_codon2nt($nt, $aa); 282 | Args : Array reference to original nucleotides; 283 | Array reference to proteins. 284 | Returns : Reference to recovered sequences. 285 | 286 | =cut 287 | sub recover_codon2nt 288 | { 289 | my ($ra_nt, $ra_aa) = @_; 290 | 291 | my $seq = ''; 292 | 293 | for (my $i=0; $i<=$#{$ra_aa};$i++) 294 | { 295 | if ($ra_aa->[$i] eq '-') { 296 | $seq .= '-' x 3; 297 | } else { 298 | $seq .= shift @{$ra_nt}; 299 | $seq .= shift @{$ra_nt}; 300 | $seq .= shift @{$ra_nt}; 301 | } 302 | } 303 | 304 | if (@{$ra_nt} > 0) { 305 | $seq .= join '', @{$ra_nt}; 306 | } 307 | 308 | return \$seq; 309 | } 310 | 311 | 1; 312 | 313 | 314 | =head1 VERSION 315 | 316 | 1.2.2 317 | 318 | =head1 AUTHOR 319 | 320 | Nowind, noitulove9891@gmail.com 321 | 322 | =head1 COPYRIGHT 323 | 324 | Copyright (c) Nowind's Area. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. 325 | 326 | 327 | =cut -------------------------------------------------------------------------------- /site_perl/MyPerl/Compare.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # Compare.pm -- Sequences comparation 4 | # 5 | # Author: Nowind 6 | # Created: 2010-10-09 7 | # Updated: 2012-11-04 8 | # Version: 1.1.0 9 | # 10 | # Change logs: 11 | # Version 1.1.0 12/11/04: Add function check_codon_TsTv; remove dependency on bioperl. 12 | 13 | 14 | 15 | 16 | 17 | package MyPerl::Compare; 18 | 19 | use MyPerl::Convert; 20 | 21 | @ISA = qw(Exporter); 22 | 23 | @EXPORT = qw(Clip_Seq_By_Ref Compare_Codon Count_Indel 24 | Format_Aln_Sequence Count_Diff_Num check_codon_TsTv); 25 | 26 | 27 | $VERSION = '1.1.0'; 28 | 29 | 30 | ## 31 | ## Compare codons in two aligned sequences 32 | ## 33 | sub Compare_Codon 34 | { 35 | my ($ref, $cmp) = @_; 36 | 37 | my @cmp = (); 38 | my $pos = 0; 39 | 40 | while ((length $ref) >3) 41 | { 42 | ++$pos; 43 | 44 | my $code1 = substr($ref, 0, 3, ''); 45 | my $code2 = substr($cmp, 0, 3, ''); 46 | 47 | if ($code1 ne $code2) { 48 | my $p1 = Codon2AA($code1); 49 | my $p2 = Codon2AA($code2); 50 | 51 | if ($p1 ne -1 && $p2 ne -1) { 52 | if ($p1 eq $p2) { 53 | push @cmp, "$pos\t$code1->$code2\tSyn"; 54 | } 55 | else { 56 | push @cmp, "$pos\t$code1->$code2\tNon"; 57 | } 58 | } 59 | } 60 | } 61 | 62 | return \@cmp; 63 | } 64 | 65 | ## 66 | ## Find indel in sequences 67 | ## 68 | sub Count_Indel 69 | { 70 | my ($seq) = shift; 71 | 72 | my @indel = (); 73 | 74 | while ($seq =~ m/\-+/g) 75 | { 76 | my $pos = pos $seq; 77 | my $len = length $&; 78 | 79 | my $start = $pos - $len + 1; 80 | push @indel, "$start\t$len"; 81 | } 82 | 83 | return \@indel; 84 | } 85 | 86 | ## 87 | ## Use one of the sequences as the reference, and get rid of all '-' in the reference 88 | ## 89 | sub Clip_Seq_By_Ref 90 | { 91 | my ($rseq1, $rseq2, $limit) = @_; 92 | 93 | while ($$rseq1 =~ m/\-+/g) 94 | { 95 | my $pos = pos($$rseq1); 96 | my $len = length $&; 97 | 98 | next if ($limit && $len <= $limit); 99 | 100 | substr($$rseq1, $pos-$len, $len, ''); 101 | substr($$rseq2, $pos-$len, $len, ''); 102 | } 103 | } 104 | 105 | ## 106 | ## Count the difference number of bases 107 | ## 108 | sub Count_Diff_Num 109 | { 110 | my ($rseq1, $rseq2) = @_; 111 | 112 | my @nt1 = split //, $$rseq1; 113 | my @nt2 = split //, $$rseq2; 114 | 115 | my $aln_length = 0; 116 | my $diff_num = 0; 117 | 118 | for (my $i=0; $i<=$#nt1; $i++) 119 | { 120 | my $cmp1 = $nt1[$i]; 121 | my $cmp2 = $nt2[$i]; 122 | 123 | if ($cmp1 ne '-' && $cmp2 ne '-') { 124 | $aln_length++; 125 | 126 | if($cmp1 ne $cmp2) { 127 | $diff_num++; 128 | } 129 | } 130 | 131 | } 132 | 133 | return ($diff_num, $aln_length); 134 | } 135 | 136 | ## Get rid of '-' at the head after alignment 137 | sub Format_Aln_Sequence 138 | { 139 | my ($rseq1, $rseq2) = @_; 140 | 141 | my $pos1 = pos($$rseq1) if $$rseq1 =~ m/\w/g; 142 | my $pos2 = pos($$rseq2) if $$rseq2 =~ m/\w/g; 143 | 144 | my $pos = $pos1 >= $pos2 ? $pos1 : $pos2; 145 | 146 | return 1 if ($pos == 1); # Neither of sequence has '-' in head 147 | 148 | substr($$rseq1, 0, $pos-1, ''); 149 | substr($$rseq2, 0, $pos-1, ''); 150 | 151 | Format_Aln_Sequence($rseq1, $rseq2); 152 | } 153 | 154 | ## Return the shorter length 155 | sub cmp_seq_length 156 | { 157 | my ($gene1, $gene2, $cds_seq) = @_; 158 | 159 | my $length1 = length $cds_seq->{$gene1}->seq; 160 | my $length2 = length $cds_seq->{$gene2}->seq; 161 | 162 | my $short = $length1 >= $length2 ? $length2 : $length1; 163 | 164 | return $short; 165 | } 166 | 167 | 168 | sub check_codon_TsTv # Transition or transverion 169 | { 170 | my ($cmp1, $cmp2) = @_; 171 | 172 | my %Ts = ( 173 | 'A' => 'G', 174 | 'G' => 'A', 175 | 'T' => 'C', 176 | 'C' => 'T', 177 | ); 178 | 179 | my @cmp1 = split //, $cmp1; 180 | my @cmp2 = split //, $cmp2; 181 | 182 | my $diff = 0; 183 | my $type = ''; 184 | 185 | for my $n (0,1,2) 186 | { 187 | if ($cmp1[$n] ne $cmp2[$n]) { 188 | $diff++; 189 | 190 | return 'N' unless exists($Ts{$cmp1[$n]}); 191 | 192 | if ($Ts{$cmp1[$n]} ne $cmp2[$n]) { # Transversion 193 | $type = 'Tv'; 194 | } 195 | else { # Transition 196 | $type = 'Ts'; 197 | } 198 | } 199 | } 200 | 201 | return 'N' unless $diff == 1; # Direct Neighbour 202 | 203 | return $type; 204 | } 205 | 206 | 207 | 1; -------------------------------------------------------------------------------- /site_perl/MyPerl/Convert.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # Convert.pm -- Convert between nucleotide and amino acid codes 4 | # 5 | # Author: Nowind 6 | # Created: 2010-10-09 7 | # Updated: 2016-05-08 8 | # Version: 1.2.0 9 | # 10 | # Change logs: 11 | # Version 1.0.0 10/10/09: The initial version. 12 | # Version 1.1.0 14/06/14: Remove dependency on bioperl. 13 | # Version 1.2.0 16/05/08: Make it faster. 14 | 15 | 16 | package MyPerl::Convert; 17 | 18 | use strict; 19 | 20 | require Exporter; 21 | 22 | 23 | ## 24 | ## Global Constants and Variables 25 | ## 26 | use vars qw( 27 | @ISA 28 | %EXPORT_TAGS 29 | @EXPORT_OK 30 | @EXPORT 31 | ); 32 | 33 | @ISA = qw(Exporter); 34 | 35 | %EXPORT_TAGS = ( 36 | 'all' => [ 37 | qw( 38 | Codon2AA 39 | Translate 40 | ) 41 | ] 42 | ); 43 | 44 | @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); 45 | @EXPORT = qw(); 46 | 47 | 48 | $MyPerl::FileIO::VERSION = '1.2.0'; 49 | 50 | 51 | =head1 METHODS 52 | 53 | =head2 Translate 54 | 55 | About : Translate nucleotides to amino acid. 56 | Usage : my $aa_seq = Translate($nt_seq); 57 | Args : Nucleotdie sequences. 58 | Returns : Amino acid sequences. 59 | 60 | =cut 61 | sub Translate 62 | { 63 | my ($nt_seq) = shift; 64 | 65 | my %genetic_code = ( 66 | 'TCA' => 'S', # Serine 67 | 'TCC' => 'S', # Serine 68 | 'TCG' => 'S', # Serine 69 | 'TCT' => 'S', # Serine 70 | 'TTC' => 'F', # Phenylalanine 71 | 'TTT' => 'F', # Phenylalanine 72 | 'TTA' => 'L', # Leucine 73 | 'TTG' => 'L', # Leucine 74 | 'TAC' => 'Y', # Tyrosine 75 | 'TAT' => 'Y', # Tyrosine 76 | 'TAA' => '*', # Stop 77 | 'TAG' => '*', # Stop 78 | 'TGC' => 'C', # Cysteine 79 | 'TGT' => 'C', # Cysteine 80 | 'TGA' => '*', # Stop 81 | 'TGG' => 'W', # Tryptophan 82 | 'CTA' => 'L', # Leucine 83 | 'CTC' => 'L', # Leucine 84 | 'CTG' => 'L', # Leucine 85 | 'CTT' => 'L', # Leucine 86 | 'CCA' => 'P', # Proline 87 | 'CCC' => 'P', # Proline 88 | 'CCG' => 'P', # Proline 89 | 'CCT' => 'P', # Proline 90 | 'CAC' => 'H', # Histidine 91 | 'CAT' => 'H', # Histidine 92 | 'CAA' => 'Q', # Glutamine 93 | 'CAG' => 'Q', # Glutamine 94 | 'CGA' => 'R', # Arginine 95 | 'CGC' => 'R', # Arginine 96 | 'CGG' => 'R', # Arginine 97 | 'CGT' => 'R', # Arginine 98 | 'ATA' => 'I', # Isoleucine 99 | 'ATC' => 'I', # Isoleucine 100 | 'ATT' => 'I', # Isoleucine 101 | 'ATG' => 'M', # Methionine 102 | 'ACA' => 'T', # Threonine 103 | 'ACC' => 'T', # Threonine 104 | 'ACG' => 'T', # Threonine 105 | 'ACT' => 'T', # Threonine 106 | 'AAC' => 'N', # Asparagine 107 | 'AAT' => 'N', # Asparagine 108 | 'AAA' => 'K', # Lysine 109 | 'AAG' => 'K', # Lysine 110 | 'AGC' => 'S', # Serine 111 | 'AGT' => 'S', # Serine 112 | 'AGA' => 'R', # Arginine 113 | 'AGG' => 'R', # Arginine 114 | 'GTA' => 'V', # Valine 115 | 'GTC' => 'V', # Valine 116 | 'GTG' => 'V', # Valine 117 | 'GTT' => 'V', # Valine 118 | 'GCA' => 'A', # Alanine 119 | 'GCC' => 'A', # Alanine 120 | 'GCG' => 'A', # Alanine 121 | 'GCT' => 'A', # Alanine 122 | 'GAC' => 'D', # Aspartic Acid 123 | 'GAT' => 'D', # Aspartic Acid 124 | 'GAA' => 'E', # Glutamic Acid 125 | 'GAG' => 'E', # Glutamic Acid 126 | 'GGA' => 'G', # Glycine 127 | 'GGC' => 'G', # Glycine 128 | 'GGG' => 'G', # Glycine 129 | 'GGT' => 'G', # Glycine 130 | ); 131 | 132 | $nt_seq = uc($nt_seq); 133 | 134 | my @codons = unpack("(A3)*", $nt_seq); 135 | my @aminos = (); 136 | 137 | for (my $i=0; $i<@codons; $i++) 138 | { 139 | if ($genetic_code{$codons[$i]}) { 140 | push @aminos, $genetic_code{$codons[$i]}; 141 | } 142 | } 143 | 144 | my $aa_seq = join '', @aminos; 145 | 146 | return $aa_seq; 147 | } 148 | 149 | 150 | 151 | 152 | =head2 Codon2AA 153 | 154 | About : Translate nucleotides to amino acid. 155 | Usage : my $aa = Codon2AA($codon); 156 | Args : Nucleotdie codon. 157 | Returns : Amino acid. 158 | 159 | =cut 160 | sub Codon2AA 161 | { 162 | my($codon) = @_; 163 | 164 | $codon = uc $codon; 165 | 166 | my %genetic_code = ( 167 | 'TCA' => 'S', # Serine 168 | 'TCC' => 'S', # Serine 169 | 'TCG' => 'S', # Serine 170 | 'TCT' => 'S', # Serine 171 | 'TTC' => 'F', # Phenylalanine 172 | 'TTT' => 'F', # Phenylalanine 173 | 'TTA' => 'L', # Leucine 174 | 'TTG' => 'L', # Leucine 175 | 'TAC' => 'Y', # Tyrosine 176 | 'TAT' => 'Y', # Tyrosine 177 | 'TAA' => '*', # Stop 178 | 'TAG' => '*', # Stop 179 | 'TGC' => 'C', # Cysteine 180 | 'TGT' => 'C', # Cysteine 181 | 'TGA' => '*', # Stop 182 | 'TGG' => 'W', # Tryptophan 183 | 'CTA' => 'L', # Leucine 184 | 'CTC' => 'L', # Leucine 185 | 'CTG' => 'L', # Leucine 186 | 'CTT' => 'L', # Leucine 187 | 'CCA' => 'P', # Proline 188 | 'CCC' => 'P', # Proline 189 | 'CCG' => 'P', # Proline 190 | 'CCT' => 'P', # Proline 191 | 'CAC' => 'H', # Histidine 192 | 'CAT' => 'H', # Histidine 193 | 'CAA' => 'Q', # Glutamine 194 | 'CAG' => 'Q', # Glutamine 195 | 'CGA' => 'R', # Arginine 196 | 'CGC' => 'R', # Arginine 197 | 'CGG' => 'R', # Arginine 198 | 'CGT' => 'R', # Arginine 199 | 'ATA' => 'I', # Isoleucine 200 | 'ATC' => 'I', # Isoleucine 201 | 'ATT' => 'I', # Isoleucine 202 | 'ATG' => 'M', # Methionine 203 | 'ACA' => 'T', # Threonine 204 | 'ACC' => 'T', # Threonine 205 | 'ACG' => 'T', # Threonine 206 | 'ACT' => 'T', # Threonine 207 | 'AAC' => 'N', # Asparagine 208 | 'AAT' => 'N', # Asparagine 209 | 'AAA' => 'K', # Lysine 210 | 'AAG' => 'K', # Lysine 211 | 'AGC' => 'S', # Serine 212 | 'AGT' => 'S', # Serine 213 | 'AGA' => 'R', # Arginine 214 | 'AGG' => 'R', # Arginine 215 | 'GTA' => 'V', # Valine 216 | 'GTC' => 'V', # Valine 217 | 'GTG' => 'V', # Valine 218 | 'GTT' => 'V', # Valine 219 | 'GCA' => 'A', # Alanine 220 | 'GCC' => 'A', # Alanine 221 | 'GCG' => 'A', # Alanine 222 | 'GCT' => 'A', # Alanine 223 | 'GAC' => 'D', # Aspartic Acid 224 | 'GAT' => 'D', # Aspartic Acid 225 | 'GAA' => 'E', # Glutamic Acid 226 | 'GAG' => 'E', # Glutamic Acid 227 | 'GGA' => 'G', # Glycine 228 | 'GGC' => 'G', # Glycine 229 | 'GGG' => 'G', # Glycine 230 | 'GGT' => 'G', # Glycine 231 | ); 232 | 233 | if( exists $genetic_code{$codon} ) { 234 | return $genetic_code{$codon}; 235 | } 236 | else{ 237 | return 'X'; 238 | } 239 | } 240 | 241 | 1; -------------------------------------------------------------------------------- /site_perl/MyPerl/FileIO.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # FileIO.pm -- Process of files 4 | # 5 | # Author: Nowind 6 | # Created: 2010-10-09 7 | # Updated: 2015-11-13 8 | # Version: 2.1.0 9 | # 10 | # Change logs: 11 | # Version 1.0.0 11/11/12: Add function ParseSNPfile, ParseVCFSNPs, ParseShoreSNPs, ParseOtherSNPs. 12 | # Version 1.1.0 11/11/13: Add function ParseMergedSNPs. 13 | # Version 1.2.0 11/11/13: Remove function ParseOtherSNPs. 14 | # Version 1.3.0 11/11/13: Remove function ParseSNPfile, ParseVCFSNPs, ParseShoreSNPs, ParseMergedSNPs. 15 | # Version 1.3.1 11/12/04: Change function Parse_GFF to ParseGFF3. 16 | # Version 1.3.2 11/12/14: Change the way in reading files in function Parse_Fasta. 17 | # Version 1.4.0 12/06/15: Add function getInputFilehandle. 18 | # Version 1.4.1 12/06/19: Add gzipped and bzipped file support in reading fasta files. 19 | # Version 1.4.2 12/07/03: Modify match patterns in function ParseGFF3; change some 20 | # data structures to reduce memory use. 21 | # Version 1.4.3 12/07/04: Modify data structures in function ParseGFF3. 22 | # Version 1.4.4 12/07/28: Add @ids as return values in function Parse_Fasta. 23 | # Version 1.5.0 12/07/30: Remove dependency of "Bio::SeqIO"; remove function Get_Seq_Hash. 24 | # Version 1.5.1 12/08/09: Add existence checking in function ParseGFF3. 25 | # Version 1.5.2 12/08/14: Bug fixed in ParseGFF3 in match patterns. 26 | # Version 1.5.3 12/10/21: Modify match patterns in ParseGFF3. 27 | # Version 1.5.4 12/11/04: Add checking for gene type in ParseGFF3. 28 | # Version 1.5.5 12/11/12: Add support for reading from stdin in getInputFilehandle. 29 | # Version 1.5.6 12/12/20: Bug fixed in reading stdin. 30 | # Version 2.0.0 12/12/28: Change to a safer way for exporting functions; change serveral functions' 31 | # names: Format_Fasta -> format_fasta_SEQs, Parse_Fasta -> parse_fasta_SEQs, 32 | # ParseGFF3 -> parse_gff3_file; remove unused functions Get_Seq_Hash, 33 | # Get_cds_info and Parse_Variant; complete the documents. 34 | # Version 2.0.1 13/01/21: Add annotations of intergenic regions in parse_gff3_file(uncompleted*). 35 | # Version 2.0.2 15/01/06: Bug fixed: "Use of uninitialized value $id" while id descriptions contian 36 | # character ">". 37 | # Version 2.1.0 15/11/13: Updated: add function get_genome_length. 38 | 39 | 40 | 41 | =head1 NAME 42 | 43 | MyPerl::FileIO - Local perl module for file-related operations 44 | 45 | 46 | =head1 SYNOPSIS 47 | 48 | use MyPerl::FileIO qw(:all); 49 | 50 | my $fh = getInputFilehandle( $filename ); 51 | 52 | my $genome_size = get_genome_length(\%chrom_ids, \%chrom_lengths, $length_file, \@exclude_chroms); 53 | 54 | my @SEQ_IDs = parse_fasta_SEQs(\%SEQs, $filename); 55 | my @SEQ_IDs = parse_fasta_SEQs(\@SEQs, $filename, 'full'); 56 | 57 | my $formated_str = format_fasta_SEQs($id, \$seq, $word_wrap); 58 | 59 | parse_gff3_file(\%GFF_info, $filename); 60 | 61 | =head1 DESCRIPTION 62 | 63 | Local perl module used for reading, parsing and format files used in bioinformatics 64 | 65 | =cut 66 | 67 | package MyPerl::FileIO; 68 | 69 | use strict; 70 | 71 | require Exporter; 72 | 73 | ## 74 | ## Global Constants and Variables 75 | ## 76 | use vars qw( 77 | @ISA 78 | %EXPORT_TAGS 79 | @EXPORT_OK 80 | @EXPORT 81 | ); 82 | 83 | @ISA = qw(Exporter); 84 | 85 | %EXPORT_TAGS = ( 86 | 'all' => [ 87 | qw( 88 | getInputFilehandle 89 | get_genome_length 90 | format_fasta_SEQs 91 | parse_fasta_SEQs 92 | parse_gff3_file 93 | ) 94 | ] 95 | ); 96 | 97 | @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); 98 | @EXPORT = qw(); 99 | 100 | 101 | $MyPerl::FileIO::VERSION = '2.1.0'; 102 | 103 | 104 | =head1 METHODS 105 | 106 | =head2 getInputFilehandle 107 | 108 | About : Open and return filehandles 109 | Usage : my $fh = getInputFilehandle($filename); 110 | Args : Filename 111 | Returns : Filehandle to the opened file 112 | 113 | =cut 114 | sub getInputFilehandle 115 | { 116 | my ($in) = shift; 117 | 118 | my $expr = "-"; 119 | 120 | if (!defined $in || $in eq "-") { ## read from STDIN 121 | $expr = "-"; 122 | } 123 | elsif ($in =~ /\.tar\.gz$/) { ## read from a tar gzip ball 124 | $expr = "tar -zxf $in -O |"; 125 | } 126 | elsif ($in =~ /\.tar\.bz2$/) { ## read from a tar bzip2 ball 127 | $expr = "tar -jxf $in -O |"; 128 | } 129 | elsif ($in =~ /\.gz$/) { ## read from a gzip file 130 | $expr = "gzip -dc $in |"; 131 | } 132 | elsif ($in =~ /\.bz2$/) { ## read from a bzip2 file 133 | $expr = "bzip2 -dc $in |"; 134 | } 135 | elsif ($in =~ /\.zip$/) { ## read from a zip file 136 | $expr = "unzip -p $in |"; 137 | } 138 | else { 139 | $expr = "< $in"; 140 | } 141 | 142 | open (my $fh, $expr) || die $!; 143 | 144 | return $fh; 145 | } 146 | 147 | 148 | =head2 parse_fasta_SEQs 149 | 150 | About : Reading sequences from a file in fasta format 151 | Usage : my @SEQ_IDs = parse_fasta_SEQs(\%SEQs, $filename, $description); 152 | Args : Reference to a hash or array to hold all the sequences; 153 | Fasta filename; 154 | Descriptions used for parsing sequence IDs. 155 | Returns : Array of sequence IDs. 156 | 157 | =cut 158 | sub parse_fasta_SEQs 159 | { 160 | my ($rf_seq, $in, $desc) = @_; 161 | 162 | my $match_str = '^(.*?)\s+'; 163 | 164 | if ($desc) { 165 | if ($desc eq 'full') { 166 | $match_str = '^(.*?)\n' 167 | } 168 | elsif ($desc eq 'gbk') { 169 | $match_str = 'gb\|(.*?)\.'; 170 | } 171 | } 172 | 173 | my $fh = getInputFilehandle($in); 174 | my $fas = do { local $/; <$fh> }; 175 | 176 | my @fas = split /\n\>/, $fas; 177 | 178 | $fas[0] =~ s/^\>//; 179 | #shift @fas; 180 | 181 | my @ids = (); 182 | 183 | for my $str (@fas) 184 | { 185 | $str =~ /$match_str/; 186 | my $id = $1; 187 | 188 | $str =~ s/.*\n//; 189 | #$str =~ s/\>//; 190 | $str =~ s/\s+//g; 191 | 192 | if (ref($rf_seq) eq 'HASH') { ## read into a hash 193 | $rf_seq->{$id} = $str; 194 | } 195 | elsif (ref($rf_seq) eq 'ARRAY') { ## read into an array 196 | push @{$rf_seq}, $str; 197 | } 198 | 199 | push @ids, $id; 200 | } 201 | 202 | return (@ids); 203 | } 204 | 205 | 206 | =head2 format_fasta_SEQs 207 | 208 | About : Format sequence for output in fasta format 209 | Usage : my $formated_str = format_fasta_SEQs($id, \$seq, $word_wrap); 210 | Args : ID of the sequence; 211 | Reference to sequence; 212 | Maximum columns before line wraps. 213 | Returns : Formated string. 214 | 215 | =cut 216 | sub format_fasta_SEQs 217 | { 218 | my ($id, $rs_seq, $wrap) = @_; 219 | 220 | my $seq = $$rs_seq; 221 | my $str = ">$id\n"; 222 | 223 | if ($wrap) { 224 | while (length $seq > $wrap) 225 | { 226 | $str .= substr($seq, 0, $wrap, ''); 227 | $str .= "\n"; 228 | } 229 | } 230 | $str .= "$seq\n"; 231 | 232 | return $str; 233 | } 234 | 235 | 236 | 237 | =head2 parse_gff3_file 238 | 239 | About : Parsing annotation informations in gff3 format file 240 | Usage : parse_gff3_file(\%GFF_info, $filename); 241 | Args : Reference to a hash to hold all the annotation informations; 242 | Filename of the gff3 format file. 243 | Returns : Null 244 | 245 | =cut 246 | sub parse_gff3_file 247 | { 248 | my ($rh_GFF, $in) = @_; 249 | 250 | my $fh = getInputFilehandle($in); 251 | 252 | while (<$fh>) 253 | { 254 | next if (/#/ || /^\s+$/); 255 | 256 | chomp; 257 | 258 | my ($chr, $desc, $type) = (split /\s+/)[0..2]; 259 | 260 | if ( $type =~ /chromosome/ ) { ## chromosome 261 | m{ 262 | ^(.*?)\s+.*? # Chromosome ID 263 | chromosome\s+(\d+) # Start Position 264 | \s+(\d+) # End Position 265 | }x; 266 | 267 | next unless $1; 268 | 269 | my ($chr, $start, $end) = ($1, $2, $3); 270 | 271 | $rh_GFF->{$chr}->{pos} = "$start\-$end"; 272 | } 273 | elsif ( $type =~ /(.*?gene)/ ) { ## gene 274 | my $gene_type = $1; 275 | 276 | m{ 277 | ^(.*?)\s+.*? # Chromosome ID 278 | gene\s+(\d+) # Start Position 279 | \s+(\d+)\s+\. # End Position 280 | \s+(\-|\+)\s+\. # Strand 281 | \s+ID\=(.*?)\;.* # ID 282 | Note\=(.*?)(;|$) # Note 283 | }x; 284 | 285 | next unless $1; 286 | 287 | my ($chr, $start, $end, $Strand, $ID, $Note) = 288 | ($1, $2, $3, $4, $5, $6); 289 | 290 | $rh_GFF->{$chr}->{gene}->{$ID}->{type} = $gene_type; 291 | $rh_GFF->{$chr}->{gene}->{$ID}->{strand} = $Strand; 292 | $rh_GFF->{$chr}->{gene}->{$ID}->{pos} = "$start\-$end"; 293 | $rh_GFF->{$chr}->{gene}->{$ID}->{note} = $Note; 294 | } 295 | elsif ( $type =~ /(mRNA|pseudogenic\_transcript)/ ) { ## mRNA 296 | m{ 297 | ^(.*?)\s+.*? # Chromosome ID 298 | \s+(\d+) # Start Position 299 | \s+(\d+)\s+.* # End Position 300 | \s+(\-|\+)\s+\. # Strand 301 | \s+ID\=(.*?)\;.* # ID 302 | (Parent|Locus_id)\=(.*?)(;|$) # Parent 303 | }x; 304 | 305 | next unless $1; 306 | 307 | my ($chr, $start, $end, $Strand, $ID, $Parent) = 308 | ($1, $2, $3, $4, $5, $7); 309 | 310 | ###print "$chr\t$start\t$end\t$Strand\t$ID\t$Parent\n";exit; 311 | 312 | $rh_GFF->{$chr}->{mRNA}->{$ID}->{strand} = $Strand; 313 | $rh_GFF->{$chr}->{mRNA}->{$ID}->{pos} = "$start\-$end"; 314 | 315 | push @{$rh_GFF->{$chr}->{gene}->{$Parent}->{mRNA}}, $ID; 316 | } 317 | elsif ( $type =~ /(five|three)\_prime\_UTR/ ) { ## five prime UTR 318 | my $terminal = $1; 319 | 320 | m{ 321 | ^(.*?)\s+.*? # Chromosome ID 322 | UTR\s+(\d+) # Start Position 323 | \s+(\d+).*? # End Position 324 | Parent\=(.*)(;|$) # Parent 325 | }x; 326 | 327 | next unless $1; 328 | 329 | my ($chr, $start, $end, $Parent) = ($1, $2, $3, $4); 330 | push @{$rh_GFF->{$chr}->{mRNA}->{$Parent}->{$terminal}}, "$start\-$end"; 331 | } 332 | elsif ( $type =~ /(exon|pseudogenic\_exon)/ ) { ## exon 333 | m{ 334 | ^(.*?)\s+.*? # Chromosome ID 335 | exon\s+(\d+) # Start Position 336 | \s+(\d+).*? # End Position 337 | Parent\=(.*)(;|$) # Parent 338 | }x; 339 | 340 | next unless $1; 341 | 342 | my ($chr, $start, $end, $Parent) = ($1, $2, $3, $4); 343 | 344 | push @{$rh_GFF->{$chr}->{mRNA}->{$Parent}->{exon}}, "$start\-$end"; 345 | } 346 | elsif ( $type =~ /CDS/ ) { ## CDS 347 | m{ 348 | ^(.*?)\s+.*? # Chromosome ID 349 | CDS\s+(\d+) # Start Position 350 | \s+(\d+).*? # End Position 351 | Parent=(.*?)(,|$) # Parent 352 | }x; 353 | 354 | next unless $1; 355 | 356 | my ($chr, $start, $end, $Parent) = ($1, $2, $3, $4); 357 | push @{$rh_GFF->{$chr}->{mRNA}->{$Parent}->{CDS}}, "$start\-$end"; 358 | } 359 | 360 | 361 | ###last if $.>40; 362 | } 363 | 364 | #for my $chrom (sort keys %{$rh_GFF}) 365 | #{ 366 | # 367 | # my @genes = sort {(split /\-/, $rh_GFF->{$chrom}->{gene}->{$a}->{pos})[0] <=> 368 | # (split /\-/, $rh_GFF->{$chrom}->{gene}->{$b}->{pos})[0]} (keys %{$rh_GFF->{$chrom}->{gene}}); 369 | # 370 | # # intergenic region before the first gene 371 | # my ($gene_id, $gene_start, $gene_end) = (split /\-/, $genes[0]); 372 | # my $intgenic_id = $gene_id; 373 | # my $intgenic_start = 1; 374 | # my $intgenic_end = $gene_start - 1; 375 | # 376 | # print STDOUT "$chrom\t.\tintergenic\t$intgenic_start\t$intgenic_end\t." 377 | # . "\t+\t.\tID=$intgenic_id;Name=$intgenic_id;Note=N/A\n"; 378 | # 379 | # for (my $i=0; $i<$#genes; $i++) 380 | # { 381 | # my ($bef_id, $bef_start, $bef_end) = (split /\-/, $genes[$i]); 382 | # my ($aft_id, $aft_start, $aft_end) = (split /\-/, $genes[$i+1]); 383 | # 384 | # if ($bef_end < $aft_start) { 385 | # $intgenic_id = "$bef_id-$aft_id"; 386 | # $intgenic_start = $bef_end + 1; 387 | # $intgenic_end = $aft_start - 1; 388 | # 389 | # print STDOUT "$chrom\t.\tintergenic\t$intgenic_start\t$intgenic_end\t." 390 | # . "\t+\t.\tID=$intgenic_id;Name=$intgenic_id;Note=N/A\n"; 391 | # } 392 | # elsif ($aft_end < $bef_end) { 393 | # @genes = @genes[0..$i, ($i+2)..$#genes]; 394 | # $i--; 395 | # } 396 | # } 397 | # 398 | # # intergenic region after the last gene 399 | # ($gene_id, $gene_start, $gene_end) = (split /\-/, $genes[-1]); 400 | # $intgenic_id = $gene_id; 401 | # $intgenic_start = $gene_end + 1; 402 | # $intgenic_end = $chroms{length}->[$i];; 403 | # 404 | # print STDOUT "$chrom\t.\tintergenic\t$intgenic_start\t$intgenic_end\t." 405 | # . "\t+\t.\tID=$intgenic_id;Name=$intgenic_id;Note=N/A\n"; 406 | #} 407 | } 408 | 409 | 410 | =head2 get_genome_length 411 | 412 | About : Get length of each chromosomes. 413 | Usage : my $genome_size = get_genome_length(\@chrom_ids, \%chrom_lengths, $length_file, \@exclude_chroms); 414 | Args : Array reference to save all chromosome ids; 415 | Hash reference to save all chromosome lengths; 416 | Input file contain chromosome ids and lengthes; 417 | Chromosome ids to be ignored (pattern matches). 418 | Returns : Total size of all supplied chromosomes. 419 | 420 | =cut 421 | sub get_genome_length 422 | { 423 | my ($ra_chrom_ids, $rh_chrom_lens, $in, $ra_exclude_chroms) = @_; 424 | 425 | ## exclude unwanted chromosomes or scaffolds while simulating, all 426 | ## chromosomes with ids match strings specified here would be ignored 427 | my $exclude_str = ''; 428 | if ($ra_exclude_chroms && @{$ra_exclude_chroms} > 0) { 429 | $exclude_str = join '|', @{$ra_exclude_chroms}; 430 | } 431 | 432 | my $genome_size = 0; 433 | 434 | my $fh = getInputFilehandle($in); 435 | while (<$fh>) 436 | { 437 | next if (/^\#/ || /^\s+$/); 438 | 439 | my ($CHROM, $LENGTH) = (split /\s+/); 440 | 441 | if ($ra_exclude_chroms && @{$ra_exclude_chroms} > 0) { 442 | next if ($CHROM =~ /($exclude_str)/); 443 | } 444 | 445 | push @{$ra_chrom_ids}, $CHROM; 446 | $rh_chrom_lens->{$CHROM} = $LENGTH; 447 | 448 | $genome_size += $LENGTH; 449 | } 450 | 451 | return $genome_size; 452 | } 453 | 454 | 455 | 456 | 457 | 1; 458 | 459 | =head1 VERSION 460 | 461 | 2.1.0 462 | 463 | =head1 AUTHOR 464 | 465 | Nowind, noitulove9891@gmail.com 466 | 467 | =head1 COPYRIGHT 468 | 469 | Copyright (c) Nowind's Area. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. 470 | 471 | 472 | =cut 473 | 474 | 475 | -------------------------------------------------------------------------------- /site_perl/MyPerl/README.md: -------------------------------------------------------------------------------- 1 | Hard to choose an appropriate name ... 2 | -------------------------------------------------------------------------------- /site_perl/MyPerl/Statistics.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # Statistics.pm -- do some statistic jobs 4 | # 5 | # Author: Nowind 6 | # Created: 2010-10-09 7 | # Updated: 2015-11-16 8 | # Version: 1.1.0 9 | # 10 | # Change logs: 11 | # Version 1.0.0 12/10/18: The initial version. 12 | # Version 1.1.0 15/11/16: Updated: Add function chi_squared_test. 13 | 14 | 15 | 16 | 17 | use strict; 18 | use Carp qw< croak >; 19 | use List::Util qw< sum >; 20 | use Statistics::Distributions qw< chisqrprob >; 21 | 22 | 23 | package MyPerl::Statistics; 24 | 25 | 26 | sub new 27 | { 28 | my ($pkg, %data) = @_; 29 | 30 | bless { 31 | "numbers" => $data{numbers}, 32 | "observed" => $data{observed}, 33 | "expected" => $data{expected}, 34 | "correct" => $data{yates_correction}, 35 | }, $pkg; 36 | } 37 | 38 | 39 | sub count 40 | { 41 | my $obj = shift; 42 | 43 | return (scalar @{$obj->{numbers}}); 44 | } 45 | 46 | sub sum 47 | { 48 | my $obj = shift; 49 | 50 | my $sum = 0; 51 | for my $num (@{$obj->{numbers}}) 52 | { 53 | $sum += $num; 54 | } 55 | 56 | return $sum; 57 | } 58 | 59 | sub mean 60 | { 61 | my $obj = shift; 62 | 63 | my $num = scalar @{$obj->{numbers}}; 64 | my $sum = $obj->sum(); 65 | 66 | my $mean = $sum / $num; 67 | } 68 | 69 | 70 | sub sqsum 71 | { 72 | my $obj = shift; 73 | 74 | my $mean = $obj->mean(); 75 | 76 | my $sqsum = 0; 77 | for my $num (@{$obj->{numbers}}) 78 | { 79 | $sqsum += ( $num - $mean ) ** 2; 80 | } 81 | 82 | return $sqsum; 83 | } 84 | 85 | ## sample standard deviation 86 | sub stdev_s 87 | { 88 | my $obj = shift; 89 | 90 | my $num = scalar @{$obj->{numbers}}; 91 | 92 | return 0 unless( $num > 1 ); 93 | 94 | my $sqsum = $obj->sqsum(); 95 | 96 | $sqsum /= ( $num - 1 ); 97 | 98 | my $stdev = sqrt($sqsum); 99 | 100 | return $stdev; 101 | } 102 | 103 | sub stdev_p 104 | { 105 | my $obj = shift; 106 | 107 | my $num = scalar @{$obj->{numbers}}; 108 | 109 | my $sqsum = $obj->sqsum(); 110 | 111 | $sqsum /= ( $num ); 112 | 113 | my $stdev = sqrt($sqsum); 114 | 115 | return $stdev; 116 | } 117 | 118 | 119 | =head2 chi_squared_test 120 | 121 | About : Pearson's chi-square test. 122 | Source : http://stackoverflow.com/questions/21204733/a-better-chi-square-test-for-perl 123 | Usage : chi_squared_test("observed" => \@observed, 124 | "expected" => \@expected, 125 | "correct" => $yates_correction,); 126 | Args : Observed counts; 127 | Expected counts; 128 | Determine wether Yates' correction should be used. 129 | Returns : P-values. 130 | 131 | =cut 132 | sub chi_squared_test 133 | { 134 | my $obj = shift; 135 | 136 | return -2 unless(@{$obj->{observed}} == @{$obj->{expected}}); 137 | 138 | my $chi_squared = sum map { 139 | ( $obj->{observed}->[$_] - $obj->{expected}->[$_] )**2 / $obj->{expected}->[$_] 140 | } 0 .. $#{$obj->{observed}}; 141 | 142 | ## Yates's correction for continuity 143 | if ($obj->{correct}) { 144 | $chi_squared = sum map { 145 | ( abs($obj->{observed}->[$_] - $obj->{expected}->[$_]) - 0.5 )**2 / $obj->{expected}->[$_] 146 | } 0 .. $#{$obj->{observed}}; 147 | } 148 | 149 | my $degrees_of_freedom = @{$obj->{observed}} - 1; 150 | 151 | my $probability = chisqrprob( 152 | $degrees_of_freedom, 153 | $chi_squared 154 | ); 155 | 156 | return $probability; 157 | } 158 | 159 | 160 | 1; -------------------------------------------------------------------------------- /subtotal_stats.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # subtotal_stats.pl -- Subtotal stats by different rows. 4 | # 5 | # 6 | # Author: Nowind 7 | # Created: 2012-05-31 8 | # Updated: 2019-01-03 9 | # Version: 2.1.0 10 | # 11 | # Change logs: 12 | # Version 1.0.0 13/05/07: The initial version. 13 | # Version 1.0.1 13/05/09: Add option "--parts" to count values located in different 14 | # intervals. 15 | # Version 1.0.2 13/05/13: Add option "--keys" to specify output keys. 16 | # Version 1.0.3 13/06/13: Add test for non-numeric values. 17 | # Version 1.0.4 14/06/10: Change "--parts" to "--percent" to use "percentile" function 18 | # instead of "frequency_distribution_ref" function. 19 | # Version 1.1.0 14/07/02: Add function to stat lines with multiple values. 20 | # Version 1.2.0 14/12/18: Add function to calculate standard errors. 21 | # Version 1.2.1 14/12/19: Change input rows parsed in function count_multi2. 22 | # Version 2.0.0 14/12/23: Remove redundant functions, rewrite and rearrange most codes. 23 | # Version 2.0.1 15/10/20: Remove unused option "--rows"; bug fixed while processing multiple 24 | # values; update explanation of some options. 25 | # Version 2.0.2 15/10/23: Bug fixed while no percentile value returned. 26 | # Version 2.0.3 15/11/12: Add median values in output results. 27 | # Version 2.0.4 17/05/27: Update: Add option "--skip-minus" to skip unwanted minus values. 28 | # Version 2.1.0 19/01/03: Update: Add option "--freq-cnt" to count frequency of string values. 29 | 30 | 31 | use strict; 32 | 33 | use Data::Dumper; 34 | use Getopt::Long; 35 | use File::Find::Rule; 36 | use File::Basename; 37 | use Statistics::Descriptive; 38 | use Statistics::PointEstimation; 39 | use Scalar::Util qw(looks_like_number); 40 | 41 | use MyPerl::FileIO qw(:all); 42 | 43 | ######################## Main ######################## 44 | 45 | 46 | my $CMDLINE = "perl $0 @ARGV"; 47 | my $VERSION = '2.1.0'; 48 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 49 | 50 | 51 | my %options = (); 52 | my ($output, $multi_values, @percentiles, $skip_minus); 53 | GetOptions( 54 | "input=s" => \$options{input_file}, 55 | 56 | "O|output=s" => \$output, 57 | 58 | "D|out-orders=s{,}" => \@{$options{out_orders}}, 59 | 60 | "key-type=s" => \$options{key_type}, 61 | "value-type=s" => \$options{value_type}, 62 | "freq-cnt" => \$options{out_value_freq}, 63 | 64 | "percent=f{,}" => \@percentiles, 65 | 66 | "multi-values" => \$multi_values, 67 | 68 | "skip-minus" => \$skip_minus, 69 | ); 70 | 71 | unless( $options{input_file} ) { 72 | print < 82 | input file with first row used as key, remain rows as values, there 83 | could be only one key, but multiple values can be specified by using 84 | "--multi-values" option, required 85 | 86 | -O, --output 87 | output filename, default to STDOUT 88 | 89 | --key-type 90 | manully specify the type of keys, "numeric" or "string", otherwise 91 | will be determined by automatic detection 92 | --value-type 93 | manully specify the type of values, "numeric" or "string", otherwise 94 | will be determined by automatic detection, note when value type is 95 | string, only occurence will be counted 96 | 97 | -p, --percent 98 | sort the data and returns the value that corresponds to those 99 | percentiles specified here, can have multiple values 100 | 101 | -m, --multi-values 102 | subtotal stats of lines with one master key with multiple values, the 103 | input file should start with an header line with content look like: 104 | 105 | #key value_id1 value_id2 value_id3 ... 106 | ex1 1000 1000 1000 ... 107 | ex1 1500 1010 500 ... 108 | ex1 1300 2000 3000 ... 109 | ex2 1050 1000 1000 ... 110 | ... 111 | 112 | then the actual key field would be "value_id1:ex1", "value_id2:ex1", 113 | "value_id1:ex2", etc. 114 | 115 | -f, --freq-cnt 116 | count frequency of "string" values 117 | 118 | -D, --out-orders 119 | specify keys manually, the output will be sorted according to the 120 | order specified here, this option is used to control the output 121 | order of key fields (or master key fields while "--multi-values" option 122 | is specified) 123 | 124 | -s, --skip-minus 125 | skip minus numbers in values 126 | 127 | EOF 128 | 129 | exit(1); 130 | } 131 | 132 | $|++; 133 | 134 | if ($output) { 135 | open (STDOUT, "> $output") || die $!; 136 | } 137 | 138 | 139 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 140 | 141 | print STDOUT "$HEADER##" . (scalar localtime()) . "\n"; 142 | 143 | 144 | count_subtotal(\%options); 145 | 146 | 147 | 148 | print STDERR "# " . (scalar localtime()) . "\n"; 149 | 150 | ######################### Sub ######################### 151 | 152 | 153 | 154 | =head2 count_subtotal 155 | 156 | About : Count subtotals. 157 | Usage : count_subtotal($file); 158 | Args : Array reference to hold background blocks infos; 159 | File contains blocks infos. 160 | Returns : Null 161 | 162 | =cut 163 | sub count_subtotal 164 | { 165 | my ($opts) = @_; 166 | 167 | 168 | printf STDERR ">> Start parsing $opts->{input_file} ... "; 169 | my @titles = (); 170 | my %Stats = (); 171 | my $key_type = $options{key_type} ? $options{key_type} : 'string'; 172 | my $value_type = $options{value_type} ? $options{value_type} : 'string'; 173 | my $fh = getInputFilehandle($opts->{input_file}); 174 | while (<$fh>) 175 | { 176 | if (/^\#/) { 177 | @titles = (split /\s+/); 178 | } 179 | 180 | next if (/^\#/ || /^\s+$/); 181 | 182 | my ($key, @values) = (split /\s+/); 183 | 184 | ## 185 | ## check type of key and values 186 | ## 187 | if(!$options{key_type} && looks_like_number($key)) { 188 | $key_type = 'numeric'; 189 | } 190 | 191 | if(!$options{value_type} && looks_like_number($values[0])) { 192 | $value_type = 'numeric'; 193 | } 194 | 195 | for (my $i=0; $i<@values; $i++) 196 | { 197 | next if ($skip_minus && $values[$i] < 0); 198 | 199 | if ($multi_values) { 200 | push @{$Stats{$key}->{$i}}, $values[$i]; 201 | } 202 | else { 203 | push @{$Stats{$key}->{0}}, $values[$i]; 204 | } 205 | } 206 | } 207 | print STDERR "done!\n"; 208 | 209 | 210 | ## 211 | ## sort output by key values 212 | ## 213 | my @user_keys = (); 214 | if (@{$opts->{out_orders}} > 0) { 215 | @user_keys = @{$opts->{out_orders}}; 216 | } 217 | else { 218 | if ($key_type eq 'string') { 219 | @user_keys = sort {$a cmp $b} keys %Stats; 220 | } 221 | else { 222 | @user_keys = sort {$a <=> $b} keys %Stats; 223 | } 224 | } 225 | 226 | 227 | ## 228 | ## generate stats 229 | ## 230 | if ($value_type eq 'numeric') { 231 | if (@percentiles > 0) { 232 | my $parts = join "\t", @percentiles; 233 | print "#ID\tCount\tSum\tMean\tMedian\tMin\tMax\tStdev\tStd_err\tPercentile:$parts\n"; 234 | } 235 | else { 236 | print "#ID\tCount\tSum\tMean\tMedian\tMin\tMax\tStdev\tStd_err\n"; 237 | } 238 | } 239 | else { 240 | print "#ID\tCount(All)\tCount(NoDup)\n"; 241 | } 242 | 243 | 244 | print STDERR ">> Start generating results ... "; 245 | for my $sub_key (@user_keys) 246 | { 247 | unless($Stats{$sub_key}) { 248 | next; 249 | } 250 | 251 | for my $i (sort {$a <=> $b} keys %{$Stats{$sub_key}}) 252 | { 253 | my $out_id = $sub_key; 254 | my $out_stats = ''; 255 | 256 | if ($multi_values) { 257 | my $column_id = (@titles > 0) ? $titles[$i+1] : ($i+1); 258 | 259 | $out_id = "$column_id:$sub_key"; 260 | } 261 | 262 | if (($options{value_type} && $options{value_type} eq 'numeric') || 263 | (!$options{value_type} && looks_like_number($Stats{$sub_key}->{$i}->[0]))) { 264 | my $stat = Statistics::Descriptive::Full->new(); 265 | $stat->add_data(@{$Stats{$sub_key}->{$i}}); 266 | 267 | my $count = $stat->count(); 268 | my $sum = $stat->sum(); 269 | my $mean = $stat->mean(); 270 | my $median = $stat->median(); 271 | my $min = $stat->min(); 272 | my $max = $stat->max(); 273 | my $var = $stat->variance(); 274 | my $stdev = $stat->standard_deviation(); 275 | my $stderr = $stdev / sqrt($count); 276 | 277 | if (@percentiles > 0) { 278 | my @perc_counts = (); 279 | for (@percentiles) 280 | { 281 | my $perc_count = defined($stat->percentile($_)) ? $stat->percentile($_) : '-'; 282 | push @perc_counts, $perc_count; 283 | } 284 | 285 | my $perc_counts = join "\t", @perc_counts; 286 | 287 | $out_stats = "$count\t$sum\t$mean\t$median\t$min\t$max\t$stdev\t$stderr\t$perc_counts"; 288 | } 289 | else { 290 | $out_stats = "$count\t$sum\t$mean\t$median\t$min\t$max\t$stdev\t$stderr"; 291 | } 292 | } 293 | else { 294 | ## 295 | ## only count frequency of each string 296 | ## 297 | my %cnts = (); 298 | my %dups = (); 299 | for my $j (@{$Stats{$sub_key}->{$i}}) 300 | { 301 | $cnts{$j} ++; 302 | next if (exists $dups{$j}); 303 | $dups{$j} ++; 304 | } 305 | 306 | my $cnt_all = scalar @{$Stats{$sub_key}->{$i}}; 307 | my $cnt_nodup = scalar (keys %dups); 308 | 309 | $out_stats = "$cnt_all\t$cnt_nodup"; 310 | 311 | if ($options{out_value_freq}) { 312 | for my $j (sort keys %cnts) 313 | { 314 | print STDOUT "$out_id:$j\t$cnts{$j}\t$dups{$j}\n"; 315 | } 316 | 317 | $out_id .= ":All"; 318 | } 319 | } 320 | 321 | print STDOUT "$out_id\t$out_stats\n"; 322 | } 323 | } 324 | print STDERR "done!\n"; 325 | } 326 | -------------------------------------------------------------------------------- /vcf2tables.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # vcf2tables.pl -- convert vcf file to tabular file 4 | # 5 | # Author: Nowind 6 | # Created: 2012-02-21 7 | # Updated: 2016-09-28 8 | # Version: 1.2.1 9 | # 10 | # Change logs: 11 | # Version 1.0.0 16/07/30: The initial version. 12 | # Version 1.1.0 16/08/12: Updated: add option "--compact" to specify output format. 13 | # Version 1.2.0 16/09/21: Updated: add option "--depth" to generate overall allele depths. 14 | # Version 1.2.1 16/09/28: Updated: add square brackets around alleles. 15 | 16 | 17 | 18 | 19 | use strict; 20 | 21 | use Data::Dumper; 22 | use Getopt::Long; 23 | 24 | use MyPerl::FileIO qw(:all); 25 | 26 | ##################### Main #################### 27 | 28 | 29 | my $CMDLINE = "perl $0 @ARGV"; 30 | my $VERSION = '1.2.1'; 31 | my $HEADER = "##$CMDLINE\n##Version: $VERSION\n"; 32 | my $SOURCE = (scalar localtime()) . " Version: $VERSION"; 33 | 34 | my $min_out_depth = 1; 35 | my ($vcf_file, $output, @query_info_fields, @query_sample_fields, 36 | $show_sample_alleles, $count_allele_depth, $compact_out); 37 | GetOptions( 38 | "vcf=s" => \$vcf_file, 39 | "output=s" => \$output, 40 | 41 | "info=s{,}" => \@query_info_fields, 42 | "sample-info=s{,}" => \@query_sample_fields, 43 | 44 | "use-nucl" => \$show_sample_alleles, 45 | 46 | "depth-count" => \$count_allele_depth, 47 | "min-depth=i" => \$min_out_depth, 48 | 49 | "compact" => \$compact_out, 50 | ); 51 | 52 | unless( $vcf_file ) { 53 | print < 63 | input vcf file, required 64 | -o, --output 65 | output filename, default to STDOUT 66 | 67 | -i, --info 68 | overall INFO fields to output, can have multiple values 69 | -s, --sample-info 70 | sample infos to output, can have multiple values [default: GT] 71 | 72 | -u, --use-nucl 73 | use nucleotides instead of GT code (e.g. 0,1,...) for each sample, 74 | the REF and ALT fields will be omitted 75 | 76 | -c, --compact 77 | make results more compact by output all samples in a single row 78 | 79 | -d, --depth-count 80 | output allele depth, require "AD" field 81 | -m, --min-depth 82 | ignore alleles with total depth below this threshold, [default: 1] 83 | 84 | EOF 85 | 86 | exit(1); 87 | } 88 | 89 | $|++; 90 | 91 | if ($output) { 92 | open (STDOUT, "> $output") || die $!; 93 | } 94 | 95 | unless(@query_sample_fields > 0) { 96 | push @query_sample_fields, 'GT'; 97 | } 98 | 99 | print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n"; 100 | 101 | print STDERR ">> Start processing $vcf_file ... "; 102 | print STDOUT "##source=$SOURCE $CMDLINE\n"; 103 | processVCF($vcf_file); 104 | print STDERR "done!\n"; 105 | 106 | print STDERR "# " . (scalar localtime()) . "\n"; 107 | 108 | 109 | ######################### Sub ######################### 110 | 111 | sub processVCF 112 | { 113 | my ($in) = @_; 114 | 115 | my @Samples_ids = (); 116 | my $fh = getInputFilehandle($in); 117 | while (<$fh>) 118 | { 119 | if (/#CHROM/) { 120 | my @lines = (split /\s+/); 121 | for (my $i=9; $i <@lines; $i++) 122 | { 123 | push @Samples_ids, $lines[$i]; 124 | } 125 | 126 | my $out_sample_ids = join "\t", @Samples_ids; 127 | my $query_sample_header = join "\;", @query_sample_fields; 128 | 129 | 130 | if ($compact_out) { 131 | $out_sample_ids = "SAMPLES($query_sample_header)"; 132 | } 133 | 134 | my $out_header = ($show_sample_alleles) ? "#CHROM\tPOS" : "#CHROM\tPOS\tREF\tALT"; 135 | 136 | 137 | if (@query_info_fields > 0) { 138 | my $out_infos = join "\t", @query_info_fields; 139 | 140 | $out_header .= "\t$out_infos"; 141 | } 142 | 143 | if ($count_allele_depth) { 144 | $out_header .= "\tOverall_Allele_Depth"; 145 | } 146 | 147 | print STDOUT "$out_header\t$out_sample_ids\n"; 148 | } 149 | 150 | next if (/\#/ || /^\s+$/); 151 | 152 | my ($CHROM, $POS, $ID, $REF, $ALT, $QUAL, 153 | $FILTER, $INFO, $FORMAT, @SAMPLES) = (split /\s+/); 154 | 155 | 156 | ## 157 | ## get query overall infos 158 | ## 159 | my @infos = (split /\;/, $INFO); 160 | my %infos = (); 161 | for (my $i=0; $i<@infos; $i++) 162 | { 163 | if($infos[$i] =~ /(\w+)\=(.*)/) { 164 | $infos{$1} = $2; 165 | } 166 | } 167 | 168 | my @out_infos = (); 169 | 170 | for my $info_id (@query_info_fields) 171 | { 172 | if ($infos{$info_id}) { 173 | push @out_infos, $infos{$info_id}; 174 | } 175 | else { 176 | push @out_infos, '-'; 177 | } 178 | } 179 | 180 | 181 | ## 182 | ## get query sample infos 183 | ## 184 | my @vars = ($REF, (split /\,/, $ALT)); 185 | 186 | my @tags = (split /\:/, $FORMAT); 187 | my %tags = (); 188 | for (my $i=0; $i<@tags; $i++) { $tags{$tags[$i]} = $i; } 189 | 190 | my %total_allele_depths = (); 191 | 192 | my @out_samples = (); 193 | for (my $i=0; $i<@SAMPLES; $i++) 194 | { 195 | my @sample_infos = (split /\:/, $SAMPLES[$i]); 196 | 197 | my @out_sample_info = (); 198 | 199 | for (my $j=0; $j<@query_sample_fields; $j++) 200 | { 201 | if ($show_sample_alleles && $query_sample_fields[$j] eq 'GT') { 202 | ## use nucleotides instead of numbers 203 | if ($sample_infos[$tags{$query_sample_fields[$j]}] =~ /(\d)(\/|\|)(\d)/) { 204 | $sample_infos[$tags{$query_sample_fields[$j]}] = $vars[$1] . $2 . $vars[$3]; 205 | } 206 | } 207 | 208 | if (exists($tags{$query_sample_fields[$j]}) && $sample_infos[$tags{$query_sample_fields[$j]}]) { 209 | push @out_sample_info, $sample_infos[$tags{$query_sample_fields[$j]}]; 210 | } 211 | else { 212 | push @out_sample_info, '-'; 213 | } 214 | } 215 | 216 | 217 | ## count allele depths 218 | if ($count_allele_depth) { 219 | if ($tags{AD} && ($sample_infos[$tags{AD}]) && ($sample_infos[$tags{AD}] ne '.')) { 220 | my @depths = split /\,/, $sample_infos[$tags{AD}]; 221 | 222 | for (my $k=0; $k<@depths; $k++) 223 | { 224 | $total_allele_depths{$vars[$k]} += $depths[$k]; 225 | } 226 | } 227 | } 228 | 229 | unless(@out_sample_info > 0) { 230 | print STDERR Dumper(@out_sample_info);exit; 231 | } 232 | 233 | my $out_sample_info = join ':', @out_sample_info; 234 | 235 | if ($compact_out) { 236 | push @out_samples, "$Samples_ids[$i]($out_sample_info)"; 237 | } 238 | else { 239 | push @out_samples, $out_sample_info; 240 | } 241 | } 242 | 243 | my $out_samples = join "\t", @out_samples; 244 | 245 | if ($compact_out) { 246 | $out_samples = join ";", @out_samples; 247 | } 248 | 249 | 250 | 251 | ## 252 | ## count overall allele depth 253 | ## 254 | my @allele_depth_info = (); 255 | 256 | if ($count_allele_depth) { 257 | for my $nt (@vars) 258 | { 259 | if ($total_allele_depths{$nt} && $total_allele_depths{$nt} >= $min_out_depth) { 260 | push @allele_depth_info, "[$nt]:$total_allele_depths{$nt}"; 261 | } 262 | } 263 | } 264 | 265 | my $allele_depth_info = (@allele_depth_info > 0) ? (join ',', @allele_depth_info) : '-'; 266 | 267 | my $out_line = $show_sample_alleles ? "$CHROM\t$POS" : "$CHROM\t$POS\t$REF\t$ALT"; 268 | 269 | if (@query_info_fields > 0) { 270 | my $out_infos = join "\t", @out_infos; 271 | 272 | $out_line .= "\t$out_infos"; 273 | } 274 | 275 | if ($count_allele_depth) { 276 | $out_line .= "\t$allele_depth_info"; 277 | } 278 | 279 | print STDOUT "$out_line\t$out_samples\n"; 280 | } 281 | } 282 | --------------------------------------------------------------------------------