├── .gitignore
├── LICENSE
├── README.md
├── biology
    └── bio_plasmid_get_insert.pl
├── blast
    ├── auto_blast
    ├── auto_makeblastdb
    ├── blast_best_hit.py
    ├── blast_best_hit_outfmt6.py
    └── fasta_rename_head_before_blast.pl
├── enzyme
    ├── embossre.enz
    ├── enzs.list
    ├── restrict_check_digested_sequence_number.pl
    ├── restrict_choose_enzyme_for_identify_genomes.pl
    ├── restrict_with_T_tail.pl
    ├── restrict_with_far_away_digest_site.pl
    └── restrict_without_digest_site_in_sequences.pl
├── file_formats
    ├── add_annotations_to_myva.pl
    ├── bam2gff.py
    ├── extract_cds_from_glimmer_predict_result.pl
    ├── extract_features_from_genbank_file.py
    ├── extract_sequence_from_genbank_file.pl
    ├── genbank_filter.py
    ├── gff2fa.py
    ├── gff_frame_start_coverage.plot.R
    ├── gff_frame_start_coverage.py
    └── gff_intersect.py
├── for_education
    ├── Parsing grouped data in multi-line.pl
    ├── extract_cds_by_gff.pl
    ├── fasta_common_seqs2.pl
    ├── fasta_extract_sequence_by_id_file.pl
    ├── join_table.pl
    └── simple_statistics.pl
├── not_used
    ├── csv2tab
    ├── csv_join
    ├── csv_join_paired_lines.py
    ├── csv_split_paired_lines.py
    ├── fasta_seq_gc_content_plot.py
    └── fasta_seq_length_plot.py
├── plot
    ├── README.md
    ├── example
    │   ├── data.tsv
    │   ├── data.tsv.dist.png
    │   ├── data.txt.png
    │   ├── heatmap.png
    │   └── plot_barplot.png
    ├── plot_distribution.R
    └── plot_distribution.py
├── protein
    └── protein_batch_compute_pI.pl
├── sequence
    ├── README.md
    ├── fasta2tab
    ├── fasta_common_seqs.pl
    ├── fasta_extract_by_pattern.pl
    ├── fasta_extract_randomly.pl
    ├── fasta_gc_skew.plot.R
    ├── fasta_gc_skew.py
    ├── fasta_locate_motif.pl
    ├── fasta_remove_duplicates.pl
    ├── fasta_rename_duplicated_names.pl
    ├── fasta_reset_start_position_for_circular_genome.pl
    ├── fasta_sliding_window.pl
    ├── fasta_trim_aligned_fasta.pl
    ├── fastq2tab
    ├── fastq_extract_paired_reads.pl
    ├── fastx_mapping_with_bwa.pl
    ├── fastx_pwm.py
    ├── fastx_tm.py
    ├── fastx_translate.py
    ├── run_clustalo.pl
    ├── sample
    │   ├── gc_skew.png
    │   ├── seq.fa
    │   └── seq.fq.gz
    ├── seqcomp
    ├── seqrc
    ├── seqrev
    ├── tab2fasta
    └── tab2fastq
├── taxon
    └── taxon_fetch.py
└── util
    └── unzipGBK


/.gitignore:
--------------------------------------------------------------------------------
 1 | .directory
 2 | /blib/
 3 | /.build/
 4 | _build/
 5 | cover_db/
 6 | inc/
 7 | Build
 8 | !Build/
 9 | Build.bat
10 | .last_cover_stats
11 | /Makefile
12 | /Makefile.old
13 | /MANIFEST.bak
14 | /META.yml
15 | /META.json
16 | /MYMETA.*
17 | nytprof.out
18 | /pm_to_blib
19 | *.o
20 | *.bs
21 | /_eumm/ 
22 | .directory
23 | *.idea
24 | 
25 | 
26 | # Byte-compiled / optimized / DLL files
27 | __pycache__/
28 | *.py[cod]
29 | 
30 | # C extensions
31 | *.so
32 | 
33 | # Distribution / packaging
34 | .Python
35 | env/
36 | build/
37 | develop-eggs/
38 | dist/
39 | downloads/
40 | eggs/
41 | .eggs/
42 | lib/
43 | lib64/
44 | parts/
45 | sdist/
46 | var/
47 | *.egg-info/
48 | .installed.cfg
49 | *.egg
50 | 
51 | # PyInstaller
52 | #  Usually these files are written by a python script from a template
53 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
54 | *.manifest
55 | *.spec
56 | 
57 | # Installer logs
58 | pip-log.txt
59 | pip-delete-this-directory.txt
60 | 
61 | # Unit test / coverage reports
62 | htmlcov/
63 | .tox/
64 | .coverage
65 | .cache
66 | nosetests.xml
67 | coverage.xml
68 | 
69 | # Translations
70 | *.mo
71 | *.pot
72 | 
73 | # Django stuff:
74 | *.log
75 | 
76 | # Sphinx documentation
77 | docs/_build/
78 | 
79 | # PyBuilder
80 | target/
81 | 
82 | # History files
83 | .Rhistory
84 | .Rapp.history
85 | 
86 | # Example code in package build process
87 | *-Ex.R
88 | 
89 | # RStudio files
90 | .Rproj.user/
91 | 
92 | # produced vignettes
93 | vignettes/*.html
94 | vignettes/*.pdf
95 | 
96 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
97 | .httr-oauth


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013 Wei Shen (shenwei356@gmail.com)
2 | 
3 | The MIT License
4 | 
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 | 
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 | 
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Bio_scripts
 2 | ========
 3 | 
 4 | Practical, reusable scripts for bioinformatics .
 5 | 
 6 |     sequence        tools for FASTA/Q files: fasta2tab, tab2fasta,
 7 |                     fasta_extract_by_pattern, fasta_common_seqs,
 8 |                     fasta_locate_motif, fasta_remove_duplicates,
 9 |                     fastx_translate, fasta_gc_skew.plot ...
10 |     util            moved to https://github.com/shenwei356/datakit
11 |     plot            plot scripts: heatmap ...
12 | 
13 |     blast           NCBI BLAST+ wrappers
14 |     file_formats    genbank->gtf, bam2gff, gff2fa ...
15 |     enzyme          analysis of restrict enzymes
16 |     protein         Batch compute pI of protein
17 | 
18 |     biology         get insert from sanger sequencing result
19 | 
20 |     for_education   scripts with detail comments, for education
21 |     not_used        older version of some scripts
22 | 
23 | 
24 | See README in sub derectories.
25 | 
26 | -------
27 | 
28 | Copyright (c) 2014-2015, Wei Shen (shenwei356@gmail.com)
29 | 
30 | 
31 | [MIT License](https://github.com/shenwei356/bio_scripts/blob/master/LICENSE)
32 | 


--------------------------------------------------------------------------------
/biology/bio_plasmid_get_insert.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use File::Basename;
 5 | use BioUtil::Seq;
 6 | 
 7 | # M13
 8 | my $prefix = "AGCGGCCGCGAATTGCCCTT";
 9 | my $suffix = "AAGGGCAATTCGTTTAAACCT";
10 | 
11 | $0 = basename $0;
12 | my $usage = <<USAGE;
13 | 
14 | usage: $0 <forward seq> <reverse seq>
15 | 
16 | USAGE
17 | 
18 | die $usage unless @ARGV == 2;
19 | 
20 | my $seqf = get_the_one_seq( shift @ARGV );
21 | my $seqr = revcom (get_the_one_seq( shift @ARGV ) );
22 | 
23 | my $sf =  extract_insert( $prefix, $suffix, $seqf );
24 | my $sr =  extract_insert( $prefix, $suffix, $seqr );
25 | 
26 | if ( $sf ne $sr ) {
27 |     print "forward: $sf\nreverse: $sr\n";
28 |     die "forward and reverse sequences are not equal!";
29 | }
30 | 
31 | print $sf, "\n";
32 | 
33 | 
34 | 
35 | sub extract_insert {
36 |     my ( $prefix, $suffix, $seq ) = @_;
37 |     die "prefix and suffix do not match sequence!\n"
38 |         unless $seq =~ /$prefix(.+)$suffix/;
39 |     return $1;
40 | }
41 | 
42 | sub get_the_one_seq {
43 |     my ($file) = @_;
44 |     my $seqs = read_sequence_from_fasta_file($file);
45 |     die "only one sequence should be in $file. Please check it.\n"
46 |         unless keys %$seqs == 1;
47 |     return ( values %$seqs )[0];
48 | }
49 | 


--------------------------------------------------------------------------------
/blast/auto_blast:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # https://github.com/shenwei356/bio_scripts
 3 | 
 4 | use strict;
 5 | use File::Basename;
 6 | use BioUtil::Seq;
 7 | use BioUtil::Util;
 8 | 
 9 | my $usage = q(
10 | auto_blast -- just for typing fewer words..^_^
11 | 
12 | Usage: $0 [arguments]
13 | Options:   
14 |     -program             [blastn] 
15 |     -query
16 |     -db               
17 |     -outfmt              [11 0]
18 | 
19 |     -h                   Show this help information
20 | 
21 | Examples:
22 | 
23 |     auto_blast -query test.fa -db nt -outfmt 6
24 | 
25 | https://github.com/shenwei356/bio_scripts
26 | 
27 | );
28 | 
29 | my $opts    = {};
30 | my $threads = `cat /proc/cpuinfo | grep processor |wc -l`;
31 | $threads =~ s/\n//g;
32 | my $outfmt7shenwei356
33 |     = '7 qseqid sseqid qlen slen length pident ppos qcovs qcovhsp'
34 |     . ' mismatch gapopen gaps sstrand qstart qend sstart send'
35 |     . ' evalue bitscore staxids salltitles';
36 | $$opts{-outfmt} = [ 0, $outfmt7shenwei356 ];
37 | 
38 | getopt( $opts, \@ARGV );
39 | 
40 | $$opts{-program}     = 'blastn' unless exists $$opts{-program};
41 | $$opts{-num_threads} = $threads unless exists $$opts{-num_threads};
42 | 
43 | # print "$_: $$opts{$_}\n" for sort keys %$opts;
44 | 
45 | die $usage unless exists $$opts{-query} and exists $$opts{-db};
46 | die "file not exists: $$opts{-query}\n" unless -e $$opts{-query};
47 | 
48 | my $info = sprintf "%s.%s@%s", quotemeta $$opts{-query}, $$opts{-program},
49 |     basename( $$opts{-db} );
50 | $$opts{-out} = $info unless exists $$opts{-out};
51 | 
52 | my $cmd = "";
53 | 
54 | $cmd = $$opts{-program};
55 | for ( sort keys %$opts ) {
56 |     next
57 |         if $_ eq '-program'
58 |         or $_ eq '-outfmt'
59 |         or $_ eq '-out'
60 |         or $_ eq '-query';
61 | 
62 |     if ( ref $$opts{$_} eq ref [] ) {
63 |         $cmd .= " $_ ", join " ", @{ $$opts{$_} };
64 |     }
65 |     else {
66 |         $cmd .= " $_ $$opts{$_}";
67 |     }
68 | }
69 | my $file_outfmt11 = "$$opts{-out}.asn";
70 | $cmd .= sprintf " -query %s -outfmt 11 -out %s",
71 |     quotemeta $$opts{-query}, $file_outfmt11;
72 | 
73 | print STDERR "$cmd\n";
74 | my $fail = run($cmd);
75 | die "failed to run:$cmd\n" if $fail;
76 | 
77 | for ( @{ $$opts{-outfmt} } ) {
78 |     s/^\s+//g;
79 |     my $outfmt = ( split( /\s+/, $_ ) )[0];
80 |     $cmd
81 |         = "blast_formatter -archive $file_outfmt11 -outfmt \"$_\" > $$opts{-out}.outfmt$outfmt";
82 |     print STDERR "$cmd\n";
83 |     my $fail = run($cmd);
84 |     die "failed to run:$cmd\n" if $fail;
85 | }
86 | 


--------------------------------------------------------------------------------
/blast/auto_makeblastdb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # https://github.com/shenwei356/bio_scripts
 3 | 
 4 | use strict;
 5 | use File::Basename;
 6 | use BioUtil::Seq;
 7 | use BioUtil::Util;
 8 | 
 9 | my $usage = q(
10 | auto_makeblastdb -- just for typing fewer words..^_^
11 | 
12 | Usage: $0 [arguments]
13 | Options:    
14 |     -in                  
15 |     -input_type          [fasta]
16 | 
17 |     -dbtype              [nucl]
18 |     -out
19 |     -title               
20 | 
21 |     -parse_seqids        [false]
22 | 
23 |     -mask                [false]
24 | 
25 |     -h                   Show this help information
26 | 
27 | Examples:
28 | 
29 |     auto_makeblastdb -in ab.fa -out ab
30 | 
31 | https://github.com/shenwei356/bio_scripts
32 | 
33 | );
34 | 
35 | my $opts = {};
36 | 
37 | getopt( $opts, \@ARGV );
38 | 
39 | die $usage unless exists $$opts{-in} and exists $$opts{-out};
40 | die "file not exists: $$opts{-in}\n" unless -e $$opts{-in};
41 | 
42 | $$opts{-title} = $$opts{-out} unless exists $$opts{-title};
43 | $$opts{-dbtype} = 'nucl' unless exists $$opts{-dbtype};
44 | $$opts{-input_type} = 'fasta' unless exists $$opts{-input_type};
45 | 
46 | my $cmd = '';
47 | my ( $file_mask_asnb, $file_mask_counts ) = (undef) x 2;
48 | if ( $$opts{-mask} ) {
49 |     print STDERR "Create masking information using windowmask...\n";
50 |     $file_mask_counts = "$$opts{-out}_mask.counts";
51 |     $file_mask_asnb   = "$$opts{-out}_mask.asnb";
52 |     $cmd
53 |         = sprintf
54 |         "windowmasker -in %s -infmt %s -mk_counts -out %s",
55 |         quotemeta $$opts{-in}, $$opts{-input_type}, $file_mask_counts;
56 |     $cmd .= " -parse_seqids" if $$opts{-parse_seqids};
57 |     my $fail = run($cmd);
58 |     die "failed to run:$cmd\n" if $fail;
59 | 
60 |     print STDERR "Makeblastdb...\n";
61 |     $cmd
62 |         = sprintf
63 |         "windowmasker -in %s -infmt %s -ustat %s -outfmt %s -out %s",
64 |         quotemeta $$opts{-in}, $$opts{-input_type}, $file_mask_counts,
65 |         "maskinfo_asn1_bin", $file_mask_asnb;    
66 |     $cmd .= " -parse_seqids" if $$opts{-parse_seqids};
67 |     
68 |     my $fail = run($cmd);
69 |     die "failed to run:$cmd\n" if $fail;
70 | }
71 | 
72 | $cmd
73 |     = sprintf
74 |     "makeblastdb -in %s -input_type %s -dbtype %s -out %s -title %s",
75 |     quotemeta $$opts{-in}, $$opts{-input_type}, $$opts{-dbtype},
76 |     $$opts{-out}, $$opts{-title};
77 | $cmd .= " -parse_seqids" if $$opts{-parse_seqids};
78 | 
79 | if ( $$opts{-mask} ) {
80 |     $cmd .= " -mask_data $file_mask_asnb";
81 | }
82 | 
83 | print STDERR "$cmd\n";
84 | my $fail = run($cmd);
85 | die "failed to run:$cmd\n" if $fail;
86 | 


--------------------------------------------------------------------------------
/blast/blast_best_hit.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | 
 7 | if len(sys.argv) != 2:
 8 |     print "\nUsage: %s  <alignment file>\n" % os.path.basename(sys.argv[0])
 9 |     sys.exit(1)
10 | 
11 | blast          = sys.argv[1]
12 | 
13 | with open(blast, 'r') as fp:
14 |     init = ""
15 |     for line in fp:
16 |         if not line.startswith("#"):
17 |             item         = line.strip().split("\t")
18 |             if init != item[0]:
19 |                 print line.strip()
20 |                 init = item[0]
21 | 


--------------------------------------------------------------------------------
/blast/blast_best_hit_outfmt6.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # from https://github.com/jameslz/benchmark
 4 | import sys
 5 | 
 6 | if len(sys.argv) != 4:
 7 |     print "\nUsage: %s  <alignment> <evalue> <bit_score>\n" % sys.argv[0]
 8 |     sys.exit(1)
 9 | 
10 | blast          = sys.argv[1]
11 | max_evalue     = float(sys.argv[2])
12 | min_bit_score  = float(sys.argv[3])
13 | 
14 | with open(blast, 'r') as fp:
15 |     init = ""
16 |     for line in fp:
17 |         if not line.startswith("#"):
18 |             item         = line.strip().split("\t")
19 |             evalue       = float(item[10])
20 |             bit_score    = float(item[11])
21 |             if init != item[0]:
22 |                 if evalue <= max_evalue and bit_score >= min_bit_score:
23 |                     print line.strip()
24 |                 init = item[0]
25 | 


--------------------------------------------------------------------------------
/blast/fasta_rename_head_before_blast.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | # Function: Delete illegal charactors of head line in fasta file before blast.
 4 | # Author  : Wei Shen <shenwei356#gmail.com> http://shenwei.me
 5 | # Date    : 2014-08-14
 6 | 
 7 | use strict;
 8 | use BioUtil::Seq;
 9 | 
10 | die "\nUsage: $0 fasta_file [fasta_file ...]\n\n"
11 |     unless @ARGV > 0;
12 | 
13 | while (@ARGV) {
14 |     my $file = shift @ARGV;
15 |     my $n    = rename_fasta_header( '[^a-z\d\s\-\_\(\)\[\]\|]', '_', $file,
16 |         "$file.rename.fa" );
17 |     print "$file: $n records renamed\n";
18 | }
19 | 


--------------------------------------------------------------------------------
/enzyme/embossre.enz:
--------------------------------------------------------------------------------
  1 | #  
  2 | # REBASE version 408                                              emboss_e.408
  3 | #  
  4 | #     =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  5 | #     REBASE, The Restriction Enzyme Database   http://rebase.neb.com
  6 | #     Copyright (c)  Dr. Richard J. Roberts, 2014.   All rights reserved.
  7 | #     =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
  8 | #  
  9 | # Rich Roberts                                                    Jul 30 2014
 10 | #  
 11 | # REBASE enzyme patterns for EMBOSS (embossre.enz)
 12 | #
 13 | # Format:
 14 | # 
 15 | # name<ws>pattern<ws>len<ws>ncuts<ws>blunt<ws>c1<ws>c2<ws>c3<ws>c4
 16 | #
 17 | # Where:
 18 | # name = name of enzyme
 19 | # pattern = recognition site
 20 | # len = length of pattern
 21 | # ncuts = number of cuts made by enzyme
 22 | #         Zero represents unknown
 23 | # blunt = true if blunt end cut, false if sticky
 24 | # c1 = First 5' cut
 25 | # c2 = First 3' cut
 26 | # c3 = Second 5' cut
 27 | # c4 = Second 3' cut
 28 | #
 29 | # Examples:
 30 | # AAC^TGG -> 6 2 1 3 3 0 0
 31 | # A^ACTGG -> 6 2 0 1 5 0 0
 32 | # AACTGG  -> 6 0 0 0 0 0 0
 33 | # AACTGG(-5/-1) -> 6 2 0 1 5 0 0
 34 | # (8/13)GACNNNNNNTCA(12/7) -> 12 4 0 -9 -14 24 19
 35 | #
 36 | # i.e. cuts are always to the right of the given
 37 | # residue and sequences are always with reference to
 38 | # the 5' strand.
 39 | # Sequences are numbered ... -3 -2 -1 1 2 3 ... with
 40 | # the first residue of the pattern at base number 1.
 41 | #
 42 | #
 43 | AanI	TTATAA	6	2	1	3	3	0	0
 44 | AarI	CACCTGC	7	2	0	11	15	0	0
 45 | AasI	GACNNNNNNGTC	12	2	0	7	5	0	0
 46 | AatII	GACGTC	6	2	0	5	1	0	0
 47 | AbaCIII	ctatcav	7	0	0	0	0	0	0
 48 | AbaSI	C	1	2	0	12	10	0	0
 49 | AbsI	CCTCGAGG	8	2	0	2	6	0	0
 50 | AccI	GTMKAC	6	2	0	2	4	0	0
 51 | AccII	CGCG	4	2	1	2	2	0	0
 52 | AccIII	TCCGGA	6	2	0	1	5	0	0
 53 | Acc16I	TGCGCA	6	2	1	3	3	0	0
 54 | Acc36I	ACCTGC	6	2	0	10	14	0	0
 55 | Acc65I	GGTACC	6	2	0	1	5	0	0
 56 | AccB1I	GGYRCC	6	2	0	1	5	0	0
 57 | AccB7I	CCANNNNNTGG	11	2	0	7	4	0	0
 58 | AccBSI	CCGCTC	6	2	1	3	3	0	0
 59 | AceIII	cagctc	6	2	0	13	17	0	0
 60 | AciI	CCGC	4	2	0	1	3	0	0
 61 | AclI	AACGTT	6	2	0	2	4	0	0
 62 | AclWI	GGATC	5	2	0	9	10	0	0
 63 | AcoI	YGGCCR	6	2	0	1	5	0	0
 64 | AcsI	RAATTY	6	2	0	1	5	0	0
 65 | AcuI	CTGAAG	6	2	0	22	20	0	0
 66 | AcvI	CACGTG	6	2	1	3	3	0	0
 67 | AcyI	GRCGYC	6	2	0	2	4	0	0
 68 | AdeI	CACNNNGTG	9	2	0	6	3	0	0
 69 | AfaI	GTAC	4	2	1	2	2	0	0
 70 | AfeI	AGCGCT	6	2	1	3	3	0	0
 71 | AfiI	CCNNNNNNNGG	11	2	0	7	4	0	0
 72 | AflII	CTTAAG	6	2	0	1	5	0	0
 73 | AflIII	ACRYGT	6	2	0	1	5	0	0
 74 | AgeI	ACCGGT	6	2	0	1	5	0	0
 75 | AgsI	TTSAA	5	2	0	3	2	0	0
 76 | AhaIII	tttaaa	6	2	1	3	3	0	0
 77 | AhdI	GACNNNNNGTC	11	2	0	6	5	0	0
 78 | AhlI	ACTAGT	6	2	0	1	5	0	0
 79 | AjiI	CACGTC	6	2	1	3	3	0	0
 80 | AjnI	CCWGG	5	2	0	-1	5	0	0
 81 | AjuI	GAANNNNNNNTTGG	14	4	0	-8	-13	25	20
 82 | AleI	CACNNNNGTG	10	2	1	5	5	0	0
 83 | AlfI	GCANNNNNNTGC	12	4	0	-11	-13	24	22
 84 | AloI	GAACNNNNNNTCC	13	4	0	-8	-13	25	20
 85 | AluI	AGCT	4	2	1	2	2	0	0
 86 | AluBI	AGCT	4	2	1	2	2	0	0
 87 | AlwI	GGATC	5	2	0	9	10	0	0
 88 | Alw21I	GWGCWC	6	2	0	5	1	0	0
 89 | Alw26I	GTCTC	5	2	0	6	10	0	0
 90 | Alw44I	GTGCAC	6	2	0	1	5	0	0
 91 | AlwFI	gaaaynnnnnrtg	13	0	0	0	0	0	0
 92 | AlwNI	CAGNNNCTG	9	2	0	6	3	0	0
 93 | Ama87I	CYCGRG	6	2	0	1	5	0	0
 94 | Aor13HI	TCCGGA	6	2	0	1	5	0	0
 95 | Aor51HI	AGCGCT	6	2	1	3	3	0	0
 96 | AoxI	ggcc	4	2	0	-1	4	0	0
 97 | ApaI	GGGCCC	6	2	0	5	1	0	0
 98 | ApaBI	gcannnnntgc	11	2	0	8	3	0	0
 99 | ApaLI	GTGCAC	6	2	0	1	5	0	0
100 | ApeKI	GCWGC	5	2	0	1	4	0	0
101 | ApoI	RAATTY	6	2	0	1	5	0	0
102 | ApyPI	atcgac	6	2	0	26	24	0	0
103 | AquII	gccgnac	7	2	0	27	25	0	0
104 | AquIII	gaggag	6	2	0	26	24	0	0
105 | AquIV	grggaag	7	2	0	26	24	0	0
106 | ArsI	GACNNNNNNTTYG	13	4	0	-9	-14	24	19
107 | AscI	GGCGCGCC	8	2	0	2	6	0	0
108 | AseI	ATTAAT	6	2	0	2	4	0	0
109 | Asi256I	gatc	4	2	0	1	3	0	0
110 | AsiGI	ACCGGT	6	2	0	1	5	0	0
111 | AsiSI	GCGATCGC	8	2	0	5	3	0	0
112 | Asp700I	GAANNNNTTC	10	2	1	5	5	0	0
113 | Asp718I	GGTACC	6	2	0	1	5	0	0
114 | AspA2I	CCTAGG	6	2	0	1	5	0	0
115 | AspBHI	yscns	5	2	0	13	17	0	0
116 | AspLEI	GCGC	4	2	0	3	1	0	0
117 | AspS9I	GGNCC	5	2	0	1	4	0	0
118 | AssI	AGTACT	6	2	1	3	3	0	0
119 | AsuI	ggncc	5	2	0	1	4	0	0
120 | AsuII	TTCGAA	6	2	0	2	4	0	0
121 | AsuC2I	CCSGG	5	2	0	2	3	0	0
122 | AsuHPI	GGTGA	5	2	0	13	12	0	0
123 | AsuNHI	GCTAGC	6	2	0	1	5	0	0
124 | AvaI	CYCGRG	6	2	0	1	5	0	0
125 | AvaII	GGWCC	5	2	0	1	4	0	0
126 | AvaIII	atgcat	6	0	0	0	0	0	0
127 | AvrII	CCTAGG	6	2	0	1	5	0	0
128 | AxyI	CCTNAGG	7	2	0	2	5	0	0
129 | BaeI	ACNNNNGTAYC	11	4	0	-11	-16	23	18
130 | BaeGI	GKGCMC	6	2	0	5	1	0	0
131 | BalI	TGGCCA	6	2	1	3	3	0	0
132 | BamHI	GGATCC	6	2	0	1	5	0	0
133 | BanI	GGYRCC	6	2	0	1	5	0	0
134 | BanII	GRGCYC	6	2	0	5	1	0	0
135 | BanLI	rtcagg	6	0	0	0	0	0	0
136 | BarI	GAAGNNNNNNTAC	13	4	0	-8	-13	25	20
137 | BasI	CCANNNNNTGG	11	2	0	7	4	0	0
138 | BauI	CACGAG	6	2	0	1	5	0	0
139 | Bbr7I	gaagac	6	2	0	13	17	0	0
140 | BbrPI	CACGTG	6	2	1	3	3	0	0
141 | BbsI	GAAGAC	6	2	0	8	12	0	0
142 | BbvI	GCAGC	5	2	0	13	17	0	0
143 | BbvII	gaagac	6	2	0	8	12	0	0
144 | Bbv12I	GWGCWC	6	2	0	5	1	0	0
145 | BbvCI	CCTCAGC	7	2	0	2	5	0	0
146 | BccI	CCATC	5	2	0	9	10	0	0
147 | Bce83I	cttgag	6	2	0	22	20	0	0
148 | BceAI	ACGGC	5	2	0	17	19	0	0
149 | R1.BceSIV	gcagc	5	4	0	-8	-6	14	16
150 | BcefI	acggc	5	2	0	17	18	0	0
151 | BcgI	CGANNNNNNTGC	12	4	0	-11	-13	24	22
152 | BciT130I	CCWGG	5	2	0	2	3	0	0
153 | BciVI	GTATCC	6	2	0	12	11	0	0
154 | BclI	TGATCA	6	2	0	1	5	0	0
155 | BcnI	CCSGG	5	2	0	2	3	0	0
156 | BcoDI	GTCTC	5	2	0	6	10	0	0
157 | BcuI	ACTAGT	6	2	0	1	5	0	0
158 | BdaI	tgannnnnntca	12	4	0	-11	-13	24	22
159 | BetI	wccggw	6	2	0	1	5	0	0
160 | BfaI	CTAG	4	2	0	1	3	0	0
161 | BfaSII	ganggag	7	0	0	0	0	0	0
162 | BfiI	actggg	6	2	0	11	10	0	0
163 | BfmI	CTRYAG	6	2	0	1	5	0	0
164 | BfoI	RGCGCY	6	2	0	5	1	0	0
165 | BfrI	CTTAAG	6	2	0	1	5	0	0
166 | BfuI	GTATCC	6	2	0	12	11	0	0
167 | BfuAI	ACCTGC	6	2	0	10	14	0	0
168 | BfuCI	GATC	4	2	0	-1	4	0	0
169 | BglI	GCCNNNNNGGC	11	2	0	7	4	0	0
170 | BglII	AGATCT	6	2	0	1	5	0	0
171 | BinI	ggatc	5	2	0	9	10	0	0
172 | BisI	GCNGC	5	2	0	2	3	0	0
173 | BlnI	CCTAGG	6	2	0	1	5	0	0
174 | BlpI	GCTNAGC	7	2	0	2	5	0	0
175 | BlsI	GCNGC	5	2	0	3	2	0	0
176 | BmcAI	AGTACT	6	2	1	3	3	0	0
177 | Bme18I	GGWCC	5	2	0	1	4	0	0
178 | Bme1390I	CCNGG	5	2	0	2	3	0	0
179 | BmeDI	c	1	2	0	3	1	0	0
180 | BmeRI	GACNNNNNGTC	11	2	0	6	5	0	0
181 | BmeT110I	CYCGRG	6	2	0	1	5	0	0
182 | BmgI	gkgccc	6	0	0	0	0	0	0
183 | BmgBI	CACGTC	6	2	1	3	3	0	0
184 | BmgT120I	GGNCC	5	2	0	1	4	0	0
185 | BmiI	GGNNCC	6	2	1	3	3	0	0
186 | BmrI	ACTGGG	6	2	0	11	10	0	0
187 | BmrFI	CCNGG	5	2	0	2	3	0	0
188 | BmsI	GCATC	5	2	0	10	14	0	0
189 | BmtI	GCTAGC	6	2	0	5	1	0	0
190 | BmuI	ACTGGG	6	2	0	11	10	0	0
191 | BoxI	GACNNNNGTC	10	2	1	5	5	0	0
192 | BpiI	GAAGAC	6	2	0	8	12	0	0
193 | BplI	GAGNNNNNCTC	11	4	0	-9	-14	24	19
194 | BpmI	CTGGAG	6	2	0	22	20	0	0
195 | Bpu10I	CCTNAGC	7	2	0	2	5	0	0
196 | Bpu14I	TTCGAA	6	2	0	2	4	0	0
197 | Bpu1102I	GCTNAGC	7	2	0	2	5	0	0
198 | BpuEI	CTTGAG	6	2	0	22	20	0	0
199 | BpuMI	CCSGG	5	2	0	2	3	0	0
200 | BpvUI	CGATCG	6	2	0	4	2	0	0
201 | BsaI	GGTCTC	6	2	0	7	11	0	0
202 | Bsa29I	ATCGAT	6	2	0	2	4	0	0
203 | BsaAI	YACGTR	6	2	1	3	3	0	0
204 | BsaBI	GATNNNNATC	10	2	1	5	5	0	0
205 | BsaHI	GRCGYC	6	2	0	2	4	0	0
206 | BsaJI	CCNNGG	6	2	0	1	5	0	0
207 | BsaWI	WCCGGW	6	2	0	1	5	0	0
208 | BsaXI	ACNNNNNCTCC	11	4	0	-10	-13	21	18
209 | BsbI	caacac	6	2	0	27	25	0	0
210 | Bsc4I	CCNNNNNNNGG	11	2	0	7	4	0	0
211 | BscAI	gcatc	5	2	0	9	11	0	0
212 | BscGI	cccgt	5	0	0	0	0	0	0
213 | Bse1I	ACTGG	5	2	0	6	4	0	0
214 | Bse8I	GATNNNNATC	10	2	1	5	5	0	0
215 | Bse21I	CCTNAGG	7	2	0	2	5	0	0
216 | Bse118I	RCCGGY	6	2	0	1	5	0	0
217 | BseAI	TCCGGA	6	2	0	1	5	0	0
218 | BseBI	CCWGG	5	2	0	2	3	0	0
219 | BseCI	ATCGAT	6	2	0	2	4	0	0
220 | BseDI	CCNNGG	6	2	0	1	5	0	0
221 | Bse3DI	GCAATG	6	2	0	8	6	0	0
222 | BseGI	GGATG	5	2	0	7	5	0	0
223 | BseJI	GATNNNNATC	10	2	1	5	5	0	0
224 | BseLI	CCNNNNNNNGG	11	2	0	7	4	0	0
225 | BseMI	GCAATG	6	2	0	8	6	0	0
226 | BseMII	CTCAG	5	2	0	15	13	0	0
227 | BseNI	ACTGG	5	2	0	6	4	0	0
228 | BsePI	GCGCGC	6	2	0	1	5	0	0
229 | BseRI	GAGGAG	6	2	0	16	14	0	0
230 | BseSI	GKGCMC	6	2	0	5	1	0	0
231 | BseXI	GCAGC	5	2	0	13	17	0	0
232 | BseX3I	CGGCCG	6	2	0	1	5	0	0
233 | BseYI	CCCAGC	6	2	0	1	5	0	0
234 | BsgI	GTGCAG	6	2	0	22	20	0	0
235 | Bsh1236I	CGCG	4	2	1	2	2	0	0
236 | Bsh1285I	CGRYCG	6	2	0	4	2	0	0
237 | BshFI	GGCC	4	2	1	2	2	0	0
238 | BshNI	GGYRCC	6	2	0	1	5	0	0
239 | BshTI	ACCGGT	6	2	0	1	5	0	0
240 | BshVI	ATCGAT	6	2	0	2	4	0	0
241 | BsiI	cacgag	6	2	0	1	5	0	0
242 | BsiEI	CGRYCG	6	2	0	4	2	0	0
243 | BsiHKAI	GWGCWC	6	2	0	5	1	0	0
244 | BsiHKCI	CYCGRG	6	2	0	1	5	0	0
245 | BsiSI	CCGG	4	2	0	1	3	0	0
246 | BsiWI	CGTACG	6	2	0	1	5	0	0
247 | BsiYI	ccnnnnnnngg	11	2	0	7	4	0	0
248 | BslI	CCNNNNNNNGG	11	2	0	7	4	0	0
249 | BslFI	GGGAC	5	2	0	15	19	0	0
250 | BsmI	GAATGC	6	2	0	7	5	0	0
251 | BsmAI	GTCTC	5	2	0	6	10	0	0
252 | BsmBI	CGTCTC	6	2	0	7	11	0	0
253 | BsmFI	GGGAC	5	2	0	15	19	0	0
254 | BsnI	GGCC	4	2	1	2	2	0	0
255 | Bso31I	GGTCTC	6	2	0	7	11	0	0
256 | BsoBI	CYCGRG	6	2	0	1	5	0	0
257 | Bsp13I	TCCGGA	6	2	0	1	5	0	0
258 | Bsp19I	CCATGG	6	2	0	1	5	0	0
259 | Bsp24I	gacnnnnnntgg	12	4	0	-9	-14	24	19
260 | Bsp68I	TCGCGA	6	2	1	3	3	0	0
261 | Bsp119I	TTCGAA	6	2	0	2	4	0	0
262 | Bsp120I	GGGCCC	6	2	0	1	5	0	0
263 | Bsp143I	GATC	4	2	0	-1	4	0	0
264 | Bsp1286I	GDGCHC	6	2	0	5	1	0	0
265 | Bsp1407I	TGTACA	6	2	0	1	5	0	0
266 | Bsp1720I	GCTNAGC	7	2	0	2	5	0	0
267 | BspACI	CCGC	4	2	0	1	3	0	0
268 | BspCNI	CTCAG	5	2	0	14	12	0	0
269 | BspDI	ATCGAT	6	2	0	2	4	0	0
270 | BspD6I	gactc	5	2	0	9	11	0	0
271 | BspEI	TCCGGA	6	2	0	1	5	0	0
272 | BspFNI	CGCG	4	2	1	2	2	0	0
273 | BspGI	ctggac	6	0	0	0	0	0	0
274 | BspHI	TCATGA	6	2	0	1	5	0	0
275 | BspLI	GGNNCC	6	2	1	3	3	0	0
276 | BspLU11I	acatgt	6	2	0	1	5	0	0
277 | BspMI	ACCTGC	6	2	0	10	14	0	0
278 | BspMII	tccgga	6	2	0	1	5	0	0
279 | BspNCI	ccaga	5	0	0	0	0	0	0
280 | BspOI	GCTAGC	6	2	0	5	1	0	0
281 | BspPI	GGATC	5	2	0	9	10	0	0
282 | BspQI	GCTCTTC	7	2	0	8	11	0	0
283 | BspTI	CTTAAG	6	2	0	1	5	0	0
284 | BspT104I	TTCGAA	6	2	0	2	4	0	0
285 | BspT107I	GGYRCC	6	2	0	1	5	0	0
286 | BsrI	ACTGG	5	2	0	6	4	0	0
287 | BsrBI	CCGCTC	6	2	1	3	3	0	0
288 | BsrDI	GCAATG	6	2	0	8	6	0	0
289 | BsrFI	RCCGGY	6	2	0	1	5	0	0
290 | BsrGI	TGTACA	6	2	0	1	5	0	0
291 | BsrSI	ACTGG	5	2	0	6	4	0	0
292 | BssAI	RCCGGY	6	2	0	1	5	0	0
293 | BssECI	CCNNGG	6	2	0	1	5	0	0
294 | BssHII	GCGCGC	6	2	0	1	5	0	0
295 | BssKI	CCNGG	5	2	0	-1	5	0	0
296 | BssMI	GATC	4	2	0	-1	4	0	0
297 | BssNI	GRCGYC	6	2	0	2	4	0	0
298 | BssNAI	GTATAC	6	2	1	3	3	0	0
299 | BssSI	CACGAG	6	2	0	1	5	0	0
300 | BssT1I	CCWWGG	6	2	0	1	5	0	0
301 | Bst6I	CTCTTC	6	2	0	7	10	0	0
302 | Bst1107I	GTATAC	6	2	1	3	3	0	0
303 | BstACI	GRCGYC	6	2	0	2	4	0	0
304 | BstAFI	CTTAAG	6	2	0	1	5	0	0
305 | BstAPI	GCANNNNNTGC	11	2	0	7	4	0	0
306 | BstAUI	TGTACA	6	2	0	1	5	0	0
307 | BstBI	TTCGAA	6	2	0	2	4	0	0
308 | Bst2BI	CACGAG	6	2	0	1	5	0	0
309 | BstBAI	YACGTR	6	2	1	3	3	0	0
310 | Bst4CI	ACNGT	5	2	0	3	2	0	0
311 | BstC8I	GCNNGC	6	2	1	3	3	0	0
312 | BstDEI	CTNAG	5	2	0	1	4	0	0
313 | BstDSI	CCRYGG	6	2	0	1	5	0	0
314 | BstEII	GGTNACC	7	2	0	1	6	0	0
315 | BstENI	CCTNNNNNAGG	11	2	0	5	6	0	0
316 | BstF5I	GGATG	5	2	0	7	5	0	0
317 | BstFNI	CGCG	4	2	1	2	2	0	0
318 | BstH2I	RGCGCY	6	2	0	5	1	0	0
319 | BstHHI	GCGC	4	2	0	3	1	0	0
320 | BstKTI	GATC	4	2	0	3	1	0	0
321 | BstMAI	GTCTC	5	2	0	6	10	0	0
322 | BstMBI	GATC	4	2	0	-1	4	0	0
323 | BstMCI	CGRYCG	6	2	0	4	2	0	0
324 | BstMWI	GCNNNNNNNGC	11	2	0	7	4	0	0
325 | BstNI	CCWGG	5	2	0	2	3	0	0
326 | BstNSI	RCATGY	6	2	0	5	1	0	0
327 | BstOI	CCWGG	5	2	0	2	3	0	0
328 | BstPI	GGTNACC	7	2	0	1	6	0	0
329 | BstPAI	GACNNNNGTC	10	2	1	5	5	0	0
330 | BstSCI	CCNGG	5	2	0	-1	5	0	0
331 | BstSFI	CTRYAG	6	2	0	1	5	0	0
332 | BstSLI	GKGCMC	6	2	0	5	1	0	0
333 | BstSNI	TACGTA	6	2	1	3	3	0	0
334 | BstUI	CGCG	4	2	1	2	2	0	0
335 | Bst2UI	CCWGG	5	2	0	2	3	0	0
336 | BstV1I	GCAGC	5	2	0	13	17	0	0
337 | BstV2I	GAAGAC	6	2	0	8	12	0	0
338 | BstXI	CCANNNNNNTGG	12	2	0	8	4	0	0
339 | BstX2I	RGATCY	6	2	0	1	5	0	0
340 | BstYI	RGATCY	6	2	0	1	5	0	0
341 | BstZI	CGGCCG	6	2	0	1	5	0	0
342 | BstZ17I	GTATAC	6	2	1	3	3	0	0
343 | BsuI	GTATCC	6	2	0	12	11	0	0
344 | Bsu15I	ATCGAT	6	2	0	2	4	0	0
345 | Bsu36I	CCTNAGG	7	2	0	2	5	0	0
346 | BsuRI	GGCC	4	2	1	2	2	0	0
347 | BtgI	CCRYGG	6	2	0	1	5	0	0
348 | BtgZI	GCGATG	6	2	0	16	20	0	0
349 | BthCI	gcngc	5	2	0	4	1	0	0
350 | BtrI	CACGTC	6	2	1	3	3	0	0
351 | BtsIMutI	CAGTG	5	2	0	7	5	0	0
352 | BtsI	GCAGTG	6	2	0	8	6	0	0
353 | BtsCI	GGATG	5	2	0	7	5	0	0
354 | BtuMI	TCGCGA	6	2	1	3	3	0	0
355 | BveI	ACCTGC	6	2	0	10	14	0	0
356 | Cac8I	GCNNGC	6	2	1	3	3	0	0
357 | CaiI	CAGNNNCTG	9	2	0	6	3	0	0
358 | CauII	ccsgg	5	2	0	2	3	0	0
359 | CchII	ggarga	6	2	0	17	15	0	0
360 | CchIII	cccaag	6	2	0	26	24	0	0
361 | CciI	TCATGA	6	2	0	1	5	0	0
362 | CciNI	GCGGCCGC	8	2	0	2	6	0	0
363 | CcrNAIII	cgaccag	7	0	0	0	0	0	0
364 | CdiI	catcg	5	2	1	4	4	0	0
365 | Cdi630V	caaaaa	6	0	0	0	0	0	0
366 | CdpI	gcggag	6	2	0	26	24	0	0
367 | CfoI	GCGC	4	2	0	3	1	0	0
368 | CfrI	yggccr	6	2	0	1	5	0	0
369 | Cfr9I	CCCGGG	6	2	0	1	5	0	0
370 | Cfr10I	RCCGGY	6	2	0	1	5	0	0
371 | Cfr13I	GGNCC	5	2	0	1	4	0	0
372 | Cfr42I	CCGCGG	6	2	0	4	2	0	0
373 | Cgl13032I	ggcgca	6	0	0	0	0	0	0
374 | Cgl13032II	acgabgg	7	0	0	0	0	0	0
375 | ChaI	gatc	4	2	0	4	-1	0	0
376 | CjeI	ccannnnnngt	11	4	0	-9	-15	26	20
377 | CjeFIII	gcaagg	6	0	0	0	0	0	0
378 | CjeFV	ggrca	5	0	0	0	0	0	0
379 | CjeNII	gagnnnnngt	10	0	0	0	0	0	0
380 | CjeNIII	gkaayg	6	2	0	25	23	0	0
381 | CjePI	ccannnnnnntc	12	4	0	-8	-14	26	20
382 | CjeP659IV	cacnnnnnnngaa	13	0	0	0	0	0	0
383 | CjuI	caynnnnnrtg	11	0	0	0	0	0	0
384 | CjuII	caynnnnnctc	11	0	0	0	0	0	0
385 | ClaI	ATCGAT	6	2	0	2	4	0	0
386 | CpoI	CGGWCCG	7	2	0	2	5	0	0
387 | CseI	GACGC	5	2	0	10	15	0	0
388 | CsiI	ACCWGGT	7	2	0	1	6	0	0
389 | CspI	CGGWCCG	7	2	0	2	5	0	0
390 | Csp6I	GTAC	4	2	0	1	3	0	0
391 | CspAI	ACCGGT	6	2	0	1	5	0	0
392 | CspCI	CAANNNNNGTGG	12	4	0	-12	-14	24	22
393 | CstMI	aaggag	6	2	0	26	24	0	0
394 | CviAII	CATG	4	2	0	1	3	0	0
395 | CviJI	RGCY	4	2	1	2	2	0	0
396 | CviKI-1	RGCY	4	2	1	2	2	0	0
397 | CviQI	GTAC	4	2	0	1	3	0	0
398 | CviRI	tgca	4	2	1	2	2	0	0
399 | DdeI	CTNAG	5	2	0	1	4	0	0
400 | DinI	GGCGCC	6	2	1	3	3	0	0
401 | DpnI	GATC	4	2	1	2	2	0	0
402 | DpnII	GATC	4	2	0	-1	4	0	0
403 | DraI	TTTAAA	6	2	1	3	3	0	0
404 | DraII	rggnccy	7	2	0	2	5	0	0
405 | DraIII	CACNNNGTG	9	2	0	6	3	0	0
406 | DraRI	caagnac	7	2	0	27	25	0	0
407 | DrdI	GACNNNNNNGTC	12	2	0	7	5	0	0
408 | DrdII	gaacca	6	0	0	0	0	0	0
409 | DriI	GACNNNNNGTC	11	2	0	6	5	0	0
410 | DsaI	ccrygg	6	2	0	1	5	0	0
411 | DseDI	GACNNNNNNGTC	12	2	0	7	5	0	0
412 | EaeI	YGGCCR	6	2	0	1	5	0	0
413 | EagI	CGGCCG	6	2	0	1	5	0	0
414 | Eam1104I	CTCTTC	6	2	0	7	10	0	0
415 | Eam1105I	GACNNNNNGTC	11	2	0	6	5	0	0
416 | EarI	CTCTTC	6	2	0	7	10	0	0
417 | EciI	GGCGGA	6	2	0	17	15	0	0
418 | Ecl136II	GAGCTC	6	2	1	3	3	0	0
419 | EclXI	CGGCCG	6	2	0	1	5	0	0
420 | Eco24I	GRGCYC	6	2	0	5	1	0	0
421 | Eco31I	GGTCTC	6	2	0	7	11	0	0
422 | Eco32I	GATATC	6	2	1	3	3	0	0
423 | Eco47I	GGWCC	5	2	0	1	4	0	0
424 | Eco47III	AGCGCT	6	2	1	3	3	0	0
425 | Eco52I	CGGCCG	6	2	0	1	5	0	0
426 | Eco57I	CTGAAG	6	2	0	22	20	0	0
427 | Eco72I	CACGTG	6	2	1	3	3	0	0
428 | Eco81I	CCTNAGG	7	2	0	2	5	0	0
429 | Eco88I	CYCGRG	6	2	0	1	5	0	0
430 | Eco91I	GGTNACC	7	2	0	1	6	0	0
431 | Eco105I	TACGTA	6	2	1	3	3	0	0
432 | Eco130I	CCWWGG	6	2	0	1	5	0	0
433 | Eco147I	AGGCCT	6	2	1	3	3	0	0
434 | EcoHI	ccsgg	5	2	0	-1	5	0	0
435 | EcoICRI	GAGCTC	6	2	1	3	3	0	0
436 | Eco57MI	ctgrag	6	2	0	22	20	0	0
437 | EcoNI	CCTNNNNNAGG	11	2	0	5	6	0	0
438 | EcoO65I	GGTNACC	7	2	0	1	6	0	0
439 | EcoO109I	RGGNCCY	7	2	0	2	5	0	0
440 | EcoRI	GAATTC	6	2	0	1	5	0	0
441 | EcoRII	CCWGG	5	2	0	-1	5	0	0
442 | EcoRV	GATATC	6	2	1	3	3	0	0
443 | EcoT14I	CCWWGG	6	2	0	1	5	0	0
444 | EcoT22I	ATGCAT	6	2	0	5	1	0	0
445 | EcoT38I	GRGCYC	6	2	0	5	1	0	0
446 | Eco53kI	GAGCTC	6	2	1	3	3	0	0
447 | EgeI	GGCGCC	6	2	1	3	3	0	0
448 | EheI	GGCGCC	6	2	1	3	3	0	0
449 | ErhI	CCWWGG	6	2	0	1	5	0	0
450 | EsaBC3I	tcga	4	2	1	2	2	0	0
451 | EsaSSI	gaccac	6	0	0	0	0	0	0
452 | EspI	gctnagc	7	2	0	2	5	0	0
453 | Esp3I	CGTCTC	6	2	0	7	11	0	0
454 | FaeI	CATG	4	2	0	4	-1	0	0
455 | FaiI	YATR	4	2	1	2	2	0	0
456 | FalI	AAGNNNNNCTT	11	4	0	-9	-14	24	19
457 | FaqI	GGGAC	5	2	0	15	19	0	0
458 | FatI	CATG	4	2	0	-1	4	0	0
459 | FauI	CCCGC	5	2	0	9	11	0	0
460 | FauNDI	CATATG	6	2	0	2	4	0	0
461 | FbaI	TGATCA	6	2	0	1	5	0	0
462 | FblI	GTMKAC	6	2	0	2	4	0	0
463 | FinI	gggac	5	0	0	0	0	0	0
464 | FmuI	ggncc	5	2	0	4	1	0	0
465 | FnuDII	cgcg	4	2	1	2	2	0	0
466 | Fnu4HI	GCNGC	5	2	0	2	3	0	0
467 | FokI	GGATG	5	2	0	14	18	0	0
468 | FriOI	GRGCYC	6	2	0	5	1	0	0
469 | FseI	GGCCGGCC	8	2	0	6	2	0	0
470 | FspI	TGCGCA	6	2	1	3	3	0	0
471 | FspAI	RTGCGCAY	8	2	1	4	4	0	0
472 | FspBI	CTAG	4	2	0	1	3	0	0
473 | FspEI	CC	2	2	0	14	18	0	0
474 | Fsp4HI	GCNGC	5	2	0	2	3	0	0
475 | GauT27I	cgcgcagg	8	0	0	0	0	0	0
476 | GdiII	cggccr	6	2	0	1	5	0	0
477 | GlaI	GCGC	4	2	1	2	2	0	0
478 | GluI	GCNGC	5	2	0	2	3	0	0
479 | GsaI	CCCAGC	6	2	0	5	1	0	0
480 | GsuI	CTGGAG	6	2	0	22	20	0	0
481 | HaeI	wggccw	6	2	1	3	3	0	0
482 | HaeII	RGCGCY	6	2	0	5	1	0	0
483 | HaeIII	GGCC	4	2	1	2	2	0	0
484 | HaeIV	gaynnnnnrtc	11	4	0	-8	-14	25	20
485 | HapII	CCGG	4	2	0	1	3	0	0
486 | HauII	tggcca	6	2	0	17	15	0	0
487 | HgaI	GACGC	5	2	0	10	15	0	0
488 | HgiAI	gwgcwc	6	2	0	5	1	0	0
489 | HgiCI	ggyrcc	6	2	0	1	5	0	0
490 | HgiEII	accnnnnnnggt	12	0	0	0	0	0	0
491 | HgiJII	grgcyc	6	2	0	5	1	0	0
492 | HhaI	GCGC	4	2	0	3	1	0	0
493 | Hin1I	GRCGYC	6	2	0	2	4	0	0
494 | Hin1II	CATG	4	2	0	4	-1	0	0
495 | Hin4I	gaynnnnnvtc	11	4	0	-9	-14	24	19
496 | Hin4II	ccttc	5	2	0	11	10	0	0
497 | Hin6I	GCGC	4	2	0	1	3	0	0
498 | HinP1I	GCGC	4	2	0	1	3	0	0
499 | HincII	GTYRAC	6	2	1	3	3	0	0
500 | HindII	GTYRAC	6	2	1	3	3	0	0
501 | HindIII	AAGCTT	6	2	0	1	5	0	0
502 | HinfI	GANTC	5	2	0	1	4	0	0
503 | HpaI	GTTAAC	6	2	1	3	3	0	0
504 | HpaII	CCGG	4	2	0	1	3	0	0
505 | HphI	GGTGA	5	2	0	13	12	0	0
506 | Hpy8I	GTNNAC	6	2	1	3	3	0	0
507 | Hpy99I	CGWCG	5	2	0	5	-1	0	0
508 | Hpy99XIII	gccta	5	0	0	0	0	0	0
509 | Hpy99XIV	ggwtaa	6	0	0	0	0	0	0
510 | Hpy99XIV-mut1	ggwcna	6	0	0	0	0	0	0
511 | Hpy99XXII	tcannnnnntrg	12	0	0	0	0	0	0
512 | Hpy166II	GTNNAC	6	2	1	3	3	0	0
513 | Hpy178III	tcnnga	6	2	0	2	4	0	0
514 | Hpy188I	TCNGA	5	2	0	3	2	0	0
515 | Hpy188III	TCNNGA	6	2	0	2	4	0	0
516 | HpyAV	CCTTC	5	2	0	11	10	0	0
517 | HpyAXIV	gcgta	5	0	0	0	0	0	0
518 | HpyAXVI-mut1	crttaa	6	0	0	0	0	0	0
519 | HpyAXVI-mut2	crtcna	6	0	0	0	0	0	0
520 | HpyCH4III	ACNGT	5	2	0	3	2	0	0
521 | HpyCH4IV	ACGT	4	2	0	1	3	0	0
522 | HpyCH4V	TGCA	4	2	1	2	2	0	0
523 | HpyF3I	CTNAG	5	2	0	1	4	0	0
524 | HpyF10VI	GCNNNNNNNGC	11	2	0	7	4	0	0
525 | HpySE526I	ACGT	4	2	0	1	3	0	0
526 | Hsp92I	GRCGYC	6	2	0	2	4	0	0
527 | Hsp92II	CATG	4	2	0	4	-1	0	0
528 | HspAI	GCGC	4	2	0	1	3	0	0
529 | Jma19592I	gtatnac	7	0	0	0	0	0	0
530 | KasI	GGCGCC	6	2	0	1	5	0	0
531 | KflI	GGGWCCC	7	2	0	2	5	0	0
532 | KpnI	GGTACC	6	2	0	5	1	0	0
533 | Kpn2I	TCCGGA	6	2	0	1	5	0	0
534 | KroI	GCCGGC	6	2	0	1	5	0	0
535 | KspI	CCGCGG	6	2	0	4	2	0	0
536 | Ksp22I	TGATCA	6	2	0	1	5	0	0
537 | Ksp632I	ctcttc	6	2	0	7	10	0	0
538 | KspAI	GTTAAC	6	2	1	3	3	0	0
539 | Kzo9I	GATC	4	2	0	-1	4	0	0
540 | LguI	GCTCTTC	7	2	0	8	11	0	0
541 | LpnI	rgcgcy	6	2	1	3	3	0	0
542 | LpnPI	CCDG	4	2	0	14	18	0	0
543 | Lsp1109I	GCAGC	5	2	0	13	17	0	0
544 | LweI	GCATC	5	2	0	10	14	0	0
545 | MabI	ACCWGGT	7	2	0	1	6	0	0
546 | MaeI	CTAG	4	2	0	1	3	0	0
547 | MaeII	ACGT	4	2	0	1	3	0	0
548 | MaeIII	GTNAC	5	2	0	-1	5	0	0
549 | MalI	GATC	4	2	1	2	2	0	0
550 | MaqI	crttgac	7	2	0	28	26	0	0
551 | MauBI	CGCGCGCG	8	2	0	2	6	0	0
552 | MbiI	CCGCTC	6	2	1	3	3	0	0
553 | MboI	GATC	4	2	0	-1	4	0	0
554 | MboII	GAAGA	5	2	0	13	12	0	0
555 | McaTI	gcgcgc	6	2	0	4	2	0	0
556 | McrI	cgrycg	6	2	0	4	2	0	0
557 | MfeI	CAATTG	6	2	0	1	5	0	0
558 | MflI	RGATCY	6	2	0	1	5	0	0
559 | MhlI	GDGCHC	6	2	0	5	1	0	0
560 | MjaIV	gtnnac	6	0	0	0	0	0	0
561 | MkaDII	gagaygt	7	0	0	0	0	0	0
562 | MlsI	TGGCCA	6	2	1	3	3	0	0
563 | MluI	ACGCGT	6	2	0	1	5	0	0
564 | MluCI	AATT	4	2	0	-1	4	0	0
565 | MluNI	TGGCCA	6	2	1	3	3	0	0
566 | MlyI	GAGTC	5	2	1	10	10	0	0
567 | Mly113I	GGCGCC	6	2	0	2	4	0	0
568 | MmeI	TCCRAC	6	2	0	26	24	0	0
569 | MnlI	CCTC	4	2	0	11	10	0	0
570 | Mox20I	TGGCCA	6	2	1	3	3	0	0
571 | Mph1103I	ATGCAT	6	2	0	5	1	0	0
572 | MreI	CGCCGGCG	8	2	0	2	6	0	0
573 | MroI	TCCGGA	6	2	0	1	5	0	0
574 | MroNI	GCCGGC	6	2	0	1	5	0	0
575 | MroXI	GAANNNNTTC	10	2	1	5	5	0	0
576 | MscI	TGGCCA	6	2	1	3	3	0	0
577 | MseI	TTAA	4	2	0	1	3	0	0
578 | MslI	CAYNNNNRTG	10	2	1	5	5	0	0
579 | MspI	CCGG	4	2	0	1	3	0	0
580 | Msp20I	TGGCCA	6	2	1	3	3	0	0
581 | MspA1I	CMGCKG	6	2	1	3	3	0	0
582 | MspCI	CTTAAG	6	2	0	1	5	0	0
583 | MspJI	CNNR	4	2	0	13	17	0	0
584 | MspR9I	CCNGG	5	2	0	2	3	0	0
585 | MssI	GTTTAAAC	8	2	1	4	4	0	0
586 | MstI	tgcgca	6	2	1	3	3	0	0
587 | MunI	CAATTG	6	2	0	1	5	0	0
588 | MvaI	CCWGG	5	2	0	2	3	0	0
589 | Mva1269I	GAATGC	6	2	0	7	5	0	0
590 | MvnI	CGCG	4	2	1	2	2	0	0
591 | MvrI	CGATCG	6	2	0	4	2	0	0
592 | MwoI	GCNNNNNNNGC	11	2	0	7	4	0	0
593 | NaeI	GCCGGC	6	2	1	3	3	0	0
594 | NarI	GGCGCC	6	2	0	2	4	0	0
595 | NciI	CCSGG	5	2	0	2	3	0	0
596 | NcoI	CCATGG	6	2	0	1	5	0	0
597 | NdeI	CATATG	6	2	0	2	4	0	0
598 | NdeII	GATC	4	2	0	-1	4	0	0
599 | NgoAVIII	gacnnnnntga	11	4	0	-13	-15	24	22
600 | NgoMIV	GCCGGC	6	2	0	1	5	0	0
601 | NhaXI	caagrag	7	0	0	0	0	0	0
602 | NheI	GCTAGC	6	2	0	1	5	0	0
603 | NlaIII	CATG	4	2	0	4	-1	0	0
604 | NlaIV	GGNNCC	6	2	1	3	3	0	0
605 | NlaCI	catcac	6	2	0	25	23	0	0
606 | Nli3877I	cycgrg	6	2	0	5	1	0	0
607 | NmeAIII	GCCGAG	6	2	0	27	25	0	0
608 | NmeDI	rccggy	6	4	0	-13	-8	13	18
609 | NmuCI	GTSAC	5	2	0	-1	5	0	0
610 | NotI	GCGGCCGC	8	2	0	2	6	0	0
611 | NruI	TCGCGA	6	2	1	3	3	0	0
612 | NsbI	TGCGCA	6	2	1	3	3	0	0
613 | NsiI	ATGCAT	6	2	0	5	1	0	0
614 | NspI	RCATGY	6	2	0	5	1	0	0
615 | NspV	TTCGAA	6	2	0	2	4	0	0
616 | NspBII	cmgckg	6	2	1	3	3	0	0
617 | OliI	CACNNNNGTG	10	2	1	5	5	0	0
618 | PabI	gtac	4	2	0	3	1	0	0
619 | PacI	TTAATTAA	8	2	0	5	3	0	0
620 | PaeI	GCATGC	6	2	0	5	1	0	0
621 | PaeR7I	CTCGAG	6	2	0	1	5	0	0
622 | PagI	TCATGA	6	2	0	1	5	0	0
623 | PalAI	GGCGCGCC	8	2	0	2	6	0	0
624 | PasI	CCCWGGG	7	2	0	2	5	0	0
625 | PauI	GCGCGC	6	2	0	1	5	0	0
626 | PceI	AGGCCT	6	2	1	3	3	0	0
627 | PciI	ACATGT	6	2	0	1	5	0	0
628 | PciSI	GCTCTTC	7	2	0	8	11	0	0
629 | PcsI	WCGNNNNNNNCGW	13	2	0	7	6	0	0
630 | PctI	GAATGC	6	2	0	7	5	0	0
631 | PdiI	GCCGGC	6	2	1	3	3	0	0
632 | Pdi8503III	ccggnag	7	0	0	0	0	0	0
633 | PdmI	GAANNNNTTC	10	2	1	5	5	0	0
634 | PenI	gcagt	5	0	0	0	0	0	0
635 | PfeI	GAWTC	5	2	0	1	4	0	0
636 | Pfl23II	CGTACG	6	2	0	1	5	0	0
637 | Pfl1108I	tcgtag	6	0	0	0	0	0	0
638 | PflFI	GACNNNGTC	9	2	0	4	5	0	0
639 | PflMI	CCANNNNNTGG	11	2	0	7	4	0	0
640 | PfoI	TCCNGGA	7	2	0	1	6	0	0
641 | PinAI	ACCGGT	6	2	0	1	5	0	0
642 | PlaDI	catcag	6	2	0	27	25	0	0
643 | PleI	GAGTC	5	2	0	9	10	0	0
644 | Ple19I	CGATCG	6	2	0	4	2	0	0
645 | PluTI	GGCGCC	6	2	0	5	1	0	0
646 | PmaCI	CACGTG	6	2	1	3	3	0	0
647 | PmeI	GTTTAAAC	8	2	1	4	4	0	0
648 | Pme5II	gacgag	6	0	0	0	0	0	0
649 | PmeS132I	gacgag	6	0	0	0	0	0	0
650 | PmlI	CACGTG	6	2	1	3	3	0	0
651 | PpiI	gaacnnnnnctc	12	4	0	-8	-13	25	20
652 | PpsI	GAGTC	5	2	0	9	10	0	0
653 | Ppu10I	atgcat	6	2	0	1	5	0	0
654 | Ppu21I	YACGTR	6	2	1	3	3	0	0
655 | PpuMI	RGGWCCY	7	2	0	2	5	0	0
656 | PscI	ACATGT	6	2	0	1	5	0	0
657 | PshAI	GACNNNNGTC	10	2	1	5	5	0	0
658 | PshBI	ATTAAT	6	2	0	2	4	0	0
659 | PsiI	TTATAA	6	2	1	3	3	0	0
660 | Psp03I	ggwcc	5	2	0	4	1	0	0
661 | Psp5II	RGGWCCY	7	2	0	2	5	0	0
662 | Psp6I	CCWGG	5	2	0	-1	5	0	0
663 | Psp1406I	AACGTT	6	2	0	2	4	0	0
664 | Psp124BI	GAGCTC	6	2	0	5	1	0	0
665 | PspCI	CACGTG	6	2	1	3	3	0	0
666 | PspEI	GGTNACC	7	2	0	1	6	0	0
667 | PspGI	CCWGG	5	2	0	-1	5	0	0
668 | PspLI	CGTACG	6	2	0	1	5	0	0
669 | PspN4I	GGNNCC	6	2	1	3	3	0	0
670 | PspOMI	GGGCCC	6	2	0	1	5	0	0
671 | PspOMII	cgcccar	7	2	0	27	25	0	0
672 | PspPI	GGNCC	5	2	0	1	4	0	0
673 | PspPPI	RGGWCCY	7	2	0	2	5	0	0
674 | PspPRI	ccycag	6	2	0	21	19	0	0
675 | PspXI	VCTCGAGB	8	2	0	2	6	0	0
676 | PsrI	GAACNNNNNNTAC	13	4	0	-8	-13	25	20
677 | PssI	rggnccy	7	2	0	5	2	0	0
678 | PstI	CTGCAG	6	2	0	5	1	0	0
679 | PstNI	CAGNNNCTG	9	2	0	6	3	0	0
680 | PsuI	RGATCY	6	2	0	1	5	0	0
681 | PsyI	GACNNNGTC	9	2	0	4	5	0	0
682 | PteI	GCGCGC	6	2	0	1	5	0	0
683 | PvuI	CGATCG	6	2	0	4	2	0	0
684 | PvuII	CAGCTG	6	2	1	3	3	0	0
685 | RceI	catcgac	7	2	0	27	25	0	0
686 | RdeGBI	ccgcag	6	0	0	0	0	0	0
687 | RdeGBII	acccag	6	2	0	26	24	0	0
688 | RdeGBIII	tgryca	6	4	0	-10	-12	17	15
689 | RflFIII	cgccag	6	0	0	0	0	0	0
690 | RgaI	GCGATCGC	8	2	0	5	3	0	0
691 | RigI	GGCCGGCC	8	2	0	6	2	0	0
692 | RlaI	vcw	3	0	0	0	0	0	0
693 | RleAI	cccaca	6	2	0	18	15	0	0
694 | RpaI	gtyggag	7	2	0	18	16	0	0
695 | RpaBI	cccgcag	7	2	0	27	25	0	0
696 | RpaB5I	cgrggac	7	2	0	27	25	0	0
697 | RpaTI	grtggag	7	0	0	0	0	0	0
698 | RruI	TCGCGA	6	2	1	3	3	0	0
699 | RsaI	GTAC	4	2	1	2	2	0	0
700 | RsaNI	GTAC	4	2	0	1	3	0	0
701 | RseI	CAYNNNNRTG	10	2	1	5	5	0	0
702 | RsrII	CGGWCCG	7	2	0	2	5	0	0
703 | Rsr2I	CGGWCCG	7	2	0	2	5	0	0
704 | SacI	GAGCTC	6	2	0	5	1	0	0
705 | SacII	CCGCGG	6	2	0	4	2	0	0
706 | SalI	GTCGAC	6	2	0	1	5	0	0
707 | SanDI	gggwccc	7	2	0	2	5	0	0
708 | SapI	GCTCTTC	7	2	0	8	11	0	0
709 | SaqAI	TTAA	4	2	0	1	3	0	0
710 | SatI	GCNGC	5	2	0	2	3	0	0
711 | SauI	cctnagg	7	2	0	2	5	0	0
712 | Sau96I	GGNCC	5	2	0	1	4	0	0
713 | Sau3AI	GATC	4	2	0	-1	4	0	0
714 | SbfI	CCTGCAGG	8	2	0	6	2	0	0
715 | ScaI	AGTACT	6	2	1	3	3	0	0
716 | SchI	GAGTC	5	2	1	10	10	0	0
717 | SciI	ctcgag	6	2	1	3	3	0	0
718 | ScrFI	CCNGG	5	2	0	2	3	0	0
719 | SdaI	CCTGCAGG	8	2	0	6	2	0	0
720 | SdeAI	cagrag	6	2	0	27	25	0	0
721 | SdeOSI	gacnnnnrtga	11	4	0	-12	-14	23	21
722 | SduI	GDGCHC	6	2	0	5	1	0	0
723 | SecI	ccnngg	6	2	0	1	5	0	0
724 | SelI	cgcg	4	2	0	-1	4	0	0
725 | Sen1736II	gatcag	6	0	0	0	0	0	0
726 | SenTFV	gatcag	6	0	0	0	0	0	0
727 | SetI	ASST	4	2	0	4	-1	0	0
728 | SexAI	ACCWGGT	7	2	0	1	6	0	0
729 | SfaAI	GCGATCGC	8	2	0	5	3	0	0
730 | SfaNI	GCATC	5	2	0	10	14	0	0
731 | SfcI	CTRYAG	6	2	0	1	5	0	0
732 | SfeI	ctryag	6	2	0	1	5	0	0
733 | SfiI	GGCCNNNNNGGCC	13	2	0	8	5	0	0
734 | SfoI	GGCGCC	6	2	1	3	3	0	0
735 | Sfr274I	CTCGAG	6	2	0	1	5	0	0
736 | Sfr303I	CCGCGG	6	2	0	4	2	0	0
737 | SfuI	TTCGAA	6	2	0	2	4	0	0
738 | SgeI	CNNG	4	2	0	13	17	0	0
739 | SgfI	GCGATCGC	8	2	0	5	3	0	0
740 | SgrAI	CRCCGGYG	8	2	0	2	6	0	0
741 | SgrBI	CCGCGG	6	2	0	4	2	0	0
742 | SgrDI	CGTCGACG	8	2	0	2	6	0	0
743 | SgrTI	ccds	4	2	0	14	18	0	0
744 | SgsI	GGCGCGCC	8	2	0	2	6	0	0
745 | SimI	gggtc	5	2	0	2	5	0	0
746 | SlaI	CTCGAG	6	2	0	1	5	0	0
747 | SmaI	CCCGGG	6	2	1	3	3	0	0
748 | SmiI	ATTTAAAT	8	2	1	4	4	0	0
749 | SmiMI	CAYNNNNRTG	10	2	1	5	5	0	0
750 | SmlI	CTYRAG	6	2	0	1	5	0	0
751 | SmoI	CTYRAG	6	2	0	1	5	0	0
752 | SnaI	gtatac	6	0	0	0	0	0	0
753 | SnaBI	TACGTA	6	2	1	3	3	0	0
754 | Sno506I	ggccgag	7	0	0	0	0	0	0
755 | SpeI	ACTAGT	6	2	0	1	5	0	0
756 | SphI	GCATGC	6	2	0	5	1	0	0
757 | SplI	cgtacg	6	2	0	1	5	0	0
758 | SpoDI	gcggrag	7	0	0	0	0	0	0
759 | SrfI	gcccgggc	8	2	1	4	4	0	0
760 | Sse9I	AATT	4	2	0	-1	4	0	0
761 | Sse232I	cgccggcg	8	2	0	2	6	0	0
762 | Sse8387I	CCTGCAGG	8	2	0	6	2	0	0
763 | Sse8647I	aggwcct	7	2	0	2	5	0	0
764 | SseBI	AGGCCT	6	2	1	3	3	0	0
765 | SsiI	CCGC	4	2	0	1	3	0	0
766 | SspI	AATATT	6	2	1	3	3	0	0
767 | SspDI	GGCGCC	6	2	0	1	5	0	0
768 | SspD5I	ggtga	5	2	1	13	13	0	0
769 | SstI	GAGCTC	6	2	0	5	1	0	0
770 | SstE37I	cgaagac	7	2	0	27	25	0	0
771 | Sth132I	cccg	4	2	0	8	12	0	0
772 | Sth302II	ccgg	4	2	1	2	2	0	0
773 | StrI	CTCGAG	6	2	0	1	5	0	0
774 | StsI	ggatg	5	2	0	15	19	0	0
775 | StuI	AGGCCT	6	2	1	3	3	0	0
776 | StyI	CCWWGG	6	2	0	1	5	0	0
777 | StyD4I	CCNGG	5	2	0	-1	5	0	0
778 | SwaI	ATTTAAAT	8	2	1	4	4	0	0
779 | TaaI	ACNGT	5	2	0	3	2	0	0
780 | TaiI	ACGT	4	2	0	4	-1	0	0
781 | TaqI	TCGA	4	2	0	1	3	0	0
782 | TaqII	GACCGA	6	2	0	17	15	0	0
783 | TasI	AATT	4	2	0	-1	4	0	0
784 | TatI	WGTACW	6	2	0	1	5	0	0
785 | TauI	GCSGC	5	2	0	4	1	0	0
786 | TdeDII	accagg	6	0	0	0	0	0	0
787 | TfiI	GAWTC	5	2	0	1	4	0	0
788 | Tru1I	TTAA	4	2	0	1	3	0	0
789 | Tru9I	TTAA	4	2	0	1	3	0	0
790 | TscAI	CASTG	5	2	0	7	-3	0	0
791 | TseI	GCWGC	5	2	0	1	4	0	0
792 | TseFI	GTSAC	5	2	0	-1	5	0	0
793 | TsoI	tarcca	6	2	0	17	15	0	0
794 | Tsp45I	GTSAC	5	2	0	-1	5	0	0
795 | Tsp4CI	acngt	5	2	0	3	2	0	0
796 | TspDTI	ATGAA	5	2	0	16	14	0	0
797 | TspEI	aatt	4	2	0	-1	4	0	0
798 | TspGWI	ACGGA	5	2	0	16	14	0	0
799 | TspMI	CCCGGG	6	2	0	1	5	0	0
800 | TspRI	CASTG	5	2	0	7	-3	0	0
801 | TssI	gagnnnctc	9	0	0	0	0	0	0
802 | TstI	cacnnnnnntcc	12	4	0	-9	-14	24	19
803 | TsuI	gcgac	5	0	0	0	0	0	0
804 | Tth111I	GACNNNGTC	9	2	0	4	5	0	0
805 | Tth111II	caarca	6	2	0	17	15	0	0
806 | UbaF9I	tacnnnnnrtgt	12	0	0	0	0	0	0
807 | UbaF11I	tcgta	5	0	0	0	0	0	0
808 | UbaF12I	ctacnnngtc	10	0	0	0	0	0	0
809 | UbaF13I	gagnnnnnnctgg	13	0	0	0	0	0	0
810 | UbaF14I	ccannnnntcg	11	0	0	0	0	0	0
811 | UbaPI	cgaacg	6	0	0	0	0	0	0
812 | UcoMSI	gagctc	6	4	0	-8	-6	11	13
813 | UnbI	ggncc	5	2	0	-1	5	0	0
814 | Van91I	CCANNNNNTGG	11	2	0	7	4	0	0
815 | Vha464I	CTTAAG	6	2	0	1	5	0	0
816 | VneI	GTGCAC	6	2	0	1	5	0	0
817 | VpaK11AI	ggwcc	5	2	0	-1	5	0	0
818 | VpaK11BI	GGWCC	5	2	0	1	4	0	0
819 | VspI	ATTAAT	6	2	0	2	4	0	0
820 | WviI	cacrag	6	2	0	27	25	0	0
821 | XagI	CCTNNNNNAGG	11	2	0	5	6	0	0
822 | XapI	RAATTY	6	2	0	1	5	0	0
823 | XbaI	TCTAGA	6	2	0	1	5	0	0
824 | XceI	RCATGY	6	2	0	5	1	0	0
825 | XcmI	CCANNNNNNNNNTGG	15	2	0	8	7	0	0
826 | XhoI	CTCGAG	6	2	0	1	5	0	0
827 | XhoII	RGATCY	6	2	0	1	5	0	0
828 | XmaI	CCCGGG	6	2	0	1	5	0	0
829 | XmaIII	cggccg	6	2	0	1	5	0	0
830 | XmaJI	CCTAGG	6	2	0	1	5	0	0
831 | XmiI	GTMKAC	6	2	0	2	4	0	0
832 | XmnI	GAANNNNTTC	10	2	1	5	5	0	0
833 | XspI	CTAG	4	2	0	1	3	0	0
834 | YkrI	c	1	2	0	11	10	0	0
835 | ZraI	GACGTC	6	2	1	3	3	0	0
836 | ZrmI	AGTACT	6	2	1	3	3	0	0
837 | Zsp2I	ATGCAT	6	2	0	5	1	0	0
838 | 


--------------------------------------------------------------------------------
/enzyme/enzs.list:
--------------------------------------------------------------------------------
 1 | AatII
 2 | ApaI
 3 | AscI
 4 | BamHI
 5 | BbsI
 6 | BglII
 7 | CpoI
 8 | EcoRI
 9 | EndoIV
10 | EndoV
11 | EndoVIII
12 | FspI
13 | HindIII
14 | KpnI
15 | NarI
16 | NcoI
17 | NdeI
18 | NheI
19 | NotI
20 | PstI
21 | PvuI
22 | SacI
23 | SalI
24 | Sau3AI(MboI)
25 | ScaI
26 | SnaBI
27 | SpeI
28 | SphI
29 | XbaI
30 | XhoI
31 | 


--------------------------------------------------------------------------------
/enzyme/restrict_check_digested_sequence_number.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # Copyright 2014 Wei Shen (shenwei356#gmail.com). All rights reserved.
  4 | # Use of this source code is governed by a MIT-license
  5 | # that can be found in the LICENSE file.
  6 | # https://github.com/shenwei356/bio_scripts/
  7 | 
  8 | # embossre.enz
  9 | #   ftp://ftp.neb.com/pub/rebase/emboss_e.\d+
 10 | 
 11 | use strict;
 12 | use File::Basename;
 13 | use Getopt::Long;
 14 | 
 15 | use BioUtil::Misc;
 16 | use BioUtil::Seq;
 17 | use BioUtil::Util;
 18 | 
 19 | my $usage = sprintf "
 20 | Usage: %s [options]
 21 | 
 22 | Options:
 23 |     -e FILE    Enzymefile (from Rebase)
 24 |     -i FILE    Fasta file
 25 |     -l FILE    Enzyme list file
 26 |     -t INT     Threshold  [%d]
 27 | 
 28 | Example:
 29 |     
 30 |     %s -e embossre.enz -i test.fasta -t 10
 31 | 
 32 | See more: https://github.com/shenwei356/bio_scripts
 33 | ", basename($0), 1 << 30, basename($0);
 34 | 
 35 | my $help       = 0;
 36 | my $enzymefile = "";
 37 | my $seqfile    = "";
 38 | my $listfile   = "";
 39 | my $threshold  = 1 << 30;
 40 | 
 41 | GetOptions(
 42 |     'help|h' => \$help,
 43 |     'e=s'    => \$enzymefile,
 44 |     'i=s'    => \$seqfile,
 45 |     'l=s'    => \$listfile,
 46 |     't=i'    => \$threshold,
 47 | ) or die $usage;
 48 | 
 49 | die $usage if $help;
 50 | die $usage unless $enzymefile ne "" and $seqfile ne "";
 51 | die "threshold should > 0\n" unless $threshold > 0;
 52 | 
 53 | # ===============================================================
 54 | 
 55 | my $enzs     = parse_embossre($enzymefile);
 56 | my %subenzs  = ();
 57 | my %listhash = ();
 58 | 
 59 | if ( $listfile ne "" ) {
 60 |     my $list = get_column_data($listfile, 1);
 61 |     %listhash = map { $_ => 0 } @$list;
 62 |     for my $enz ( keys %$enzs ) {
 63 |         if ( exists $listhash{$enz} ) {
 64 |             $subenzs{$enz} = $$enzs{$enz};
 65 |         }
 66 |     }
 67 | }
 68 | else {
 69 |     %subenzs = %$enzs;
 70 | }
 71 | 
 72 | %listhash = ();
 73 | %listhash = map { $_ => 0 } keys %subenzs;
 74 | 
 75 | # show process
 76 | local $| = 1;
 77 | my $n = 0;
 78 | 
 79 | my $next_seq = FastaReader($seqfile);
 80 | while ( my $fa = &$next_seq() ) {
 81 |     my ( $header, $seq ) = @$fa;
 82 |     $seq = uc $seq;
 83 |     my $revcom = revcom($seq);
 84 | 
 85 |     for my $enz ( keys %subenzs ) {
 86 |         my $e       = $subenzs{$enz};
 87 |         my $pattern = $$e{pattern_regexp};
 88 | 
 89 |         # check enzyme digest site
 90 |         if ( $seq =~ /$pattern/ or $revcom =~ /$pattern/ ) {
 91 |             $listhash{$enz}++;
 92 |             if ( $listhash{$enz} >= $threshold ) {
 93 |                 delete $subenzs{$enz};
 94 |                 delete $listhash{$enz};
 95 |             }
 96 |         }
 97 |     }
 98 | 
 99 |     # show process
100 |     $n++;
101 |     print STDERR "\rcheck seq $n";
102 | }
103 | $| = 0;
104 | 
105 | print STDERR "\n";
106 | for ( sort { $listhash{$b} <=> $listhash{$a} } keys %listhash ) {
107 |     my $e       = $subenzs{$_};
108 |     my $pattern = $$e{pattern};
109 |     printf "%s\t%s\t%s\n", $_, $pattern, $listhash{$_};
110 | }
111 | 


--------------------------------------------------------------------------------
/enzyme/restrict_choose_enzyme_for_identify_genomes.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | # Fuction     : Run EMBOSS restrict, and parse restriction fragments
  3 | # Author      : Wei Shen
  4 | # Email       : shenwei356@gmail.com, http://shenwei.me
  5 | # Date        : 2014-12-18
  6 | # Last Update : 2015-01-31
  7 | 
  8 | use strict;
  9 | use File::Basename;
 10 | use Getopt::Long;
 11 | use Parallel::Runner;
 12 | 
 13 | use BioUtil::Misc;
 14 | use BioUtil::Seq;
 15 | use BioUtil::Util;
 16 | 
 17 | local $| = 1;
 18 | 
 19 | $0 = basename($0);
 20 | my $usage = <<USAGE;
 21 | restrict_fragments.pl
 22 | 
 23 |     Parsing restriction fragments for chosing a appropriate 
 24 |     enzyme to identify multi genomes
 25 | 
 26 | Usage: $0 [options]
 27 | 
 28 | Options:
 29 |     -e FILE    Enzymefile (from Rebase)
 30 |     -i FILE    Fasta file
 31 |     -l FILE    Enzyme list file
 32 |     -t INT     Thread number
 33 |     -linear    Linear genome
 34 | 
 35 | Example:
 36 | 
 37 | See more: https://github.com/shenwei356/bio_scripts
 38 | USAGE
 39 | 
 40 | my $help          = 0;
 41 | my $enzymefile    = "";
 42 | my $seqfile       = "";
 43 | my $listfile      = "";
 44 | my $threads       = 4;
 45 | my $linear_genome = 0;
 46 | 
 47 | GetOptions(
 48 |     'help|h' => \$help,
 49 |     'e=s'    => \$enzymefile,
 50 |     'i=s'    => \$seqfile,
 51 |     'l=s'    => \$listfile,
 52 |     't=i'    => \$threads,
 53 |     'linear' => \$linear_genome,
 54 | ) or die $usage;
 55 | 
 56 | die $usage if $help;
 57 | die $usage unless $enzymefile ne "" and $seqfile ne "";
 58 | 
 59 | #=====================[ run restrict ]====================
 60 | 
 61 | my $enzs     = parse_embossre($enzymefile);
 62 | my %subenzs  = ();
 63 | my %listhash = ();
 64 | 
 65 | my $dir = '';
 66 | if ( $listfile ne "" ) {
 67 |     my $list = get_column_data( $listfile, 1 );
 68 |     %listhash = map { $_ => 0 } @$list;
 69 |     for my $enz ( keys %$enzs ) {
 70 |         if ( exists $listhash{$enz} ) {
 71 |             $subenzs{$enz} = $$enzs{$enz};
 72 |         }
 73 |     }
 74 |     $dir = "re.$seqfile.digestedby.$listfile";
 75 | }
 76 | else {
 77 |     %subenzs = %$enzs;
 78 |     $dir     = "re.$seqfile.digestedby.$enzymefile";
 79 | }
 80 | 
 81 | %listhash = ();
 82 | %listhash = map { $_ => 0 } keys %subenzs;
 83 | 
 84 | # unless ( -e $dir and -d $dir ) {
 85 | rm_and_mkdir($dir);
 86 | my $runner = Parallel::Runner->new($threads);
 87 | for my $enz ( keys %subenzs ) {
 88 |     $runner->run(
 89 |         sub {
 90 |             run_emboss_restrict( $dir, $enz );
 91 |         }
 92 |     );
 93 | }
 94 | $runner->finish;
 95 | 
 96 | # }
 97 | 
 98 | sub run_emboss_restrict {
 99 |     my ( $dir, $enzyme ) = @_;
100 |     my $resultfile = "$dir/$seqfile.$enzyme.re";
101 |     return if -e $resultfile;
102 |     print STDERR "$enzyme\n";
103 |     my $cmd = "restrict -auto -solofragment -limit "
104 |         . "-sequence $seqfile -outfile $resultfile -enzymes $enzyme ";
105 |     $cmd .= " -plasmid " unless $linear_genome;
106 | 
107 |     my $fail = run($cmd);
108 |     die "failed to run:$cmd\n" if $fail;
109 | }
110 | 
111 | # ===========[ Parsing restriction fragments ]=============
112 | 
113 | my @files = glob "$dir/*.re";
114 | 
115 | my $fragments = {};
116 | my $stats     = {};
117 | for my $file (@files) {
118 |     my ( $enzyme, $seq ) = (undef) x 2;
119 | 
120 |     open my $fh, $file
121 |         or die "fail to read enzyme file $file\n";
122 |     while (<$fh>) {
123 |         if (/^#\s+\-enzymes (.+)/) {    # enzyme name
124 |             $enzyme = $1;
125 |         }
126 |         elsif (/^# Sequence: (.+)\s+from/) {    # sequence name
127 |             $seq = $1;
128 |         }
129 |         elsif (/^# \t([\d\t]+)$/) {             # fragment size
130 |             if ( ref $$fragments{$enzyme}{$seq} ne ref [] ) {
131 |                 $$fragments{$enzyme}{$seq} = [];
132 |             }
133 |             push @{ $$fragments{$enzyme}{$seq} }, split( /\t/, $1 );
134 |         }
135 |     }
136 |     close $fh;
137 | 
138 |     my $n = 0;
139 |     for my $seq ( keys %{ $$fragments{$enzyme} } ) {
140 |         my @frags = sort { $b <=> $a } @{ $$fragments{$enzyme}{$seq} };
141 | 
142 |         # print "$enzyme\n  $seq\n  @frags\n";
143 |         $$fragments{$enzyme}{$seq} = \@frags;
144 |         $n += scalar @frags;
145 |     }
146 |     $$stats{$enzyme}{nfrags} = $n;
147 | }
148 | 
149 | # ===========[ Output restriction fragments ]=============
150 | 
151 | my $outfile = "$seqfile.digestedby.$listfile.frag";
152 | 
153 | open OUT, ">", $outfile
154 |     or die "fail to write file $outfile\n";
155 | 
156 | my $frag = {};
157 | for my $enzyme (
158 |     sort { $$stats{$a}{nfrags} <=> $$stats{$b}{nfrags} }
159 |     keys %$fragments
160 |     )
161 | {
162 | 
163 |     print OUT "-" x 79, "\n", "$enzyme\n";
164 |     for my $seq ( sort keys %{ $$fragments{$enzyme} } ) {
165 |         my @frags = @{ $$fragments{$enzyme}{$seq} };
166 |         print OUT "$seq: @frags\n";
167 |     }
168 | }
169 | 
170 | close OUT;
171 | 


--------------------------------------------------------------------------------
/enzyme/restrict_with_T_tail.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # Copyright 2014 Wei Shen (shenwei356#gmail.com). All rights reserved.
 4 | # Use of this source code is governed by a MIT-license
 5 | # that can be found in the LICENSE file.
 6 | # https://github.com/shenwei356/bio_scripts/
 7 | 
 8 | use strict;
 9 | use BioUtil::Misc;
10 | 
11 | die "usage: $0 embossre.enz\n"
12 |     unless @ARGV == 1;
13 | 
14 | my $file = shift @ARGV;
15 | my $d = shift @ARGV;
16 | 
17 | my $enzs = parse_embossre($file);
18 | 
19 | for my $enz (sort keys %$enzs) {
20 |     my $e = $$enzs{$enz};
21 |     next unless $$e{cuts_number} == 2
22 |         and $$e{c1} - $$e{c2} == 1
23 |         and substr ($$e{pattern}, $$e{c1} - 1, 1) =~ /[aN]/i;
24 |     print "$enz\n";
25 | }
26 | 
27 | # there's no enzyme meeting this condition
28 | 


--------------------------------------------------------------------------------
/enzyme/restrict_with_far_away_digest_site.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # Copyright 2014 Wei Shen (shenwei356#gmail.com). All rights reserved.
 4 | # Use of this source code is governed by a MIT-license
 5 | # that can be found in the LICENSE file.
 6 | # https://github.com/shenwei356/bio_scripts/
 7 | 
 8 | use strict;
 9 | use BioUtil::Misc;
10 | 
11 | die "usage: $0 embossre.enz d\n"
12 |     unless @ARGV == 2;
13 | 
14 | my $file = shift @ARGV;
15 | my $d = shift @ARGV;
16 | 
17 | my $enzs = parse_embossre($file);
18 | 
19 | for my $enz (sort keys %$enzs) {
20 |     my $e = $$enzs{$enz};
21 |     next unless $$e{cuts_number} == 2
22 |         and $$e{c1} - $$e{length} >= $d;
23 |     print "$enz\n";
24 | }
25 | 


--------------------------------------------------------------------------------
/enzyme/restrict_without_digest_site_in_sequences.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # Copyright 2014 Wei Shen (shenwei356#gmail.com). All rights reserved.
 4 | # Use of this source code is governed by a MIT-license
 5 | # that can be found in the LICENSE file.
 6 | # https://github.com/shenwei356/bio_scripts/
 7 | 
 8 | # embossre.enz
 9 | #   ftp://ftp.neb.com/pub/rebase/
10 | 
11 | use strict;
12 | use File::Basename;
13 | use BioUtil::Misc;
14 | use BioUtil::Seq;
15 | use BioUtil::Util;
16 | 
17 | my $usage = sprintf "
18 | Usage: %s <embossre.enz> <fasta file> [enzyme list file]
19 | 
20 | ", basename($0);
21 | die $usage unless @ARGV == 3 or @ARGV == 2;
22 | 
23 | my $enzymefile = shift @ARGV;
24 | my $seqfile    = shift @ARGV;
25 | 
26 | my $enzs    = parse_embossre($enzymefile);
27 | my %subenzs = ();
28 | 
29 | my $listfile = shift @ARGV;
30 | if ( defined $listfile ) {
31 |     my $list = get_list_from_file($listfile);
32 |     my %listhash = map { $_ => 0 } @$list;
33 |     for my $enz ( keys %$enzs ) {
34 |         if ( exists $listhash{$enz} ) {
35 |             $subenzs{$enz} = $$enzs{$enz};
36 |         }
37 |     }
38 | }
39 | else {
40 |     %subenzs = %$enzs;
41 | }
42 | 
43 | # show process
44 | local $| = 1;
45 | my $n    = 0;
46 | my $sum  = scalar keys %subenzs;
47 | my $left = $sum;
48 | 
49 | my $next_seq = FastaReader($seqfile);
50 | while ( my $fa = &$next_seq() ) {
51 |     my ( $header, $seq ) = @$fa;
52 |     $seq = uc $seq;
53 |     my $revcom = revcom($seq);
54 | 
55 |     for my $enz ( keys %subenzs ) {
56 |         my $e       = $subenzs{$enz};
57 |         my $pattern = $$e{pattern_regexp};
58 |         # check enzyme digest site
59 |         if ( $seq =~ /$pattern/ or $revcom =~ /$pattern/ ) {
60 |             delete $subenzs{$enz};
61 |         }
62 |     }
63 | 
64 |     # show process
65 |     $n++;
66 |     $left = scalar keys %subenzs;
67 |     print STDERR "\rcheck seq $n, candidate: $left / $sum";
68 | }
69 | $| = 0;
70 | 
71 | print STDERR "\n";
72 | print "$_\n" for sort keys %subenzs;
73 | 


--------------------------------------------------------------------------------
/file_formats/add_annotations_to_myva.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | # Fuction     : To add annotations to COG aa file
  3 | # Author      : Wei Shen
  4 | # Email       : shenwei356@gmail.com
  5 | # Date        : 2011-04-08, cost 2 hour.
  6 | # Last Update : 2011-04-08
  7 | 
  8 | # Annotations are in following files downloaded from
  9 | # ftp://ftp.ncbi.nlm.nih.gov/pub/COG/COG
 10 | # 
 11 | # FILE       DATA(* means important)
 12 | # *fun.txt    function_id(one letter) -> function
 13 | # *myva      protein_id*, aa sequence*
 14 | # *myva=gb   protein_id  -> GI numbers*
 15 | # *whog      Organism(three letters)*, protein_id, 
 16 | #            function_id, cog_id, protein*
 17 | # *org.txt   Organism(three letters) -> detail
 18 | #
 19 | # Output format
 20 | # > <protein_id>_<protein>_<function>_<Organism>_<GI numbers>
 21 | # aa sequence
 22 | #
 23 | # Therefore, five files with sign * will be used
 24 | #
 25 | # Attention:
 26 | # 1. NOT all protein_ids from file myva=gb could be found in file whog
 27 | # 2. NOT all protein_ids from file myva    could be found in file whog
 28 | 
 29 | use strict;
 30 | 
 31 | # parse file fun
 32 | my $fun = &parse_file_fun('fun.txt');
 33 | 
 34 | 
 35 | # parse file myva=gb
 36 | my $pro_gi = &parse_file_myva_gb('myva=gb');
 37 | 
 38 | 
 39 | # parse file org
 40 | my $org = &parse_file_org('org.txt');
 41 | 
 42 | 
 43 | # parse file whog
 44 | my $whog = &parse_file_whog('whog');
 45 | 
 46 | # my @keys = keys %$whog;
 47 | # print "$_\n" unless $_ ~~ @keys for keys %$pro_gi;
 48 | # the result showed that not all protein_ids from file myva=gb
 49 | # are in file whog
 50 | 
 51 | # parse file myva and add annotation
 52 | my $file = 'myva';
 53 | my $out_file = "$file.full_annotation.txt";
 54 | my ($head, $seq, $pro_id_trim);
 55 | 
 56 | open IN, $file or die "File $file failed to open sequence.\n";
 57 | $/ = '>';<IN>;
 58 | open OUT, ">", $out_file or die "File $out_file failed to open sequence.\n";
 59 | 
 60 | while ( <IN> ) {
 61 |     s/\r?\n>//;
 62 |     ( $head, $seq ) = split /\r?\n/, $_, 2;
 63 |     ## > <protein_id>_<protein>_<function>_<Organism>_<GI numbers>
 64 |     $pro_id_trim = $head;
 65 |     $pro_id_trim = $1 if $head =~ /(.+)\_\d+/;    # for gi
 66 |     $head = $head
 67 |             . " __pro__". $$whog{$head}{protein}
 68 |             . "__fun_". $$whog{$head}{fun_id}. "_". $$fun{$$whog{$head}{fun_id}}
 69 |             . "__org__". $$org{$$whog{$head}{org_id}}{organism}
 70 |             . "__gi__".  $$pro_gi{$pro_id_trim};
 71 |     print OUT ">$head\n$seq\n";
 72 | }
 73 | close IN;
 74 | close OUT;
 75 | 
 76 | #====================================================================
 77 | # out put data structure:
 78 | # $hash_ref = {fun_id => function}
 79 | sub parse_file_fun($){
 80 |     my ($file) = @_;
 81 |     my $fun = {};
 82 |     
 83 |     open IN, $file or die "File $file failed to open\n";
 84 |     while (<IN>) {
 85 |         next unless /\[(\w)\] (.+) $/;
 86 |         $$fun{$1} = $2;
 87 |     }
 88 |     close IN;
 89 |     # print scalar keys %$fun;
 90 |     return $fun;
 91 | }
 92 | 
 93 | # out put data structure:
 94 | # $hash_ref = {protein_id => gi}
 95 | sub parse_file_myva_gb($){
 96 |     my ($file) = @_;
 97 |     my $pro_gi = {};
 98 |     
 99 |     open IN, $file or die "File $file failed to open\n";
100 |     while (<IN>) {
101 |         next unless /^(.+)\s+(.+)$/;
102 |         $$pro_gi{$1} = $2;
103 |     }
104 |     close IN;
105 |     # print scalar keys %$pro_gi;
106 |     return $pro_gi;
107 | }
108 | 
109 | # out put data structure:
110 | # $hash_ref = {org_id => {kindom => kindom, organism => organism} }
111 | sub parse_file_org($){
112 |     my ($file) = @_;
113 |     my $org = {};
114 |     
115 |     open IN, $file or die "File $file failed to open\n";
116 |     while (<IN>) {
117 |         next unless /^(\w{3})\s+\d+\s+(.+?)\s+(.+)$/;
118 |         $$org{$1}{kindom}   = $2;
119 |         $$org{$1}{organism} = $3;
120 |     }
121 |     close IN;
122 |     # print scalar keys %$org;
123 |     return $org;
124 | }
125 | 
126 | # out put data structure:
127 | # $hash_ref = {protein_id => {org_id => org_id, cog_id => cog_id,
128 | #                             fun_id => fun_id, protein => protein} }
129 | sub parse_file_whog($){
130 |     my ($file) = @_;
131 |     my $whog = {};
132 |     
133 |     my ($fun_id, $cog_id, $protein, $org_id, $protein_id, @protein_ids);
134 |     open IN, $file or die "File $file failed to open\n";
135 |     while (<IN>) {
136 |         if (/^\[(\w)\] (\w+) (.+)$/) {
137 |             $fun_id  = $1;
138 |             $cog_id  = $2;
139 |             $protein = $3;
140 |         }
141 |         elsif (/^\s+(\w{3})\:\s+(.+)$/) {
142 |             $org_id     = $1;
143 |             @protein_ids = split /\s+/, $2;
144 |             for $protein_id (@protein_ids) {
145 |                 $$whog{$protein_id}{fun_id}  = $fun_id;
146 |                 $$whog{$protein_id}{cog_id}  = $cog_id;
147 |                 $$whog{$protein_id}{protein} = $protein;
148 |                 $$whog{$protein_id}{org_id}  = $org_id;
149 |             }
150 |         }
151 |         elsif (/        (.+)/) {
152 |             @protein_ids = split /\s+/, $1;
153 |             for $protein_id (@protein_ids) {
154 |                 $$whog{$protein_id}{fun_id}  = $fun_id;
155 |                 $$whog{$protein_id}{cog_id}  = $cog_id;
156 |                 $$whog{$protein_id}{protein} = $protein;
157 |                 $$whog{$protein_id}{org_id}  = $org_id;
158 |             }
159 |         }
160 |         elsif (/_______/) {
161 |         }
162 |         else {
163 |         }
164 |     }
165 |     close IN;
166 |     # print scalar keys %$whog;
167 |     # print "$_\t". ($$whog{'PH0109_1'}{$_}). "\n" for keys %{$$whog{'PH0109_1'}};
168 |     return $whog;
169 | }
170 | 


--------------------------------------------------------------------------------
/file_formats/bam2gff.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # https://github.com/shenwei356/bio_scripts
 4 | 
 5 | import argparse
 6 | import sys
 7 | from collections import Counter, defaultdict
 8 | 
 9 | import pysam
10 | 
11 | parser = argparse.ArgumentParser(
12 |     description="bam2gff. Extracting the locations of properly mapping paired (single) ends to GFF format.",
13 |     epilog="https://github.com/shenwei356/bio_scripts")
14 | 
15 | parser.add_argument('bamfile', type=str, help='bam file')
16 | parser.add_argument('-c', '--cache-size', type=int, default=1000, help='cache size [1000]')
17 | parser.add_argument('-m', '--match-proportion', type=float, default=0.75,
18 |                     help='minimum match proportion to define properly paired ends [0.75]')
19 | parser.add_argument('-se', '--single-end', action='store_true', help='single read mapping result')
20 | 
21 | parser.add_argument("-v", "--verbose", help='verbosely print information',
22 |                     action="count", default=0)
23 | 
24 | args = parser.parse_args()
25 | 
26 | pairs = defaultdict(lambda: defaultdict(dict))
27 | stats = Counter()
28 | samfile = pysam.AlignmentFile(args.bamfile, "rb")
29 | for read in samfile.fetch():
30 |     if args.single_end:
31 |         if not read.reference_length or read.reference_length < read.query_length * args.match_proportion:  # full match
32 |             stats['bad match'] += 1
33 |             continue
34 |         ref = samfile.getrname(read.reference_id)
35 |         if read.is_reverse:
36 |             start, end, strand = read.reference_start, read.reference_end, '-'
37 |         else:
38 |             start, end, strand = read.reference_start, read.reference_end, '+'
39 |         sys.stdout.write('\t'.join(
40 |             [ref, 'bam2gff.py', 'single_ends', str(start + 1), str(end), '.', strand, '.',
41 |              read.query_name]) + "\n")
42 |         continue
43 | 
44 |     if read.is_proper_pair and not read.is_secondary:
45 |         if read.reference_length < read.query_length * args.match_proportion:  # full match
46 |             stats['bad match'] += 1
47 |             continue
48 |         key = '_'.join([str(x) for x in sorted([read.reference_start, read.next_reference_start])])
49 |         pairs[read.query_name][key]['read1' if read.is_read1 else 'read2'] = {'start': read.reference_start,
50 |                                                                               'end': read.reference_end,
51 |                                                                               'ref': samfile.getrname(
52 |                                                                                   read.reference_id),
53 |                                                                               'reverse': read.is_reverse}
54 | 
55 |         if 'read1' in pairs[read.query_name][key] and 'read2' in pairs[read.query_name][key]:
56 |             read1, read2 = pairs[read.query_name][key]['read1'], pairs[read.query_name][key]['read2']
57 | 
58 |             if not read1['reverse']:
59 |                 strand, start, end = '+', read1['start'], read2['end']
60 |             else:
61 |                 strand, start, end = '-', read2['start'], read1['end']
62 | 
63 |             sys.stdout.write('\t'.join(
64 |                 [read1['ref'], 'bam2gff.py', 'paired_ends', str(start + 1), str(end), '.', strand, '.',
65 |                  read.query_name]) + "\n")
66 | 
67 |             stats['paired'] += 1
68 | 
69 |             del pairs[read.query_name][key]
70 | 
71 | samfile.close()
72 | 
73 | for query, sites in pairs.items():
74 |     if len(sites) == 0:
75 |         continue
76 |     stats['unpaired'] += 1
77 | 
78 | sys.stderr.write('{} summary: {}\n'.format(args.bamfile, stats))
79 | 


--------------------------------------------------------------------------------
/file_formats/extract_cds_from_glimmer_predict_result.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # https://github.com/shenwei356/bio_scripts
 3 | 
 4 | use strict;
 5 | use File::Basename;
 6 | use BioUtil::Seq;
 7 | use BioUtil::Util;
 8 | 
 9 | $0 = basename($0);
10 | my $usage = qq(
11 |     usage: $0 <glimmer .predict file> <genome file> [gff]
12 | 
13 | );
14 | die $usage unless @ARGV == 2 or @ARGV == 3;
15 | my $prfile  = shift @ARGV;
16 | my $seqfile = shift @ARGV;
17 | my $gtf     = shift @ARGV;
18 | 
19 | my $genome = ( values %{ read_sequence_from_fasta_file($seqfile) } )[0];
20 | 
21 | my @data = ();
22 | my ( $genome, $name, $a, $b, $frame, $seq );
23 | open my $fh, '<', $prfile or die "fail to open file: $prfile\n";
24 | while (<$fh>) {
25 |     s/\r?\n//g;
26 |     $genome = $1 if /^>(.+)/;
27 |     @data = split /\s+/, $_;
28 |     next unless scalar(@data) == 5;
29 |     ( $name, $a, $b, $frame ) = @data;
30 |     next unless $a =~ /^\d+$/;
31 |     if ($a > $b) {
32 |         my $tmp = $a;
33 |         $a = $b;
34 |         $b = $tmp;
35 |     }
36 | 
37 |     if ( $gtf eq 'gff' ) {
38 |         my $strand = $frame > 0 ? '+' : '-';
39 |         printf "%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n",
40 |              $genome, 'glimmer', 'CDS', $a, $b, '.', $strand, '.', $name;
41 |     }
42 |     else {
43 |         if ( $frame > 0 ) {
44 |             $seq = substr( $genome, $a - 1, ( $b - $a + 1 ) );
45 |         }
46 |         else {
47 |             $seq = revcom( substr( $genome, $b - 1, ( $a - $b + 1 ) ) );
48 |         }
49 |         print ">${name}_${a}..${b}..$frame\n", format_seq($seq);
50 |     }
51 | 
52 | }
53 | close $fh;
54 | 


--------------------------------------------------------------------------------
/file_formats/extract_features_from_genbank_file.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # https://github.com/shenwei356/bio_scripts
  3 | 
  4 | import sys
  5 | import argparse
  6 | import gzip
  7 | 
  8 | from Bio import SeqIO
  9 | from Bio.Seq import Seq
 10 | from Bio.SeqRecord import SeqRecord
 11 | 
 12 | 
 13 | def parse_args():
 14 |     parser = argparse.ArgumentParser(
 15 |         description="Extract features from Genbank file",
 16 |         epilog="https://github.com/shenwei356/bio_scripts")
 17 | 
 18 |     parser.add_argument('gbkfile', type=str, help='Genbank file')
 19 |     parser.add_argument(
 20 |         '-t',
 21 |         '--type',
 22 |         type=str,
 23 |         default='CDS',
 24 |         help='Feature type (CDS tRNA). Multiple values should be separated by comma. "." for any types.')
 25 |     outfmt_choices = ['fasta', 'gtf', 'gff']
 26 |     parser.add_argument('-f',
 27 |                         '--outfmt',
 28 |                         type=str,
 29 |                         default='fasta',
 30 |                         help='Out format, fasta or gtf')
 31 | 
 32 |     parser.add_argument('-p',
 33 |                         '--peptide',
 34 |                         action="store_true",
 35 |                         help='Translate the nucleotides to peptides')
 36 |     parser.add_argument(
 37 |         '--table',
 38 |         type=int,
 39 |         help='Genetic code table (detail: http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi ) [1]')
 40 | 
 41 |     args = parser.parse_args()
 42 | 
 43 |     if args.outfmt not in outfmt_choices:
 44 |         sys.stderr.write('[ERROR] -f | --outfmt should be in {}\n'.format(outfmt_choices))
 45 |         sys.exit(1)
 46 | 
 47 |     if args.table:
 48 |         args.peptide = True
 49 | 
 50 |     return args
 51 | 
 52 | 
 53 | if __name__ == '__main__':
 54 |     args = parse_args()
 55 | 
 56 |     types = set(args.type.lower().split(','))
 57 |     with gzip.open(args.gbkfile) if args.gbkfile.endswith('.gz') else open(args.gbkfile) as fh:
 58 |         records = SeqIO.parse(fh, "genbank")
 59 |         for record in records:
 60 |             for f in record.features:
 61 |                 if '.' not in types and f.type.lower() not in types:
 62 |                     continue
 63 | 
 64 |                 start, end = f.location.start, f.location.end
 65 |                 strand = '+' if f.strand > 0 else '-'
 66 | 
 67 |                 qualifiers = f.qualifiers
 68 |                 if 'product' in qualifiers:
 69 |                     product = qualifiers['product'][0]
 70 |                 else:
 71 |                     product = ''
 72 | 
 73 |                 if 'note' in qualifiers:
 74 |                     note = qualifiers['note'][0]
 75 |                 else:
 76 |                     note = ''
 77 | 
 78 |                 if 'gene' in qualifiers:
 79 |                     gene_id = qualifiers['gene'][0]
 80 |                 elif 'locus_tag' in qualifiers:
 81 |                     gene_id = qualifiers['locus_tag'][0]
 82 |                 else:
 83 |                     gene_id = ''
 84 | 
 85 |                 if args.outfmt == 'fasta':
 86 |                     seq = None
 87 |                     if args.peptide:
 88 |                         if args.table:
 89 |                             transl_table = args.table
 90 |                         elif 'transl_table' in qualifiers:
 91 |                             transl_table = qualifiers['transl_table']
 92 |                         else:
 93 |                             sys.stderr.write('[WARNING] neither translate table given or found in features. set 1\n')
 94 |                             transl_table = 1
 95 | 
 96 |                         if 'translation' in qualifiers:
 97 |                             seq = Seq(qualifiers['translation'][0])
 98 |                         else:
 99 |                             seq = record.seq[start:end].translate(table=transl_table)
100 |                     else:
101 |                         seq = record.seq[start:end]
102 | 
103 |                     SeqIO.write(
104 |                         [SeqRecord(seq,
105 |                                    id='{}_{}..{}..{}'.format(record.id, start + 1, end, strand), description=product)],
106 |                         sys.stdout,
107 |                         "fasta")
108 | 
109 |                 elif args.outfmt == 'gtf':
110 |                     frame = int(qualifiers['codon_start'][0]) - 1 if 'codon_start' in qualifiers else 0
111 | 
112 |                     transcript_id = gene_id
113 | 
114 |                     attribute = 'gene_id "{}"; transcript_id "{}"'.format(gene_id, transcript_id)
115 | 
116 |                     if 'protein_id' in f.qualifiers:
117 |                         attribute += '; protein_id "{}"'.format(qualifiers['protein_id'][0])
118 | 
119 |                     if 'db_xref' in qualifiers:
120 |                         for ext in qualifiers['db_xref']:
121 |                             attribute += '; db_xref "{}"'.format(ext)
122 | 
123 |                     if 'note' in f.qualifiers:
124 |                         attribute += '; note "{}"'.format(qualifiers['note'][0])
125 | 
126 |                     attribute += '; product "{}"; '.format(product)
127 | 
128 |                     sys.stdout.write('\t'.join(
129 |                         [record.id, 'genbank', f.type, str(start + 1), str(end), '.', strand, str(frame), attribute]) + "\n")
130 | 
131 |                 elif args.outfmt == 'gff':
132 |                     if 'codon_start' in qualifiers:
133 |                         frame = int(qualifiers['codon_start'][0]) - 1
134 |                     else:
135 |                         frame = 0
136 |                     sys.stdout.write('\t'.join(
137 |                         [record.id, 'genbank', f.type, str(start + 1),
138 |                          str(end), '.', strand, str(frame),
139 |                          "{},{}".format(gene_id, product)]) + "\n")
140 | 


--------------------------------------------------------------------------------
/file_formats/extract_sequence_from_genbank_file.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # Author      : Wei Shen
 3 | # Email       : shenwei356@gmail.com
 4 | # Date        : 2011-07-20
 5 | # Last Update : 2011-07-20
 6 | use strict;
 7 | use File::Basename;
 8 | 
 9 | $0 = basename($0);
10 | die "Usage: $0 gb_file\n" unless @ARGV == 1;
11 | my $file = shift;
12 | 
13 | my ($definition, $version, $gi, $seq);
14 | 
15 | open IN, $file or die "failed to open file: $file\n";
16 | $/ = "\n//";
17 | while (<IN>) {
18 |     next unless /DEFINITION  (.+)\./;
19 |     $definition = $1;
20 |     #print "$definition\n";
21 |     next unless /VERSION     (.+)  GI\:(.+)\r?\n/;
22 |     $version = $1;
23 |     $gi      = $2;
24 |     #print "$version, $gi\n";
25 |     $seq     = substr($_, index($_, 'ORIGIN') + 6);
26 |     $seq     =~ s/\/\/.*//s;
27 |     $seq     =~ s/\s+//g;
28 |     $seq     =~ s/\d+//g;
29 |     #print "$seq\n";
30 |     #print length($seq),"\n";
31 |     print ">gi|$gi|gb|$version| $definition\n".(format_seq($seq, 60))."\n";
32 | }
33 | $/ = "\n";
34 | close IN;
35 | 
36 | 
37 | sub format_seq($$){
38 |     my ($s, $n) = @_;
39 |     my $s2 ='';
40 |     my ($j, $int);
41 |     $int = int ((length $s) / $n);
42 |     for($j = 0 ; $j <= $int - 1; $j ++){
43 |         $s2 .= substr($s, $j * $n, $n)."\n";
44 |     }
45 |     $s2 .= substr($s, $int * $n);
46 |     return $s2;
47 | }
48 | 


--------------------------------------------------------------------------------
/file_formats/genbank_filter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | import re
 4 | import sys
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser(
 8 |     description='filter gene records by regular expression from genbank file',
 9 |     epilog="https://github.com/shenwei356/bio_scripts")
10 | parser.add_argument('gbfile', type=str, help='genbank file')
11 | parser.add_argument('pattern',
12 |                     type=str,
13 |                     help='pattern (regular expression) [.]')
14 | args = parser.parse_args()
15 | 
16 | with open(args.gbfile) as fh:
17 |     tmp = ''
18 |     for line in fh:
19 |         if line.startswith('     gene '):
20 |             if tmp == '':
21 |                 tmp = line
22 |             else:
23 |                 if re.search(args.pattern, tmp):
24 |                     sys.stdout.write(tmp)
25 |                 tmp = line
26 |         elif line != '':
27 |             tmp += line
28 | if re.search(args.pattern, tmp):
29 |     sys.stdout.write(tmp)
30 | 


--------------------------------------------------------------------------------
/file_formats/gff2fa.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # https://github.com/shenwei356/bio_scripts
  4 | 
  5 | from __future__ import print_function
  6 | 
  7 | import argparse
  8 | import gzip
  9 | import sys
 10 | from collections import defaultdict
 11 | 
 12 | from Bio import SeqIO
 13 | from Bio.SeqRecord import SeqRecord
 14 | 
 15 | parser = argparse.ArgumentParser(description="extract_cds_by_gff")
 16 | parser.add_argument('-t',
 17 |                     '--type',
 18 |                     type=str,
 19 |                     default='CDS',
 20 |                     help='gene type. "." for any types. [CDS]')
 21 | parser.add_argument('-us',
 22 |                     '--up-stream',
 23 |                     type=int,
 24 |                     default=0,
 25 |                     help='up stream length [0]')
 26 | parser.add_argument('-ds',
 27 |                     '--down-stream',
 28 |                     type=int,
 29 |                     default=0,
 30 |                     help='down stream length [0]')
 31 | parser.add_argument('-j',
 32 |                     '--just',
 33 |                     action="store_true",
 34 |                     help='only output up and down stream')
 35 | parser.add_argument('gff_file', type=str, help='gff file')
 36 | parser.add_argument('fasta_file', type=str, help='fasta file')
 37 | args = parser.parse_args()
 38 | if not (args.up_stream >= 0 and args.down_stream >= 0):
 39 |     print('value of --up-stream and --down-stream should be >= 0',
 40 |           file=sys.stderr)
 41 |     sys.exit(1)
 42 | if args.just:
 43 |     if args.up_stream and args.down_stream or not (args.up_stream or
 44 |                                                        args.down_stream):
 45 |         print(
 46 |             'when using option --just, ONE of --up-stream and --down-stream should given',
 47 |             file=sys.stderr)
 48 |         sys.exit(1)
 49 | 
 50 | 
 51 | def read_gff_file(file):
 52 |     genes = defaultdict(list)
 53 |     with open(file, 'rt') as fh:
 54 |         for row in fh:
 55 |             data = row.strip().split('\t')
 56 |             if len(data) < 9:
 57 |                 continue
 58 |             name = data[0]
 59 |             gene = dict()
 60 |             gene['type'], gene['start'], gene['end'], gene['strand'], gene[
 61 |                 'product'
 62 |             ] = data[2], int(data[3]), int(
 63 |                 data[4]), data[6], data[8]
 64 |             genes[name].append(gene)
 65 | 
 66 |     return genes
 67 | 
 68 | 
 69 | genes = read_gff_file(args.gff_file)
 70 | 
 71 | fh = gzip.open(args.fasta_file,
 72 |                'rt') if args.fasta_file.endswith('.gz') else open(
 73 |                    args.fasta_file, 'r')
 74 | for record in SeqIO.parse(fh, 'fasta'):
 75 |     name, genome = record.id, record.seq
 76 |     genomesize = len(genome)
 77 |     if name not in genes:
 78 |         continue
 79 | 
 80 |     for gene in genes[name]:
 81 |         if args.type != '.' and gene['type'].lower() != args.type.lower():
 82 |             continue
 83 |         seq = ''
 84 |         flag = ''
 85 |         if gene['strand'] == '+':
 86 |             if args.just:
 87 |                 if args.up_stream:
 88 |                     s = gene['start'] - args.up_stream - 1
 89 |                     e = gene['start'] - 1
 90 |                     flag = 'jus..{}'.format(args.up_stream)
 91 |                 else:
 92 |                     s = gene['end']
 93 |                     e = gene['end'] + args.down_stream
 94 |                     flag = 'jds..{}'.format(args.down_stream)
 95 |             else:
 96 |                 s = gene['start'] - args.up_stream - 1
 97 |                 s = 0 if s < 0 else s
 98 |                 e = gene['end'] + args.down_stream
 99 |                 if args.up_stream:
100 |                     flag = 'us..{}'.format(args.up_stream)
101 |                 else:
102 |                     flag = 'ds..{}'.format(args.down_stream)
103 | 
104 |             s = 0 if s < 0 else s
105 |             end = genomesize - 1 if e > genomesize - 1 else e
106 |             seq = genome[s:e]
107 |         else:
108 |             if args.just:
109 |                 if args.up_stream:
110 |                     s = gene['end']
111 |                     e = gene['end'] + args.up_stream
112 |                     flag = 'jus..{}'.format(args.up_stream)
113 |                 else:
114 |                     s = gene['start'] - args.down_stream - 1
115 |                     e = gene['start'] - 1
116 |                     flag = 'jds..{}'.format(args.down_stream)
117 |             else:
118 |                 s = gene['start'] - args.down_stream - 1
119 |                 s = 0 if s < 0 else s
120 |                 e = gene['end'] + args.up_stream
121 |                 if args.up_stream:
122 |                     flag = 'us..{}'.format(args.up_stream)
123 |                 else:
124 |                     flag = 'ds..{}'.format(args.down_stream)
125 | 
126 |             s = 0 if s < 0 else s
127 |             end = genomesize - 1 if e > genomesize - 1 else e
128 |             seq = genome[s:e].reverse_complement()
129 | 
130 |         if args.up_stream or args.down_stream:
131 |             id = '{}_{}..{}..{}_{}'.format(name, gene['start'], gene['end'],
132 |                                            gene['strand'], flag)
133 |         else:
134 |             id = '{}_{}..{}..{}'.format(name, gene['start'], gene['end'],
135 |                                         gene['strand'])
136 |         SeqIO.write(
137 |             SeqRecord(seq,
138 |                       id=id,
139 |                       description=gene['product']),
140 |             sys.stdout,
141 |             'fasta')
142 | fh.close()
143 | 


--------------------------------------------------------------------------------
/file_formats/gff_frame_start_coverage.plot.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | library(dplyr)
 3 | library(ggplot2)
 4 | library(reshape2)
 5 | 
 6 | args <- commandArgs(TRUE)
 7 | if (length(args) != 3) {
 8 |   write("\nusage: gff_frame_start_coverage.plot.R infile out.png title\n", stderr())
 9 |   quit(status = 1)
10 | }
11 | 
12 | df <- read.csv(args[1], sep = "\t")
13 | windows <- df['end'] - df['start']
14 | window <- windows[1,1] + 1
15 | 
16 | if (window == 1000) {
17 |   ylabel = paste("Counts/", 1, "kb", sep='')
18 | } else if (window > 1000) {
19 |   ylabel = paste("Counts/", window/1000, "kb", sep='')
20 | } else {
21 |   ylabel = paste("Counts/", window, "bp", sep='')
22 | }
23 | 
24 | df <- select(df, X.chr, strand, cnt_f0, cnt_f1, cnt_f2)
25 | 
26 | df_m <- melt(df, id.vars = c("X.chr", "strand"))
27 | 
28 | p <- ggplot(df_m, aes(variable, value, fill = strand)) +
29 |   geom_violin(adjust=1, position = position_dodge(width = 0.75)) +
30 |   scale_x_discrete(labels = c('0','1','2')) +
31 |   xlab('Frame') +
32 |   ylab(ylabel) +
33 |   ggtitle(args[3]) + 
34 |   facet_grid(. ~ X.chr) +
35 |   theme_bw() +
36 |   theme(
37 |     panel.border = element_blank(),
38 |     panel.grid.major = element_blank(),
39 |     panel.grid.minor = element_blank(),
40 |     axis.line = element_line(colour = "black"),
41 |     legend.key = element_blank(),
42 |     strip.background = element_rect(
43 |       colour = "white", fill = "white",
44 |       size = 0.2
45 |     )
46 |   )
47 | 
48 | ggsave(p, file = args[2], width = 8, height = 4)
49 | 


--------------------------------------------------------------------------------
/file_formats/gff_frame_start_coverage.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # https://github.com/shenwei356/bio_scripts
  4 | # Author     : Wei Shen
  5 | # Contact    : shenwei356@gmail.com
  6 | # LastUpdate : 2015-07-17
  7 | 
  8 | from __future__ import division, print_function
  9 | 
 10 | import argparse
 11 | import gzip
 12 | import os
 13 | import pickle
 14 | import sys
 15 | from collections import Counter, defaultdict
 16 | 
 17 | import numpy as np
 18 | 
 19 | parser = argparse.ArgumentParser(description="gff frame start coverage",
 20 |                                  epilog="https://github.com/shenwei356/bio_scripts")
 21 | 
 22 | parser.add_argument('genome_size_file', type=str, help='genome size file. two fields (chr and size) per line. ')
 23 | parser.add_argument('gff_file', type=str, help='gff/gtf file')
 24 | parser.add_argument('-w', '--window', type=int, default=1000, help='windows size [1000]')
 25 | parser.add_argument('-s', '--step', type=int, default=30, help='step size [30]')
 26 | 
 27 | args = parser.parse_args()
 28 | 
 29 | # read genome size file
 30 | sys.stderr.write('read genome size\n')
 31 | genomesizes = defaultdict(int)
 32 | with gzip.open(args.genome_size_file) if args.genome_size_file.endswith('.gz') else open(args.genome_size_file) as fh:
 33 |     for line in fh:
 34 |         if line.isspace() or line[0] == '#':
 35 |             continue
 36 |         data = line.rstrip().split()
 37 |         if len(data) < 2:
 38 |             sys.stderr.write('number of columns < 2! {}'.format(line))
 39 |             continue
 40 |         chr, size = data[0], data[1]
 41 |         genomesizes[chr] = int(size)
 42 | 
 43 | # read gff file
 44 | sys.stderr.write('read gff file\n')
 45 | coverages = defaultdict(dict)
 46 | file_cov_pickle = '{}.cov.pickle'.format(args.gff_file)
 47 | if not (os.path.exists(file_cov_pickle) and os.path.getsize(file_cov_pickle) > 0):
 48 |     with gzip.open(args.gff_file) if args.gff_file.endswith('.gz') else open(args.gff_file) as fh:
 49 |         chr = ''
 50 |         for line in fh:
 51 |             if line.isspace() or line[0] == '#':
 52 |                 continue
 53 | 
 54 |             data = line.rstrip().split('\t')
 55 |             if len(data) != 9:
 56 |                 sys.stderr.write('number of columns != 9: {}'.format(line))
 57 | 
 58 |             g, start, end, strand = data[0], int(data[3]), int(data[4]), data[6]
 59 |             if g != chr:
 60 |                 chr = g
 61 |                 coverages[chr]['+'] = np.zeros(genomesizes[chr] + 1, dtype=np.uint32)
 62 |                 coverages[chr]['-'] = np.zeros(genomesizes[chr] + 1, dtype=np.uint32)
 63 |                 sys.stderr.write('read chr {}\n'.format(chr))
 64 |             if strand == '+':
 65 |                 coverages[chr][strand][start] += 1
 66 |                 # print(chr, strand, start)
 67 |             else:
 68 |                 coverages[chr][strand][end] += 1
 69 |     with open(file_cov_pickle, 'wb') as fh:
 70 |         pickle.dump(coverages, fh, pickle.HIGHEST_PROTOCOL)
 71 | else:
 72 |     with open(file_cov_pickle, 'rb') as fh:
 73 |         coverages = pickle.load(fh)
 74 | 
 75 | 
 76 | def mean_coverage(data):
 77 |     return round(sum((c for j, c in data)) / len(data), 2) if len(data) > 0 else 0
 78 | 
 79 | # counting
 80 | sys.stderr.write('statistics...\n')
 81 | sys.stdout.write('#{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format('chr', 'strand', 'start', 'end', 'cnt',
 82 |                                                                         'cnt_f0', 'cnt_f1', 'cnt_f2',
 83 |                                                                         'a_cov_f0', 'a_cov_f1', 'a_cov_f2'))
 84 | for chr in sorted(coverages.keys()):
 85 |     for strand in ['+', '-']:
 86 |         coverage = coverages[chr][strand]
 87 |         # for i, c in enumerate(coverage):
 88 |         #     sys.stdout.write('{}\t{}\t{}\t{}\n'.format(chr, strand, i, c))
 89 |         _end = genomesizes[chr] - args.window + 1 if genomesizes[chr] > args.window else 1
 90 |         # print(chr, strand, genomesizes[chr], _end)
 91 |         for i in np.arange(1, _end + 1, args.step, dtype=np.uint32):
 92 |             data = [(j, coverage[j]) for j in np.arange(i, i + args.window) if coverage[j] > 0]
 93 |             data_f0 = [(j, c) for j, c in data if j % 3 == 1]
 94 |             data_f1 = [(j, c) for j, c in data if j % 3 == 2]
 95 |             data_f2 = [(j, c) for j, c in data if j % 3 == 0]
 96 |             sys.stdout.write(
 97 |                 '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(chr, strand, i, i + args.window - 1, len(data),
 98 |                                                                       len(data_f0), len(data_f1), len(data_f2),
 99 |                                                                       mean_coverage(data_f0), mean_coverage(data_f1),
100 |                                                                       mean_coverage(data_f2)))
101 | 


--------------------------------------------------------------------------------
/file_formats/gff_intersect.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # https://github.com/shenwei356/bio_scripts
  4 | # Author     : Wei Shen
  5 | # Contact    : shenwei356@gmail.com
  6 | # LastUpdate : 2015-06-26
  7 | 
  8 | from __future__ import print_function, division
  9 | import argparse
 10 | import os
 11 | import shutil
 12 | import sys
 13 | import gzip
 14 | from collections import defaultdict, Counter
 15 | from bx.intervals.intersection import Intersecter, Interval
 16 | 
 17 | parser = argparse.ArgumentParser(description="gff intersect",
 18 |                                  epilog="https://github.com/shenwei356/bio_scripts")
 19 | 
 20 | parser.add_argument('query', type=str, help='query gff file')
 21 | parser.add_argument('subject', type=str, help='subject gff file')
 22 | parser.add_argument('-e', '--embeded', action='store_true',
 23 |                     help='see what genes (query) contained in specific regions (subject)')
 24 | parser.add_argument('-c', '--cover', action='store_true',
 25 |                     help='see what genes (query) containing specific regions (subject)')
 26 | parser.add_argument('-s', '--split', action='store_true',
 27 |                     help='split results into multiple files')
 28 | parser.add_argument('-o', '--split-dir', type=str,
 29 |                     help='directory for split results')
 30 | parser.add_argument('-eu', '--extend-upstream', type=int, default=0,
 31 |                     help='extend N bases in the upstream [0]')
 32 | parser.add_argument('-ed', '--extend-downstream', type=int, default=0,
 33 |                     help='extend N bases in the downstream [0]')
 34 | 
 35 | args = parser.parse_args()
 36 | 
 37 | if args.extend_upstream and args.extend_upstream <= 0:
 38 |     sys.stderr.write('value of option --extend-upstream should be greater than 0\n')
 39 |     sys.exit(1)
 40 | 
 41 | if args.extend_downstream and args.extend_downstream <= 0:
 42 |     sys.stderr.write('value of option --extend-downstream should be greater than 0\n')
 43 |     sys.exit(1)
 44 | 
 45 | if args.cover and args.embeded:
 46 |     sys.stderr.write('only one of option -e/--embeded and -c/--cover allowed\n')
 47 |     sys.exit(1)
 48 | 
 49 | sys.stderr.write('building tree from {}\n'.format(args.subject))
 50 | trees = dict()
 51 | with gzip.open(args.subject) if args.subject.endswith('.gz') else open(args.subject) as fh:
 52 |     genome = ''
 53 |     for line in fh:
 54 |         if line.isspace() or line[0] == '#':
 55 |             continue
 56 | 
 57 |         data = line.rstrip().split('\t')
 58 |         if len(data) != 9:
 59 |             sys.stderr.write('number of columns != 9: {}'.format(line))
 60 | 
 61 |         g, start, end, strand = data[0], int(data[3]), int(data[4]), data[6]
 62 |         if g != genome:
 63 |             genome = g
 64 |             trees[genome] = Intersecter()
 65 | 
 66 |         if strand == '+':
 67 |             start -= args.extend_upstream
 68 |             end += args.extend_downstream
 69 |         else:
 70 |             start -= args.extend_downstream
 71 |             end += args.extend_upstream
 72 | 
 73 |         if not args.embeded and strand == '-':  # complement strand
 74 |             start, end = -end, -start
 75 |         trees[genome].add_interval(Interval(start, end, value=data))
 76 | 
 77 | if args.split:
 78 |     if args.split_dir is None:
 79 |         outdir = '{}.intersect@{}'.format(os.path.normpath(os.path.basename(args.query)),
 80 |                                       os.path.normpath(os.path.basename(args.subject)))
 81 |     else:
 82 |         outdir = args.split_dir
 83 | 
 84 |     if os.path.exists(outdir):
 85 |         shutil.rmtree(outdir)
 86 |     os.makedirs(outdir)
 87 | 
 88 | sys.stderr.write('querying\n')
 89 | with gzip.open(args.query) if args.query.endswith('.gz') else open(args.query) as fh:
 90 |     for line in fh:
 91 |         if line.isspace() or line[0] == '#':
 92 |             continue
 93 |         data = line.rstrip().split('\t')
 94 |         if len(data) != 9:
 95 |             sys.stderr.write('number of columns != 9: {}'.format(line))
 96 | 
 97 |         genome, start, end, strand, product = data[0], int(data[3]), int(data[4]), data[6], data[8]
 98 | 
 99 |         if genome not in trees:
100 |             continue
101 | 
102 |         overlaps = trees[genome].find(start, end)
103 |         if len(overlaps) == 0:
104 |             continue
105 | 
106 |         overlap_data, stats = list(), Counter()
107 |         for x in overlaps:
108 |             s, e = x.start, x.end
109 |             if args.embeded:
110 |                 strand2 = '.'
111 |             elif s > 0:
112 |                 strand2 = '+'
113 |             else:  # complement strand
114 |                 s, e = -x.end, -x.start
115 |                 strand2 = '-'
116 | 
117 |             overlap, t = 0, ''
118 |             if s <= start:
119 |                 if e >= end:
120 |                     #   start ======== end
121 |                     #     s ------------- e
122 |                     overlap = end - start + 1
123 |                     t = 'embed'
124 |                     if args.cover:
125 |                         continue
126 |                 else:
127 |                     #  start ======== end
128 |                     #   s ------ e
129 |                     if args.embeded or args.cover:
130 |                         continue
131 |                     overlap = e - start + 1
132 |                     t = 'overlap.downstream' if strand == '+' else 'overlap.upstream'
133 |             else:
134 |                 if e >= end:
135 |                     #   start ======== end
136 |                     #           s ------ e
137 |                     if args.embeded or args.cover:
138 |                         continue
139 |                     overlap = end - s + 1
140 |                     t = 'overlap.upstream' if strand == '+' else 'overlap.downstream'
141 |                 else:
142 |                     #   start ======== end
143 |                     #          s --- e
144 |                     if args.embeded:
145 |                         continue
146 |                     overlap = e - s + 1
147 |                     t = 'cover'
148 | 
149 |             if args.embeded or args.cover:
150 |                 frame = '.'
151 |             elif strand == '+':
152 |                 frame = abs(s - start) % 3
153 |             else:
154 |                 frame = abs(e - end) % 3
155 | 
156 |             stats[t] += 1
157 |             if args.embeded or args.cover:
158 |                 overlap_data.append(x.value)
159 |             else:
160 |                 overlap_data.append([str(i) for i in
161 |                                      [data[0], s, e, strand2, overlap, round(100 * overlap / (end - start + 1), 1), t, frame,
162 |                                       x.value[-1]]])
163 |         if len(overlap_data) == 0:
164 |             continue
165 | 
166 |         if args.split:
167 |             fh_out = open(os.path.join(outdir, '{}_{}..{}..{}_{}.gff'.format(genome,
168 |                         start, end, strand, product.replace('/', '_').replace('"', ''))), 'wt')
169 |             fh_out.write('# {}'.format(line))
170 |         else:
171 |             fh_out = sys.stdout
172 |             fh_out.write('>{}'.format(line))
173 | 
174 |         if args.embeded or args.cover:
175 |             sorted_overlap_data = sorted(overlap_data, key=lambda o: (o[0], o[1]))
176 |         else:
177 |             fh_out.write('# summary: {}\n'.format(stats))
178 |             fh_out.write(
179 |                 '\t'.join(['chr', 'start', 'end', 'strand', 'overlap', 'overlap%', 'type', 'frame', 'attribute']) + '\n')
180 |             sorted_overlap_data = sorted(overlap_data, key=lambda o: (o[6], o[7], -float(o[5])))
181 | 
182 |         for overlap in sorted_overlap_data:
183 |             fh_out.write('\t'.join(overlap) + '\n')
184 | 
185 |         if args.split:
186 |             fh_out.close()
187 | 


--------------------------------------------------------------------------------
/for_education/Parsing grouped data in multi-line.pl:
--------------------------------------------------------------------------------
 1 | # https://github.com/shenwei356
 2 | # 
 3 | # Ths script illustrates how to parse grouped data in multi-line, as below. 
 4 | # String of first column is the group ID, and a group may have
 5 | # more than one records in multi-line.
 6 | # 
 7 | #     g1 2 3
 8 | #     g1 2 5
 9 | #     g2 2 3
10 | #     g2 2 5
11 | #     g3 2 3
12 | #     g3 2 5
13 | #
14 | # Outline
15 | # 
16 | # A flag “last_id” is used to judge first / same / new group (See code bellow). 
17 | # 
18 | # For different situation,
19 | # 
20 | #     1. First record. Initializing container for current group ( id), 
21 | #        and add in this record.   last_id = id
22 | #     2. Same group. Add this record into the container for current group ( id ).
23 | #     2. New group. Do something with previous group ( last_id ). Initializing 
24 | #        container for current group ( id ), and add in this record. last_id = id .
25 | #     2. Last group. Adding last group ( last_id) at the end of file (EOF).
26 | #
27 | # Extension
28 | #
29 | # In previous case, the marker for a new record is a different id. In other cases,
30 | # parsing fasta file for example, the marker is the character “>”.
31 | #
32 | use strict;
33 | 
34 | my $data = {};  # container for all data
35 | my ( $id, $last_id ) = ( "", "" );
36 | my $record = "";
37 | 
38 | while (<DATA>) {
39 | 
40 |     # parse id
41 |     next unless /^(.+?)\s+/;
42 |     $id = $1;
43 | 
44 |     # parse record. Here is the whole line
45 |     $record = $_;
46 | 
47 |     if ( $last_id eq "" ) {    # first record
48 |         $$data{$id} = [];      # initialize container for this group
49 |         push @{ $$data{$id} }, $record;     # add this record
50 |         $last_id = $id;                     # restore this id for further use
51 |     }
52 |     else {
53 |         if ( $id eq $last_id ) {            # same group
54 |             push @{ $$data{$id} }, $record; # add this record
55 |         }
56 |         else {                              # new group
57 |             # do something with previous group
58 |             &dosomthing( $$data{$last_id} );
59 | 
60 |             $$data{$id} = [];
61 |             push @{ $$data{$id} }, $record;
62 |             $last_id = $id;
63 |         }
64 |     }
65 | }
66 | 
67 | # do something with the last group
68 | &dosomthing( $$data{$id} );
69 | 
70 | sub dosomthing {
71 |     my ($records) = @_;
72 |     for (@$records) {
73 |         print " $_";
74 |     }
75 |     print "\n";
76 | }
77 | 
78 | # example data
79 | __DATA__
80 | g1 2 3
81 | g1 2 5
82 | g2 2 3
83 | g2 2 5
84 | g3 2 3
85 | g3 2 5


--------------------------------------------------------------------------------
/for_education/extract_cds_by_gff.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use strict;
  4 | use File::Basename;
  5 | use Getopt::Long;
  6 | use BioUtil::Seq;
  7 | use BioUtil::Util;
  8 | use Data::Dumper;
  9 | 
 10 | $0 = basename($0);
 11 | my $usage = qq(
 12 | Usage: $0 [options] gff_file fasta_file
 13 | Options:
 14 |     -t,   --type         gene type (CDS or mRNA) [CDS]
 15 |     -us,  --up-stream    up stream length [0]
 16 |     -ds,  --down-stream  down stream length [0]
 17 |     -h,   --help         show this usage
 18 | 
 19 | );
 20 | 
 21 | my $argv = {};
 22 | $$argv{type}        = 'CDS';
 23 | $$argv{up_stream}   = 0;
 24 | $$argv{down_stream} = 0;
 25 | 
 26 | GetOptions(
 27 |     'help|h'           => \$$argv{help},
 28 |     'type|t=s'         => \$$argv{type},
 29 |     'up-stream|us=s'   => \$$argv{up_stream},
 30 |     'down-stream|ds=s' => \$$argv{down_stream},
 31 | );
 32 | 
 33 | die $usage if $$argv{help};
 34 | die $usage if scalar(@ARGV) != 2;
 35 | 
 36 | check_positive_integer( $$argv{up_stream} + 1 );
 37 | check_positive_integer( $$argv{down_stream} + 1 );
 38 | 
 39 | my ( $gff_file, $fasta_file ) = @ARGV;
 40 | 
 41 | my $genes = read_gff_file($gff_file);
 42 | 
 43 | # print Dumper($genes);
 44 | 
 45 | my $next_seq = FastaReader($fasta_file);
 46 | while ( my $fa = &$next_seq() ) {
 47 |     my ( $name, $genome ) = @$fa;
 48 |     next if not exists $$genes{$name};
 49 | 
 50 |     for my $gene ( @{ $$genes{$name} } ) {
 51 |         next if lc $$gene{type} ne lc $$argv{type};    # specific type
 52 |         my $seq = '';
 53 | 
 54 |         if ( $$gene{strand} eq '+' ) {
 55 |             my $s = $$gene{start} - $$argv{up_stream} - 1;
 56 |             $s = 0 if $s < 0;
 57 |             $seq = substr(
 58 |                 $genome, $s,
 59 |                 $$gene{end}
 60 |                     - $$gene{start}
 61 |                     + $$argv{down_stream} + 1
 62 | 
 63 |             );
 64 |         }
 65 |         else {
 66 |             my $s = $$gene{start} - $$argv{down_stream} - 1;
 67 |             $s = 0 if $s < 0;
 68 |             $seq = revcom(
 69 |                 substr(
 70 |                     $genome, $s,
 71 |                     $$gene{end} - $$gene{start} + $$argv{up_stream} + 1
 72 |                 )
 73 |             );
 74 |         }
 75 |         printf( ">%s_%d..%d..%s\n%s",
 76 |             $name, $$gene{start}, $$gene{end}, $$gene{strand},
 77 |             format_seq($seq) );
 78 |     }
 79 | 
 80 | }
 81 | 
 82 | sub read_gff_file {
 83 |     my ($file) = @_;
 84 |     my $genes = {};
 85 |     open( my $fh, "<", $file ) or die "fail to open file: $file\n";
 86 |     while (<$fh>) {
 87 |         my @data = split( /\s+/, $_ );
 88 |         next unless scalar(@data) >= 9;
 89 |         my $name = $data[0];
 90 |         my $gene = {};
 91 |         ( $$gene{type}, $$gene{start}, $$gene{end}, $$gene{strand} )
 92 |             = ( $data[2], $data[3], $data[4], $data[6] );
 93 |         if ( not exists $$genes{$name} ) {
 94 |             $$genes{$name} = [];
 95 |         }
 96 |         push @{ $$genes{$name} }, $gene;
 97 | 
 98 |     }
 99 |     close($fh);
100 |     return $genes;
101 | }
102 | 
103 | 


--------------------------------------------------------------------------------
/for_education/fasta_common_seqs2.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | # Copyright 2013 Wei Shen (shenwei356#gmail.com). All rights reserved.
  3 | # Use of this source code is governed by a MIT-license
  4 | # that can be found in the LICENSE file.
  5 | use File::Basename;
  6 | use Getopt::Long;
  7 | use Digest::MD5;
  8 | use strict;
  9 | 
 10 | $0 = basename($0);
 11 | my $usage = <<"USAGE";
 12 | ===============================================================================
 13 | Function: Find common sequences in fasta files. Version 2.
 14 |           Features:
 15 |               1) Comparing by name or sequence are both supported.
 16 |               2) No files number limit.
 17 |               3) Very low RAM usage. (Lower than Version 1).
 18 |           Note that:
 19 |               1) Records with different names may have same sequences.
 20 |               2) Case of sequence letters or name may be different.
 21 |               3) Duplicated records may exist in a fasta file.
 22 | Contact : Wei Shen <shenwei356#gmail.com>
 23 | Date    : 2013-12-05
 24 | Update  : 2014-08-14
 25 | Site    : https://github.com/shenwei356/bio_scripts
 26 | 
 27 | Usage   : $0 [-s] [-i] fastafile fastafile2 [fastafile3 ...]
 28 | Options :
 29 |    -s Comparing by sequence.
 30 |    -i Ignore case.
 31 | ===============================================================================
 32 | 
 33 | USAGE
 34 | 
 35 | our $by_seq      = 0;
 36 | our $ignore_case = 0;
 37 | GetOptions(
 38 |     "s" => \$by_seq,
 39 |     "i" => \$ignore_case,
 40 | ) or die $usage;
 41 | 
 42 | # at least two files;
 43 | die "$usage\n>= 2 sequence file needed.\n" unless @ARGV >= 2;
 44 | 
 45 | our $counts = {};
 46 | our $names  = {};
 47 | 
 48 | our ( $file, $has_head, $last_head, $head, $head0, $seq_len, $seq_md5 );
 49 | our $md5;
 50 | $md5 = Digest::MD5->new if $by_seq;
 51 | 
 52 | # check files
 53 | for $file (@ARGV) {
 54 |     die "File ($file) does not exists.\n" unless -e $file;
 55 | }
 56 | 
 57 | for $file (@ARGV) {
 58 |     open IN, "<", $file
 59 |         or die "Fail to open file: $file!\n";
 60 | 
 61 |     $has_head = 0;
 62 |     $seq_len  = 0;
 63 |     $md5->reset if $by_seq;
 64 | 
 65 |     while (<IN>) {
 66 |         s/\r?\n//;
 67 |         if (/^\s*>/) {    # fasta head
 68 |             s/>\s*//;
 69 |             s/\s+$//;
 70 | 
 71 |             recording();
 72 | 
 73 |             $seq_len  = 0;
 74 |             $has_head = 1;
 75 |         }
 76 |         elsif ( $has_head == 1 ) {    # sequence          
 77 |             next if $_ eq "";
 78 | 
 79 |             $seq_len += length $_;
 80 | 
 81 |             next unless $by_seq;
 82 |             tr/A-Z/a-z/ if $ignore_case;
 83 |             $md5->add($_);
 84 |         }
 85 |     }
 86 |     close IN;
 87 | 
 88 |     # do not forget the last record
 89 |     recording() if $seq_len > 0;
 90 | }
 91 | 
 92 | sub recording {
 93 |     $head0     = $last_head;    # orgin sequence name
 94 |     $last_head = $_;            # store this head for next turn;
 95 | 
 96 |     $head = $head0;
 97 |     $head = lc $head if $ignore_case;
 98 |     if ($by_seq) {
 99 |         $seq_md5 = $md5->hexdigest;
100 |         $md5->reset;
101 | 
102 |         # ingore sequence records without sequence.
103 |         return if $seq_len == 0;
104 | 
105 |         # count sequences with md5 $seq_md5 in $file
106 |         $$counts{$seq_md5}{$file}++;
107 | 
108 |         # record the origin sequence name.
109 |         $$names{$seq_md5}{$file} = $head0;
110 |     }
111 |     else {
112 |         # ingore sequence records without head
113 |         return if $head eq '';
114 | 
115 |         # count sequences with name $head in $file
116 |         $$counts{$head}{$file}++;
117 |         $$names{$head}{$file} = $head0;
118 |     }
119 | }
120 | 
121 | # find common sequences
122 | my $file_num = scalar @ARGV;
123 | 
124 | # extract sequences from the first file.
125 | $file = $ARGV[0];
126 | my $names_ok = {};
127 | for my $key ( keys %$counts ) {
128 | 
129 |     # all files have a same record
130 |     next unless ( scalar keys %{ $$counts{$key} } ) == $file_num;
131 | 
132 |     # save into a hash.
133 |     $$names_ok{ $$names{$key}{$file} }
134 |         = $$counts{$key}{$file};
135 | }
136 | 
137 | # print common sequences
138 | my $is_target = 0;
139 | open IN, "<", $file
140 |     or die "Fail to open file: $file!\n";
141 | while (<IN>) {
142 |     if (/^\s*>/) {
143 |         s/>\s*//;
144 |         s/\s+$//;
145 |         next if $_ eq '';
146 | 
147 |         $head      = $_;
148 |         $is_target = 0;
149 |         if ( exists $$names_ok{$head} and $$names_ok{$head} > 0 ) {
150 |             print ">$head\n";
151 |             $is_target = 1;
152 | 
153 |             # just export one record for duplicated records.
154 |             $$names_ok{$head} = 0;
155 |         }
156 |     }
157 |     elsif ( $is_target == 1 ) {
158 |         print $_;
159 |     }
160 | }
161 | close IN;
162 | 


--------------------------------------------------------------------------------
/for_education/fasta_extract_sequence_by_id_file.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | # Function: Given a id file, extracting records in another file.
 4 | #           It works well for super big file.
 5 | # Author  : Wei Shen <shenwei356#gmail.com> http://shenwei.me
 6 | # Date    : 2013-08-01
 7 | # Update  : 2014-11-14
 8 | # Docment : http://blog.shenwei.me/extract_records_by_id_file/
 9 | 
10 | use strict;
11 | use File::Basename;
12 | use BioUtil::Util;
13 | 
14 | $0 = basename($0);
15 | my $usage = <<USAGE;
16 | 
17 | Usage: $0 <id_file> <seq_file> <out_file>
18 | 
19 | USAGE
20 | 
21 | die $usage unless @ARGV == 3;
22 | 
23 | my $id_file  = shift;
24 | my $seq_file = shift;
25 | my $out_file = shift;
26 | 
27 | #-------------[ read ids ]-------------
28 | 
29 | my %ids_hash
30 |     ;    # 用字典（查询效率更高）来存储id及每个id的命中数
31 | 
32 | open ID, "<", $id_file
33 |     or die "Failed to open file $id_file.\n";
34 | while (<ID>) {
35 |     s/\r?\n//;    # 记得把回车\r和换行符\n删掉
36 |     next if /^\s*$/;
37 |     s/^\s+|\s+$//;
38 | 
39 |     # 根据具体情况提取id !!!!!!
40 |     # next unless /gi\|(\d+)/; # gi|12313|的情况
41 |     next unless /(.+)/;    # 整个一行作为id的情况
42 | 
43 |     $ids_hash{$1} = 0;     # 加入字典
44 | }
45 | close ID;
46 | 
47 | # show number of ids
48 | my @ids = keys %ids_hash;
49 | my $n   = @ids;
50 | print "\nRead $n ids.\n\n";
51 | 
52 | #-------------[ searching ]-------------
53 | 
54 | # 显示搜索进度的变量，当目标文件非常大的时候很有用
55 | my $count = 0;    # 当前处理的序列数
56 | my $hits  = 0;    # 匹配到的序列数
57 | local $| = 1
58 |     ; # 输出通道在每次打印或写之后都强制刷新，提高显示进度速度
59 | 
60 | open OUT, ">", $out_file
61 |     or die "Failed to open file $out_file.\n";
62 | 
63 | my $next_seq = FastaReader($seq_file);
64 | while ( my $fa = &$next_seq() ) {
65 |     my ( $head, $seq ) = @$fa;
66 | 
67 |     $count++;
68 | 
69 |     $seq =~ s/\s+//g;
70 | 
71 |     # 根据具体情况提取id !!!!!!!!!!!!!!!!!!!!!
72 |     # 取出记录中的id
73 |     # next unless $head =~ /gi\|(\d+)\|/;  # gi|12313|的情况
74 |     # next unless $head =~ /(.+?)_/;       # 我测试的例子，勿套用
75 |     next unless $head =~ /(.+)/;    # 整个一行作为id的情况
76 | 
77 |     # 在%ids_hash中查询记录
78 |     if ( exists $ids_hash{$1} ) {
79 |         print OUT ">$head\n$seq\n";
80 | 
81 | # 如果确信目标文件中只有唯一与ID匹配的记录，则从字典中删除，提高查询速度
82 | # delete $ids_hash{$1};
83 | 
84 |         # record hit number of a id
85 |         $ids_hash{$1}++;
86 |         $hits++;
87 |     }
88 |     print "\rProcessing ${count} th record. hits: $hits";
89 | }
90 | close OUT;
91 | 
92 | # 显示没有匹配到任何记录的id
93 | my @ids = grep { $ids_hash{$_} == 0 } keys %ids_hash;
94 | my $n = @ids;
95 | print "\n\n$n ids did not match any record in $seq_file:\n";
96 | print "@ids\n";
97 | 
98 | 


--------------------------------------------------------------------------------
/for_education/join_table.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | use strict;
 3 | use File::Basename;
 4 | use Data::Dumper;
 5 | 
 6 | $0 = basename($0);
 7 | my $usage = <<USAGE;
 8 | Usage: $0 <tsv1> <index1> <tsv2> <index2>
 9 | 
10 | USAGE
11 | die($usage) unless scalar(@ARGV) == 4;
12 | my ( $tsv1, $index1, $tsv2, $index2 ) = @ARGV;
13 | 
14 | sub tsv2map ($$) {
15 |     my ( $file, $index ) = @_;
16 |     $index = 1 unless defined($index);    # index column, defautl: 1
17 | 
18 |     my $data = {};    # data is a hash reference, I prefer this.
19 |     open( my $fh, "<", $file ) or die("failed to open file: $file\n");
20 |     while (<$fh>) {
21 |         chomp($_);
22 |         my @items = split( /\t/, $_ );
23 |         if ( scalar(@items) < $index ) {    # verify $index
24 |             die "number of column in file ($file) < index ($index).\n";
25 |         }
26 |         my $key = $items[ $index - 1 ];     # get the key
27 |         $$data{$key} = $_;                  # store key => value
28 |     }
29 |     close $fh;
30 | 
31 |     return $data;
32 | }
33 | 
34 | my $data_tsv2 = tsv2map( $tsv2, $index2 );
35 | 
36 | # print Dumper($data_tsv2);
37 | # result:
38 | # $VAR1 = {
39 | #           '123' => '123 onetwothree',
40 | #           'str' => 'str string',
41 | #           '245' => '245 twofourfive'
42 | #         };
43 | 
44 | # parse tsv1
45 | open( my $fh, "<", $tsv1 ) or die("failed to open file: $tsv1\n");
46 | while (<$fh>) {
47 |     chomp($_);
48 |     my @items = split( /\t/, $_ );
49 |     if ( scalar(@items) < $index1 ) {
50 |         die "number of column in file ($tsv1) < index ($index1).\n";
51 |     }
52 |     my $key = $items[ $index1 - 1 ];    # get the key
53 | 
54 |     if ( exists $$data_tsv2{$key} ) {   # check if key existed in tsv2
55 |         print "$_\t$$data_tsv2{$key}\n";
56 |     }
57 |     else {
58 |         print "$_\n";
59 |     }
60 | }
61 | close $fh;
62 | 


--------------------------------------------------------------------------------
/for_education/simple_statistics.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | # Copyright 2014 Wei Shen (shenwei356#gmail.com). All rights reserved.
 4 | # Use of this source code is governed by a MIT-license
 5 | # that can be found in the LICENSE file.
 6 | # https://github.com/shenwei356
 7 | 
 8 | use strict;
 9 | 
10 | my $usage = <<USAGE;
11 | 
12 | usage: simple_statistics <infile> <column>
13 | 
14 | <infile> is a plain text file. each column should be seperated by TAB(\\t)
15 | <column> is the column number of the table.
16 | 
17 | USAGE
18 | 
19 | die $usage unless @ARGV == 2;
20 | 
21 | my $file   = shift @ARGV;
22 | my $column = shift @ARGV;
23 | 
24 | my $data = get_column_data( $file, $column );
25 | 
26 | printf "#.\t%d\n", scalar @$data;
27 | printf "min.\t%d\n", min($data);
28 | printf "max.\t%d\n", max($data);
29 | 
30 | my ($mean, $stdev) = mean_and_stdev($data);
31 | 
32 | printf "mean.\t%.2f\n", $mean;
33 | printf "stdev.\t%.2f\n", $stdev;
34 | 
35 | 
36 | sub get_column_data {
37 |     my ( $file, $column ) = @_;
38 |     unless ( $column =~ /^(\d+)$/ and $column > 0 ) {
39 |         warn
40 |             "column number ($column) should be an integer and greater than 0.\n";
41 |         $column = 1;
42 |     }
43 | 
44 |     open IN, "<", $file or die "failed to open file: $file\n";
45 |     my @linedata = ();
46 |     my @data     = ();
47 |     my $n        = 0;
48 |     while (<IN>) {
49 |         s/\r?\n//;
50 |         @linedata = split /\t/, $_;
51 |         $n = scalar @linedata;
52 |         next unless $n > 0;
53 |         
54 |         if ( $column > $n ) {
55 |             die
56 |                 "number of columns of this line ($n) is less than given column number ($column)\n";
57 |         }
58 | 
59 |         push @data, $linedata[ $column - 1 ];
60 |     }
61 |     close IN;
62 | 
63 |     return \@data;
64 | }
65 | 
66 | # you can also modules
67 | # use List::Util qw/max min sum/;
68 | 
69 | sub max {
70 |     my ($list) = @_;
71 |     my $max = shift @$list;
72 |     for (@$list) {
73 |         $max = $_ if $_ > $max;
74 |     }
75 |     return $max;
76 | }
77 | 
78 | sub min {
79 |     my ($list) = @_;
80 |     my $min = shift @$list;
81 |     for (@$list) {
82 |         $min = $_ if $_ < $min;
83 |     }
84 |     return $min;
85 | }
86 | 
87 | sub mean_and_stdev($) {
88 |     my ($list) = @_;
89 |     return ( 0, 0 ) if @$list == 0;
90 |     my $sum = 0;
91 |     $sum += $_ for @$list;
92 |     my $sum_square = 0;
93 |     $sum_square += $_ * $_ for @$list;
94 |     my $mean     = $sum / @$list;
95 |     my $variance = $sum_square / @$list - $mean * $mean;
96 |     my $std      = sqrt $variance;
97 |     return ( $mean, $std );
98 | }
99 | 


--------------------------------------------------------------------------------
/not_used/csv2tab:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # https://github.com/shenwei356
3 | awk -F'^"|","|,"|",|,|"$' '{ out=$1; for(i=2;i<=NF;i++){out=out"\t"$i}; print out}' $@
4 | 


--------------------------------------------------------------------------------
/not_used/csv_join:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # https://github.com/shenwei356/bio_scripts
  4 | # Author     : Wei Shen
  5 | # Contact    : shenwei356@gmail.com
  6 | # LastUpdate : 2015-08-13
  7 | 
  8 | from __future__ import print_function
  9 | 
 10 | import argparse
 11 | import csv
 12 | import logging
 13 | import sys
 14 | 
 15 | 
 16 | def parse_key_index(key):
 17 |     if ',' in key:
 18 |         return [int(i) for i in key.split(',')]
 19 |     else:
 20 |         return [int(key)]
 21 | 
 22 | 
 23 | def parse_args():
 24 |     parser = argparse.ArgumentParser(description="Merge csvfile2 to csvfile1. Multiple keys supported.",
 25 |                                      epilog="https://github.com/shenwei356/bio_scripts")
 26 | 
 27 |     parser.add_argument('csvfile1', type=str, help='CSV file 1')
 28 |     parser.add_argument("key1", type=str,
 29 |                         help='Column number of key in csvfile1. Multiple values shoud be separated by comma.')
 30 |     parser.add_argument('csvfile2', type=str, help='CSV file 2')
 31 |     parser.add_argument("key2", type=str,
 32 |                         help='Column number of key in csvfile2. Multiple values shoud be separated by comma.')
 33 | 
 34 |     parser.add_argument("-f1", type=str, default=",",
 35 |                         help='Field separator in csvfile1 [,]')
 36 |     parser.add_argument("-q1", type=str, default='"',
 37 |                         help='Quote char in csvfile1 ["]')
 38 |     parser.add_argument("-f2", type=str, default=",",
 39 |                         help='Field separator in csvfile2 [,]')
 40 |     parser.add_argument("-q2", type=str, default='"',
 41 |                         help='Quote char in csvfile2 ["]')
 42 |     parser.add_argument("-of", type=str, default=",",
 43 |                         help='Field separator in output [,]')
 44 | 
 45 |     parser.add_argument("-t1", action='store_true',
 46 |                         help='csvfile1 is table file. Quote char is "\\t"')
 47 |     parser.add_argument("-t2", action='store_true',
 48 |                         help='csvfile1 is table file. Quote char is "\\t"')
 49 |     parser.add_argument("-to", action='store_true',
 50 |                         help='Output quote char is "\\t"')
 51 |     parser.add_argument("-t", action='store_true',
 52 |                         help='Abbreviation for "-t1 -t2 -to"')
 53 | 
 54 |     parser.add_argument("-k", "--keep-unmatched", action='store_true',
 55 |                         help='Keep rows in CSV file1 not matching row in file2"')
 56 | 
 57 |     args = parser.parse_args()
 58 | 
 59 |     if args.t:
 60 |         args.f1, args.f2, args.of = '\t', '\t', '\t'
 61 |     else:
 62 |         if args.t1:
 63 |             args.f1 = '\t'
 64 |         if args.t2:
 65 |             args.f2 = '\t'
 66 |         if args.to:
 67 |             args.of = '\t'
 68 | 
 69 |     return args
 70 | 
 71 | 
 72 | def read_csv_file(file, key_index, fs, qc):
 73 |     data = dict()
 74 | 
 75 |     with open(file) as fh:
 76 |         reader = csv.reader(fh, delimiter=fs, quotechar=qc)
 77 |         for row in reader:
 78 |             ncolumn = len(row)
 79 |             if ncolumn == 0:
 80 |                 continue
 81 | 
 82 |             key = list()
 83 |             for k in parse_key_index(key_index):
 84 |                 if ncolumn < k:
 85 |                     logging.error(
 86 |                         "key ({}) is beyond number of column ({})".format(k, ncolumn))
 87 |                     sys.exit(1)
 88 |                 key.append(row[k - 1].strip())
 89 |             key = '_'.join(key)
 90 | 
 91 |             data[key] = row
 92 | 
 93 |     return data
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 |     logging.basicConfig(level=logging.DEBUG, format="[%(levelname)s] %(message)s")
 98 | 
 99 |     args = parse_args()
100 | 
101 |     data = read_csv_file(args.csvfile2, args.key2, args.f2, args.q2)
102 | 
103 |     file, fs, qc, key_index = args.csvfile1, args.f1, args.q1, args.key1
104 | 
105 |     writer = csv.writer(sys.stdout, delimiter=args.of, quotechar=qc, quoting=csv.QUOTE_MINIMAL)
106 |     with open(file) as fh:
107 |         reader = csv.reader(fh, delimiter=fs, quotechar=qc)
108 |         for row in reader:
109 |             ncolumn = len(row)
110 |             if ncolumn == 0:
111 |                 continue
112 | 
113 |             key = list()
114 |             for k in parse_key_index(key_index):
115 |                 if ncolumn < k:
116 |                     logging.error(
117 |                         "key ({}) is beyond number of column ({})".format(k, ncolumn))
118 |                     sys.exit(1)
119 |                 key.append(row[k - 1].strip())
120 |             key = '_'.join(key)
121 | 
122 |             if key in data:
123 |                 writer.writerow(row + data[key])
124 |             elif args.keep_unmatched:
125 |                 writer.writerow(row)
126 | 


--------------------------------------------------------------------------------
/not_used/csv_join_paired_lines.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # https://github.com/shenwei356/bio_scripts
 3 | # Author     : Wei Shen
 4 | # Contact    : shenwei356@gmail.com
 5 | # LastUpdate : 2015-02-04
 6 | 
 7 | import argparse
 8 | import csv
 9 | import logging
10 | import sys
11 | import re
12 | 
13 | # ===================================[ args ]=================================
14 | 
15 | parser = argparse.ArgumentParser(description="Join paired lines from two files into one file")
16 | 
17 | parser.add_argument("-v", "--verbose", help='Verbosely print information',
18 |                     action="count", default=0)
19 | 
20 | parser.add_argument('infile1', type=argparse.FileType('r'),
21 |                     help='Input file 1')
22 | parser.add_argument('infile2', type=argparse.FileType('r'),
23 |                     help='Input file 2')
24 | parser.add_argument('outfile', nargs='*', type=argparse.FileType('w'),
25 |                     default=sys.stdout, help='Output file')
26 | 
27 | parser.add_argument("-k", '--key', type=int, default=1,
28 |                     help='Column number of key in csvfile')
29 | parser.add_argument("-H", "--ignoretitle", help="Ignore title",
30 |                     action="store_true")
31 | parser.add_argument("-F", '--fs', type=str, default="\t",
32 |                     help='Field separator [\\t]')
33 | parser.add_argument("-Q", '--qc', type=str, default='"',
34 |                     help='Quote char["]')
35 | 
36 | args = parser.parse_args()
37 | 
38 | # logging level
39 | if args.verbose >= 2:
40 |     logginglevel = logging.DEBUG
41 | elif args.verbose == 1:
42 |     logginglevel = logging.INFO
43 | else:
44 |     logginglevel = logging.WARN
45 | logging.basicConfig(level=logginglevel,
46 |                     format="[%(levelname)s] %(message)s")
47 | 
48 | logging.info("Column number of key in csvfile: {}".format(args.key))
49 | 
50 | # ===================================[ read csv ]=============================
51 | 
52 | 
53 | 
54 | def get_key_from_row(nrow, row):
55 |     if nrow < args.key:
56 |         logging.error(
57 |             "-k ({}) is beyond number of column ({})".format(args.key, nrow))
58 |         sys.exit(1)
59 |     elif args.key < 1:
60 |         args.key = 1
61 |     key = row[args.key - 1].strip()
62 |     return key
63 | 
64 | 
65 | reader1 = csv.reader(iter(args.infile1.readline, ''), delimiter=args.fs, quotechar=args.qc)
66 | reader2 = csv.reader(iter(args.infile2.readline, ''), delimiter=args.fs, quotechar=args.qc)
67 | 
68 | writer = csv.writer(args.outfile, delimiter=args.fs, quotechar=args.qc, quoting=csv.QUOTE_MINIMAL)
69 | 
70 | once = True
71 | for row1, row2 in zip(reader1, reader2):
72 |     if args.ignoretitle and once:  # Ignore title
73 |         once = False
74 |         continue
75 | 
76 |     nrow1, nrow2 = len(row1), len(row2)
77 |     if nrow1 == 0 or nrow2 == 0:
78 |         continue
79 |     if nrow1 != nrow2:
80 |         logging.error("unpaired column number: {} vs {}".format(nrow1, nrow2))
81 |         sys.exit(1)
82 | 
83 |     key1, key2 = get_key_from_row(nrow1, row1), get_key_from_row(nrow2, row2)
84 | 
85 |     if key1 != key2:
86 |         logging.error("keys do not match: {} vs {}".format(key1, key2))
87 |         sys.exit(1)
88 | 
89 |     writer.writerow(row1)
90 |     writer.writerow(row2)
91 | 


--------------------------------------------------------------------------------
/not_used/csv_split_paired_lines.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # https://github.com/shenwei356/bio_scripts
  3 | # Author     : Wei Shen
  4 | # Contact    : shenwei356@gmail.com
  5 | # LastUpdate : 2015-02-04
  6 | 
  7 | import argparse
  8 | import csv
  9 | import logging
 10 | import sys
 11 | import re
 12 | 
 13 | # ===================================[ args ]=================================
 14 | 
 15 | parser = argparse.ArgumentParser(description="Split paired lines into two files")
 16 | 
 17 | parser.add_argument('csvfile', nargs='*', type=argparse.FileType('r'),
 18 |                     default=sys.stdin, help='Input file(s)')
 19 | parser.add_argument("-v", "--verbose", help='Verbosely print information',
 20 |                     action="count", default=0)
 21 | 
 22 | parser.add_argument('outfile1', type=argparse.FileType('w'),
 23 |                     default="out_1.tab", help='Output file 1')
 24 | parser.add_argument('outfile2', type=argparse.FileType('w'),
 25 |                     default="out_2.tab", help='Output file 2')
 26 | 
 27 | parser.add_argument("-k", '--key', type=int, default=1,
 28 |                     help='Column number of key in csvfile')
 29 | parser.add_argument("-H", "--ignoretitle", help="Ignore title",
 30 |                     action="store_true")
 31 | parser.add_argument("-F", '--fs', type=str, default="\t",
 32 |                     help='Field separator [\\t]')
 33 | parser.add_argument("-Q", '--qc', type=str, default='"',
 34 |                     help='Quote char["]')
 35 | 
 36 | args = parser.parse_args()
 37 | 
 38 | # logging level
 39 | if args.verbose >= 2:
 40 |     logginglevel = logging.DEBUG
 41 | elif args.verbose == 1:
 42 |     logginglevel = logging.INFO
 43 | else:
 44 |     logginglevel = logging.WARN
 45 | logging.basicConfig(level=logginglevel,
 46 |                     format="[%(levelname)s] %(message)s")
 47 | 
 48 | logging.info("Column number of key in csvfile: {}".format(args.key))
 49 | 
 50 | # ===================================[ read csv ]=============================
 51 | 
 52 | writer1 = csv.writer(args.outfile1, delimiter=args.fs, quotechar=args.qc, quoting=csv.QUOTE_MINIMAL)
 53 | writer2 = csv.writer(args.outfile2, delimiter=args.fs, quotechar=args.qc, quoting=csv.QUOTE_MINIMAL)
 54 | 
 55 | cnt, sum = 0, 0
 56 | stdinflag = False
 57 | 
 58 | # If "iter(sys.stdin.readline, '')" in the flowing for-loop, first line
 59 | # of stdin will be missing
 60 | if args.csvfile is sys.stdin:
 61 |     logging.info("read data from STDIN")
 62 |     stdinflag = True
 63 |     args.csvfile = [iter(sys.stdin.readline, '')]
 64 | 
 65 | 
 66 | def get_key_from_row(nrow, row):
 67 |     if nrow < args.key:
 68 |         logging.error(
 69 |             "-k ({}) is beyond number of column ({})".format(args.key, nrow))
 70 |         sys.exit(1)
 71 |     elif args.key < 1:
 72 |         args.key = 1
 73 |     key = row[args.key - 1].strip()
 74 |     return key
 75 | 
 76 | 
 77 | key0, row0, flag = '', '', True
 78 | 
 79 | for f in args.csvfile:
 80 |     if not stdinflag:
 81 |         logging.info("read data from file")
 82 |         f = iter(f.readline, '')
 83 |     reader = csv.reader(f, delimiter=args.fs, quotechar=args.qc)
 84 | 
 85 |     once = True
 86 |     for row in reader:
 87 |         if args.ignoretitle and once:  # Ignore title
 88 |             once = False
 89 |             continue
 90 | 
 91 |         nrow = len(row)
 92 |         if nrow == 0:
 93 |             continue
 94 | 
 95 |         sum += 1
 96 |         key = get_key_from_row(nrow, row)
 97 | 
 98 |         if key0 == '':
 99 |             key0, row0 = key, row
100 |             continue
101 | 
102 |         if flag:
103 |             if key0 != key:
104 |                 logging.error("unpaired key: line {} {} vs line {} {} ".format(sum - 1, row0, sum, row))
105 |                 sys.exit(1)
106 |             else:
107 |                 writer1.writerow(row0)
108 |                 writer2.writerow(row)
109 | 
110 |         flag = not flag
111 |         key0, row0 = key, row
112 | 
113 | if flag:
114 |     logging.error("unpaired record remain: {}".format(row0))
115 |     sys.exit(1)


--------------------------------------------------------------------------------
/not_used/fasta_seq_gc_content_plot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | 
 4 | import sys
 5 | import os
 6 | 
 7 | from Bio import SeqIO
 8 | from Bio.SeqUtils import GC
 9 | 
10 | import seaborn as sns
11 | import matplotlib as mpl
12 | import matplotlib.pyplot as plt
13 | 
14 | usage = """
15 | Usage: fasta_seq_gc_content_plot.py fastafile [fastafile...]
16 | """
17 | 
18 | if len(sys.argv) <= 1:
19 |     print(usage)
20 |     sys.exit(0)
21 | 
22 | gc = []
23 | 
24 | for file in sys.argv[1:]:
25 |     if not os.path.exists(file):
26 |         print("file not exists: %s" % file)
27 |         sys.exit(0)
28 | 
29 |     with open(file + ".gc", 'w') as fh:
30 |         for seq in SeqIO.parse(file, "fasta"):
31 |             gccontent = GC(seq.seq)
32 |             gc.append(gccontent)
33 |             fh.write("%s\t%d\n" % (seq.id, gccontent))
34 | 
35 | mpl.rc("figure", figsize=(8, 4))
36 | sns.distplot(gc)
37 | plt.savefig(file + ".gc.png")
38 | 


--------------------------------------------------------------------------------
/not_used/fasta_seq_length_plot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | 
 4 | import sys
 5 | import os
 6 | 
 7 | from Bio import SeqIO
 8 | 
 9 | import seaborn as sns
10 | import matplotlib as mpl
11 | import matplotlib.pyplot as plt
12 | 
13 | usage = """
14 | Usage: fasta_seq_length_plot.py fastafile [fastafile...]
15 | """
16 | 
17 | if len(sys.argv) <= 1:
18 |     print(usage)
19 |     sys.exit(0)
20 | 
21 | lengths = []
22 | 
23 | for file in sys.argv[1:]:
24 |     if not os.path.exists(file):
25 |         print("file not exists: %s" % file)
26 |         sys.exit(0)
27 | 
28 |     with open(file + ".len", 'w') as fh:
29 |         for seq in SeqIO.parse(file, "fasta"):
30 |             l = len(seq)
31 |             lengths.append(l)
32 |             fh.write("%s\t%d\n" % (seq.id, l))
33 | 
34 | mpl.rc("figure", figsize=(8, 4))
35 | sns.distplot(lengths)
36 | plt.savefig(file + ".len.png")
37 | 


--------------------------------------------------------------------------------
/plot/README.md:
--------------------------------------------------------------------------------
  1 | # Plot utilities 
  2 | 	
  3 | ## plot_barplot.R
  4 | 
  5 | source is not open right now
  6 | 
  7 | Sample output:
  8 | 
  9 | ![Sample output](example/plot_barplot.png)
 10 | 	
 11 | ## plot_distribution.R
 12 | 
 13 | Plot distribution
 14 | 
 15 | Usage
 16 | 
 17 |     usage: ./plot_distribution.R [-h] [-bw BINWIDTH] [--xlab XLAB] [--ylab YLAB]
 18 |                                  [--width WIDTH] [--height HEIGHT] [-t title]
 19 |                                  infile outfile
 20 |     
 21 |     Plot distribution.Infile should be a tsv file of two columns (group and "value")
 22 |     
 23 |     positional arguments:
 24 |       infile                infile
 25 |       outfile               outfile
 26 |     
 27 |     optional arguments:
 28 |       -h, --help            show this help message and exit
 29 |       -bw BINWIDTH, --binwidth BINWIDTH
 30 |                             binwidth
 31 |       --xlab XLAB           xlabel
 32 |       --ylab YLAB           ylabel
 33 |       --width WIDTH         output image width [20]
 34 |       --height HEIGHT       output image height [5]
 35 |       -t title, --title title
 36 |                             title
 37 |     
 38 | Sample output:
 39 | 
 40 | ![Sample output](example/data.tsv.dist.png)
 41 | 
 42 | 
 43 | ## plot_distribution.py
 44 | 
 45 | Distribution plot using seaborn
 46 | 
 47 | Example: distribution of sequence length 
 48 | 
 49 | 	cat ../sequence/seq.fa | fasta2tab -l | cut -f 3 |  \
 50 | 		plot_distribution.py -t "Disribution of sequence length" -x "sequence length" -o pic.png
 51 | 
 52 | Sample output:
 53 | 
 54 | ![Sample output](example/data.txt.png)
 55 | 
 56 | 
 57 | ## plot_heatmap.R
 58 | 
 59 | Plot heatmap with *pheatmap*
 60 | 
 61 | Usage
 62 | 
 63 |     usage: ./plot_heatmap.R [-h] [-H] [-F field_seperator] [-a] [-al]
 64 |                             [--title title] [-s scale] [-ncr] [-ncc] [-c color]
 65 |                             [-nrc] [--width width] [--height height]
 66 |                             [-thr treeheight_row] [-thc treeheight_col]
 67 |                             [-fo fontsize] [-fr fontsize_row] [-fc fontsize_col]
 68 |                             infile outname
 69 |     
 70 |     Plot heatmap. Infile should be a csv/tsv file with header containing column names. Annotation for row is also supported, please put them in the last column.
 71 |     
 72 |     positional arguments:
 73 |       infile                infile (tsv, with head
 74 |       outname               outname
 75 |     
 76 |     optional arguments:
 77 |       -h, --help            show this help message and exit
 78 |       -H, --header          header
 79 |       -F field_seperator, --field-seperator field_seperator
 80 |                             field seperator
 81 |       -a, --with-annot      add annotation_row from the last column
 82 |       -al, --with-annot-legend
 83 |                             show annotation_row_legend
 84 |       --title title         title
 85 |       -s scale, --scale scale
 86 |                             scale. row | column | none [row]
 87 |       -ncr, --not-cluster-rows
 88 |                             do not cluster_rows
 89 |       -ncc, --not-cluster-cols
 90 |                             do not cluster_cols
 91 |       -c color, --color color
 92 |                             sequential palettes names [RdYlBu]
 93 |       -nrc, --not-reverse-color-order
 94 |                             do not reverse color order
 95 |       --width width         output image width
 96 |       --height height       output image height
 97 |       -thr treeheight_row, --treeheight_row treeheight_row
 98 |                             treeheight_row
 99 |       -thc treeheight_col, --treeheight_col treeheight_col
100 |                             treeheight_col
101 |       -fo fontsize, --fontsize fontsize
102 |                             fontsize
103 |       -fr fontsize_row, --fontsize_row fontsize_row
104 |                             fontsize_row
105 |       -fc fontsize_col, --fontsize_col fontsize_col
106 |                             fontsize_col
107 |                             
108 | Sample output:
109 | 
110 | ![Sample output](example/heatmap.png)


--------------------------------------------------------------------------------
/plot/example/data.tsv:
--------------------------------------------------------------------------------
  1 | group	value
  2 | A	-2.478041594
  3 | A	-1.670924814
  4 | A	0.302085867
  5 | A	1.723464735
  6 | A	1.732239018
  7 | A	0.397676057
  8 | A	-1.550346069
  9 | A	-0.950674475
 10 | A	1.681924819
 11 | A	0.971857536
 12 | A	-0.866578709
 13 | A	-0.002960986
 14 | A	-1.099090501
 15 | A	-0.387710361
 16 | A	0.833717474
 17 | A	0.446464809
 18 | A	0.041753848
 19 | A	-1.663461487
 20 | A	-0.89552437
 21 | A	-0.506462862
 22 | A	-1.593121423
 23 | A	-0.331644759
 24 | A	-0.876270979
 25 | A	1.704732645
 26 | A	0.350942092
 27 | A	-0.650102515
 28 | A	-0.20037638
 29 | A	-1.470960109
 30 | A	-0.779294956
 31 | A	-0.182033116
 32 | A	-2.161884173
 33 | A	0.852318015
 34 | A	-0.642384841
 35 | A	0.435884613
 36 | A	-0.156577243
 37 | A	0.744276341
 38 | A	1.307949666
 39 | A	0.425873497
 40 | A	0.444357135
 41 | A	-0.561607921
 42 | A	0.548522524
 43 | A	0.552736143
 44 | A	1.04787135
 45 | A	0.735733792
 46 | A	0.500453334
 47 | A	-1.126314462
 48 | A	-0.707289961
 49 | A	1.23401702
 50 | A	-1.003678165
 51 | A	0.415567648
 52 | A	0.962199827
 53 | A	0.103141335
 54 | A	-0.836026032
 55 | A	-0.098418515
 56 | A	1.563558927
 57 | A	1.54544268
 58 | A	-0.029325997
 59 | A	-1.401309572
 60 | A	1.521911545
 61 | A	-1.892086994
 62 | A	1.64618857
 63 | A	1.236972495
 64 | A	0.269366887
 65 | A	-0.607749077
 66 | A	-1.314902995
 67 | A	-0.910285157
 68 | A	-0.019836224
 69 | A	-0.591987119
 70 | A	0.123428499
 71 | A	-0.348583796
 72 | A	1.337303538
 73 | A	0.826278844
 74 | A	-0.375063867
 75 | A	0.656301925
 76 | A	0.090945182
 77 | A	-0.809690651
 78 | A	-1.334256525
 79 | A	0.544701029
 80 | A	-0.39094792
 81 | A	-0.861895104
 82 | A	1.292683052
 83 | A	-1.17919095
 84 | A	-1.774046316
 85 | A	1.08309747
 86 | A	-0.73170333
 87 | A	0.246381319
 88 | A	-1.045812696
 89 | A	0.520570011
 90 | A	-0.998067032
 91 | A	-0.819041117
 92 | A	0.299772419
 93 | A	-0.932132226
 94 | A	0.874542401
 95 | A	1.089482407
 96 | A	1.598371819
 97 | A	0.268834238
 98 | A	0.338377536
 99 | A	-0.5965213
100 | A	0.43380957
101 | A	1.240334583
102 | A	0.383542102
103 | A	0.392334889
104 | A	1.582287577
105 | A	0.934345306
106 | A	0.185303317
107 | A	-0.408425632
108 | A	1.320292991
109 | A	0.04157766
110 | A	-1.416251282
111 | A	1.746497661
112 | A	0.331006288
113 | A	1.403564316
114 | A	1.301531005
115 | A	-0.299222217
116 | A	-0.105714073
117 | A	1.785036895
118 | A	-0.134575051
119 | A	-1.042480709
120 | A	1.78300135
121 | A	-0.438683097
122 | A	-1.795526974
123 | A	0.150512279
124 | A	0.981735884
125 | A	2.473467286
126 | A	-0.132078898
127 | A	0.654606396
128 | A	-1.759474484
129 | A	1.066364322
130 | A	-0.169913279
131 | A	-1.018935367
132 | A	1.081703818
133 | A	-0.848156005
134 | A	-0.845524301
135 | A	1.053516424
136 | A	0.153402096
137 | A	0.221617988
138 | A	0.101040281
139 | A	1.328408995
140 | A	0.498088679
141 | A	1.318025052
142 | A	-0.814440626
143 | A	-1.068784756
144 | A	-0.198829267
145 | A	-1.366966838
146 | A	-0.362010626
147 | A	-0.62462386
148 | A	-1.362625316
149 | A	-0.997938157
150 | A	1.268118962
151 | A	-0.019648713
152 | A	-0.695817179
153 | A	-0.192844372
154 | A	-0.637827077
155 | A	-0.723913683
156 | A	-0.991369846
157 | A	0.586160041
158 | A	2.726372112
159 | A	-0.577510955
160 | A	-0.990340728
161 | A	0.076611801
162 | A	0.064507967
163 | A	-2.53536432
164 | A	-1.464073017
165 | A	-1.47193269
166 | A	1.172569386
167 | A	-0.189016092
168 | A	-0.274093583
169 | A	0.90876957
170 | A	-0.131689974
171 | A	-0.847336909
172 | A	-1.119121979
173 | A	0.551980938
174 | A	-0.575261175
175 | A	0.002730048
176 | A	0.940787285
177 | A	-0.439276259
178 | A	0.134038804
179 | A	1.018411703
180 | A	-1.137956506
181 | A	-0.157175605
182 | A	-0.315387133
183 | A	0.075165076
184 | A	1.345233144
185 | A	0.830714846
186 | A	-0.873349342
187 | A	-0.284379877
188 | A	-0.580637572
189 | A	0.876275425
190 | A	0.294259868
191 | A	-1.107709451
192 | A	-1.064997494
193 | A	0.182133669
194 | A	0.284590814
195 | A	-0.831312758
196 | A	0.781795593
197 | A	-0.202621437
198 | A	-0.839671657
199 | A	-0.546233119
200 | A	-0.887818316
201 | B	-6.40E-001
202 | B	-1.12E+000
203 | B	3.23E-001
204 | B	4.10E-001
205 | B	5.93E-001
206 | B	1.06E+000
207 | B	1.98E+000
208 | B	9.21E-001
209 | B	1.46E+000
210 | B	-4.07E-001
211 | B	1.98E+000
212 | B	3.56E-001
213 | B	3.38E-001
214 | B	2.40E+000
215 | B	-4.92E-001
216 | B	1.25E+000
217 | B	-4.66E-001
218 | B	-3.06E-001
219 | B	-7.69E-001
220 | B	2.11E-001
221 | B	-3.48E-001
222 | B	-1.95E+000
223 | B	6.90E-002
224 | B	1.12E+000
225 | B	1.62E+000
226 | B	-9.32E-001
227 | B	1.39E+000
228 | B	7.02E-001
229 | B	9.40E-001
230 | B	2.38E+000
231 | B	-3.74E-001
232 | B	2.14E+000
233 | B	1.35E+000
234 | B	2.38E+000
235 | B	6.77E-001
236 | B	1.56E+000
237 | B	1.84E+000
238 | B	1.13E+000
239 | B	1.88E-001
240 | B	-2.48E-001
241 | B	5.65E-001
242 | B	3.00E+000
243 | B	5.57E-001
244 | B	1.47E+000
245 | B	8.15E-001
246 | B	1.45E+000
247 | B	-1.24E+000
248 | B	-5.90E-001
249 | B	4.49E-001
250 | B	-5.34E-001
251 | B	2.34E+000
252 | B	2.81E+000
253 | B	-1.05E+000
254 | B	1.37E+000
255 | B	1.15E+000
256 | B	4.73E-001
257 | B	2.33E+000
258 | B	2.65E+000
259 | B	7.09E-001
260 | B	-2.81E-001
261 | B	7.41E-001
262 | B	8.63E-001
263 | B	-1.60E-001
264 | B	5.80E-001
265 | B	-2.73E-001
266 | B	1.34E+000
267 | B	2.66E+000
268 | B	-1.18E+000
269 | B	5.74E-001
270 | B	3.83E-001
271 | B	3.55E+000
272 | B	1.56E+000
273 | B	8.55E-001
274 | B	-1.34E+000
275 | B	9.92E-001
276 | B	3.70E-001
277 | B	2.79E-001
278 | B	6.87E-001
279 | B	-5.85E-001
280 | B	1.96E+000
281 | B	-1.56E+000
282 | B	4.47E-005
283 | B	5.39E-002
284 | B	-1.14E+000
285 | B	2.67E+000
286 | B	1.14E+000
287 | B	2.73E+000
288 | B	2.18E+000
289 | B	1.99E+000
290 | B	1.49E+000
291 | B	1.61E+000
292 | B	-4.01E-001
293 | B	9.41E-001
294 | B	1.31E+000
295 | B	1.44E-001
296 | B	2.09E+000
297 | B	1.55E+000
298 | B	-1.64E-001
299 | B	4.08E-001
300 | B	-2.13E-001
301 | B	-9.80E-001
302 | B	9.76E-001
303 | B	-1.81E-001
304 | B	5.84E-001
305 | B	8.30E-001
306 | B	1.14E+000
307 | B	1.49E+000
308 | B	1.87E+000
309 | B	7.76E-001
310 | B	-1.01E+000
311 | B	-1.63E+000
312 | B	2.14E+000
313 | B	7.27E-001
314 | B	1.86E+000
315 | B	-1.38E+000
316 | B	-1.97E-001
317 | B	4.70E-001
318 | B	1.95E-001
319 | B	-7.66E-002
320 | B	1.86E+000
321 | B	3.13E+000
322 | B	2.46E+000
323 | B	-9.66E-002
324 | B	1.96E+000
325 | B	1.43E+000
326 | B	7.85E-001
327 | B	1.16E+000
328 | B	1.10E+000
329 | B	2.53E-001
330 | B	-1.55E-001
331 | B	5.81E-002
332 | B	7.55E-001
333 | B	1.42E+000
334 | B	2.24E+000
335 | B	1.04E+000
336 | B	1.49E-001
337 | B	2.61E+000
338 | B	1.58E+000
339 | B	1.64E+000
340 | B	1.20E+000
341 | B	-7.85E-002
342 | B	6.80E-001
343 | B	3.56E-002
344 | B	1.17E-001
345 | B	8.46E-001
346 | B	9.02E-001
347 | B	1.37E+000
348 | B	-4.92E-001
349 | B	1.17E+000
350 | B	2.90E+000
351 | B	2.81E+000
352 | B	8.50E-001
353 | B	1.18E+000
354 | B	-5.11E-001
355 | B	2.93E+000
356 | B	-4.87E-004
357 | B	4.52E-001
358 | B	1.00E+000
359 | B	1.00E+000
360 | B	2.85E+000
361 | B	8.16E-001
362 | B	1.32E+000
363 | B	1.37E+000
364 | B	4.03E-001
365 | B	3.60E-001
366 | B	4.25E-002
367 | B	2.58E-002
368 | B	8.25E-001
369 | B	1.22E+000
370 | B	1.05E-001
371 | B	-7.12E-003
372 | B	1.16E+000
373 | B	1.38E+000
374 | B	-2.63E-001
375 | B	1.23E+000
376 | B	6.94E-001
377 | B	2.12E+000
378 | B	1.38E+000
379 | B	-3.36E-001
380 | B	4.35E-001
381 | B	2.46E+000
382 | B	1.96E+000
383 | B	1.70E+000
384 | B	2.08E+000
385 | B	2.15E+000
386 | B	2.15E+000
387 | B	1.25E+000
388 | B	1.92E-001
389 | B	-1.20E+000
390 | B	8.32E-001
391 | B	1.05E+000
392 | B	2.93E-001
393 | B	2.88E-001
394 | B	6.69E-001
395 | B	2.48E+000
396 | B	1.38E+000
397 | B	2.10E-001
398 | B	3.42E-001
399 | B	6.19E-001
400 | B	-2.54E-001
401 | 


--------------------------------------------------------------------------------
/plot/example/data.tsv.dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/bio_scripts/703cec8d21903516346e2aae4d77d23385c30905/plot/example/data.tsv.dist.png


--------------------------------------------------------------------------------
/plot/example/data.txt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/bio_scripts/703cec8d21903516346e2aae4d77d23385c30905/plot/example/data.txt.png


--------------------------------------------------------------------------------
/plot/example/heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/bio_scripts/703cec8d21903516346e2aae4d77d23385c30905/plot/example/heatmap.png


--------------------------------------------------------------------------------
/plot/example/plot_barplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/bio_scripts/703cec8d21903516346e2aae4d77d23385c30905/plot/example/plot_barplot.png


--------------------------------------------------------------------------------
/plot/plot_distribution.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # https://github.com/shenwei356/bio_scripts
 3 | 
 4 | library(methods)
 5 | library(proto)
 6 | library(argparse)
 7 | library(ggplot2)
 8 | library(reshape2)
 9 | library(scales)
10 | 
11 | #-----------------------------------------------------------------------------
12 | 
13 | description <- paste(
14 |   "Plot distribution.",
15 |   "Infile should be a tsv file of two columns (group and \"value\")", sep = ""
16 | )
17 | 
18 | parser <-
19 |   ArgumentParser(description = description,
20 |                  formatter_class = "argparse.RawTextHelpFormatter")
21 | 
22 | #-----------------------------------------------------------------------------
23 | 
24 | parser$add_argument("infile", type = "character",
25 |                     help = "infile")
26 | parser$add_argument("outfile", type = "character",
27 |                     help = "outfile")
28 | 
29 | parser$add_argument(
30 |   "-bw", "--binwidth", type = "double",
31 |   default = 0.1, help = "binwidth"
32 | )
33 | 
34 | parser$add_argument("--xlab", type = "character", default = "Value",
35 |                     help = "xlabel")
36 | parser$add_argument("--ylab", type = "character", default = "Density",
37 |                     help = "ylabel")
38 | parser$add_argument("--width", type = "integer", default = 6,
39 |                     help = "output image width [20]")
40 | parser$add_argument("--height", type = "integer", default = 3,
41 |                     help = "output image height [5]")
42 | 
43 | parser$add_argument(
44 |   "-t", "--title", metavar = "title", type = "character",
45 |   default = "", help = "title"
46 | )
47 | 
48 | #-----------------------------------------------------------------------------
49 | 
50 | args <- parser$parse_args()
51 | 
52 | if (args$title == "") {
53 |   args$title = ""
54 | }
55 | 
56 | #-----------------------------------------------------------------------------
57 | 
58 | df <- read.csv(args$infile, sep = "\t")
59 | 
60 | p <- ggplot(df, aes(x = value, fill = group, colour = group)) +
61 |   geom_histogram(
62 |     aes(y = ..density..), alpha = .3, position = "identity", binwidth = args$binwidth
63 |   ) +
64 |   geom_density(alpha = .2) +
65 |   ylab(args$ylab) +
66 |   xlab(args$xlab) +
67 |   ggtitle(args$title) +
68 |   theme_bw() +
69 |   theme(
70 |     panel.border = element_blank(),
71 |     panel.grid.major = element_blank(),
72 |     panel.grid.minor = element_blank(),
73 |     axis.line = element_line(colour = "black"),
74 |     legend.key = element_blank(),
75 |     # legend.position = "none",
76 |     legend.title = element_blank()
77 |   )
78 | 
79 | ggsave(
80 |   p, file = args$outfile, width = args$width, height = args$height
81 | )


--------------------------------------------------------------------------------
/plot/plot_distribution.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from __future__ import print_function
 4 | 
 5 | import argparse
 6 | import re
 7 | import sys
 8 | 
 9 | import matplotlib as mpl
10 | import matplotlib.pyplot as plt
11 | import seaborn as sns
12 | 
13 | parser = argparse.ArgumentParser(description='Plot distribution',
14 |                                  epilog="https://github.com/shenwei356/bio_scripts")
15 | 
16 | parser.add_argument('-i', '--infile', nargs='?', type=argparse.FileType('r'),
17 |                     default=sys.stdin, help='Input file')
18 | 
19 | parser.add_argument('-o', '--outfile', nargs='?', type=str,
20 |                     default='dist.png', help='Output file')
21 | 
22 | parser.add_argument('--width', type=int, default=8, help='Figure width')
23 | parser.add_argument('--height', type=int, default=6, help='Figure heigth')
24 | parser.add_argument('--x_lim', type=str, help='x_lim. format: "1,100"')
25 | parser.add_argument('--y_lim', type=str, help='y_lim. format: "1,100"')
26 | parser.add_argument('--bins', type=int, default=0, help='bins, 0 for None')
27 | 
28 | parser.add_argument(
29 |     '-t', '--title', type=str, default='Distribution Plot', help='Figure Title')
30 | parser.add_argument(
31 |     '-x', '--xlabel', type=str, default='Value', help='Figure X label')
32 | parser.add_argument(
33 |     '-y', '--ylabel', type=str, default='Frequency', help='Figure Y label')
34 | 
35 | args = parser.parse_args()
36 | 
37 | if args.y_lim and not re.match('^[\d\.]+,[\d\.]+$', args.y_lim):
38 |     print("Invalid option value for --y_lim. Example: --y_lim 1,100 ", file=sys.stderr)
39 |     sys.exit(1)
40 | if args.x_lim and not re.match('^[\d\.]+,[\d\.]+$', args.x_lim):
41 |     print("Invalid option value for --x_lim. Example: --y_lim 1,100 ", file=sys.stderr)
42 |     sys.exit(1)
43 | 
44 | data = []
45 | for line in args.infile:
46 |     data.append(float(line.strip()))
47 | 
48 | mpl.rc("figure", figsize=(args.width, args.height))
49 | 
50 | if args.bins == 0:
51 |     args.bins = None
52 |     
53 | figure = sns.distplot(data, bins=args.bins)
54 | 
55 | figure.set_title(args.title)
56 | figure.set_xlabel(args.xlabel)
57 | figure.set_ylabel(args.ylabel)
58 | 
59 | if args.x_lim:
60 |     figure.set_xlim([float(x) for x in args.x_lim.split(',')])
61 | if args.y_lim:
62 |     figure.set_ylim([float(y) for y in args.y_lim.split(',')])
63 | 
64 | plt.savefig(args.outfile)
65 | 


--------------------------------------------------------------------------------
/protein/protein_batch_compute_pI.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | # Function: Batch compute pI (isoelectric point) and Mw (molecular weight)
 4 | #           via submiting sequences to Compute pI/Mw tool at ExPASy.
 5 | # Author  : Wei Shen <shenwei356#gmail.com> http://shenwei.me
 6 | # Date    : 2013-10-16
 7 | # Update  : 2014-07-29
 8 | 
 9 | use strict;
10 | use BioUtil::Seq;
11 | 
12 | my $usage = <<"USAGE";
13 | 
14 | Function: Batch compute pI (isoelectric point) and Mw (molecular weight)
15 |           via submiting sequences to Compute pI/Mw tool at ExPASy
16 |  Contact: Wei Shen <shenwei356#gmail.com>
17 |    Usage: $0 amino_acid_fasta_file
18 |     
19 | USAGE
20 | die $usage
21 |     unless @ARGV == 1;
22 | my $aa_file = shift @ARGV;
23 | 
24 | # initialize fasta file parser
25 | my $next_seq = FastaReader($aa_file);
26 | 
27 | # initialize pI request
28 | my $PI = &compute_pi();
29 | 
30 | my ( $head, $seq );
31 | my ( $success, $pi, $mw );
32 | my $out_file = "$aa_file.result.txt";
33 | open OUT, ">", $out_file
34 |     or die "fail to write file $out_file\n";
35 | 
36 | while ( my $fa = &$next_seq() ) {
37 |     my ( $header, $seq ) = @$fa;
38 | 
39 |     ( $success, $pi, $mw ) = &$PI($seq);
40 |     unless ($success) {
41 |         print
42 |             "$pi. Please check whether the amino acid sequence contains illegal characters.\r\n"
43 |             ;    # here $pi is the status_line of response
44 |         next;
45 |     }
46 |     print "$header\t$pi\t$mw\r\n";
47 |     print OUT "$header\t$pi\t$mw\r\n";
48 | }
49 | 
50 | close OUT;
51 | 
52 | # Compute pI/Mw via submiting sequence to Compute pI/Mw tool at ExPASy.
53 | #
54 | # See more: http://web.expasy.org/compute_pi/
55 | #
56 | # Example:
57 | #
58 | #    my @proteins = qw/AYYAYYAYAYAY ACACAGACG ---/;
59 | #    my $PI = &compute_pi();
60 | #    my ( $success, $pi, $mw );
61 | #    for my $protein (@proteins) {
62 | #        ( $success, $pi, $mw ) = &$PI($protein, "average");
63 | #        # ( $success, $pi, $mw ) = &$PI($protein, "monoisotopic");
64 | #        unless ($success) {
65 | #            print "$pi\n";    # here $pi is the status_line of response
66 | #            next;
67 | #        }
68 | #        print "($pi, $mw)\n";
69 | #    }
70 | sub compute_pi() {
71 |     use LWP::UserAgent;
72 | 
73 |     my $ua  = LWP::UserAgent->new;
74 |     my $url = "http://web.expasy.org/cgi-bin/compute_pi/pi_tool";
75 |     my ( $res, $formdata, $result );
76 | 
77 |     return sub($$) {
78 |         my ( $protein, $resolution ) = @_;
79 |         $resolution = "average" unless defined $resolution; # or  monoisotopic
80 |         $formdata = [
81 |             protein    => $protein,
82 |             resolution => $resolution,
83 |             file       => ""
84 |         ];
85 | 
86 |         $res = $ua->post( $url, $formdata );
87 | 
88 |         # 0 means failed
89 |         return ( 0, $res->status_line )
90 |             unless $res->is_success;
91 | 
92 |         $result = $res->content;
93 |         $result =~ /Theoretical pI\/Mw: ([\d\.]+)\s\/\s([\d\.]+)/;
94 | 
95 |         # 1 means success
96 |         return ( 1, $1, $2 );
97 |         }
98 | }
99 | 


--------------------------------------------------------------------------------
/sequence/README.md:
--------------------------------------------------------------------------------
  1 | # Manipulation on FASTA/Q format file
  2 | 
  3 | Recommend my toolkit [SeqKit](https://github.com/shenwei356/seqkit), 
  4 | a cross-platform and efficient toolkit for FASTA/Q file manipulation,
  5 | which integrades most of the functions provided by these scripts.
  6 | 
  7 | ## FASTA
  8 | 
  9 | ### fasta2tab and tab2fasta
 10 | 
 11 | [*fasta2tab*](https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta2tab) 
 12 | and [*tab2fasta*](https://github.com/shenwei356/bio_scripts/blob/master/sequence/tab2fasta)
 13 | are used in pair. *fasta2tab* transforms the FASTA fromat to two-column table,
 14 | fist column is the header and the second is sequence. 
 15 | Its could also compute the reverse complement sequence and remove gaps. 
 16 | Sequence length and GC content could be outputted as another column, 
 17 | which could be used for filtering and sorting. tab2fasta just tranform the
 18 | table back to FASTA format. Combining with shell tool like awk and sed,
 19 | it’s easy to filter, sort FASTA files. 
 20 | 
 21 | #### Examples
 22 | 
 23 | ##### 1. sort fasta by sequnece length
 24 | 
 25 | ```
 26 | cat seq.fa | fasta2tab -t -l | sort -r -t"`echo -e '\t'`" -n -k3,3 \
 27 | | tab2fasta -l 70 > seq.sorted.fa
 28 | ```
 29 | 
 30 | ##### 2. extract sub sequence
 31 | 
 32 | ```
 33 | fasta2tab -t -sub 3,10 -rc seq.fa | tab2fasta
 34 | ```
 35 | 
 36 | ##### 3. extract sequence longer than 1000 bp
 37 | 
 38 | ```
 39 | cat seq.fa | fasta2tab -t -l | awk -F'\t' '$3 >= 1000' | tab2fasta -l 70
 40 | ```
 41 | 
 42 | ##### 4. extract aligned sequence of which the original sequence is longer than 1000 bp
 43 | 
 44 | ```
 45 | cat seq.fa | fasta2tab -l2 | awk -F'\t' '$3 >= 1000' | tab2fasta -l 70
 46 | ```
 47 | 
 48 | ##### 5. reverse complement sequence, uppercase, and trim gaps
 49 | 
 50 | ```
 51 | zcat seq.fa.gz | fasta2tab -uc -rc -t | tab2fasta
 52 | ```
 53 | 
 54 | ### fasta_extract_by_pattern.pl
 55 | 
 56 | [fasta_extract_by_pattern.pl](https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta_extract_by_pattern.pl) 
 57 | could extract FASTA sequences by header or sequence, exactly matching or regular
 58 | expression matching are both supported. The query pattern could read from files.
 59 | And negation of the result is also easy to get. What's the most important, it could read from STDIN.  
 60 | 
 61 | Combining fasta2tab and tab2fasta with [*cvs_grep*](https://github.com/shenwei356/bio_scripts/blob/master/util/csv_grep)
 62 | could also have the same function.
 63 | 
 64 | #### Examples
 65 | 
 66 | ##### 1. sequences WITH "bacteria" in header
 67 | 
 68 | ```
 69 | fasta_extract_by_pattern.pl -r -p Bacteria *.fa > result.fa
 70 | ```
 71 | 
 72 | ##### 2. sequences WITHOUT “bacteria” in header
 73 | 
 74 | ```
 75 | fasta_extract_by_pattern.pl -r -n -p Bacteria seq1.fa seq2.fa > result.fa
 76 | ```
 77 | 
 78 | ##### 3. sequences with TTSAA (AgsI digest site) in SEQUENCE.  Base S stands for C or G.
 79 | 
 80 | ```
 81 | fasta_extract_by_pattern.pl -r -s -p 'TT[C|G]AA' seq.fa > result.fa
 82 | ```
 83 | 
 84 | ##### 4. sequences (read from STDIN ) with header that matches any patterns in list file
 85 | 
 86 | ```
 87 | zcat seq.fa.gz | fasta_extract_by_pattern.pl -pf name_list.txt > result.fa
 88 | ```
 89 | 
 90 | ### fasta_common_seqs.pl
 91 | 
 92 | [fasta_common_seqs.pl](https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta_common_seqs.pl)
 93 | is used to find common sequences in multiple files. It supports comparing by header or sequence. 
 94 | By storing the MD5 value of sequences, it has a low memory usage. It’s also could be 
 95 | used to remove duplicated records, by finding common sequencing from the
 96 | file and its copy or soft link.
 97 | 
 98 | ### fasta_remove_duplicates.pl
 99 | 
100 | [fasta_remove_duplicates.pl](https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta_remove_duplicates.pl)
101 | could remove duplicated records from file or STDIN, by both sequence and header.
102 | 
103 | ### fasta_locate_motif.pl
104 | 
105 | [fasta_locate_motif.pl](https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta_locate_motif.pl)
106 | could find restrict enzyme recognition site or other motif location.
107 | 
108 | ### fasta_gc_skew.py and fasta_gc_skew.plot.R
109 | 
110 | Sample out:
111 | 
112 | ![GC Skew](sample/gc_skew.png)
113 | 
114 | ## FASTQ
115 | 
116 | ### fastq2tab and tab2fastq
117 | 
118 | [*fastq2tab*](https://github.com/shenwei356/bio_scripts/blob/master/sequence/fastq2tab) and [*tab2fastq*](https://github.com/shenwei356/bio_scripts/blob/master/sequence/tab2fastq) are similar to fasta2tab and tab2fasta. It could use to filter fastq with help of [*cvs_grep*](https://github.com/shenwei356/bio_scripts/blob/master/util/csv_grep).
119 | 
120 | Example: removing contaminate reads
121 | 
122 |     zcat reads.fq.gz                                \
123 |        | fastq2tab                                  \
124 |        | csv_grep -t -pf <(cat idlist) -i -d        \
125 |        | tab2fastq                                  \
126 |        | gzip -c                                    \
127 |        > reads2.fq.gz
128 | 


--------------------------------------------------------------------------------
/sequence/fasta2tab:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | # https://github.com/shenwei356/bio_scripts
  3 | 
  4 | use strict;
  5 | use Getopt::Long;
  6 | use BioUtil::Seq;
  7 | use BioUtil::Util;
  8 | 
  9 | my $usage = q(
 10 | fasta2tab - transform the fasta fromat to two-column table
 11 | 
 12 | Usage: fasta2tab [options] [fastafiles...]
 13 | Options:
 14 |     -r,   --reverse             Reverse sequence
 15 |     -c,   --complement          Complement sequence
 16 |     -rc,  --reversecomplement   Reversecomplement
 17 |     -sub, --subseq  INT,INT     Substring of sequence, 1-based
 18 |                                 Examples:
 19 |                                     seq         ACGAGACGTA
 20 |                                     index       1234567890
 21 | 
 22 |                                      option      subseq
 23 |                                     --------------------
 24 |                                     -sub  2,7    CGAGAC
 25 |                                     -sub  2,2    C
 26 |                                     -sub   ,7   ACGAGAC
 27 |                                     -sub  2,     CGAGACGTA
 28 |                                     -sub -3,           GTA
 29 |                                     -sub -3,-2         GT
 30 |                                     -sub   ,-3  ACGAGACG
 31 | 
 32 |     -t,   --trim                Trim non-Latin alphabet
 33 |     -lc,  --lowercase           Lowercase
 34 |     -uc,  --uppercase           Uppercase
 35 | 
 36 |     -l,   --length              Ouput sequence length at another column
 37 |     -l2,  --length2             Ouput number of latin-letter in sequence
 38 |                                 at another column
 39 |     -bc,  --bc STRING[,STRING]  Ouput base content
 40 |                                 Examples:
 41 |                                     'GC' :  G+C content,
 42 |                                     'G,C':  G and C, in two column
 43 |     -gc,  --gc                  Ouput GC content at another column
 44 | 
 45 |     -h,   --help                Show this help information
 46 | 
 47 | Examples:
 48 | 
 49 |     1. sort fasta by sequnece length
 50 |        cat seq.fa | fasta2tab -t -l | sort -r -t"`echo -e '\t'`" -n -k3,3 \
 51 |          |  tab2fasta -l 70 > seq.sorted.fa
 52 | 
 53 |     2. extract sub sequence
 54 |        fasta2tab -t -sub 3,10 -rc  seq.fa | tab2fasta
 55 | 
 56 |     3. extract sequence longer than 1000 bp
 57 |        cat seq.fa | fasta2tab -t -l | awk -F'\t' '$3 >= 1000' | tab2fasta -l 70
 58 | 
 59 |     4. extract aligned sequence of which the original sequence is longer than 1000 bp
 60 |        cat seq.fa | fasta2tab   -l2 | awk -F'\t' '$3 >= 1000' | tab2fasta -l 70
 61 | 
 62 |     5. reverse complement sequence, uppercase, and trim gaps
 63 |        zcat seq.fa.gz | fasta2tab -uc -rc -t | tab2fasta
 64 | 
 65 | This script is usually used in pair with tab2fasta.
 66 | https://github.com/shenwei356/bio_scripts
 67 | 
 68 | );
 69 | 
 70 | my $para = {};
 71 | GetOptions(
 72 |     'help|h' => \$$para{help},
 73 | 
 74 |     'reverse|r'            => \$$para{rev},
 75 |     'complement|c'         => \$$para{comp},
 76 |     'reversecomplement|rc' => \$$para{rc},
 77 |     'subseq|sub=s'         => \$$para{sub},
 78 | 
 79 |     'trim|t'       => \$$para{trim},
 80 |     'lowercase|lc' => \$$para{lc},
 81 |     'uppercase|uc' => \$$para{uc},
 82 | 
 83 |     'length|l'   => \$$para{len},
 84 |     'length2|l2' => \$$para{len2},
 85 |     'bc=s'       => \$$para{bc},
 86 |     'gc'         => \$$para{gc},
 87 | ) or die $usage;
 88 | 
 89 | die $usage if $$para{help};
 90 | if ( $$para{sub} ) {
 91 |     die qq(
 92 | parameter of -sub not correct.
 93 | 
 94 | examples:
 95 |     seq         ACGAGACGTA
 96 |     index       1234567890
 97 | 
 98 |      option      subseq
 99 |     --------------------
100 |     -sub  2,7    CGAGAC
101 |     -sub  2,2    C
102 |     -sub   ,7   ACGAGAC
103 |     -sub  2,     CGAGACGTA
104 |     -sub -3,           GTA
105 |     -sub -3,-2         GT
106 |     -sub   ,-3  ACGAGACG
107 | 
108 | ) unless $$para{sub} =~ /^(-?\d*),(-?\d*)$/;
109 |     die "warning: end ($2) should be >= start ($1)\n" if $2 ne '' and $1 ne '' and $2 < $1 ;
110 | }
111 | 
112 | my @files = file_list_from_argv(@ARGV);
113 | 
114 | for my $file (@files) {
115 |     my $next_seq = FastaReader($file);
116 |     while ( my $fa = &$next_seq() ) {
117 |         my ( $header, $seq ) = @$fa;
118 | 
119 |         $header =~ s/\t/__tab__/g;
120 | 
121 |         if ( $$para{trim} ) {
122 |             $seq =~ s/[^a-zA-Z]+//g;
123 |         }
124 | 
125 |         if ( $$para{sub} ) {
126 |             my ( $start, $end ) = split /,/, $$para{sub};
127 |             if ( $start eq '' ) {
128 |                 $start = 1;
129 |             }
130 |             elsif ( $start < 0 ) {
131 |                 $start += 1;
132 |             }
133 | 
134 |             if ( $end eq '' ) {
135 |                 $end = 1 + length $seq;
136 |             }
137 |             elsif ( $end < 0 ) {
138 |                 $end += 1;
139 |             }
140 |             $seq = substr $seq, $start - 1, $end - $start + 1;
141 |         }
142 | 
143 |         if ( $$para{rc} ) {
144 |             $seq = revcom($seq);
145 |         }
146 |         else {
147 |             $seq = complement($seq) if $$para{comp};
148 |             $seq = reverse $seq     if $$para{rev};
149 |         }
150 | 
151 |         if ( $$para{lc} ) {
152 |             $seq = lc $seq;
153 |         }
154 |         elsif ( $$para{uc} ) {
155 |             $seq = uc $seq;
156 |         }
157 | 
158 |         print "$header\t$seq";
159 |         print "\t", length $seq if $$para{len};
160 |         if ( $$para{len2} ) {
161 |             if ( $$para{trim} ) {
162 |                 print "\t", length $seq;
163 |             }
164 |             else {
165 |                 my $seq2 = $seq;
166 |                 $seq2 =~ s/[^a-zA-Z]+//g;
167 |                 print "\t", length $seq2;
168 |             }
169 |         }
170 | 
171 |         if ($$para{gc}) {
172 |             print "\t", base_content( 'gc', $seq );
173 |         } elsif ($$para{bc}) {
174 |             my @bases = split /,/, $$para{bc};
175 |             for my $base (@bases) {
176 |                 print "\t", base_content( $base, $seq );
177 |             }
178 |         }
179 |         print "\n";
180 |     }
181 | }
182 | 


--------------------------------------------------------------------------------
/sequence/fasta_common_seqs.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | # Copyright 2014 Wei Shen (shenwei356#gmail.com). All rights reserved.
  3 | # Use of this source code is governed by a MIT-license
  4 | # that can be found in the LICENSE file.
  5 | use strict;
  6 | use File::Basename;
  7 | use Getopt::Long;
  8 | use Digest::MD5 'md5_hex';
  9 | use BioUtil::Seq;
 10 | 
 11 | local $| = 1;
 12 | $0 = basename($0);
 13 | my $usage = <<"USAGE";
 14 | ===============================================================================
 15 | Function: Find common sequences in fasta files.
 16 |           Features:
 17 |               1) Comparing by name or sequence are both supported.
 18 |               2) No files number limit.
 19 |               3) Low RAM usage.
 20 |           Note that:
 21 |               1) Records with different names may have same sequences.
 22 |               2) Case of sequence letters or name may be different.
 23 |               3) Duplicated records may exist in a fasta file.
 24 | Contact : Wei Shen <shenwei356#gmail.com>
 25 | Date    : 2013-11-07
 26 | Update  : 2014-08-14
 27 | Site    : https://github.com/shenwei356/bio_scripts
 28 | 
 29 | Usage   : $0 [-s] [-i] fastafile fastafile2 [fastafile3 ...]
 30 | Options :
 31 |    -s   Comparing by sequence.
 32 |    -i   Ignore case.
 33 |    -l   Output line length. [70]
 34 | ===============================================================================
 35 | 
 36 | USAGE
 37 | 
 38 | my $by_seq      = 0;
 39 | my $ignore_case = 0;
 40 | my $linelength  = 70;
 41 | GetOptions(
 42 |     "s"   => \$by_seq,
 43 |     "i"   => \$ignore_case,
 44 |     'l=i' => \$linelength,
 45 | ) or die $usage;
 46 | 
 47 | # at least two files;
 48 | die "$usage\n>= 2 sequence file needed.\n" unless @ARGV >= 2;
 49 | 
 50 | my $counts = {};
 51 | my $names  = {};
 52 | 
 53 | my ( $file, $next_seq, $head, $head0, $seq, $seq_md5 );
 54 | 
 55 | for $file (@ARGV) {
 56 |     print STDERR "\nparsing $file...\n";
 57 |     my $n = 0;
 58 |     $next_seq = FastaReader($file);
 59 |     while ( my $fa = &$next_seq() ) {
 60 |         ( $head, $seq ) = @$fa;
 61 |         print STDERR "\r", ++$n;
 62 |         $head0 = $head;                     # orgin sequence name
 63 |         $head = lc $head if $ignore_case;
 64 | 
 65 |         if ($by_seq) {
 66 |             $seq =~ tr/A-Z/a-z/ if $ignore_case;
 67 |             $seq_md5 = md5_hex($seq);
 68 | 
 69 |             # count sequences with md5 $seq_md5 in $file
 70 |             $$counts{$seq_md5}{$file}++;    #
 71 |                                             # record the origin sequence name.
 72 |             $$names{$seq_md5}{$file} = $head0;
 73 |         }
 74 |         else {
 75 |             # count sequences with name $head in $file
 76 |             $$counts{$head}{$file}++;
 77 |             $$names{$head}{$file} = $head0;
 78 |         }
 79 |     }
 80 | }
 81 | 
 82 | # output common sequences
 83 | print STDERR "\nchecking...\n";
 84 | my $file_num = scalar @ARGV;
 85 | $file = $ARGV[0];    # extract sequences from the first file.
 86 | my $names_ok = {};
 87 | for my $key ( keys %$counts ) {
 88 | 
 89 |     # all files have a same record
 90 |     next unless ( scalar keys %{ $$counts{$key} } ) == $file_num;
 91 | 
 92 |     $$names_ok{ $$names{$key}{$file} }
 93 |         = $$counts{$key}{$file};    # save to a hash.
 94 | }
 95 | 
 96 | print STDERR "extracting...\n";
 97 | my $n = 0;
 98 | $next_seq = FastaReader($file);
 99 | while ( my $fa = &$next_seq() ) {
100 |     ( $head, $seq ) = @$fa;
101 | 
102 |     if ( exists $$names_ok{$head} and $$names_ok{$head} > 0 ) {
103 |         print STDERR "\rhit: ", ++$n;
104 |         print ">$head\n", format_seq( $seq, $linelength );
105 | 
106 |         # just export one record for duplicated records.
107 |         $$names_ok{$head} = 0;
108 |     }
109 | }
110 | print STDERR "\n";
111 | 


--------------------------------------------------------------------------------
/sequence/fasta_extract_by_pattern.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | # https://github.com/shenwei356/bio_scripts
  3 | 
  4 | use strict;
  5 | 
  6 | use Getopt::Long;
  7 | use File::Basename;
  8 | use BioUtil::Seq;
  9 | use BioUtil::Util;
 10 | 
 11 | $0 = basename($0);
 12 | my $usage = <<USAGE;
 13 | 
 14 | Extract fasta sequences by header (list file) or regular expression (list file)
 15 | 
 16 | Version: 2015.02.06
 17 | Usage: $0 [options] [fastafiles...]
 18 | Options:
 19 |     
 20 |     -p,  --pattern STRING      Search pattern
 21 |     -pf, --patternfile FILE    Pattern list file (use first column)
 22 |     -r,  --useregexp           Use regular expression, case ignored
 23 |     -d,  --speedup             Delete matched pattern, if you know what it means
 24 |     -n,  --not                 Invert match, extract sequences NOT match the pattern
 25 |     -s,  --byseq               Match by sequence 
 26 |     -h,  --help                Show this help information
 27 | 
 28 | Examples:
 29 | 
 30 |     1) sequences WITH "bacteria" in header
 31 |         $0 -r -p Bacteria *.fa > result.fa
 32 |     2) sequences WITHOUT "bacteria" in header
 33 |         $0 -r -n -p Bacteria seq1.fa seq2.fa > result.fa
 34 |     3) sequences with TTSAA (AgsI digest site) in SEQUENCE. 
 35 |        Base S stands for C or G.
 36 |         $0 -r -s -p 'TT[C|G]AA' seq.fa > result.fa
 37 |     4) sequences (read from STDIN ) with header that matches any patterns
 38 |        in list file
 39 |         zcat seq.fa.gz | $0 -pf name_list.txt > result.fa
 40 | 
 41 | https://github.com/shenwei356/bio_scripts
 42 | 
 43 | USAGE
 44 | 
 45 | my $para = {};
 46 | GetOptions(
 47 |     'help|h'           => \$$para{help},
 48 |     'useregexp|r'      => \$$para{useregexp},
 49 |     'speedup|d'        => \$$para{speedup},
 50 |     'not|n'            => \$$para{not},
 51 |     'pattern|p=s'      => \$$para{pattern},
 52 |     'patternfile|pf=s' => \$$para{patternfile},
 53 |     'byseq|s'          => \$$para{byseq},
 54 | ) or die $usage;
 55 | die $usage if $$para{help};
 56 | 
 57 | # get patterns
 58 | my $patterns = {};
 59 | $$patterns{$$para{pattern}} = 1 if $$para{pattern};
 60 | if ( $$para{patternfile} ){
 61 |     $$patterns{$_} = 1 for @{ get_column_data( $$para{patternfile}, 1 ) };
 62 | }
 63 | die "no patterns given. Type \"$0 -h\" for help.\n" if keys %$patterns == 0;
 64 | 
 65 | # get the file list
 66 | my @files = file_list_from_argv(@ARGV);
 67 | 
 68 | my $not_trim = 1;
 69 | $not_trim = 0 if $$para{byseq};
 70 | 
 71 | my ( $sum, $n ) = ( 0, 0 );
 72 | 
 73 | for my $file (@files) {
 74 | 
 75 |     my $next_seq = FastaReader( $file, $not_trim );
 76 |     while ( my $fa = &$next_seq() ) {
 77 |         my ( $header, $seq ) = @$fa;
 78 |         $sum++;
 79 | 
 80 |         # matching object, by header or sequence
 81 |         my $object = $header;
 82 |         if ( $$para{byseq} ) {
 83 |             $object = $seq;
 84 |         }
 85 | 
 86 |         my $hit = undef;
 87 |         if ( $$para{useregexp} ) {    # use regular expression
 88 |             for my $p (keys %$patterns) {
 89 |                 if ( $object =~ /$p/i ) {
 90 |                     $hit = 1;
 91 |                     delete $$patterns{$p} if $$para{speedup};
 92 |                     last;
 93 |                 }
 94 |             }
 95 |         }
 96 |         else {                        # compare with full header | sequence
 97 |             if ( exists $$patterns{$object} ) {
 98 |                 $hit = 1;
 99 |             }
100 |         }
101 | 
102 |         if ( $$para{not} ) {          # NOT
103 |             next if $hit;
104 |         }
105 |         else {
106 |             next unless $hit;
107 |         }
108 | 
109 |         $n++;
110 |         if ( $$para{byseq} ) {
111 |             print ">$header\n", format_seq($seq);
112 |         }
113 |         else {
114 |             print ">$header\n$seq";
115 |         }
116 |     }
117 | }
118 | 
119 | print STDERR "\rHits: $n / $sum\n";
120 | 


--------------------------------------------------------------------------------
/sequence/fasta_extract_randomly.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # https://github.com/shenwei356/bio_scripts
 3 | 
 4 | use strict;
 5 | 
 6 | use File::Basename;
 7 | use BioUtil::Seq;
 8 | use BioUtil::Util;
 9 | 
10 | $0 = basename($0);
11 | my $usage = <<USAGE;
12 | 
13 | Randomly extract fasta sequences by a given proportion
14 | 
15 | Examples: 
16 | 
17 |     1) $0 0.1 seq.fa
18 |     2) $0 0.1 seq.fa seq2.fa   # multi files supported
19 |     3) $0 0.1 seq*.fa          # glob expression
20 |     4) cat seq.fa | $0 0.1     # read from STDIN
21 | 
22 | https://github.com/shenwei356/bio_scripts
23 | 
24 | USAGE
25 | die $usage unless @ARGV >= 1;
26 | 
27 | my $p = shift @ARGV;
28 | die "Probability should between 0 and 1\n"
29 |     unless $p =~ /^[\d\.]+$/
30 |     and $p > 0
31 |     and $p <= 1;
32 | 
33 | srand();
34 | 
35 | my @files = file_list_from_argv(@ARGV);
36 | 
37 | my $n = 0;
38 | for my $file (@files) {
39 |     my $next_seq = FastaReader( $file, 1 );
40 |     while ( my $fa = &$next_seq() ) {
41 |         my ( $header, $seq ) = @$fa;
42 | 
43 |         next unless rand() < $p;
44 |         $n++;
45 |         print ">$header\n$seq";
46 |     }
47 | }
48 | 
49 | print STDERR "sum: $n\n";
50 | 


--------------------------------------------------------------------------------
/sequence/fasta_gc_skew.plot.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | # https://github.com/shenwei356/bio_scripts
  3 | library(methods)
  4 | library(proto)
  5 | library(dplyr)
  6 | library(tidyr)
  7 | library(argparse)
  8 | library(ggplot2)
  9 | library(scales)
 10 | library(ggthemes)
 11 | library(tidyr)
 12 | library(swr)
 13 | 
 14 | 
 15 | parser <-
 16 |   ArgumentParser(description = "Plot GC and GC Skew with the result produced by fasta_gc_skew.py",
 17 |                  formatter_class = "argparse.RawTextHelpFormatter")
 18 | 
 19 | parser$add_argument("infile", type = "character",
 20 |                     help = "gcskew file produced by fasta_gc_skew.py")
 21 | parser$add_argument("outfile", type = "character",
 22 |                     help = "outfile")
 23 | parser$add_argument(
 24 |   "-xi",
 25 |   "--x-interval",
 26 |   type = "integer",
 27 |   default = 1000000,
 28 |   help = "x axix interval [1,000,000]"
 29 | )
 30 | parser$add_argument("-n",
 31 |                     type = "integer",
 32 |                     default = 10,
 33 |                     help = "divide the normalized accum_gcskew by n so it looks better [10]")
 34 | parser$add_argument(
 35 |   "--width",
 36 |   metavar = "width",
 37 |   type = "integer",
 38 |   default = 20,
 39 |   help = "output image width [20]"
 40 | )
 41 | parser$add_argument(
 42 |   "--height",
 43 |   metavar = "height",
 44 |   type = "integer",
 45 |   default = 5,
 46 |   help = "output image height [5]"
 47 | )
 48 | parser$add_argument(
 49 |   "-g",
 50 |   "--gc-content",
 51 |   action = "store_true",
 52 |   dest = "gc_content",
 53 |   help = "only plot GC Content"
 54 | )
 55 | parser$add_argument("-s",
 56 |                     "--gc-skew",
 57 |                     action = "store_true",
 58 |                     dest = "gc_skew",
 59 |                     help = "only plot GC Skew")
 60 | parser$add_argument(
 61 |   "-t",
 62 |   "--title",
 63 |   metavar = "title",
 64 |   type = "character",
 65 |   default = "GC Content/GC Skew",
 66 |   help = "title"
 67 | )
 68 | 
 69 | args <- parser$parse_args()
 70 | 
 71 | if (args$title == "") {
 72 |   args$title = NULL
 73 | }
 74 | 
 75 | df <- read.csv(args$infile, sep = "\t")
 76 | df['accum_gcskew'] = df['accum_gcskew'] / max(df['accum_gcskew']) / args$n
 77 | 
 78 | if (args$gc_content && !args$gc_skew) {
 79 |   df['gcskew'] = NULL
 80 |   df['accum_gcskew'] = NULL
 81 | }
 82 | if (!args$gc_content && args$gc_skew) {
 83 |   df['gc'] = NULL
 84 | }
 85 | 
 86 | df_m <- df %>% gather(group, value, -chr, -loc)
 87 | 
 88 | p <- ggplot(df_m) +
 89 |   geom_line(aes(loc, value, color = group)) +
 90 |   geom_hline(aes(yintercept = 0), linetype = 2) +
 91 |   scale_size(range = c(0.1)) +
 92 |   scale_colour_wsj() +
 93 |   facet_grid(chr ~ .) +
 94 |   ylab(NULL) +
 95 |   xlab("Position (bp)") +
 96 |   scale_x_continuous(breaks = seq(0, max(df$loc), by = args$x_interval),
 97 |                      labels = comma) +
 98 |   ggtitle(args$title) +
 99 |   shenwei356.theme() +
100 |   theme(legend.position = "top")
101 | 
102 | ggsave(
103 |   p,
104 |   file = args$outfile,
105 |   width = args$width,
106 |   height = args$height
107 | )
108 | 


--------------------------------------------------------------------------------
/sequence/fasta_gc_skew.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # https://github.com/shenwei356/bio_scripts
 4 | from __future__ import division
 5 | 
 6 | import argparse
 7 | import sys
 8 | 
 9 | import numpy as np
10 | from Bio import SeqIO
11 | 
12 | 
13 | def parse_args():
14 |     parser = argparse.ArgumentParser(description="GC Skew",
15 |                                      epilog="https://github.com/shenwei356/bio_scripts")
16 | 
17 |     parser.add_argument('infile', type=str, help='fasta file')
18 |     parser.add_argument('-w', '--window', type=int, default=10000, help='window size [10000]')
19 |     parser.add_argument('-s', '--step', type=int, default=200, help='step size [200]')
20 |     parser.add_argument('-c', '--circular', action='store_true', help='circular genome')
21 | 
22 |     args = parser.parse_args()
23 |     return args
24 | 
25 | 
26 | def GC_Skew(seq, window=10000, step=200, circular=False):
27 |     length, cnt = len(seq), 0
28 |     if circular:
29 |         end = length - step if length > step else 0
30 |     else:
31 |         end = length - window if length > window else 0
32 |     locs = range(0, end + 1, step)
33 |     GC, skew = np.zeros(len(locs)), np.zeros(len(locs))
34 |     for i in locs:
35 |         if i >= length - window:
36 |             s = '{}{}'.format(seq[i:length], seq[0:window - (length - i)])
37 |         else:
38 |             s = seq[i:i + window]
39 |         g, c = s.count('g') + s.count('G'), s.count('c') + s.count('C')
40 |         GC[cnt] = (g + c) / window
41 |         skew[cnt] = (g - c) / (g + c)
42 |         cnt += 1
43 |     return GC, skew
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     args = parse_args()
48 | 
49 |     with open(args.infile) as fh:
50 |         sys.stdout.write('{}\t{}\t{}\t{}\t{}\n'.format('chr', 'loc', 'gc', 'gcskew', 'accum_gcskew'))
51 |         for seq in SeqIO.parse(fh, 'fasta'):
52 |             sys.stderr.write('compute gcskew: {}\n'.format(seq.id))
53 |             GC, gcskew = GC_Skew(seq.seq, window=args.window, step=args.step, circular=args.circular)
54 |             acc = 0
55 |             for i in range(0, len(GC)):
56 |                 gc, skew = GC[i], gcskew[i]
57 |                 acc += skew
58 |                 sys.stdout.write('{}\t{}\t{:.4f}\t{:.4f}\t{:.4f}\n'.format(seq.id, i * args.step + 1, gc, skew, acc))
59 | 


--------------------------------------------------------------------------------
/sequence/fasta_locate_motif.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | # https://github.com/shenwei356/bio_scripts
  3 | 
  4 | use strict;
  5 | use Getopt::Long;
  6 | use File::Basename;
  7 | use BioUtil::Seq;
  8 | 
  9 | $0 = basename($0);
 10 | my $usage = <<USAGE;
 11 | 
 12 | $0 - locating motif in genomes. 
 13 | 
 14 | Motifs could be EITHER plain sequence containing "ACTGN" OR regular
 15 | expression like "A[TU]G(?:.{3})+?[TU](?:AG|AA|GA)" for ORFs. 
 16 | Degenerate bases like "RYMM.." are also supported by option -d/
 17 | 
 18 | Usage: $0 <motif fasta file> <subject fasta file>
 19 | Options:
 20 | 
 21 |     -d, --degenerate   Motif contains egenerate base
 22 |     -h, --help         Show this help information
 23 | 
 24 | Attention: In default, motifs are treated as regular expression.
 25 |            When option -d given, regular expression may be wrong. 
 26 |            For example: "\\w" -> "\\[AT]". In this case you can use "\\.+?"
 27 | 
 28 | USAGE
 29 | 
 30 | my $args = {};
 31 | GetOptions(
 32 |     'help|h'       => \$$args{help},
 33 |     'degenerate|d' => \$$args{degenerate},
 34 | ) or die $usage;
 35 | die $usage if $$args{help};
 36 | die $usage unless @ARGV == 2;
 37 | 
 38 | my $queries = read_sequence_from_fasta_file( shift @ARGV );
 39 | 
 40 | my $next_seq = FastaReader( shift @ARGV );
 41 | 
 42 | print "subject\tquery\tstart\tend\tstrand\tmatched\n";
 43 | while ( my $fa = &$next_seq() ) {
 44 |     my ( $header, $seq ) = @$fa;
 45 | 
 46 |     for my $qname ( sort keys %$queries ) {
 47 |         my $qseq = $$queries{$qname};
 48 | 
 49 |         my $qseq_r = $qseq;
 50 |         $qseq_r = degenerate_seq_to_regexp($qseq_r) if $$args{degenerate};
 51 | 
 52 |         my $matches = match_regexp( $qseq_r, $seq );
 53 |         for my $match (@$matches) {
 54 |             my ( $start, $end, $matched ) = @$match;
 55 |             $start += 1;
 56 |             $end   += 1;
 57 |             print "$header\t$qname\t$start\t$end\t+\t$matched\n";
 58 |         }
 59 | 
 60 |         my $qseq_r = revcom($qseq);
 61 |         $qseq_r = degenerate_seq_to_regexp($qseq_r) if $$args{degenerate};
 62 |         my $matches = match_regexp( $qseq_r, $seq );
 63 |         for my $match (@$matches) {
 64 |             my ( $start, $end, $matched ) = @$match;
 65 |             $start += 1;
 66 |             $end   += 1;
 67 |             print "$header\t$qname\t$start\t$end\t-\t"
 68 |                 . revcom($matched) . "\n";
 69 |         }
 70 |     }
 71 | }
 72 | 
 73 | =head2 degenerate_seq_to_regexp
 74 | 
 75 | Translate degenerate sequence to regular expression.
 76 | 
 77 | =cut
 78 | 
 79 | sub degenerate_seq_to_regexp {
 80 |     my ($seq) = @_;
 81 |     my %bases = (
 82 |         'A' => 'A',
 83 |         'T' => 'T',
 84 |         'U' => 'U',
 85 |         'C' => 'C',
 86 |         'G' => 'G',
 87 |         'R' => '[AG]',
 88 |         'Y' => '[CT]',
 89 |         'M' => '[AC]',
 90 |         'K' => '[GT]',
 91 |         'S' => '[CG]',
 92 |         'W' => '[AT]',
 93 |         'H' => '[ACT]',
 94 |         'B' => '[CGT]',
 95 |         'V' => '[ACG]',
 96 |         'D' => '[AGT]',
 97 |         'N' => '[ACGT]',
 98 |     );
 99 |     return join '', map { exists $bases{$_} ? $bases{$_} : $_ }
100 |         split //, uc $seq;
101 | }
102 | 
103 | =head2 match_regexp
104 | 
105 | Find all sites matching the regular expression.
106 | 
107 | See https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta_locate_motif.pl
108 | 
109 | =cut
110 | 
111 | sub match_regexp {
112 |     my ( $r, $s ) = @_;
113 |     my @matched = ();
114 |     my $pos     = -1;
115 |     while ( $s =~ /($r)/ig ) {
116 |         $pos = pos $s;
117 | 
118 |         # return start, end, matched string
119 |         # start and end are 0-based
120 |         push @matched, [ $pos - length($1), $pos - 1, $1 ];
121 |         pos $s = $pos - length($1) + 1;
122 |     }
123 |     return \@matched;
124 | }
125 | 


--------------------------------------------------------------------------------
/sequence/fasta_remove_duplicates.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # https://github.com/shenwei356/bio_scripts
 3 | 
 4 | use strict;
 5 | use File::Basename;
 6 | use Getopt::Long;
 7 | use Digest::MD5 'md5_hex';
 8 | use BioUtil::Seq;
 9 | use BioUtil::Util;
10 | 
11 | local $| = 1;
12 | $0 = basename($0);
13 | my $usage = <<USAGE;
14 | 
15 | Remove duplicated fasta records
16 | 
17 | Usage: $0 [options] [fastafiles...]
18 | Options:
19 | 
20 |    -n   Comparing by header.
21 |    -s   Comparing by sequence.
22 |    -i   Ignore case.
23 |    -l   Output line length. [70]
24 | 
25 |    -h   Show this help information.
26 | 
27 | Examples:
28 | 
29 |     fasta_remove_duplicates.pl -s -i seq1.fa seq2.fa > uniq.fa
30 |     fasta_remove_duplicates.pl -n seq*.fa > uniq.fa
31 |     zcat seq.fa.gz | fasta_remove_duplicates.pl -s -i > uniq.fa
32 | 
33 |     # remove records same header and seqs
34 |     fasta_remove_duplicates.pl -s -n -i seq1.fa > uniq.fa
35 | 
36 | https://github.com/shenwei356/bio_scripts
37 | 
38 | USAGE
39 | 
40 | my $help        = 0;
41 | my $by_head     = 0;
42 | my $by_seq      = 0;
43 | my $ignore_case = 0;
44 | my $linelength  = 70;
45 | GetOptions(
46 |     'help|h' => \$help,
47 |     "n"      => \$by_head,
48 |     "s"      => \$by_seq,
49 |     "i"      => \$ignore_case,
50 |     'l=i'    => \$linelength,
51 | ) or die $usage;
52 | 
53 | die $usage if $help;
54 | if ($linelength <= 0 ){
55 |     die sprintf "value of -l (%d) should be greatter than 0\n", $linelength;
56 | }
57 | 
58 | # get the file list
59 | my @files = file_list_from_argv(@ARGV);
60 | 
61 | my $md5s = {};
62 | my ( $sum, $n ) = ( 0, 0 );
63 | my ( $file, $next_seq, $fa, $header, $seq, $target, $md5 ) = (undef) x 7;
64 | for $file (@files) {
65 |     $next_seq = FastaReader($file);
66 |     while ( $fa = &$next_seq() ) {
67 |         ( $header, $seq ) = @$fa;
68 | 
69 |         if ($by_seq) {    # comparing by seq
70 |             $target = $seq;
71 |             if ($by_head) {    # comparing by head and seq
72 |                 $target = $header . $seq;
73 |             }
74 |         }
75 |         elsif ($by_head) {     # comparing by head
76 |             $target = $header;
77 |         }
78 | 
79 |         $target = lc $target if $ignore_case;
80 |         $md5 = md5_hex($target);
81 | 
82 |         if ( $$md5s{$md5} == 1 ) {    # duplicates
83 |             $n++;
84 |         }
85 |         else {
86 |             $$md5s{$md5} = 1;
87 |             $sum++;
88 |             print ">$header\n", format_seq( $seq, $linelength );
89 |         }
90 |         print STDERR "\rremove: $n; remain: $sum";
91 |     }
92 | }
93 | 
94 | print STDERR "\n";
95 | 


--------------------------------------------------------------------------------
/sequence/fasta_rename_duplicated_names.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # https://github.com/shenwei356/bio_scripts
 3 | 
 4 | use strict;
 5 | use File::Basename;
 6 | use Getopt::Long;
 7 | use BioUtil::Seq;
 8 | use BioUtil::Util;
 9 | 
10 | $0 = basename($0);
11 | my $usage = <<USAGE;
12 | 
13 | Remove duplicated fasta names
14 | 
15 | Usage: $0 [options] [fastafiles...]
16 | Options:
17 | 
18 |    -l   Output line length. should be >= 0, 0 for no formating [70]
19 |    -h   Show this help information.
20 | https://github.com/shenwei356/bio_scripts
21 | 
22 | USAGE
23 | 
24 | my $help       = 0;
25 | my $linelength = 70;
26 | GetOptions(
27 |     'help|h' => \$help,
28 |     'l=i'    => \$linelength,
29 | ) or die $usage;
30 | 
31 | die $usage if $help;
32 | if ( $linelength < 0 ) {
33 |     die sprintf "value of -l (%d) should be greatter or equal to 0\n",
34 |       $linelength;
35 | }
36 | 
37 | # get the file list
38 | my @files = file_list_from_argv(@ARGV);
39 | 
40 | my $names = {};
41 | for my $file (@files) {
42 |     my $next_seq = FastaReader($file);
43 |     while ( my $fa = &$next_seq() ) {
44 |         my ( $header, $seq ) = @$fa;
45 |         if ( exists $$names{$header} ) {
46 |             $$names{$header}++;
47 |             $header = "$header r$$names{$header}";
48 |         }
49 |         else {
50 |             $$names{$header} = 1;
51 |         }
52 | 
53 |         if ( $linelength > 0 ) {
54 |             print ">$header\n", format_seq( $seq, $linelength );
55 |         }
56 |         else {
57 |             print ">$header\n", $seq, "\n";
58 |         }
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/sequence/fasta_reset_start_position_for_circular_genome.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | use strict;
 3 | 
 4 | my $usage = <<USAGE;
 5 | 
 6 | Function: Reset start position for circular genome.
 7 |    Usage: reset_start_position_for_circular_genome <fasta file> <new start>
 8 |  Example: 
 9 |     1. Set the 100th base as the new start position
10 |         reset_start_position_for_circular_genome seq.fa 100
11 | 
12 | Author: Wei Shen <shenwei356#gmail.com> <http://shenwei.me>
13 | Change history:
14 |     - 2014-04-30 rewrite.
15 |     - 2011 first edition.
16 | 
17 | USAGE
18 | 
19 | die $usage unless @ARGV == 2;
20 | 
21 | my ( $infile, $newstart, $head, $seq, $newseq, $buffer, $outfile );
22 | 
23 | $infile   = shift;
24 | $newstart = shift;
25 | 
26 | die "newstart should be integer greater than 0, you input $newstart.\n"
27 |     unless $newstart =~ /^\d+$/ and $newstart > 0;
28 | 
29 | $buffer = '';
30 | open IN, $infile or die "fail to open sequence file $infile!\n";
31 | local $/ = '>';
32 | <IN>;
33 | 
34 | while (<IN>) {
35 |     s/>$//;
36 |     ( $head, $seq ) = split "\r?\n", $_, 2;
37 |     $seq =~ s/\s+//g;
38 | 
39 |     $newseq = substr( $seq, $newstart - 1  ) . substr( $seq, 0, $newstart - 1 );
40 |     
41 |     $buffer .= ">$head (start position move to $newstart)\n"
42 |         . format_seq( $newseq, 70 ) . "\n";
43 | }
44 | close IN;
45 | $/ = "\n";
46 | 
47 | $outfile = "$infile.newstart$newstart.fa";
48 | if ( $infile =~ /(.+)\.(.+?)$/ ) {
49 |     $outfile = "$1.newstart$newstart.$2";
50 | }
51 | open OUT, ">", $outfile or die "failed to open file $outfile\n";
52 | print OUT $buffer;
53 | close OUT;
54 | 
55 | sub format_seq($$) {
56 |     my ( $s, $n ) = @_;
57 |     my $s2 = '';
58 |     my ( $j, $int );
59 |     $int = int( ( length $s ) / $n );
60 |     for ( $j = 0; $j <= $int - 1; $j++ ) {
61 |         $s2 .= substr( $s, $j * $n, $n ) . "\n";
62 |     }
63 |     $s2 .= substr( $s, $int * $n );
64 |     return $s2;
65 | }
66 | 


--------------------------------------------------------------------------------
/sequence/fasta_sliding_window.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use File::Basename;
 5 | use BioUtil::Seq;
 6 | use BioUtil::Util;
 7 | 
 8 | $0 = basename $0;
 9 | die "\nusage: $0 <seq_file> <w.size start> <w.size end> <w.size step> <slid step>\n\n"
10 |     unless @ARGV == 5;
11 | 
12 | my ( $file_query, $win_start, $win_end, $win_step, $step ) = @ARGV;
13 | check_positive_integer($win_start);
14 | check_positive_integer($win_end);
15 | check_positive_integer($win_step);
16 | check_positive_integer($step);
17 | 
18 | die "win_start should not be larger han win_end\n"
19 |     unless $win_end >= $win_start;
20 | 
21 | my $next_seq = FastaReader($file_query);
22 | while ( my $fa = &$next_seq() ) {
23 |     my ( $header, $seq ) = @$fa;
24 |     my $len_seq = length $seq;
25 | 
26 |     for ( my $win = $win_start; $win <= $win_end; $win += $win_step ) {
27 |         my $end = $len_seq - $win < 0 ? 0 : $len_seq - $win;
28 |         for ( my $i = 0; $i <= $end; $i += $step ) {
29 |             my $s = substr( $seq, $i, $win );
30 |             printf ">%s_window(%d,%d)\n%s\n", $header, $i+1, $win, $s;
31 |         }
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/sequence/fasta_trim_aligned_fasta.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | # https://github.com/shenwei356/bio_scripts
  3 | use strict;
  4 | use Getopt::Long;
  5 | use File::Temp qw/ tempfile/;
  6 | use BioUtil::Seq;
  7 | use BioUtil::Util;
  8 | 
  9 | local $| = 1;
 10 | 
 11 | my @GAPS = ( '-', '.' );
 12 | my $tmpfile_prefix = "fasta_trim_aligned_fasta_tmpfile_";
 13 | 
 14 | my $usage = <<USAGE;
 15 | 
 16 | Remove common gaps in aligned fasta sequences.
 17 | 
 18 | Reading from STDIN is supported. But in this case, sequences will be saved to
 19 | disk temporarily. Because this script reads sequences twice to reduce memery 
 20 | usage.
 21 | 
 22 | Usage: $0 [options] [aligned fasta file...]
 23 | Options:
 24 |     -h,  --help                Show this help information
 25 |     -g,  --gaps                Gap symbols [-.]
 26 |     -l,  --linelength          Line length
 27 | 
 28 | https://github.com/shenwei356/bio_scripts
 29 | 
 30 | USAGE
 31 | 
 32 | my $para = {};
 33 | $$para{linelength} = 70;
 34 | 
 35 | GetOptions(
 36 |     'help|h'         => \$$para{help},
 37 |     'gaps|g=s'       => \$$para{gaps},
 38 |     'linelength|l=i' => \$$para{linelength},
 39 | ) or die $usage;
 40 | 
 41 | die $usage if $$para{help};
 42 | 
 43 | # gap symbols
 44 | my %GAPSMAP = ();
 45 | if ( $$para{gaps} ) {
 46 |     @GAPS = split //, $$para{gaps};
 47 | }
 48 | $GAPSMAP{$_} = 1 for @GAPS;
 49 | 
 50 | my $use_stdin = 0;
 51 | my ( $tmp_file_fh, $tmp_file ) = (undef) x 2;
 52 | 
 53 | my @files = ();
 54 | for my $file (@ARGV) {
 55 |     for my $f ( glob $file ) {
 56 |         push @files, $f;
 57 |     }
 58 | }
 59 | if ( @files == 0 ) {
 60 |     push @files, 'STDIN';
 61 |     ( $tmp_file_fh, $tmp_file )
 62 |         = tempfile( $tmpfile_prefix . "XXXXXX", DIR => ".", SUFFIX => '.fa' );
 63 | 
 64 |     $use_stdin = 1;
 65 | }
 66 | 
 67 | print STDERR "sequences from STDIN is saved in $tmp_file\n" if $use_stdin;
 68 | print STDERR "check...\n";
 69 | 
 70 | my $gaploc  = {};    # store the gap location
 71 | my $do_once = 1;
 72 | my ( $header, $seq, $len, $i, $base ) = (undef) x 5;
 73 | my ( $sum, $n ) = (0) x 2;
 74 | for my $file (@files) {
 75 |     my $next_seq = FastaReader($file);
 76 |     while ( my $fa = &$next_seq() ) {
 77 |         ( $header, $seq ) = @$fa;
 78 |         $sum++;
 79 |         print STDERR "\rcount: $sum";
 80 |         if ($do_once) {
 81 |             $len = length $seq;
 82 |             $$gaploc{$_} = 1 for 0 .. ( $len - 1 );
 83 |             $do_once = 0;
 84 |         }
 85 | 
 86 |         for $i ( 0 .. ( $len - 1 ) ) {
 87 |             $base = substr $seq, $i, 1;
 88 |             if ( $GAPSMAP{$base} != 1 ) {    # it's not a gap!
 89 |                 delete $$gaploc{$i};
 90 |             }
 91 |         }
 92 | 
 93 |         if ( scalar keys %$gaploc == 0 ) {
 94 |             close $tmp_file_fh if $use_stdin;
 95 |             remove_tmpfile()   if $use_stdin;
 96 |             die "\nno gap to trim\n";
 97 |         }
 98 | 
 99 |         print $tmp_file_fh ">$header\n$seq\n" if $use_stdin;
100 |     }
101 | }
102 | 
103 | close $tmp_file_fh if $use_stdin;
104 | 
105 | my @index = keys %$gaploc;
106 | 
107 | print STDERR "\n", (scalar @index), " gaps to trim\n";
108 | print STDERR "\nextract sequences...\n";
109 | 
110 | @files = ($tmp_file) if $use_stdin;
111 | 
112 | for my $file (@files) {
113 |     my $next_seq = FastaReader($file);
114 |     while ( my $fa = &$next_seq() ) {
115 |         ( $header, $seq ) = @$fa;
116 |         $n++;
117 |         print STDERR "\r$n / $sum";
118 |         print ">$header\n",
119 |             format_seq( delete_string_elements_by_indexes( \$seq, \@index ),
120 |             $$para{linelength} );
121 |     }
122 | }
123 | 
124 | print STDERR "\n";
125 | 
126 | remove_tmpfile() if $use_stdin;
127 | 
128 | sub remove_tmpfile {
129 |     print STDERR "\nremove temporary files\n";
130 |     for ( glob "$tmpfile_prefix*" ) {
131 |         unlink $_ or die "fail to remove $_\n";
132 |     }
133 | }
134 | 


--------------------------------------------------------------------------------
/sequence/fastq2tab:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # https://github.com/shenwei356/bio_scripts
 3 | 
 4 | use strict;
 5 | use Getopt::Long;
 6 | 
 7 | my $usage = q(
 8 | fastq2tab - transform the fastq fromat to four-column table
 9 | 
10 | Usage: fastq2tab [options] [fastafiles...]
11 | Options:    
12 |     -s,   --split             reads id only. no index info
13 | 
14 | This script is usually used in pair with tab2fastq.
15 | https://github.com/shenwei356/bio_scripts
16 | 
17 | );
18 | 
19 | my $args = {};
20 | GetOptions(
21 |     'help|h' => \$$args{help},
22 | 
23 |     'split|s' => \$$args{split},
24 | ) or die $usage;
25 | die $usage if $$args{help};
26 | 
27 | my $line = '';
28 | while ( my $record = <> ) {
29 |     chomp($record);
30 |     $record =~ s/^\@//;
31 | 
32 |     if ( $$args{split} ) {
33 |         $record = ( split / /, $record )[0];
34 |     }
35 | 
36 |     for ( 1 .. 3 ) {
37 |         $line = <>;
38 |         chomp($line);
39 |         $record .= "\t$line";
40 |     }
41 | 
42 |     print "$record\n";
43 | }
44 | 


--------------------------------------------------------------------------------
/sequence/fastq_extract_paired_reads.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | # make sure the reads in the two fastq files has same order!
  4 | use strict;
  5 | use Parallel::Runner;
  6 | use File::Basename;
  7 | 
  8 | $0 = basename($0);
  9 | die "usage: $0 <read.1.fq> <read.2.fq>\n"
 10 |     unless @ARGV == 2;
 11 | 
 12 | my $fqfile1 = shift @ARGV;
 13 | my $fqfile2 = shift @ARGV;
 14 | 
 15 | # ===========================================================
 16 | 
 17 | print "read $fqfile1\n";
 18 | my $headers1 = get_headers($fqfile1);
 19 | 
 20 | print "read $fqfile2\n";
 21 | my $headers2 = get_headers($fqfile2);
 22 | 
 23 | # ===========================================================
 24 | 
 25 | print "find common IDs: ";
 26 | my $headers = {};
 27 | for my $header ( keys %$headers1 ) {
 28 |     next unless exists $$headers2{$header};
 29 |     $$headers{$header} = 1;
 30 | }
 31 | my $n = keys %$headers;
 32 | print "$n\n";
 33 | 
 34 | die "sadly, no paired reads found\n" if $n == 0;
 35 | 
 36 | # ===========================================================
 37 | 
 38 | my $runner = Parallel::Runner->new(2);
 39 | 
 40 | print "extract $fqfile1\n";
 41 | $runner->run( sub { extract( $headers, $fqfile1 ); } );
 42 | 
 43 | print "extract $fqfile2\n";
 44 | $runner->run( sub { extract( $headers, $fqfile2 ); } );
 45 | 
 46 | $runner->finish;
 47 | 
 48 | # ===========================================================
 49 | 
 50 | sub extract {
 51 |     my ( $headers, $fqfile ) = @_;
 52 | 
 53 |     my $fqfileout = $fqfile;
 54 |     $fqfileout =~ s/\.(fq|fastq)$//i;
 55 |     $fqfileout .= ".pe.fq";
 56 |     open my $fh, ">", $fqfileout or die "fail to wrtie file: $fqfileout\n";
 57 | 
 58 |     my $next_seq = FastqReader($fqfile);
 59 |     my $id = '';
 60 |     while ( my $fq = &$next_seq() ) {
 61 |         my ( $head, $seq, $qual ) = @$fq;
 62 |         $id = (split / /, $head )[0];
 63 |         if ($id =~ /(.+)\/\d$/){
 64 |             $id = $1;
 65 |         }
 66 |         next unless exists $$headers{ $id };
 67 |         print $fh "\@$head\n$seq\n+\n$qual\n";
 68 |     }
 69 | }
 70 | 
 71 | sub get_headers {
 72 |     my ($fqfile) = @_;
 73 |     my $headers = {};
 74 | 
 75 |     my $next_seq = FastqReader($fqfile);
 76 |     my $id = '';
 77 |     while ( my $fq = &$next_seq() ) {
 78 |         my ( $head, $seq, $qual ) = @$fq;
 79 |         $id = (split / /, $head )[0];
 80 |         if ($id =~ /(.+)\/\d$/){
 81 |             $id = $1;
 82 |         }
 83 |         $$headers{$id} = '1';
 84 |     }
 85 | 
 86 |     return $headers;
 87 | }
 88 | 
 89 | sub FastqReader {
 90 |     my ($file) = @_;
 91 | 
 92 |     my ( $open_flg, $finished ) = ( 0, 0 );
 93 |     my ( $fh, $head, $seq, $qual ) = (undef) x 4;
 94 | 
 95 |     if ( $file =~ /^STDIN$/i ) {    # from stdin
 96 |         $fh = *STDIN;
 97 |     }
 98 |     elsif ( ref $file eq '' or ref $file eq 'SCALAR' ) {    # from file
 99 |         open $fh, '<', $file or die "fail to open file: $file!\n";
100 |         $open_flg = 1;
101 |     }
102 |     else {    # glob, i.e. given file handler
103 |         $fh = $file;
104 |     }
105 | 
106 |     return sub {
107 |         return if $finished;
108 | 
109 |         while (<$fh>) {
110 |             if ( substr( $_, 0, 1 ) ne '@' ) {
111 |                 die "bad fq file\n";
112 |             }
113 |             
114 |             $head = $_;
115 |             $head =~ s/\r?\n$//;
116 |             substr( $head, 0, 1, '' );
117 | 
118 |             $seq = <$fh>;
119 |             $seq =~ s/\r?\n$//;
120 | 
121 |             <$fh>;
122 |             
123 |             $qual = <$fh;
124 |             $qual =~ s/\r?\n$//;
125 | 
126 |             return [ $head, $seq, $qual ];
127 |         }
128 | 
129 |         close $fh if $open_flg;
130 |         $finished = 1;
131 |         return;
132 |     };
133 | }
134 | 


--------------------------------------------------------------------------------
/sequence/fastx_mapping_with_bwa.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use File::Basename;
 5 | use BioUtil::Util;
 6 | 
 7 | $0 = basename ($0);
 8 | die qq(
 9 |     usage: $0 <threads> <refseq> <outprefix> <fasta/q> [<fasta/q>]
10 |     when two fastx file given, they are treated as paired end reads
11 | 
12 | )
13 |    unless @ARGV == 5 or @ARGV == 4;
14 | 
15 | my $threads = shift @ARGV;
16 | my $refseq  = shift @ARGV;
17 | my $prefix  = shift @ARGV;
18 | my $read    = shift @ARGV;
19 | my $read2   = shift @ARGV;
20 | 
21 | check_positive_integer($threads);
22 | 
23 | # build index
24 | my @suffix      = qw/.amb .ann .bwt .pac .sa/;
25 | my $index_built = 1;
26 | for (@suffix) {
27 |     $index_built = 0 unless -e "$refseq$_";
28 | }
29 | run("bwa index $refseq") unless $index_built;
30 | run("samtools faidx $refseq") unless -e "$refseq.fai";
31 | 
32 | # =================[ mapping ]===================
33 | 
34 | print "mapping\n";
35 | if ($read2){
36 |     run("bwa mem -t $threads -M -a $refseq $read $read2 > $prefix.sam");
37 | }else{
38 | run("bwa mem -t $threads -M -a $refseq $read > $prefix.sam");
39 |     }
40 | 
41 | # =================[ mapping ]===================
42 | 
43 | print "sam -> bam\n";
44 | run("samtools view  -bS $prefix.sam > $prefix.bam");
45 | 
46 | print "sort bam\n";
47 | run("samtools sort $prefix.bam $prefix.sorted");
48 | 
49 | print "index bam\n";
50 | run("samtools index $prefix.sorted.bam");
51 | 
52 | print "flagstat\n";
53 | run("samtools flagstat $prefix.sorted.bam > $prefix.sorted.bam.flagstat");
54 | 
55 | run("rm $prefix.bam $prefix.sam");


--------------------------------------------------------------------------------
/sequence/fastx_pwm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # https://github.com/shenwei356/bio_scripts
 4 | from __future__ import print_function
 5 | 
 6 | import argparse
 7 | import gzip
 8 | import logging
 9 | import os
10 | import re
11 | import sys
12 | 
13 | from Bio import SeqIO, motifs
14 | from Bio.Seq import Seq
15 | 
16 | 
17 | def parse_args():
18 |     parser = argparse.ArgumentParser(description="Position Weight Matrices of sequence")
19 | 
20 |     parser.add_argument("-v", "--verbose", help='verbosely print information',
21 |                         action="count", default=0)
22 | 
23 |     group = parser.add_mutually_exclusive_group()
24 |     group.add_argument("--stdin", action="store_true",
25 |                        help='read from stdin, one sequence per line')
26 |     group.add_argument('-i', '--infile', type=str,
27 |                        help='file name should like this: infile.[fasta|fa|fastq|fq][.gz]')
28 | 
29 |     args = parser.parse_args()
30 |     if not ( args.stdin or args.infile ):
31 |         sys.stderr.write("option --stdin or -i should be given\n")
32 |         sys.exit(1)
33 | 
34 |     return args
35 | 
36 | 
37 | def seq_iter(file):
38 |     if file:
39 |         found = re.search(r'(?i)(fasta|fa|fastq|fq)(.gz)?$', file)
40 |         if not found:
41 |             sys.stderr.write("invalid file name suffix.\nfile name should like this: infile.[fasfa|fa|fastq|fq][.gz]\n")
42 |             sys.exit(1)
43 |         seq_format, is_gz = found.groups()
44 |         if seq_format == 'fa':
45 |             seq_format = 'fasta'
46 |         if seq_format == 'fq':
47 |             seq_format = 'fastq'
48 | 
49 |         fh = gzip.open(file, 'rt') if is_gz else open(file, 'r')
50 |         for record in SeqIO.parse(fh, seq_format):
51 |             yield record.seq
52 |         fh.close()
53 |     else:
54 |         for line in sys.stdin:
55 |             yield Seq(line.strip())
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     args = parse_args()
60 |     seqs = seq_iter(args.infile)
61 |     seqs2 = [seq for seq in seqs if not 'N' in seq]
62 |     m = motifs.create(seqs2)
63 |     print(m.pwm)
64 |     # print(m.pssm)
65 |     # m.weblogo("motif.png")
66 | 


--------------------------------------------------------------------------------
/sequence/fastx_tm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # https://github.com/shenwei356/bio_scripts
 4 | from __future__ import print_function
 5 | 
 6 | import argparse
 7 | import gzip
 8 | import logging
 9 | import os
10 | import re
11 | import sys
12 | 
13 | from Bio import SeqIO
14 | from Bio.Seq import Seq
15 | from Bio.SeqRecord import SeqRecord
16 | from Bio.SeqUtils import MeltingTemp as mt
17 | 
18 | 
19 | def parse_args():
20 |     parser = argparse.ArgumentParser(description="Compute DNA MeltingTemp")
21 |     group = parser.add_mutually_exclusive_group()
22 |     group.add_argument("--stdin", action="store_true",
23 |                        help='read from stdin, one sequence per line')
24 |     group.add_argument('-i', '--infile', type=str,
25 |                        help='file name should like this: infile.[fasta|fa|fastq|fq][.gz]')
26 |     parser.add_argument('-f', '--format', type=str, # default='fasta',
27 |                         help='seqence format: fasta |fastq  [fasta]')
28 | 
29 |     args = parser.parse_args()
30 |     if not (args.stdin or args.infile):
31 |         sys.stderr.write("option --stdin or -i should be given\n")
32 |         sys.exit(1)
33 |     if args.format and not args.format in ['fasta', 'fastq']:
34 |         sys.stderr.write("option -f | --format should be 'fasta' or 'fastq'\n")
35 |         sys.exit(1)
36 |     if args.stdin and not args.format:
37 |         sys.stderr.write("option -f | --format should be given when --stdin is set.\n")
38 |         sys.exit(1)
39 | 
40 |     return args
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     args = parse_args()
45 | 
46 |     file, seq_format, fh = args.infile, args.format,  None,
47 |     if file:
48 |         if not seq_format:
49 |             found = re.search(r'(?i)(fasta|fa|fastq|fq)(.gz)?$', file)
50 |             if not found:
51 |                 print("invalid file name suffix.\nfile name should like this: infile.[fasfa|fa|fastq|fq][.gz]",
52 |                       file=sys.stderr)
53 |                 sys.exit(1)
54 |             seq_format, is_gz = found.groups()
55 |             if seq_format == 'fa':
56 |                 seq_format = 'fasta'
57 |             if seq_format == 'fq':
58 |                 seq_format = 'fastq'
59 | 
60 |         fh = gzip.open(file, 'rt') if file.endswith('.gz') else open(file, 'r')
61 |     else:
62 |         fh = sys.stdin
63 |         seq_format = args.format
64 | 
65 | 
66 |     sys.stdout.write('{}\t{}\t{}\t{}\n'.format('seq_id', 'Tm_Wallace', 'Tm_GC', 'Tm_NN'))
67 |     for seq in SeqIO.parse(fh, seq_format):
68 |         sys.stdout.write('{}\t{:0.2f}\t{:0.2f}\t{:0.2f}\n'.format(seq.id, mt.Tm_Wallace(seq.seq), mt.Tm_GC(seq.seq), mt.Tm_NN(seq.seq)))
69 |     fh.close()
70 | 


--------------------------------------------------------------------------------
/sequence/fastx_translate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # https://github.com/shenwei356/bio_scripts
 4 | from __future__ import print_function
 5 | 
 6 | import argparse
 7 | import gzip
 8 | import logging
 9 | import os
10 | import re
11 | import sys
12 | 
13 | from Bio import SeqIO
14 | from Bio.Seq import Seq
15 | from Bio.SeqRecord import SeqRecord
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser(description="Translate DNA to peptide")
20 | 
21 |     parser.add_argument("-v", "--verbose", help='verbosely print information',
22 |                         action="count", default=0)
23 | 
24 |     group = parser.add_mutually_exclusive_group()
25 |     group.add_argument("--stdin", action="store_true",
26 |                        help='read from stdin, one sequence per line')
27 |     group.add_argument('-i', '--infile', type=str,
28 |                        help='file name should like this: infile.[fasta|fa|fastq|fq][.gz]')
29 |     parser.add_argument('-f', '--format', type=str, # default='fasta',
30 |                         help='seqence format: fasta |fastq  [fasta]')
31 |     parser.add_argument('-t', '--table', type=int, default=1,
32 |                         help='genetic code table (detail: http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi ) [1]')
33 | 
34 |     args = parser.parse_args()
35 |     if not (args.stdin or args.infile):
36 |         sys.stderr.write("option --stdin or -i should be given\n")
37 |         sys.exit(1)
38 |     if args.format and not args.format in ['fasta', 'fastq']:
39 |         sys.stderr.write("option -f | --format should be 'fasta' or 'fastq'\n")
40 |         sys.exit(1)
41 |     if args.stdin and not args.format:
42 |         sys.stderr.write("option -f | --format should be given when --stdin is set.\n")
43 |         sys.exit(1)
44 | 
45 |     return args
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     args = parse_args()
50 | 
51 |     file, seq_format, fh = args.infile, args.format,  None,
52 |     if file:
53 |         if not seq_format:
54 |             found = re.search(r'(?i)(fasta|fa|fastq|fq)(.gz)?$', file)
55 |             if not found:
56 |                 print("invalid file name suffix.\nfile name should like this: infile.[fasfa|fa|fastq|fq][.gz]",
57 |                       file=sys.stderr)
58 |                 sys.exit(1)
59 |             seq_format, is_gz = found.groups()
60 |             if seq_format == 'fa':
61 |                 seq_format = 'fasta'
62 |             if seq_format == 'fq':
63 |                 seq_format = 'fastq'
64 | 
65 |         fh = gzip.open(file, 'rt') if file.endswith('.gz') else open(file, 'r')
66 |     else:
67 |         fh = sys.stdin
68 |         seq_format = args.format
69 | 
70 |     for seq in SeqIO.parse(fh, seq_format):
71 |         SeqIO.write([SeqRecord(seq.seq.translate(table=args.table), id=seq.id, description=seq.description)], sys.stdout, 'fasta')
72 | 
73 |     fh.close()
74 | 


--------------------------------------------------------------------------------
/sequence/run_clustalo.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use File::Basename;
 5 | use BioUtil::Util;
 6 | 
 7 | $0 = basename($0);
 8 | my $usage = <<USAGE;
 9 | 
10 | Usage: $0 <threads number> <fastafile> [fastafile...]
11 | 
12 | https://github.com/shenwei356/bio_scripts
13 | 
14 | USAGE
15 | 
16 | die $usage unless @ARGV >= 2;
17 | 
18 | my $threads = shift @ARGV;
19 | 
20 | for my $file (@ARGV) {
21 |     my $fileout = "$file.align.fa";
22 |     my $cmd     = "clustalo -i $file -o $fileout --force --outfmt fasta --threads=$threads";
23 |     my $fail = run($cmd);
24 |     die "failed to run:$cmd\n" if $fail;
25 | }
26 | 


--------------------------------------------------------------------------------
/sequence/sample/gc_skew.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/bio_scripts/703cec8d21903516346e2aae4d77d23385c30905/sequence/sample/gc_skew.png


--------------------------------------------------------------------------------
/sequence/sample/seq.fa:
--------------------------------------------------------------------------------
 1 | >1234 gene=0001
 2 | actgatcat-gtagagag
 3 | tagatcagagtc
 4 | >seq2
 5 | atcgatcgaa
 6 | >seq3
 7 | atcgatcgaa
 8 | >123 gene=00011
 9 | acccccctct-ttcgg-tatgct-gata-tgatgatgtacg
10 | -tatgct-gata-tgatgtac
11 | acccccctct-ttcgg-tatgct-tgatgtac
12 | acccccctct-ttcgg-tatgct-tgatgtac
13 | acccccctct-ttcgg-tatgct-tgatgtac
14 | acccccctct-ttcgg-tatgct-tgatgtac
15 | acccccctct-ttcgg-tatgct-tgatgtac
16 | acccccctct-ttcgg-tatgct-tgatgtac
17 | acccccctct-ttcgg-tatgct-tgatgtac
18 | acccccctct-ttcgg-tatgct-tgatgtac
19 | acccccctct-ttcgg-tatgct-
20 | acccccctct-ttcgg-tatgct-gata-tgatgatgtacg 
21 | 


--------------------------------------------------------------------------------
/sequence/sample/seq.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shenwei356/bio_scripts/703cec8d21903516346e2aae4d77d23385c30905/sequence/sample/seq.fq.gz


--------------------------------------------------------------------------------
/sequence/seqcomp:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | use strict;
3 | 
4 | my $seq = shift @ARGV;
5 | $seq =~ tr/ACGTRYMKSWBDHVNacgtrymkswbdhvn/TGCAYRKMSWVHDBNtgcayrkmswvhdbn/;
6 | print "$seq\n";
7 | 


--------------------------------------------------------------------------------
/sequence/seqrc:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | use strict;
3 | 
4 | my $seq = shift @ARGV;
5 | $seq = reverse $seq;
6 | $seq =~ tr/ACGTRYMKSWBDHVNacgtrymkswbdhvn/TGCAYRKMSWVHDBNtgcayrkmswvhdbn/;
7 | print "$seq\n";
8 | 


--------------------------------------------------------------------------------
/sequence/seqrev:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | use strict;
3 | 
4 | my $seq = shift @ARGV;
5 | $seq = reverse $seq;
6 | print "$seq\n";
7 | 


--------------------------------------------------------------------------------
/sequence/tab2fasta:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # https://github.com/shenwei356/bio_scripts
 3 | 
 4 | use strict;
 5 | use Getopt::Long;
 6 | use BioUtil::Util;
 7 | use BioUtil::Seq;
 8 | 
 9 | my $usage = q(
10 | tab2fasta - transfrom column table to fasta fromat
11 | 
12 | Usage: $0 [options] [tablefile...]
13 | 
14 | Options:
15 | 
16 |     -l,  --linelength          Output line length
17 |     -h,  --help                Show this help information
18 | 
19 | This script is usually used in pair with fasta2tab.
20 | https://github.com/shenwei356/bio_scripts
21 | 
22 | );
23 | 
24 | my $para = {};
25 | GetOptions(
26 |     'help|h'         => \$$para{help},
27 |     'linelength|l=i' => \$$para{linelength},
28 | ) or die $usage;
29 | 
30 | die $usage if $$para{help};
31 | 
32 | my @files = file_list_from_argv(@ARGV);
33 | 
34 | for my $file (@files) {
35 |     my $fh = undef;
36 | 
37 |     my $is_stdin = 0;
38 |     if ( $file eq 'STDIN' ) {
39 |         $fh       = *STDIN;
40 |         $is_stdin = 1;
41 |     }
42 |     else {
43 |         open $fh, "<", $file
44 |             or die "fail to open file: $file\n";
45 |     }
46 | 
47 |     my ( $header, $seq ) = ( "", "" );
48 |     while (<$fh>) {
49 |         s/\r?\n//g;
50 |         s/^\s+|\s+$//g;
51 |         next if $_ eq ''    # blank line
52 |             or /^#/;        # annotation
53 | 
54 |         # first column as header, and second column as sequence,
55 |         # ignore others
56 |         ( $header, $seq ) = split /\t/, $_;
57 | 
58 |         $header =~ s/__tab__/\t/g;
59 | 
60 |         if ( $$para{linelength} ) {
61 |             print ">$header\n", format_seq( $seq, $$para{linelength} );
62 |         }
63 |         else {
64 |             print ">$header\n$seq\n";
65 |         }
66 |     }
67 | 
68 |     close $fh unless $is_stdin;
69 | }
70 | 


--------------------------------------------------------------------------------
/sequence/tab2fastq:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | # https://github.com/shenwei356/bio_scripts
3 | 
4 | use strict;
5 | 
6 | while (<>) {
7 |     print '@'.join( "\n", split( /\t/, $_ ) );
8 | }
9 | 


--------------------------------------------------------------------------------
/taxon/taxon_fetch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # https://github.com/shenwei356/bio_scripts/
  3 | '''
  4 | fetch taxon information by species name or taxid.
  5 | 
  6 | Take home message:
  7 | 
  8 |     1. using cache to avoid repeatly search
  9 |     2. object of Entrez.read(Entrez.efetch()) could be treated as list,
 10 |        but it could not be rightly pickled. Using Json is also not OK.
 11 |        The right way is cache the xml text.
 12 | 
 13 |            search = Entrez.efetch(id=taxid, db="taxonomy", retmode="xml")
 14 |            # data = Entrez.read(search)
 15 |            ##  read and parse xml
 16 |            data_xml = search.read()
 17 |            data = list(Entrez.parse(StringIO(data_xml)))
 18 |     3. pickle file was fragile. a flag file could be used to detect whether
 19 |        data is rightly dumped.
 20 |     4. using multi-threads to accelerate fetching.
 21 | 
 22 | '''
 23 | 
 24 | from __future__ import print_function
 25 | import sys
 26 | import argparse
 27 | import os
 28 | import re
 29 | import shutil
 30 | import pickle
 31 | from StringIO import StringIO
 32 | from multiprocessing import Pool
 33 | from Bio import Entrez
 34 | 
 35 | parser = argparse.ArgumentParser(
 36 |     description=
 37 |     "fetch taxon information by species name or taxid. Cache used to avoid repeatly search",
 38 |     epilog="https://github.com/shenwei356/bio_scripts/")
 39 | 
 40 | parser.add_argument('infile', help='species name/taxid list')
 41 | parser.add_argument('-n',
 42 |                     '--by-name',
 43 |                     action='store_true',
 44 |                     help='search by species name')
 45 | parser.add_argument('-t',
 46 |                     '--threads',
 47 |                     type=int,
 48 |                     default=4,
 49 |                     help='threads number, default:4')
 50 | 
 51 | default_cache_path = os.path.join(
 52 |     os.path.expanduser("~"), '.taxon', 'taxon_map.pickle')
 53 | parser.add_argument(
 54 |     '-c',
 55 |     '--cache-file',
 56 |     type=str,
 57 |     default=default_cache_path,
 58 |     help='taxon_map cache file, default: {}'.format(default_cache_path))
 59 | parser.add_argument('-d',
 60 |                     '--delete-cache-file',
 61 |                     action='store_true',
 62 |                     help='delete cache file')
 63 | 
 64 | args = parser.parse_args()
 65 | 
 66 | # ================[ caching feteched data ]==================
 67 | cache = dict()
 68 | 
 69 | # a flag file to check if the pickle file is ok, its existance means not ok
 70 | flag_file = '{}.close-by-accident'.format(args.cache_file)
 71 | 
 72 | if args.delete_cache_file:
 73 |     if os.path.exists(args.cache_file):
 74 |         os.unlink(args.cache_file)
 75 |     if os.path.exists(flag_file):
 76 |         os.unlink(flag_file)
 77 | 
 78 | # read cache if available
 79 | if os.path.exists(args.cache_file):
 80 |     sys.stderr.write('[INFO] read taxon_map cache from file: {}\n'.format(
 81 |         args.cache_file))
 82 | 
 83 |     if not os.path.exists(flag_file):
 84 |         cache = pickle.load(open(args.cache_file, 'rb'))
 85 |     else:
 86 |         sys.stderr.write(
 87 |             '[INFO] it seems that last run failed. delete cache file.\n')
 88 |         os.unlink(flag_file)
 89 |     # cache = pickle.load(open(args.cache_file, 'rb'))
 90 | else:
 91 |     sys.stderr.write('[INFO] create new taxon_map cache file: {}\n'.format(
 92 |         args.cache_file))
 93 | 
 94 |     cache_dir = os.path.dirname(args.cache_file)
 95 |     if not os.path.exists(cache_dir):
 96 |         os.mkdir(cache_dir)
 97 | 
 98 | cache_fh = open(args.cache_file, 'wb')
 99 | 
100 | open(flag_file, 'w').close()
101 | 
102 | 
103 | # ================[ fetching method ]==================
104 | def get_tax_id(species):
105 |     species = species.replace(" ", "+").strip()
106 | 
107 |     search = Entrez.esearch(term=species, db="taxonomy", retmode="xml")
108 |     record = Entrez.read(search)
109 | 
110 |     return record['IdList'][0]
111 | 
112 | 
113 | # ================[ fetching method ]==================
114 | def get_tax_data(taxid):
115 |     if not re.search('^\d+$', taxid):
116 |         sys.stderr.write(
117 |             '[ERROR] do you use species name as query? you may use flag: -n\n')
118 |         if os.path.exists(flag_file):
119 |             os.unlink(flag_file)
120 |         sys.exit(0)
121 |     search = Entrez.efetch(id=taxid, db="taxonomy", retmode="xml")
122 | 
123 |     # return Entrez.read(search) # if not using pickle, this is enough
124 | 
125 |     # save xml for pickle
126 |     data_xml = search.read()
127 |     return list(Entrez.parse(StringIO(data_xml))), data_xml
128 | 
129 | 
130 | # ================[ fetching and outputing ]==================
131 | def fetch_taxon(query):
132 |     if query in cache:
133 |         sys.stderr.write('[INFO] cached query: {}\n'.format(query))
134 | 
135 |         taxon_data_xml = cache[query]
136 |         data = list(Entrez.parse(StringIO(taxon_data_xml)))
137 |     else:
138 |         sys.stderr.write('[INFO] new query: {}\n'.format(query))
139 | 
140 |         if args.by_name:
141 |             taxid = get_tax_id(query)
142 |             data, taxon_data_xml = get_tax_data(taxid)
143 | 
144 |             cache[taxid] = taxon_data_xml
145 |         else:
146 |             data, taxon_data_xml = get_tax_data(query)
147 | 
148 |             cache[data[0]['ScientificName']] = taxon_data_xml
149 | 
150 |         # save xml for pickle
151 |         cache[query] = taxon_data_xml
152 | 
153 |     # output
154 |     lineage = data[0]['Lineage']
155 |     division = data[0]['Division']
156 |     taxid = data[0]['TaxId']
157 | 
158 |     CommonName = ''
159 |     if 'OtherNames' in data[0] and 'GenbankCommonName' in data[0][
160 |             'OtherNames']:
161 |         CommonName = data[0]['OtherNames']['GenbankCommonName']
162 | 
163 |     ScientificName = data[0]['ScientificName']
164 | 
165 |     if args.by_name:
166 |         print('\t'.join([taxid, query, division, CommonName, lineage]))
167 |     else:
168 |         print('\t'.join([query, ScientificName, division, CommonName, lineage
169 |                          ]))
170 | 
171 | # ================[ read query list ]==================
172 | Entrez.email = "tmp@gmail.com"
173 | 
174 | species_list = list()
175 | with open(args.infile) as fh:
176 |     for species in fh:
177 |         species = species.rstrip().lstrip()
178 |         if len(species) == 0:
179 |             continue
180 |         species_list.append(species)
181 | 
182 | # ================[ fetching with multiprocessing ]==================
183 | pool = Pool(args.threads)
184 | #pool.map(fetch_taxon, species_list)
185 | map(fetch_taxon, species_list)
186 | 
187 | # ================[ caching ]==================
188 | pickle.dump(cache, cache_fh, -1)
189 | if os.path.exists(flag_file):
190 |     os.unlink(flag_file)
191 | 


--------------------------------------------------------------------------------
/util/unzipGBK:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | import zipfile
 7 | 
 8 | for file in sys.argv[1:]:    
 9 |     print "Processing File " + file
10 | 
11 |     file=zipfile.ZipFile(file,"r");
12 |     for name in file.namelist():
13 |         utf8name=name.decode('gbk')
14 |         print "Extracting " + utf8name
15 |         pathname = os.path.dirname(utf8name)
16 |         if not os.path.exists(pathname) and pathname!= "":
17 |             os.makedirs(pathname)
18 |         data = file.read(name)
19 |         if not os.path.exists(utf8name):
20 |             fo = open(utf8name, "w")
21 |             fo.write(data)
22 |             fo.close
23 |     file.close() 
24 | 


--------------------------------------------------------------------------------