├── .gitignore ├── LICENSE ├── README.md ├── biology └── bio_plasmid_get_insert.pl ├── blast ├── auto_blast ├── auto_makeblastdb ├── blast_best_hit.py ├── blast_best_hit_outfmt6.py └── fasta_rename_head_before_blast.pl ├── enzyme ├── embossre.enz ├── enzs.list ├── restrict_check_digested_sequence_number.pl ├── restrict_choose_enzyme_for_identify_genomes.pl ├── restrict_with_T_tail.pl ├── restrict_with_far_away_digest_site.pl └── restrict_without_digest_site_in_sequences.pl ├── file_formats ├── add_annotations_to_myva.pl ├── bam2gff.py ├── extract_cds_from_glimmer_predict_result.pl ├── extract_features_from_genbank_file.py ├── extract_sequence_from_genbank_file.pl ├── genbank_filter.py ├── gff2fa.py ├── gff_frame_start_coverage.plot.R ├── gff_frame_start_coverage.py └── gff_intersect.py ├── for_education ├── Parsing grouped data in multi-line.pl ├── extract_cds_by_gff.pl ├── fasta_common_seqs2.pl ├── fasta_extract_sequence_by_id_file.pl ├── join_table.pl └── simple_statistics.pl ├── not_used ├── csv2tab ├── csv_join ├── csv_join_paired_lines.py ├── csv_split_paired_lines.py ├── fasta_seq_gc_content_plot.py └── fasta_seq_length_plot.py ├── plot ├── README.md ├── example │ ├── data.tsv │ ├── data.tsv.dist.png │ ├── data.txt.png │ ├── heatmap.png │ └── plot_barplot.png ├── plot_distribution.R └── plot_distribution.py ├── protein └── protein_batch_compute_pI.pl ├── sequence ├── README.md ├── fasta2tab ├── fasta_common_seqs.pl ├── fasta_extract_by_pattern.pl ├── fasta_extract_randomly.pl ├── fasta_gc_skew.plot.R ├── fasta_gc_skew.py ├── fasta_locate_motif.pl ├── fasta_remove_duplicates.pl ├── fasta_rename_duplicated_names.pl ├── fasta_reset_start_position_for_circular_genome.pl ├── fasta_sliding_window.pl ├── fasta_trim_aligned_fasta.pl ├── fastq2tab ├── fastq_extract_paired_reads.pl ├── fastx_mapping_with_bwa.pl ├── fastx_pwm.py ├── fastx_tm.py ├── fastx_translate.py ├── run_clustalo.pl ├── sample │ ├── gc_skew.png │ ├── seq.fa │ └── seq.fq.gz ├── seqcomp ├── seqrc ├── seqrev ├── tab2fasta └── tab2fastq ├── taxon └── taxon_fetch.py └── util └── unzipGBK /.gitignore: -------------------------------------------------------------------------------- 1 | .directory 2 | /blib/ 3 | /.build/ 4 | _build/ 5 | cover_db/ 6 | inc/ 7 | Build 8 | !Build/ 9 | Build.bat 10 | .last_cover_stats 11 | /Makefile 12 | /Makefile.old 13 | /MANIFEST.bak 14 | /META.yml 15 | /META.json 16 | /MYMETA.* 17 | nytprof.out 18 | /pm_to_blib 19 | *.o 20 | *.bs 21 | /_eumm/ 22 | .directory 23 | *.idea 24 | 25 | 26 | # Byte-compiled / optimized / DLL files 27 | __pycache__/ 28 | *.py[cod] 29 | 30 | # C extensions 31 | *.so 32 | 33 | # Distribution / packaging 34 | .Python 35 | env/ 36 | build/ 37 | develop-eggs/ 38 | dist/ 39 | downloads/ 40 | eggs/ 41 | .eggs/ 42 | lib/ 43 | lib64/ 44 | parts/ 45 | sdist/ 46 | var/ 47 | *.egg-info/ 48 | .installed.cfg 49 | *.egg 50 | 51 | # PyInstaller 52 | # Usually these files are written by a python script from a template 53 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 54 | *.manifest 55 | *.spec 56 | 57 | # Installer logs 58 | pip-log.txt 59 | pip-delete-this-directory.txt 60 | 61 | # Unit test / coverage reports 62 | htmlcov/ 63 | .tox/ 64 | .coverage 65 | .cache 66 | nosetests.xml 67 | coverage.xml 68 | 69 | # Translations 70 | *.mo 71 | *.pot 72 | 73 | # Django stuff: 74 | *.log 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # History files 83 | .Rhistory 84 | .Rapp.history 85 | 86 | # Example code in package build process 87 | *-Ex.R 88 | 89 | # RStudio files 90 | .Rproj.user/ 91 | 92 | # produced vignettes 93 | vignettes/*.html 94 | vignettes/*.pdf 95 | 96 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 97 | .httr-oauth -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 Wei Shen (shenwei356@gmail.com) 2 | 3 | The MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Bio_scripts 2 | ======== 3 | 4 | Practical, reusable scripts for bioinformatics . 5 | 6 | sequence tools for FASTA/Q files: fasta2tab, tab2fasta, 7 | fasta_extract_by_pattern, fasta_common_seqs, 8 | fasta_locate_motif, fasta_remove_duplicates, 9 | fastx_translate, fasta_gc_skew.plot ... 10 | util moved to https://github.com/shenwei356/datakit 11 | plot plot scripts: heatmap ... 12 | 13 | blast NCBI BLAST+ wrappers 14 | file_formats genbank->gtf, bam2gff, gff2fa ... 15 | enzyme analysis of restrict enzymes 16 | protein Batch compute pI of protein 17 | 18 | biology get insert from sanger sequencing result 19 | 20 | for_education scripts with detail comments, for education 21 | not_used older version of some scripts 22 | 23 | 24 | See README in sub derectories. 25 | 26 | ------- 27 | 28 | Copyright (c) 2014-2015, Wei Shen (shenwei356@gmail.com) 29 | 30 | 31 | [MIT License](https://github.com/shenwei356/bio_scripts/blob/master/LICENSE) 32 | -------------------------------------------------------------------------------- /biology/bio_plasmid_get_insert.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use File::Basename; 5 | use BioUtil::Seq; 6 | 7 | # M13 8 | my $prefix = "AGCGGCCGCGAATTGCCCTT"; 9 | my $suffix = "AAGGGCAATTCGTTTAAACCT"; 10 | 11 | $0 = basename $0; 12 | my $usage = < 15 | 16 | USAGE 17 | 18 | die $usage unless @ARGV == 2; 19 | 20 | my $seqf = get_the_one_seq( shift @ARGV ); 21 | my $seqr = revcom (get_the_one_seq( shift @ARGV ) ); 22 | 23 | my $sf = extract_insert( $prefix, $suffix, $seqf ); 24 | my $sr = extract_insert( $prefix, $suffix, $seqr ); 25 | 26 | if ( $sf ne $sr ) { 27 | print "forward: $sf\nreverse: $sr\n"; 28 | die "forward and reverse sequences are not equal!"; 29 | } 30 | 31 | print $sf, "\n"; 32 | 33 | 34 | 35 | sub extract_insert { 36 | my ( $prefix, $suffix, $seq ) = @_; 37 | die "prefix and suffix do not match sequence!\n" 38 | unless $seq =~ /$prefix(.+)$suffix/; 39 | return $1; 40 | } 41 | 42 | sub get_the_one_seq { 43 | my ($file) = @_; 44 | my $seqs = read_sequence_from_fasta_file($file); 45 | die "only one sequence should be in $file. Please check it.\n" 46 | unless keys %$seqs == 1; 47 | return ( values %$seqs )[0]; 48 | } 49 | -------------------------------------------------------------------------------- /blast/auto_blast: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # https://github.com/shenwei356/bio_scripts 3 | 4 | use strict; 5 | use File::Basename; 6 | use BioUtil::Seq; 7 | use BioUtil::Util; 8 | 9 | my $usage = q( 10 | auto_blast -- just for typing fewer words..^_^ 11 | 12 | Usage: $0 [arguments] 13 | Options: 14 | -program [blastn] 15 | -query 16 | -db 17 | -outfmt [11 0] 18 | 19 | -h Show this help information 20 | 21 | Examples: 22 | 23 | auto_blast -query test.fa -db nt -outfmt 6 24 | 25 | https://github.com/shenwei356/bio_scripts 26 | 27 | ); 28 | 29 | my $opts = {}; 30 | my $threads = `cat /proc/cpuinfo | grep processor |wc -l`; 31 | $threads =~ s/\n//g; 32 | my $outfmt7shenwei356 33 | = '7 qseqid sseqid qlen slen length pident ppos qcovs qcovhsp' 34 | . ' mismatch gapopen gaps sstrand qstart qend sstart send' 35 | . ' evalue bitscore staxids salltitles'; 36 | $$opts{-outfmt} = [ 0, $outfmt7shenwei356 ]; 37 | 38 | getopt( $opts, \@ARGV ); 39 | 40 | $$opts{-program} = 'blastn' unless exists $$opts{-program}; 41 | $$opts{-num_threads} = $threads unless exists $$opts{-num_threads}; 42 | 43 | # print "$_: $$opts{$_}\n" for sort keys %$opts; 44 | 45 | die $usage unless exists $$opts{-query} and exists $$opts{-db}; 46 | die "file not exists: $$opts{-query}\n" unless -e $$opts{-query}; 47 | 48 | my $info = sprintf "%s.%s@%s", quotemeta $$opts{-query}, $$opts{-program}, 49 | basename( $$opts{-db} ); 50 | $$opts{-out} = $info unless exists $$opts{-out}; 51 | 52 | my $cmd = ""; 53 | 54 | $cmd = $$opts{-program}; 55 | for ( sort keys %$opts ) { 56 | next 57 | if $_ eq '-program' 58 | or $_ eq '-outfmt' 59 | or $_ eq '-out' 60 | or $_ eq '-query'; 61 | 62 | if ( ref $$opts{$_} eq ref [] ) { 63 | $cmd .= " $_ ", join " ", @{ $$opts{$_} }; 64 | } 65 | else { 66 | $cmd .= " $_ $$opts{$_}"; 67 | } 68 | } 69 | my $file_outfmt11 = "$$opts{-out}.asn"; 70 | $cmd .= sprintf " -query %s -outfmt 11 -out %s", 71 | quotemeta $$opts{-query}, $file_outfmt11; 72 | 73 | print STDERR "$cmd\n"; 74 | my $fail = run($cmd); 75 | die "failed to run:$cmd\n" if $fail; 76 | 77 | for ( @{ $$opts{-outfmt} } ) { 78 | s/^\s+//g; 79 | my $outfmt = ( split( /\s+/, $_ ) )[0]; 80 | $cmd 81 | = "blast_formatter -archive $file_outfmt11 -outfmt \"$_\" > $$opts{-out}.outfmt$outfmt"; 82 | print STDERR "$cmd\n"; 83 | my $fail = run($cmd); 84 | die "failed to run:$cmd\n" if $fail; 85 | } 86 | -------------------------------------------------------------------------------- /blast/auto_makeblastdb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # https://github.com/shenwei356/bio_scripts 3 | 4 | use strict; 5 | use File::Basename; 6 | use BioUtil::Seq; 7 | use BioUtil::Util; 8 | 9 | my $usage = q( 10 | auto_makeblastdb -- just for typing fewer words..^_^ 11 | 12 | Usage: $0 [arguments] 13 | Options: 14 | -in 15 | -input_type [fasta] 16 | 17 | -dbtype [nucl] 18 | -out 19 | -title 20 | 21 | -parse_seqids [false] 22 | 23 | -mask [false] 24 | 25 | -h Show this help information 26 | 27 | Examples: 28 | 29 | auto_makeblastdb -in ab.fa -out ab 30 | 31 | https://github.com/shenwei356/bio_scripts 32 | 33 | ); 34 | 35 | my $opts = {}; 36 | 37 | getopt( $opts, \@ARGV ); 38 | 39 | die $usage unless exists $$opts{-in} and exists $$opts{-out}; 40 | die "file not exists: $$opts{-in}\n" unless -e $$opts{-in}; 41 | 42 | $$opts{-title} = $$opts{-out} unless exists $$opts{-title}; 43 | $$opts{-dbtype} = 'nucl' unless exists $$opts{-dbtype}; 44 | $$opts{-input_type} = 'fasta' unless exists $$opts{-input_type}; 45 | 46 | my $cmd = ''; 47 | my ( $file_mask_asnb, $file_mask_counts ) = (undef) x 2; 48 | if ( $$opts{-mask} ) { 49 | print STDERR "Create masking information using windowmask...\n"; 50 | $file_mask_counts = "$$opts{-out}_mask.counts"; 51 | $file_mask_asnb = "$$opts{-out}_mask.asnb"; 52 | $cmd 53 | = sprintf 54 | "windowmasker -in %s -infmt %s -mk_counts -out %s", 55 | quotemeta $$opts{-in}, $$opts{-input_type}, $file_mask_counts; 56 | $cmd .= " -parse_seqids" if $$opts{-parse_seqids}; 57 | my $fail = run($cmd); 58 | die "failed to run:$cmd\n" if $fail; 59 | 60 | print STDERR "Makeblastdb...\n"; 61 | $cmd 62 | = sprintf 63 | "windowmasker -in %s -infmt %s -ustat %s -outfmt %s -out %s", 64 | quotemeta $$opts{-in}, $$opts{-input_type}, $file_mask_counts, 65 | "maskinfo_asn1_bin", $file_mask_asnb; 66 | $cmd .= " -parse_seqids" if $$opts{-parse_seqids}; 67 | 68 | my $fail = run($cmd); 69 | die "failed to run:$cmd\n" if $fail; 70 | } 71 | 72 | $cmd 73 | = sprintf 74 | "makeblastdb -in %s -input_type %s -dbtype %s -out %s -title %s", 75 | quotemeta $$opts{-in}, $$opts{-input_type}, $$opts{-dbtype}, 76 | $$opts{-out}, $$opts{-title}; 77 | $cmd .= " -parse_seqids" if $$opts{-parse_seqids}; 78 | 79 | if ( $$opts{-mask} ) { 80 | $cmd .= " -mask_data $file_mask_asnb"; 81 | } 82 | 83 | print STDERR "$cmd\n"; 84 | my $fail = run($cmd); 85 | die "failed to run:$cmd\n" if $fail; 86 | -------------------------------------------------------------------------------- /blast/blast_best_hit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | 7 | if len(sys.argv) != 2: 8 | print "\nUsage: %s \n" % os.path.basename(sys.argv[0]) 9 | sys.exit(1) 10 | 11 | blast = sys.argv[1] 12 | 13 | with open(blast, 'r') as fp: 14 | init = "" 15 | for line in fp: 16 | if not line.startswith("#"): 17 | item = line.strip().split("\t") 18 | if init != item[0]: 19 | print line.strip() 20 | init = item[0] 21 | -------------------------------------------------------------------------------- /blast/blast_best_hit_outfmt6.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # from https://github.com/jameslz/benchmark 4 | import sys 5 | 6 | if len(sys.argv) != 4: 7 | print "\nUsage: %s \n" % sys.argv[0] 8 | sys.exit(1) 9 | 10 | blast = sys.argv[1] 11 | max_evalue = float(sys.argv[2]) 12 | min_bit_score = float(sys.argv[3]) 13 | 14 | with open(blast, 'r') as fp: 15 | init = "" 16 | for line in fp: 17 | if not line.startswith("#"): 18 | item = line.strip().split("\t") 19 | evalue = float(item[10]) 20 | bit_score = float(item[11]) 21 | if init != item[0]: 22 | if evalue <= max_evalue and bit_score >= min_bit_score: 23 | print line.strip() 24 | init = item[0] 25 | -------------------------------------------------------------------------------- /blast/fasta_rename_head_before_blast.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Function: Delete illegal charactors of head line in fasta file before blast. 4 | # Author : Wei Shen http://shenwei.me 5 | # Date : 2014-08-14 6 | 7 | use strict; 8 | use BioUtil::Seq; 9 | 10 | die "\nUsage: $0 fasta_file [fasta_file ...]\n\n" 11 | unless @ARGV > 0; 12 | 13 | while (@ARGV) { 14 | my $file = shift @ARGV; 15 | my $n = rename_fasta_header( '[^a-z\d\s\-\_\(\)\[\]\|]', '_', $file, 16 | "$file.rename.fa" ); 17 | print "$file: $n records renamed\n"; 18 | } 19 | -------------------------------------------------------------------------------- /enzyme/embossre.enz: -------------------------------------------------------------------------------- 1 | # 2 | # REBASE version 408 emboss_e.408 3 | # 4 | # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= 5 | # REBASE, The Restriction Enzyme Database http://rebase.neb.com 6 | # Copyright (c) Dr. Richard J. Roberts, 2014. All rights reserved. 7 | # =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= 8 | # 9 | # Rich Roberts Jul 30 2014 10 | # 11 | # REBASE enzyme patterns for EMBOSS (embossre.enz) 12 | # 13 | # Format: 14 | # 15 | # namepatternlenncutsbluntc1c2c3c4 16 | # 17 | # Where: 18 | # name = name of enzyme 19 | # pattern = recognition site 20 | # len = length of pattern 21 | # ncuts = number of cuts made by enzyme 22 | # Zero represents unknown 23 | # blunt = true if blunt end cut, false if sticky 24 | # c1 = First 5' cut 25 | # c2 = First 3' cut 26 | # c3 = Second 5' cut 27 | # c4 = Second 3' cut 28 | # 29 | # Examples: 30 | # AAC^TGG -> 6 2 1 3 3 0 0 31 | # A^ACTGG -> 6 2 0 1 5 0 0 32 | # AACTGG -> 6 0 0 0 0 0 0 33 | # AACTGG(-5/-1) -> 6 2 0 1 5 0 0 34 | # (8/13)GACNNNNNNTCA(12/7) -> 12 4 0 -9 -14 24 19 35 | # 36 | # i.e. cuts are always to the right of the given 37 | # residue and sequences are always with reference to 38 | # the 5' strand. 39 | # Sequences are numbered ... -3 -2 -1 1 2 3 ... with 40 | # the first residue of the pattern at base number 1. 41 | # 42 | # 43 | AanI TTATAA 6 2 1 3 3 0 0 44 | AarI CACCTGC 7 2 0 11 15 0 0 45 | AasI GACNNNNNNGTC 12 2 0 7 5 0 0 46 | AatII GACGTC 6 2 0 5 1 0 0 47 | AbaCIII ctatcav 7 0 0 0 0 0 0 48 | AbaSI C 1 2 0 12 10 0 0 49 | AbsI CCTCGAGG 8 2 0 2 6 0 0 50 | AccI GTMKAC 6 2 0 2 4 0 0 51 | AccII CGCG 4 2 1 2 2 0 0 52 | AccIII TCCGGA 6 2 0 1 5 0 0 53 | Acc16I TGCGCA 6 2 1 3 3 0 0 54 | Acc36I ACCTGC 6 2 0 10 14 0 0 55 | Acc65I GGTACC 6 2 0 1 5 0 0 56 | AccB1I GGYRCC 6 2 0 1 5 0 0 57 | AccB7I CCANNNNNTGG 11 2 0 7 4 0 0 58 | AccBSI CCGCTC 6 2 1 3 3 0 0 59 | AceIII cagctc 6 2 0 13 17 0 0 60 | AciI CCGC 4 2 0 1 3 0 0 61 | AclI AACGTT 6 2 0 2 4 0 0 62 | AclWI GGATC 5 2 0 9 10 0 0 63 | AcoI YGGCCR 6 2 0 1 5 0 0 64 | AcsI RAATTY 6 2 0 1 5 0 0 65 | AcuI CTGAAG 6 2 0 22 20 0 0 66 | AcvI CACGTG 6 2 1 3 3 0 0 67 | AcyI GRCGYC 6 2 0 2 4 0 0 68 | AdeI CACNNNGTG 9 2 0 6 3 0 0 69 | AfaI GTAC 4 2 1 2 2 0 0 70 | AfeI AGCGCT 6 2 1 3 3 0 0 71 | AfiI CCNNNNNNNGG 11 2 0 7 4 0 0 72 | AflII CTTAAG 6 2 0 1 5 0 0 73 | AflIII ACRYGT 6 2 0 1 5 0 0 74 | AgeI ACCGGT 6 2 0 1 5 0 0 75 | AgsI TTSAA 5 2 0 3 2 0 0 76 | AhaIII tttaaa 6 2 1 3 3 0 0 77 | AhdI GACNNNNNGTC 11 2 0 6 5 0 0 78 | AhlI ACTAGT 6 2 0 1 5 0 0 79 | AjiI CACGTC 6 2 1 3 3 0 0 80 | AjnI CCWGG 5 2 0 -1 5 0 0 81 | AjuI GAANNNNNNNTTGG 14 4 0 -8 -13 25 20 82 | AleI CACNNNNGTG 10 2 1 5 5 0 0 83 | AlfI GCANNNNNNTGC 12 4 0 -11 -13 24 22 84 | AloI GAACNNNNNNTCC 13 4 0 -8 -13 25 20 85 | AluI AGCT 4 2 1 2 2 0 0 86 | AluBI AGCT 4 2 1 2 2 0 0 87 | AlwI GGATC 5 2 0 9 10 0 0 88 | Alw21I GWGCWC 6 2 0 5 1 0 0 89 | Alw26I GTCTC 5 2 0 6 10 0 0 90 | Alw44I GTGCAC 6 2 0 1 5 0 0 91 | AlwFI gaaaynnnnnrtg 13 0 0 0 0 0 0 92 | AlwNI CAGNNNCTG 9 2 0 6 3 0 0 93 | Ama87I CYCGRG 6 2 0 1 5 0 0 94 | Aor13HI TCCGGA 6 2 0 1 5 0 0 95 | Aor51HI AGCGCT 6 2 1 3 3 0 0 96 | AoxI ggcc 4 2 0 -1 4 0 0 97 | ApaI GGGCCC 6 2 0 5 1 0 0 98 | ApaBI gcannnnntgc 11 2 0 8 3 0 0 99 | ApaLI GTGCAC 6 2 0 1 5 0 0 100 | ApeKI GCWGC 5 2 0 1 4 0 0 101 | ApoI RAATTY 6 2 0 1 5 0 0 102 | ApyPI atcgac 6 2 0 26 24 0 0 103 | AquII gccgnac 7 2 0 27 25 0 0 104 | AquIII gaggag 6 2 0 26 24 0 0 105 | AquIV grggaag 7 2 0 26 24 0 0 106 | ArsI GACNNNNNNTTYG 13 4 0 -9 -14 24 19 107 | AscI GGCGCGCC 8 2 0 2 6 0 0 108 | AseI ATTAAT 6 2 0 2 4 0 0 109 | Asi256I gatc 4 2 0 1 3 0 0 110 | AsiGI ACCGGT 6 2 0 1 5 0 0 111 | AsiSI GCGATCGC 8 2 0 5 3 0 0 112 | Asp700I GAANNNNTTC 10 2 1 5 5 0 0 113 | Asp718I GGTACC 6 2 0 1 5 0 0 114 | AspA2I CCTAGG 6 2 0 1 5 0 0 115 | AspBHI yscns 5 2 0 13 17 0 0 116 | AspLEI GCGC 4 2 0 3 1 0 0 117 | AspS9I GGNCC 5 2 0 1 4 0 0 118 | AssI AGTACT 6 2 1 3 3 0 0 119 | AsuI ggncc 5 2 0 1 4 0 0 120 | AsuII TTCGAA 6 2 0 2 4 0 0 121 | AsuC2I CCSGG 5 2 0 2 3 0 0 122 | AsuHPI GGTGA 5 2 0 13 12 0 0 123 | AsuNHI GCTAGC 6 2 0 1 5 0 0 124 | AvaI CYCGRG 6 2 0 1 5 0 0 125 | AvaII GGWCC 5 2 0 1 4 0 0 126 | AvaIII atgcat 6 0 0 0 0 0 0 127 | AvrII CCTAGG 6 2 0 1 5 0 0 128 | AxyI CCTNAGG 7 2 0 2 5 0 0 129 | BaeI ACNNNNGTAYC 11 4 0 -11 -16 23 18 130 | BaeGI GKGCMC 6 2 0 5 1 0 0 131 | BalI TGGCCA 6 2 1 3 3 0 0 132 | BamHI GGATCC 6 2 0 1 5 0 0 133 | BanI GGYRCC 6 2 0 1 5 0 0 134 | BanII GRGCYC 6 2 0 5 1 0 0 135 | BanLI rtcagg 6 0 0 0 0 0 0 136 | BarI GAAGNNNNNNTAC 13 4 0 -8 -13 25 20 137 | BasI CCANNNNNTGG 11 2 0 7 4 0 0 138 | BauI CACGAG 6 2 0 1 5 0 0 139 | Bbr7I gaagac 6 2 0 13 17 0 0 140 | BbrPI CACGTG 6 2 1 3 3 0 0 141 | BbsI GAAGAC 6 2 0 8 12 0 0 142 | BbvI GCAGC 5 2 0 13 17 0 0 143 | BbvII gaagac 6 2 0 8 12 0 0 144 | Bbv12I GWGCWC 6 2 0 5 1 0 0 145 | BbvCI CCTCAGC 7 2 0 2 5 0 0 146 | BccI CCATC 5 2 0 9 10 0 0 147 | Bce83I cttgag 6 2 0 22 20 0 0 148 | BceAI ACGGC 5 2 0 17 19 0 0 149 | R1.BceSIV gcagc 5 4 0 -8 -6 14 16 150 | BcefI acggc 5 2 0 17 18 0 0 151 | BcgI CGANNNNNNTGC 12 4 0 -11 -13 24 22 152 | BciT130I CCWGG 5 2 0 2 3 0 0 153 | BciVI GTATCC 6 2 0 12 11 0 0 154 | BclI TGATCA 6 2 0 1 5 0 0 155 | BcnI CCSGG 5 2 0 2 3 0 0 156 | BcoDI GTCTC 5 2 0 6 10 0 0 157 | BcuI ACTAGT 6 2 0 1 5 0 0 158 | BdaI tgannnnnntca 12 4 0 -11 -13 24 22 159 | BetI wccggw 6 2 0 1 5 0 0 160 | BfaI CTAG 4 2 0 1 3 0 0 161 | BfaSII ganggag 7 0 0 0 0 0 0 162 | BfiI actggg 6 2 0 11 10 0 0 163 | BfmI CTRYAG 6 2 0 1 5 0 0 164 | BfoI RGCGCY 6 2 0 5 1 0 0 165 | BfrI CTTAAG 6 2 0 1 5 0 0 166 | BfuI GTATCC 6 2 0 12 11 0 0 167 | BfuAI ACCTGC 6 2 0 10 14 0 0 168 | BfuCI GATC 4 2 0 -1 4 0 0 169 | BglI GCCNNNNNGGC 11 2 0 7 4 0 0 170 | BglII AGATCT 6 2 0 1 5 0 0 171 | BinI ggatc 5 2 0 9 10 0 0 172 | BisI GCNGC 5 2 0 2 3 0 0 173 | BlnI CCTAGG 6 2 0 1 5 0 0 174 | BlpI GCTNAGC 7 2 0 2 5 0 0 175 | BlsI GCNGC 5 2 0 3 2 0 0 176 | BmcAI AGTACT 6 2 1 3 3 0 0 177 | Bme18I GGWCC 5 2 0 1 4 0 0 178 | Bme1390I CCNGG 5 2 0 2 3 0 0 179 | BmeDI c 1 2 0 3 1 0 0 180 | BmeRI GACNNNNNGTC 11 2 0 6 5 0 0 181 | BmeT110I CYCGRG 6 2 0 1 5 0 0 182 | BmgI gkgccc 6 0 0 0 0 0 0 183 | BmgBI CACGTC 6 2 1 3 3 0 0 184 | BmgT120I GGNCC 5 2 0 1 4 0 0 185 | BmiI GGNNCC 6 2 1 3 3 0 0 186 | BmrI ACTGGG 6 2 0 11 10 0 0 187 | BmrFI CCNGG 5 2 0 2 3 0 0 188 | BmsI GCATC 5 2 0 10 14 0 0 189 | BmtI GCTAGC 6 2 0 5 1 0 0 190 | BmuI ACTGGG 6 2 0 11 10 0 0 191 | BoxI GACNNNNGTC 10 2 1 5 5 0 0 192 | BpiI GAAGAC 6 2 0 8 12 0 0 193 | BplI GAGNNNNNCTC 11 4 0 -9 -14 24 19 194 | BpmI CTGGAG 6 2 0 22 20 0 0 195 | Bpu10I CCTNAGC 7 2 0 2 5 0 0 196 | Bpu14I TTCGAA 6 2 0 2 4 0 0 197 | Bpu1102I GCTNAGC 7 2 0 2 5 0 0 198 | BpuEI CTTGAG 6 2 0 22 20 0 0 199 | BpuMI CCSGG 5 2 0 2 3 0 0 200 | BpvUI CGATCG 6 2 0 4 2 0 0 201 | BsaI GGTCTC 6 2 0 7 11 0 0 202 | Bsa29I ATCGAT 6 2 0 2 4 0 0 203 | BsaAI YACGTR 6 2 1 3 3 0 0 204 | BsaBI GATNNNNATC 10 2 1 5 5 0 0 205 | BsaHI GRCGYC 6 2 0 2 4 0 0 206 | BsaJI CCNNGG 6 2 0 1 5 0 0 207 | BsaWI WCCGGW 6 2 0 1 5 0 0 208 | BsaXI ACNNNNNCTCC 11 4 0 -10 -13 21 18 209 | BsbI caacac 6 2 0 27 25 0 0 210 | Bsc4I CCNNNNNNNGG 11 2 0 7 4 0 0 211 | BscAI gcatc 5 2 0 9 11 0 0 212 | BscGI cccgt 5 0 0 0 0 0 0 213 | Bse1I ACTGG 5 2 0 6 4 0 0 214 | Bse8I GATNNNNATC 10 2 1 5 5 0 0 215 | Bse21I CCTNAGG 7 2 0 2 5 0 0 216 | Bse118I RCCGGY 6 2 0 1 5 0 0 217 | BseAI TCCGGA 6 2 0 1 5 0 0 218 | BseBI CCWGG 5 2 0 2 3 0 0 219 | BseCI ATCGAT 6 2 0 2 4 0 0 220 | BseDI CCNNGG 6 2 0 1 5 0 0 221 | Bse3DI GCAATG 6 2 0 8 6 0 0 222 | BseGI GGATG 5 2 0 7 5 0 0 223 | BseJI GATNNNNATC 10 2 1 5 5 0 0 224 | BseLI CCNNNNNNNGG 11 2 0 7 4 0 0 225 | BseMI GCAATG 6 2 0 8 6 0 0 226 | BseMII CTCAG 5 2 0 15 13 0 0 227 | BseNI ACTGG 5 2 0 6 4 0 0 228 | BsePI GCGCGC 6 2 0 1 5 0 0 229 | BseRI GAGGAG 6 2 0 16 14 0 0 230 | BseSI GKGCMC 6 2 0 5 1 0 0 231 | BseXI GCAGC 5 2 0 13 17 0 0 232 | BseX3I CGGCCG 6 2 0 1 5 0 0 233 | BseYI CCCAGC 6 2 0 1 5 0 0 234 | BsgI GTGCAG 6 2 0 22 20 0 0 235 | Bsh1236I CGCG 4 2 1 2 2 0 0 236 | Bsh1285I CGRYCG 6 2 0 4 2 0 0 237 | BshFI GGCC 4 2 1 2 2 0 0 238 | BshNI GGYRCC 6 2 0 1 5 0 0 239 | BshTI ACCGGT 6 2 0 1 5 0 0 240 | BshVI ATCGAT 6 2 0 2 4 0 0 241 | BsiI cacgag 6 2 0 1 5 0 0 242 | BsiEI CGRYCG 6 2 0 4 2 0 0 243 | BsiHKAI GWGCWC 6 2 0 5 1 0 0 244 | BsiHKCI CYCGRG 6 2 0 1 5 0 0 245 | BsiSI CCGG 4 2 0 1 3 0 0 246 | BsiWI CGTACG 6 2 0 1 5 0 0 247 | BsiYI ccnnnnnnngg 11 2 0 7 4 0 0 248 | BslI CCNNNNNNNGG 11 2 0 7 4 0 0 249 | BslFI GGGAC 5 2 0 15 19 0 0 250 | BsmI GAATGC 6 2 0 7 5 0 0 251 | BsmAI GTCTC 5 2 0 6 10 0 0 252 | BsmBI CGTCTC 6 2 0 7 11 0 0 253 | BsmFI GGGAC 5 2 0 15 19 0 0 254 | BsnI GGCC 4 2 1 2 2 0 0 255 | Bso31I GGTCTC 6 2 0 7 11 0 0 256 | BsoBI CYCGRG 6 2 0 1 5 0 0 257 | Bsp13I TCCGGA 6 2 0 1 5 0 0 258 | Bsp19I CCATGG 6 2 0 1 5 0 0 259 | Bsp24I gacnnnnnntgg 12 4 0 -9 -14 24 19 260 | Bsp68I TCGCGA 6 2 1 3 3 0 0 261 | Bsp119I TTCGAA 6 2 0 2 4 0 0 262 | Bsp120I GGGCCC 6 2 0 1 5 0 0 263 | Bsp143I GATC 4 2 0 -1 4 0 0 264 | Bsp1286I GDGCHC 6 2 0 5 1 0 0 265 | Bsp1407I TGTACA 6 2 0 1 5 0 0 266 | Bsp1720I GCTNAGC 7 2 0 2 5 0 0 267 | BspACI CCGC 4 2 0 1 3 0 0 268 | BspCNI CTCAG 5 2 0 14 12 0 0 269 | BspDI ATCGAT 6 2 0 2 4 0 0 270 | BspD6I gactc 5 2 0 9 11 0 0 271 | BspEI TCCGGA 6 2 0 1 5 0 0 272 | BspFNI CGCG 4 2 1 2 2 0 0 273 | BspGI ctggac 6 0 0 0 0 0 0 274 | BspHI TCATGA 6 2 0 1 5 0 0 275 | BspLI GGNNCC 6 2 1 3 3 0 0 276 | BspLU11I acatgt 6 2 0 1 5 0 0 277 | BspMI ACCTGC 6 2 0 10 14 0 0 278 | BspMII tccgga 6 2 0 1 5 0 0 279 | BspNCI ccaga 5 0 0 0 0 0 0 280 | BspOI GCTAGC 6 2 0 5 1 0 0 281 | BspPI GGATC 5 2 0 9 10 0 0 282 | BspQI GCTCTTC 7 2 0 8 11 0 0 283 | BspTI CTTAAG 6 2 0 1 5 0 0 284 | BspT104I TTCGAA 6 2 0 2 4 0 0 285 | BspT107I GGYRCC 6 2 0 1 5 0 0 286 | BsrI ACTGG 5 2 0 6 4 0 0 287 | BsrBI CCGCTC 6 2 1 3 3 0 0 288 | BsrDI GCAATG 6 2 0 8 6 0 0 289 | BsrFI RCCGGY 6 2 0 1 5 0 0 290 | BsrGI TGTACA 6 2 0 1 5 0 0 291 | BsrSI ACTGG 5 2 0 6 4 0 0 292 | BssAI RCCGGY 6 2 0 1 5 0 0 293 | BssECI CCNNGG 6 2 0 1 5 0 0 294 | BssHII GCGCGC 6 2 0 1 5 0 0 295 | BssKI CCNGG 5 2 0 -1 5 0 0 296 | BssMI GATC 4 2 0 -1 4 0 0 297 | BssNI GRCGYC 6 2 0 2 4 0 0 298 | BssNAI GTATAC 6 2 1 3 3 0 0 299 | BssSI CACGAG 6 2 0 1 5 0 0 300 | BssT1I CCWWGG 6 2 0 1 5 0 0 301 | Bst6I CTCTTC 6 2 0 7 10 0 0 302 | Bst1107I GTATAC 6 2 1 3 3 0 0 303 | BstACI GRCGYC 6 2 0 2 4 0 0 304 | BstAFI CTTAAG 6 2 0 1 5 0 0 305 | BstAPI GCANNNNNTGC 11 2 0 7 4 0 0 306 | BstAUI TGTACA 6 2 0 1 5 0 0 307 | BstBI TTCGAA 6 2 0 2 4 0 0 308 | Bst2BI CACGAG 6 2 0 1 5 0 0 309 | BstBAI YACGTR 6 2 1 3 3 0 0 310 | Bst4CI ACNGT 5 2 0 3 2 0 0 311 | BstC8I GCNNGC 6 2 1 3 3 0 0 312 | BstDEI CTNAG 5 2 0 1 4 0 0 313 | BstDSI CCRYGG 6 2 0 1 5 0 0 314 | BstEII GGTNACC 7 2 0 1 6 0 0 315 | BstENI CCTNNNNNAGG 11 2 0 5 6 0 0 316 | BstF5I GGATG 5 2 0 7 5 0 0 317 | BstFNI CGCG 4 2 1 2 2 0 0 318 | BstH2I RGCGCY 6 2 0 5 1 0 0 319 | BstHHI GCGC 4 2 0 3 1 0 0 320 | BstKTI GATC 4 2 0 3 1 0 0 321 | BstMAI GTCTC 5 2 0 6 10 0 0 322 | BstMBI GATC 4 2 0 -1 4 0 0 323 | BstMCI CGRYCG 6 2 0 4 2 0 0 324 | BstMWI GCNNNNNNNGC 11 2 0 7 4 0 0 325 | BstNI CCWGG 5 2 0 2 3 0 0 326 | BstNSI RCATGY 6 2 0 5 1 0 0 327 | BstOI CCWGG 5 2 0 2 3 0 0 328 | BstPI GGTNACC 7 2 0 1 6 0 0 329 | BstPAI GACNNNNGTC 10 2 1 5 5 0 0 330 | BstSCI CCNGG 5 2 0 -1 5 0 0 331 | BstSFI CTRYAG 6 2 0 1 5 0 0 332 | BstSLI GKGCMC 6 2 0 5 1 0 0 333 | BstSNI TACGTA 6 2 1 3 3 0 0 334 | BstUI CGCG 4 2 1 2 2 0 0 335 | Bst2UI CCWGG 5 2 0 2 3 0 0 336 | BstV1I GCAGC 5 2 0 13 17 0 0 337 | BstV2I GAAGAC 6 2 0 8 12 0 0 338 | BstXI CCANNNNNNTGG 12 2 0 8 4 0 0 339 | BstX2I RGATCY 6 2 0 1 5 0 0 340 | BstYI RGATCY 6 2 0 1 5 0 0 341 | BstZI CGGCCG 6 2 0 1 5 0 0 342 | BstZ17I GTATAC 6 2 1 3 3 0 0 343 | BsuI GTATCC 6 2 0 12 11 0 0 344 | Bsu15I ATCGAT 6 2 0 2 4 0 0 345 | Bsu36I CCTNAGG 7 2 0 2 5 0 0 346 | BsuRI GGCC 4 2 1 2 2 0 0 347 | BtgI CCRYGG 6 2 0 1 5 0 0 348 | BtgZI GCGATG 6 2 0 16 20 0 0 349 | BthCI gcngc 5 2 0 4 1 0 0 350 | BtrI CACGTC 6 2 1 3 3 0 0 351 | BtsIMutI CAGTG 5 2 0 7 5 0 0 352 | BtsI GCAGTG 6 2 0 8 6 0 0 353 | BtsCI GGATG 5 2 0 7 5 0 0 354 | BtuMI TCGCGA 6 2 1 3 3 0 0 355 | BveI ACCTGC 6 2 0 10 14 0 0 356 | Cac8I GCNNGC 6 2 1 3 3 0 0 357 | CaiI CAGNNNCTG 9 2 0 6 3 0 0 358 | CauII ccsgg 5 2 0 2 3 0 0 359 | CchII ggarga 6 2 0 17 15 0 0 360 | CchIII cccaag 6 2 0 26 24 0 0 361 | CciI TCATGA 6 2 0 1 5 0 0 362 | CciNI GCGGCCGC 8 2 0 2 6 0 0 363 | CcrNAIII cgaccag 7 0 0 0 0 0 0 364 | CdiI catcg 5 2 1 4 4 0 0 365 | Cdi630V caaaaa 6 0 0 0 0 0 0 366 | CdpI gcggag 6 2 0 26 24 0 0 367 | CfoI GCGC 4 2 0 3 1 0 0 368 | CfrI yggccr 6 2 0 1 5 0 0 369 | Cfr9I CCCGGG 6 2 0 1 5 0 0 370 | Cfr10I RCCGGY 6 2 0 1 5 0 0 371 | Cfr13I GGNCC 5 2 0 1 4 0 0 372 | Cfr42I CCGCGG 6 2 0 4 2 0 0 373 | Cgl13032I ggcgca 6 0 0 0 0 0 0 374 | Cgl13032II acgabgg 7 0 0 0 0 0 0 375 | ChaI gatc 4 2 0 4 -1 0 0 376 | CjeI ccannnnnngt 11 4 0 -9 -15 26 20 377 | CjeFIII gcaagg 6 0 0 0 0 0 0 378 | CjeFV ggrca 5 0 0 0 0 0 0 379 | CjeNII gagnnnnngt 10 0 0 0 0 0 0 380 | CjeNIII gkaayg 6 2 0 25 23 0 0 381 | CjePI ccannnnnnntc 12 4 0 -8 -14 26 20 382 | CjeP659IV cacnnnnnnngaa 13 0 0 0 0 0 0 383 | CjuI caynnnnnrtg 11 0 0 0 0 0 0 384 | CjuII caynnnnnctc 11 0 0 0 0 0 0 385 | ClaI ATCGAT 6 2 0 2 4 0 0 386 | CpoI CGGWCCG 7 2 0 2 5 0 0 387 | CseI GACGC 5 2 0 10 15 0 0 388 | CsiI ACCWGGT 7 2 0 1 6 0 0 389 | CspI CGGWCCG 7 2 0 2 5 0 0 390 | Csp6I GTAC 4 2 0 1 3 0 0 391 | CspAI ACCGGT 6 2 0 1 5 0 0 392 | CspCI CAANNNNNGTGG 12 4 0 -12 -14 24 22 393 | CstMI aaggag 6 2 0 26 24 0 0 394 | CviAII CATG 4 2 0 1 3 0 0 395 | CviJI RGCY 4 2 1 2 2 0 0 396 | CviKI-1 RGCY 4 2 1 2 2 0 0 397 | CviQI GTAC 4 2 0 1 3 0 0 398 | CviRI tgca 4 2 1 2 2 0 0 399 | DdeI CTNAG 5 2 0 1 4 0 0 400 | DinI GGCGCC 6 2 1 3 3 0 0 401 | DpnI GATC 4 2 1 2 2 0 0 402 | DpnII GATC 4 2 0 -1 4 0 0 403 | DraI TTTAAA 6 2 1 3 3 0 0 404 | DraII rggnccy 7 2 0 2 5 0 0 405 | DraIII CACNNNGTG 9 2 0 6 3 0 0 406 | DraRI caagnac 7 2 0 27 25 0 0 407 | DrdI GACNNNNNNGTC 12 2 0 7 5 0 0 408 | DrdII gaacca 6 0 0 0 0 0 0 409 | DriI GACNNNNNGTC 11 2 0 6 5 0 0 410 | DsaI ccrygg 6 2 0 1 5 0 0 411 | DseDI GACNNNNNNGTC 12 2 0 7 5 0 0 412 | EaeI YGGCCR 6 2 0 1 5 0 0 413 | EagI CGGCCG 6 2 0 1 5 0 0 414 | Eam1104I CTCTTC 6 2 0 7 10 0 0 415 | Eam1105I GACNNNNNGTC 11 2 0 6 5 0 0 416 | EarI CTCTTC 6 2 0 7 10 0 0 417 | EciI GGCGGA 6 2 0 17 15 0 0 418 | Ecl136II GAGCTC 6 2 1 3 3 0 0 419 | EclXI CGGCCG 6 2 0 1 5 0 0 420 | Eco24I GRGCYC 6 2 0 5 1 0 0 421 | Eco31I GGTCTC 6 2 0 7 11 0 0 422 | Eco32I GATATC 6 2 1 3 3 0 0 423 | Eco47I GGWCC 5 2 0 1 4 0 0 424 | Eco47III AGCGCT 6 2 1 3 3 0 0 425 | Eco52I CGGCCG 6 2 0 1 5 0 0 426 | Eco57I CTGAAG 6 2 0 22 20 0 0 427 | Eco72I CACGTG 6 2 1 3 3 0 0 428 | Eco81I CCTNAGG 7 2 0 2 5 0 0 429 | Eco88I CYCGRG 6 2 0 1 5 0 0 430 | Eco91I GGTNACC 7 2 0 1 6 0 0 431 | Eco105I TACGTA 6 2 1 3 3 0 0 432 | Eco130I CCWWGG 6 2 0 1 5 0 0 433 | Eco147I AGGCCT 6 2 1 3 3 0 0 434 | EcoHI ccsgg 5 2 0 -1 5 0 0 435 | EcoICRI GAGCTC 6 2 1 3 3 0 0 436 | Eco57MI ctgrag 6 2 0 22 20 0 0 437 | EcoNI CCTNNNNNAGG 11 2 0 5 6 0 0 438 | EcoO65I GGTNACC 7 2 0 1 6 0 0 439 | EcoO109I RGGNCCY 7 2 0 2 5 0 0 440 | EcoRI GAATTC 6 2 0 1 5 0 0 441 | EcoRII CCWGG 5 2 0 -1 5 0 0 442 | EcoRV GATATC 6 2 1 3 3 0 0 443 | EcoT14I CCWWGG 6 2 0 1 5 0 0 444 | EcoT22I ATGCAT 6 2 0 5 1 0 0 445 | EcoT38I GRGCYC 6 2 0 5 1 0 0 446 | Eco53kI GAGCTC 6 2 1 3 3 0 0 447 | EgeI GGCGCC 6 2 1 3 3 0 0 448 | EheI GGCGCC 6 2 1 3 3 0 0 449 | ErhI CCWWGG 6 2 0 1 5 0 0 450 | EsaBC3I tcga 4 2 1 2 2 0 0 451 | EsaSSI gaccac 6 0 0 0 0 0 0 452 | EspI gctnagc 7 2 0 2 5 0 0 453 | Esp3I CGTCTC 6 2 0 7 11 0 0 454 | FaeI CATG 4 2 0 4 -1 0 0 455 | FaiI YATR 4 2 1 2 2 0 0 456 | FalI AAGNNNNNCTT 11 4 0 -9 -14 24 19 457 | FaqI GGGAC 5 2 0 15 19 0 0 458 | FatI CATG 4 2 0 -1 4 0 0 459 | FauI CCCGC 5 2 0 9 11 0 0 460 | FauNDI CATATG 6 2 0 2 4 0 0 461 | FbaI TGATCA 6 2 0 1 5 0 0 462 | FblI GTMKAC 6 2 0 2 4 0 0 463 | FinI gggac 5 0 0 0 0 0 0 464 | FmuI ggncc 5 2 0 4 1 0 0 465 | FnuDII cgcg 4 2 1 2 2 0 0 466 | Fnu4HI GCNGC 5 2 0 2 3 0 0 467 | FokI GGATG 5 2 0 14 18 0 0 468 | FriOI GRGCYC 6 2 0 5 1 0 0 469 | FseI GGCCGGCC 8 2 0 6 2 0 0 470 | FspI TGCGCA 6 2 1 3 3 0 0 471 | FspAI RTGCGCAY 8 2 1 4 4 0 0 472 | FspBI CTAG 4 2 0 1 3 0 0 473 | FspEI CC 2 2 0 14 18 0 0 474 | Fsp4HI GCNGC 5 2 0 2 3 0 0 475 | GauT27I cgcgcagg 8 0 0 0 0 0 0 476 | GdiII cggccr 6 2 0 1 5 0 0 477 | GlaI GCGC 4 2 1 2 2 0 0 478 | GluI GCNGC 5 2 0 2 3 0 0 479 | GsaI CCCAGC 6 2 0 5 1 0 0 480 | GsuI CTGGAG 6 2 0 22 20 0 0 481 | HaeI wggccw 6 2 1 3 3 0 0 482 | HaeII RGCGCY 6 2 0 5 1 0 0 483 | HaeIII GGCC 4 2 1 2 2 0 0 484 | HaeIV gaynnnnnrtc 11 4 0 -8 -14 25 20 485 | HapII CCGG 4 2 0 1 3 0 0 486 | HauII tggcca 6 2 0 17 15 0 0 487 | HgaI GACGC 5 2 0 10 15 0 0 488 | HgiAI gwgcwc 6 2 0 5 1 0 0 489 | HgiCI ggyrcc 6 2 0 1 5 0 0 490 | HgiEII accnnnnnnggt 12 0 0 0 0 0 0 491 | HgiJII grgcyc 6 2 0 5 1 0 0 492 | HhaI GCGC 4 2 0 3 1 0 0 493 | Hin1I GRCGYC 6 2 0 2 4 0 0 494 | Hin1II CATG 4 2 0 4 -1 0 0 495 | Hin4I gaynnnnnvtc 11 4 0 -9 -14 24 19 496 | Hin4II ccttc 5 2 0 11 10 0 0 497 | Hin6I GCGC 4 2 0 1 3 0 0 498 | HinP1I GCGC 4 2 0 1 3 0 0 499 | HincII GTYRAC 6 2 1 3 3 0 0 500 | HindII GTYRAC 6 2 1 3 3 0 0 501 | HindIII AAGCTT 6 2 0 1 5 0 0 502 | HinfI GANTC 5 2 0 1 4 0 0 503 | HpaI GTTAAC 6 2 1 3 3 0 0 504 | HpaII CCGG 4 2 0 1 3 0 0 505 | HphI GGTGA 5 2 0 13 12 0 0 506 | Hpy8I GTNNAC 6 2 1 3 3 0 0 507 | Hpy99I CGWCG 5 2 0 5 -1 0 0 508 | Hpy99XIII gccta 5 0 0 0 0 0 0 509 | Hpy99XIV ggwtaa 6 0 0 0 0 0 0 510 | Hpy99XIV-mut1 ggwcna 6 0 0 0 0 0 0 511 | Hpy99XXII tcannnnnntrg 12 0 0 0 0 0 0 512 | Hpy166II GTNNAC 6 2 1 3 3 0 0 513 | Hpy178III tcnnga 6 2 0 2 4 0 0 514 | Hpy188I TCNGA 5 2 0 3 2 0 0 515 | Hpy188III TCNNGA 6 2 0 2 4 0 0 516 | HpyAV CCTTC 5 2 0 11 10 0 0 517 | HpyAXIV gcgta 5 0 0 0 0 0 0 518 | HpyAXVI-mut1 crttaa 6 0 0 0 0 0 0 519 | HpyAXVI-mut2 crtcna 6 0 0 0 0 0 0 520 | HpyCH4III ACNGT 5 2 0 3 2 0 0 521 | HpyCH4IV ACGT 4 2 0 1 3 0 0 522 | HpyCH4V TGCA 4 2 1 2 2 0 0 523 | HpyF3I CTNAG 5 2 0 1 4 0 0 524 | HpyF10VI GCNNNNNNNGC 11 2 0 7 4 0 0 525 | HpySE526I ACGT 4 2 0 1 3 0 0 526 | Hsp92I GRCGYC 6 2 0 2 4 0 0 527 | Hsp92II CATG 4 2 0 4 -1 0 0 528 | HspAI GCGC 4 2 0 1 3 0 0 529 | Jma19592I gtatnac 7 0 0 0 0 0 0 530 | KasI GGCGCC 6 2 0 1 5 0 0 531 | KflI GGGWCCC 7 2 0 2 5 0 0 532 | KpnI GGTACC 6 2 0 5 1 0 0 533 | Kpn2I TCCGGA 6 2 0 1 5 0 0 534 | KroI GCCGGC 6 2 0 1 5 0 0 535 | KspI CCGCGG 6 2 0 4 2 0 0 536 | Ksp22I TGATCA 6 2 0 1 5 0 0 537 | Ksp632I ctcttc 6 2 0 7 10 0 0 538 | KspAI GTTAAC 6 2 1 3 3 0 0 539 | Kzo9I GATC 4 2 0 -1 4 0 0 540 | LguI GCTCTTC 7 2 0 8 11 0 0 541 | LpnI rgcgcy 6 2 1 3 3 0 0 542 | LpnPI CCDG 4 2 0 14 18 0 0 543 | Lsp1109I GCAGC 5 2 0 13 17 0 0 544 | LweI GCATC 5 2 0 10 14 0 0 545 | MabI ACCWGGT 7 2 0 1 6 0 0 546 | MaeI CTAG 4 2 0 1 3 0 0 547 | MaeII ACGT 4 2 0 1 3 0 0 548 | MaeIII GTNAC 5 2 0 -1 5 0 0 549 | MalI GATC 4 2 1 2 2 0 0 550 | MaqI crttgac 7 2 0 28 26 0 0 551 | MauBI CGCGCGCG 8 2 0 2 6 0 0 552 | MbiI CCGCTC 6 2 1 3 3 0 0 553 | MboI GATC 4 2 0 -1 4 0 0 554 | MboII GAAGA 5 2 0 13 12 0 0 555 | McaTI gcgcgc 6 2 0 4 2 0 0 556 | McrI cgrycg 6 2 0 4 2 0 0 557 | MfeI CAATTG 6 2 0 1 5 0 0 558 | MflI RGATCY 6 2 0 1 5 0 0 559 | MhlI GDGCHC 6 2 0 5 1 0 0 560 | MjaIV gtnnac 6 0 0 0 0 0 0 561 | MkaDII gagaygt 7 0 0 0 0 0 0 562 | MlsI TGGCCA 6 2 1 3 3 0 0 563 | MluI ACGCGT 6 2 0 1 5 0 0 564 | MluCI AATT 4 2 0 -1 4 0 0 565 | MluNI TGGCCA 6 2 1 3 3 0 0 566 | MlyI GAGTC 5 2 1 10 10 0 0 567 | Mly113I GGCGCC 6 2 0 2 4 0 0 568 | MmeI TCCRAC 6 2 0 26 24 0 0 569 | MnlI CCTC 4 2 0 11 10 0 0 570 | Mox20I TGGCCA 6 2 1 3 3 0 0 571 | Mph1103I ATGCAT 6 2 0 5 1 0 0 572 | MreI CGCCGGCG 8 2 0 2 6 0 0 573 | MroI TCCGGA 6 2 0 1 5 0 0 574 | MroNI GCCGGC 6 2 0 1 5 0 0 575 | MroXI GAANNNNTTC 10 2 1 5 5 0 0 576 | MscI TGGCCA 6 2 1 3 3 0 0 577 | MseI TTAA 4 2 0 1 3 0 0 578 | MslI CAYNNNNRTG 10 2 1 5 5 0 0 579 | MspI CCGG 4 2 0 1 3 0 0 580 | Msp20I TGGCCA 6 2 1 3 3 0 0 581 | MspA1I CMGCKG 6 2 1 3 3 0 0 582 | MspCI CTTAAG 6 2 0 1 5 0 0 583 | MspJI CNNR 4 2 0 13 17 0 0 584 | MspR9I CCNGG 5 2 0 2 3 0 0 585 | MssI GTTTAAAC 8 2 1 4 4 0 0 586 | MstI tgcgca 6 2 1 3 3 0 0 587 | MunI CAATTG 6 2 0 1 5 0 0 588 | MvaI CCWGG 5 2 0 2 3 0 0 589 | Mva1269I GAATGC 6 2 0 7 5 0 0 590 | MvnI CGCG 4 2 1 2 2 0 0 591 | MvrI CGATCG 6 2 0 4 2 0 0 592 | MwoI GCNNNNNNNGC 11 2 0 7 4 0 0 593 | NaeI GCCGGC 6 2 1 3 3 0 0 594 | NarI GGCGCC 6 2 0 2 4 0 0 595 | NciI CCSGG 5 2 0 2 3 0 0 596 | NcoI CCATGG 6 2 0 1 5 0 0 597 | NdeI CATATG 6 2 0 2 4 0 0 598 | NdeII GATC 4 2 0 -1 4 0 0 599 | NgoAVIII gacnnnnntga 11 4 0 -13 -15 24 22 600 | NgoMIV GCCGGC 6 2 0 1 5 0 0 601 | NhaXI caagrag 7 0 0 0 0 0 0 602 | NheI GCTAGC 6 2 0 1 5 0 0 603 | NlaIII CATG 4 2 0 4 -1 0 0 604 | NlaIV GGNNCC 6 2 1 3 3 0 0 605 | NlaCI catcac 6 2 0 25 23 0 0 606 | Nli3877I cycgrg 6 2 0 5 1 0 0 607 | NmeAIII GCCGAG 6 2 0 27 25 0 0 608 | NmeDI rccggy 6 4 0 -13 -8 13 18 609 | NmuCI GTSAC 5 2 0 -1 5 0 0 610 | NotI GCGGCCGC 8 2 0 2 6 0 0 611 | NruI TCGCGA 6 2 1 3 3 0 0 612 | NsbI TGCGCA 6 2 1 3 3 0 0 613 | NsiI ATGCAT 6 2 0 5 1 0 0 614 | NspI RCATGY 6 2 0 5 1 0 0 615 | NspV TTCGAA 6 2 0 2 4 0 0 616 | NspBII cmgckg 6 2 1 3 3 0 0 617 | OliI CACNNNNGTG 10 2 1 5 5 0 0 618 | PabI gtac 4 2 0 3 1 0 0 619 | PacI TTAATTAA 8 2 0 5 3 0 0 620 | PaeI GCATGC 6 2 0 5 1 0 0 621 | PaeR7I CTCGAG 6 2 0 1 5 0 0 622 | PagI TCATGA 6 2 0 1 5 0 0 623 | PalAI GGCGCGCC 8 2 0 2 6 0 0 624 | PasI CCCWGGG 7 2 0 2 5 0 0 625 | PauI GCGCGC 6 2 0 1 5 0 0 626 | PceI AGGCCT 6 2 1 3 3 0 0 627 | PciI ACATGT 6 2 0 1 5 0 0 628 | PciSI GCTCTTC 7 2 0 8 11 0 0 629 | PcsI WCGNNNNNNNCGW 13 2 0 7 6 0 0 630 | PctI GAATGC 6 2 0 7 5 0 0 631 | PdiI GCCGGC 6 2 1 3 3 0 0 632 | Pdi8503III ccggnag 7 0 0 0 0 0 0 633 | PdmI GAANNNNTTC 10 2 1 5 5 0 0 634 | PenI gcagt 5 0 0 0 0 0 0 635 | PfeI GAWTC 5 2 0 1 4 0 0 636 | Pfl23II CGTACG 6 2 0 1 5 0 0 637 | Pfl1108I tcgtag 6 0 0 0 0 0 0 638 | PflFI GACNNNGTC 9 2 0 4 5 0 0 639 | PflMI CCANNNNNTGG 11 2 0 7 4 0 0 640 | PfoI TCCNGGA 7 2 0 1 6 0 0 641 | PinAI ACCGGT 6 2 0 1 5 0 0 642 | PlaDI catcag 6 2 0 27 25 0 0 643 | PleI GAGTC 5 2 0 9 10 0 0 644 | Ple19I CGATCG 6 2 0 4 2 0 0 645 | PluTI GGCGCC 6 2 0 5 1 0 0 646 | PmaCI CACGTG 6 2 1 3 3 0 0 647 | PmeI GTTTAAAC 8 2 1 4 4 0 0 648 | Pme5II gacgag 6 0 0 0 0 0 0 649 | PmeS132I gacgag 6 0 0 0 0 0 0 650 | PmlI CACGTG 6 2 1 3 3 0 0 651 | PpiI gaacnnnnnctc 12 4 0 -8 -13 25 20 652 | PpsI GAGTC 5 2 0 9 10 0 0 653 | Ppu10I atgcat 6 2 0 1 5 0 0 654 | Ppu21I YACGTR 6 2 1 3 3 0 0 655 | PpuMI RGGWCCY 7 2 0 2 5 0 0 656 | PscI ACATGT 6 2 0 1 5 0 0 657 | PshAI GACNNNNGTC 10 2 1 5 5 0 0 658 | PshBI ATTAAT 6 2 0 2 4 0 0 659 | PsiI TTATAA 6 2 1 3 3 0 0 660 | Psp03I ggwcc 5 2 0 4 1 0 0 661 | Psp5II RGGWCCY 7 2 0 2 5 0 0 662 | Psp6I CCWGG 5 2 0 -1 5 0 0 663 | Psp1406I AACGTT 6 2 0 2 4 0 0 664 | Psp124BI GAGCTC 6 2 0 5 1 0 0 665 | PspCI CACGTG 6 2 1 3 3 0 0 666 | PspEI GGTNACC 7 2 0 1 6 0 0 667 | PspGI CCWGG 5 2 0 -1 5 0 0 668 | PspLI CGTACG 6 2 0 1 5 0 0 669 | PspN4I GGNNCC 6 2 1 3 3 0 0 670 | PspOMI GGGCCC 6 2 0 1 5 0 0 671 | PspOMII cgcccar 7 2 0 27 25 0 0 672 | PspPI GGNCC 5 2 0 1 4 0 0 673 | PspPPI RGGWCCY 7 2 0 2 5 0 0 674 | PspPRI ccycag 6 2 0 21 19 0 0 675 | PspXI VCTCGAGB 8 2 0 2 6 0 0 676 | PsrI GAACNNNNNNTAC 13 4 0 -8 -13 25 20 677 | PssI rggnccy 7 2 0 5 2 0 0 678 | PstI CTGCAG 6 2 0 5 1 0 0 679 | PstNI CAGNNNCTG 9 2 0 6 3 0 0 680 | PsuI RGATCY 6 2 0 1 5 0 0 681 | PsyI GACNNNGTC 9 2 0 4 5 0 0 682 | PteI GCGCGC 6 2 0 1 5 0 0 683 | PvuI CGATCG 6 2 0 4 2 0 0 684 | PvuII CAGCTG 6 2 1 3 3 0 0 685 | RceI catcgac 7 2 0 27 25 0 0 686 | RdeGBI ccgcag 6 0 0 0 0 0 0 687 | RdeGBII acccag 6 2 0 26 24 0 0 688 | RdeGBIII tgryca 6 4 0 -10 -12 17 15 689 | RflFIII cgccag 6 0 0 0 0 0 0 690 | RgaI GCGATCGC 8 2 0 5 3 0 0 691 | RigI GGCCGGCC 8 2 0 6 2 0 0 692 | RlaI vcw 3 0 0 0 0 0 0 693 | RleAI cccaca 6 2 0 18 15 0 0 694 | RpaI gtyggag 7 2 0 18 16 0 0 695 | RpaBI cccgcag 7 2 0 27 25 0 0 696 | RpaB5I cgrggac 7 2 0 27 25 0 0 697 | RpaTI grtggag 7 0 0 0 0 0 0 698 | RruI TCGCGA 6 2 1 3 3 0 0 699 | RsaI GTAC 4 2 1 2 2 0 0 700 | RsaNI GTAC 4 2 0 1 3 0 0 701 | RseI CAYNNNNRTG 10 2 1 5 5 0 0 702 | RsrII CGGWCCG 7 2 0 2 5 0 0 703 | Rsr2I CGGWCCG 7 2 0 2 5 0 0 704 | SacI GAGCTC 6 2 0 5 1 0 0 705 | SacII CCGCGG 6 2 0 4 2 0 0 706 | SalI GTCGAC 6 2 0 1 5 0 0 707 | SanDI gggwccc 7 2 0 2 5 0 0 708 | SapI GCTCTTC 7 2 0 8 11 0 0 709 | SaqAI TTAA 4 2 0 1 3 0 0 710 | SatI GCNGC 5 2 0 2 3 0 0 711 | SauI cctnagg 7 2 0 2 5 0 0 712 | Sau96I GGNCC 5 2 0 1 4 0 0 713 | Sau3AI GATC 4 2 0 -1 4 0 0 714 | SbfI CCTGCAGG 8 2 0 6 2 0 0 715 | ScaI AGTACT 6 2 1 3 3 0 0 716 | SchI GAGTC 5 2 1 10 10 0 0 717 | SciI ctcgag 6 2 1 3 3 0 0 718 | ScrFI CCNGG 5 2 0 2 3 0 0 719 | SdaI CCTGCAGG 8 2 0 6 2 0 0 720 | SdeAI cagrag 6 2 0 27 25 0 0 721 | SdeOSI gacnnnnrtga 11 4 0 -12 -14 23 21 722 | SduI GDGCHC 6 2 0 5 1 0 0 723 | SecI ccnngg 6 2 0 1 5 0 0 724 | SelI cgcg 4 2 0 -1 4 0 0 725 | Sen1736II gatcag 6 0 0 0 0 0 0 726 | SenTFV gatcag 6 0 0 0 0 0 0 727 | SetI ASST 4 2 0 4 -1 0 0 728 | SexAI ACCWGGT 7 2 0 1 6 0 0 729 | SfaAI GCGATCGC 8 2 0 5 3 0 0 730 | SfaNI GCATC 5 2 0 10 14 0 0 731 | SfcI CTRYAG 6 2 0 1 5 0 0 732 | SfeI ctryag 6 2 0 1 5 0 0 733 | SfiI GGCCNNNNNGGCC 13 2 0 8 5 0 0 734 | SfoI GGCGCC 6 2 1 3 3 0 0 735 | Sfr274I CTCGAG 6 2 0 1 5 0 0 736 | Sfr303I CCGCGG 6 2 0 4 2 0 0 737 | SfuI TTCGAA 6 2 0 2 4 0 0 738 | SgeI CNNG 4 2 0 13 17 0 0 739 | SgfI GCGATCGC 8 2 0 5 3 0 0 740 | SgrAI CRCCGGYG 8 2 0 2 6 0 0 741 | SgrBI CCGCGG 6 2 0 4 2 0 0 742 | SgrDI CGTCGACG 8 2 0 2 6 0 0 743 | SgrTI ccds 4 2 0 14 18 0 0 744 | SgsI GGCGCGCC 8 2 0 2 6 0 0 745 | SimI gggtc 5 2 0 2 5 0 0 746 | SlaI CTCGAG 6 2 0 1 5 0 0 747 | SmaI CCCGGG 6 2 1 3 3 0 0 748 | SmiI ATTTAAAT 8 2 1 4 4 0 0 749 | SmiMI CAYNNNNRTG 10 2 1 5 5 0 0 750 | SmlI CTYRAG 6 2 0 1 5 0 0 751 | SmoI CTYRAG 6 2 0 1 5 0 0 752 | SnaI gtatac 6 0 0 0 0 0 0 753 | SnaBI TACGTA 6 2 1 3 3 0 0 754 | Sno506I ggccgag 7 0 0 0 0 0 0 755 | SpeI ACTAGT 6 2 0 1 5 0 0 756 | SphI GCATGC 6 2 0 5 1 0 0 757 | SplI cgtacg 6 2 0 1 5 0 0 758 | SpoDI gcggrag 7 0 0 0 0 0 0 759 | SrfI gcccgggc 8 2 1 4 4 0 0 760 | Sse9I AATT 4 2 0 -1 4 0 0 761 | Sse232I cgccggcg 8 2 0 2 6 0 0 762 | Sse8387I CCTGCAGG 8 2 0 6 2 0 0 763 | Sse8647I aggwcct 7 2 0 2 5 0 0 764 | SseBI AGGCCT 6 2 1 3 3 0 0 765 | SsiI CCGC 4 2 0 1 3 0 0 766 | SspI AATATT 6 2 1 3 3 0 0 767 | SspDI GGCGCC 6 2 0 1 5 0 0 768 | SspD5I ggtga 5 2 1 13 13 0 0 769 | SstI GAGCTC 6 2 0 5 1 0 0 770 | SstE37I cgaagac 7 2 0 27 25 0 0 771 | Sth132I cccg 4 2 0 8 12 0 0 772 | Sth302II ccgg 4 2 1 2 2 0 0 773 | StrI CTCGAG 6 2 0 1 5 0 0 774 | StsI ggatg 5 2 0 15 19 0 0 775 | StuI AGGCCT 6 2 1 3 3 0 0 776 | StyI CCWWGG 6 2 0 1 5 0 0 777 | StyD4I CCNGG 5 2 0 -1 5 0 0 778 | SwaI ATTTAAAT 8 2 1 4 4 0 0 779 | TaaI ACNGT 5 2 0 3 2 0 0 780 | TaiI ACGT 4 2 0 4 -1 0 0 781 | TaqI TCGA 4 2 0 1 3 0 0 782 | TaqII GACCGA 6 2 0 17 15 0 0 783 | TasI AATT 4 2 0 -1 4 0 0 784 | TatI WGTACW 6 2 0 1 5 0 0 785 | TauI GCSGC 5 2 0 4 1 0 0 786 | TdeDII accagg 6 0 0 0 0 0 0 787 | TfiI GAWTC 5 2 0 1 4 0 0 788 | Tru1I TTAA 4 2 0 1 3 0 0 789 | Tru9I TTAA 4 2 0 1 3 0 0 790 | TscAI CASTG 5 2 0 7 -3 0 0 791 | TseI GCWGC 5 2 0 1 4 0 0 792 | TseFI GTSAC 5 2 0 -1 5 0 0 793 | TsoI tarcca 6 2 0 17 15 0 0 794 | Tsp45I GTSAC 5 2 0 -1 5 0 0 795 | Tsp4CI acngt 5 2 0 3 2 0 0 796 | TspDTI ATGAA 5 2 0 16 14 0 0 797 | TspEI aatt 4 2 0 -1 4 0 0 798 | TspGWI ACGGA 5 2 0 16 14 0 0 799 | TspMI CCCGGG 6 2 0 1 5 0 0 800 | TspRI CASTG 5 2 0 7 -3 0 0 801 | TssI gagnnnctc 9 0 0 0 0 0 0 802 | TstI cacnnnnnntcc 12 4 0 -9 -14 24 19 803 | TsuI gcgac 5 0 0 0 0 0 0 804 | Tth111I GACNNNGTC 9 2 0 4 5 0 0 805 | Tth111II caarca 6 2 0 17 15 0 0 806 | UbaF9I tacnnnnnrtgt 12 0 0 0 0 0 0 807 | UbaF11I tcgta 5 0 0 0 0 0 0 808 | UbaF12I ctacnnngtc 10 0 0 0 0 0 0 809 | UbaF13I gagnnnnnnctgg 13 0 0 0 0 0 0 810 | UbaF14I ccannnnntcg 11 0 0 0 0 0 0 811 | UbaPI cgaacg 6 0 0 0 0 0 0 812 | UcoMSI gagctc 6 4 0 -8 -6 11 13 813 | UnbI ggncc 5 2 0 -1 5 0 0 814 | Van91I CCANNNNNTGG 11 2 0 7 4 0 0 815 | Vha464I CTTAAG 6 2 0 1 5 0 0 816 | VneI GTGCAC 6 2 0 1 5 0 0 817 | VpaK11AI ggwcc 5 2 0 -1 5 0 0 818 | VpaK11BI GGWCC 5 2 0 1 4 0 0 819 | VspI ATTAAT 6 2 0 2 4 0 0 820 | WviI cacrag 6 2 0 27 25 0 0 821 | XagI CCTNNNNNAGG 11 2 0 5 6 0 0 822 | XapI RAATTY 6 2 0 1 5 0 0 823 | XbaI TCTAGA 6 2 0 1 5 0 0 824 | XceI RCATGY 6 2 0 5 1 0 0 825 | XcmI CCANNNNNNNNNTGG 15 2 0 8 7 0 0 826 | XhoI CTCGAG 6 2 0 1 5 0 0 827 | XhoII RGATCY 6 2 0 1 5 0 0 828 | XmaI CCCGGG 6 2 0 1 5 0 0 829 | XmaIII cggccg 6 2 0 1 5 0 0 830 | XmaJI CCTAGG 6 2 0 1 5 0 0 831 | XmiI GTMKAC 6 2 0 2 4 0 0 832 | XmnI GAANNNNTTC 10 2 1 5 5 0 0 833 | XspI CTAG 4 2 0 1 3 0 0 834 | YkrI c 1 2 0 11 10 0 0 835 | ZraI GACGTC 6 2 1 3 3 0 0 836 | ZrmI AGTACT 6 2 1 3 3 0 0 837 | Zsp2I ATGCAT 6 2 0 5 1 0 0 838 | -------------------------------------------------------------------------------- /enzyme/enzs.list: -------------------------------------------------------------------------------- 1 | AatII 2 | ApaI 3 | AscI 4 | BamHI 5 | BbsI 6 | BglII 7 | CpoI 8 | EcoRI 9 | EndoIV 10 | EndoV 11 | EndoVIII 12 | FspI 13 | HindIII 14 | KpnI 15 | NarI 16 | NcoI 17 | NdeI 18 | NheI 19 | NotI 20 | PstI 21 | PvuI 22 | SacI 23 | SalI 24 | Sau3AI(MboI) 25 | ScaI 26 | SnaBI 27 | SpeI 28 | SphI 29 | XbaI 30 | XhoI 31 | -------------------------------------------------------------------------------- /enzyme/restrict_check_digested_sequence_number.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # Copyright 2014 Wei Shen (shenwei356#gmail.com). All rights reserved. 4 | # Use of this source code is governed by a MIT-license 5 | # that can be found in the LICENSE file. 6 | # https://github.com/shenwei356/bio_scripts/ 7 | 8 | # embossre.enz 9 | # ftp://ftp.neb.com/pub/rebase/emboss_e.\d+ 10 | 11 | use strict; 12 | use File::Basename; 13 | use Getopt::Long; 14 | 15 | use BioUtil::Misc; 16 | use BioUtil::Seq; 17 | use BioUtil::Util; 18 | 19 | my $usage = sprintf " 20 | Usage: %s [options] 21 | 22 | Options: 23 | -e FILE Enzymefile (from Rebase) 24 | -i FILE Fasta file 25 | -l FILE Enzyme list file 26 | -t INT Threshold [%d] 27 | 28 | Example: 29 | 30 | %s -e embossre.enz -i test.fasta -t 10 31 | 32 | See more: https://github.com/shenwei356/bio_scripts 33 | ", basename($0), 1 << 30, basename($0); 34 | 35 | my $help = 0; 36 | my $enzymefile = ""; 37 | my $seqfile = ""; 38 | my $listfile = ""; 39 | my $threshold = 1 << 30; 40 | 41 | GetOptions( 42 | 'help|h' => \$help, 43 | 'e=s' => \$enzymefile, 44 | 'i=s' => \$seqfile, 45 | 'l=s' => \$listfile, 46 | 't=i' => \$threshold, 47 | ) or die $usage; 48 | 49 | die $usage if $help; 50 | die $usage unless $enzymefile ne "" and $seqfile ne ""; 51 | die "threshold should > 0\n" unless $threshold > 0; 52 | 53 | # =============================================================== 54 | 55 | my $enzs = parse_embossre($enzymefile); 56 | my %subenzs = (); 57 | my %listhash = (); 58 | 59 | if ( $listfile ne "" ) { 60 | my $list = get_column_data($listfile, 1); 61 | %listhash = map { $_ => 0 } @$list; 62 | for my $enz ( keys %$enzs ) { 63 | if ( exists $listhash{$enz} ) { 64 | $subenzs{$enz} = $$enzs{$enz}; 65 | } 66 | } 67 | } 68 | else { 69 | %subenzs = %$enzs; 70 | } 71 | 72 | %listhash = (); 73 | %listhash = map { $_ => 0 } keys %subenzs; 74 | 75 | # show process 76 | local $| = 1; 77 | my $n = 0; 78 | 79 | my $next_seq = FastaReader($seqfile); 80 | while ( my $fa = &$next_seq() ) { 81 | my ( $header, $seq ) = @$fa; 82 | $seq = uc $seq; 83 | my $revcom = revcom($seq); 84 | 85 | for my $enz ( keys %subenzs ) { 86 | my $e = $subenzs{$enz}; 87 | my $pattern = $$e{pattern_regexp}; 88 | 89 | # check enzyme digest site 90 | if ( $seq =~ /$pattern/ or $revcom =~ /$pattern/ ) { 91 | $listhash{$enz}++; 92 | if ( $listhash{$enz} >= $threshold ) { 93 | delete $subenzs{$enz}; 94 | delete $listhash{$enz}; 95 | } 96 | } 97 | } 98 | 99 | # show process 100 | $n++; 101 | print STDERR "\rcheck seq $n"; 102 | } 103 | $| = 0; 104 | 105 | print STDERR "\n"; 106 | for ( sort { $listhash{$b} <=> $listhash{$a} } keys %listhash ) { 107 | my $e = $subenzs{$_}; 108 | my $pattern = $$e{pattern}; 109 | printf "%s\t%s\t%s\n", $_, $pattern, $listhash{$_}; 110 | } 111 | -------------------------------------------------------------------------------- /enzyme/restrict_choose_enzyme_for_identify_genomes.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Fuction : Run EMBOSS restrict, and parse restriction fragments 3 | # Author : Wei Shen 4 | # Email : shenwei356@gmail.com, http://shenwei.me 5 | # Date : 2014-12-18 6 | # Last Update : 2015-01-31 7 | 8 | use strict; 9 | use File::Basename; 10 | use Getopt::Long; 11 | use Parallel::Runner; 12 | 13 | use BioUtil::Misc; 14 | use BioUtil::Seq; 15 | use BioUtil::Util; 16 | 17 | local $| = 1; 18 | 19 | $0 = basename($0); 20 | my $usage = < \$help, 49 | 'e=s' => \$enzymefile, 50 | 'i=s' => \$seqfile, 51 | 'l=s' => \$listfile, 52 | 't=i' => \$threads, 53 | 'linear' => \$linear_genome, 54 | ) or die $usage; 55 | 56 | die $usage if $help; 57 | die $usage unless $enzymefile ne "" and $seqfile ne ""; 58 | 59 | #=====================[ run restrict ]==================== 60 | 61 | my $enzs = parse_embossre($enzymefile); 62 | my %subenzs = (); 63 | my %listhash = (); 64 | 65 | my $dir = ''; 66 | if ( $listfile ne "" ) { 67 | my $list = get_column_data( $listfile, 1 ); 68 | %listhash = map { $_ => 0 } @$list; 69 | for my $enz ( keys %$enzs ) { 70 | if ( exists $listhash{$enz} ) { 71 | $subenzs{$enz} = $$enzs{$enz}; 72 | } 73 | } 74 | $dir = "re.$seqfile.digestedby.$listfile"; 75 | } 76 | else { 77 | %subenzs = %$enzs; 78 | $dir = "re.$seqfile.digestedby.$enzymefile"; 79 | } 80 | 81 | %listhash = (); 82 | %listhash = map { $_ => 0 } keys %subenzs; 83 | 84 | # unless ( -e $dir and -d $dir ) { 85 | rm_and_mkdir($dir); 86 | my $runner = Parallel::Runner->new($threads); 87 | for my $enz ( keys %subenzs ) { 88 | $runner->run( 89 | sub { 90 | run_emboss_restrict( $dir, $enz ); 91 | } 92 | ); 93 | } 94 | $runner->finish; 95 | 96 | # } 97 | 98 | sub run_emboss_restrict { 99 | my ( $dir, $enzyme ) = @_; 100 | my $resultfile = "$dir/$seqfile.$enzyme.re"; 101 | return if -e $resultfile; 102 | print STDERR "$enzyme\n"; 103 | my $cmd = "restrict -auto -solofragment -limit " 104 | . "-sequence $seqfile -outfile $resultfile -enzymes $enzyme "; 105 | $cmd .= " -plasmid " unless $linear_genome; 106 | 107 | my $fail = run($cmd); 108 | die "failed to run:$cmd\n" if $fail; 109 | } 110 | 111 | # ===========[ Parsing restriction fragments ]============= 112 | 113 | my @files = glob "$dir/*.re"; 114 | 115 | my $fragments = {}; 116 | my $stats = {}; 117 | for my $file (@files) { 118 | my ( $enzyme, $seq ) = (undef) x 2; 119 | 120 | open my $fh, $file 121 | or die "fail to read enzyme file $file\n"; 122 | while (<$fh>) { 123 | if (/^#\s+\-enzymes (.+)/) { # enzyme name 124 | $enzyme = $1; 125 | } 126 | elsif (/^# Sequence: (.+)\s+from/) { # sequence name 127 | $seq = $1; 128 | } 129 | elsif (/^# \t([\d\t]+)$/) { # fragment size 130 | if ( ref $$fragments{$enzyme}{$seq} ne ref [] ) { 131 | $$fragments{$enzyme}{$seq} = []; 132 | } 133 | push @{ $$fragments{$enzyme}{$seq} }, split( /\t/, $1 ); 134 | } 135 | } 136 | close $fh; 137 | 138 | my $n = 0; 139 | for my $seq ( keys %{ $$fragments{$enzyme} } ) { 140 | my @frags = sort { $b <=> $a } @{ $$fragments{$enzyme}{$seq} }; 141 | 142 | # print "$enzyme\n $seq\n @frags\n"; 143 | $$fragments{$enzyme}{$seq} = \@frags; 144 | $n += scalar @frags; 145 | } 146 | $$stats{$enzyme}{nfrags} = $n; 147 | } 148 | 149 | # ===========[ Output restriction fragments ]============= 150 | 151 | my $outfile = "$seqfile.digestedby.$listfile.frag"; 152 | 153 | open OUT, ">", $outfile 154 | or die "fail to write file $outfile\n"; 155 | 156 | my $frag = {}; 157 | for my $enzyme ( 158 | sort { $$stats{$a}{nfrags} <=> $$stats{$b}{nfrags} } 159 | keys %$fragments 160 | ) 161 | { 162 | 163 | print OUT "-" x 79, "\n", "$enzyme\n"; 164 | for my $seq ( sort keys %{ $$fragments{$enzyme} } ) { 165 | my @frags = @{ $$fragments{$enzyme}{$seq} }; 166 | print OUT "$seq: @frags\n"; 167 | } 168 | } 169 | 170 | close OUT; 171 | -------------------------------------------------------------------------------- /enzyme/restrict_with_T_tail.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # Copyright 2014 Wei Shen (shenwei356#gmail.com). All rights reserved. 4 | # Use of this source code is governed by a MIT-license 5 | # that can be found in the LICENSE file. 6 | # https://github.com/shenwei356/bio_scripts/ 7 | 8 | use strict; 9 | use BioUtil::Misc; 10 | 11 | die "usage: $0 embossre.enz\n" 12 | unless @ARGV == 1; 13 | 14 | my $file = shift @ARGV; 15 | my $d = shift @ARGV; 16 | 17 | my $enzs = parse_embossre($file); 18 | 19 | for my $enz (sort keys %$enzs) { 20 | my $e = $$enzs{$enz}; 21 | next unless $$e{cuts_number} == 2 22 | and $$e{c1} - $$e{c2} == 1 23 | and substr ($$e{pattern}, $$e{c1} - 1, 1) =~ /[aN]/i; 24 | print "$enz\n"; 25 | } 26 | 27 | # there's no enzyme meeting this condition 28 | -------------------------------------------------------------------------------- /enzyme/restrict_with_far_away_digest_site.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # Copyright 2014 Wei Shen (shenwei356#gmail.com). All rights reserved. 4 | # Use of this source code is governed by a MIT-license 5 | # that can be found in the LICENSE file. 6 | # https://github.com/shenwei356/bio_scripts/ 7 | 8 | use strict; 9 | use BioUtil::Misc; 10 | 11 | die "usage: $0 embossre.enz d\n" 12 | unless @ARGV == 2; 13 | 14 | my $file = shift @ARGV; 15 | my $d = shift @ARGV; 16 | 17 | my $enzs = parse_embossre($file); 18 | 19 | for my $enz (sort keys %$enzs) { 20 | my $e = $$enzs{$enz}; 21 | next unless $$e{cuts_number} == 2 22 | and $$e{c1} - $$e{length} >= $d; 23 | print "$enz\n"; 24 | } 25 | -------------------------------------------------------------------------------- /enzyme/restrict_without_digest_site_in_sequences.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # Copyright 2014 Wei Shen (shenwei356#gmail.com). All rights reserved. 4 | # Use of this source code is governed by a MIT-license 5 | # that can be found in the LICENSE file. 6 | # https://github.com/shenwei356/bio_scripts/ 7 | 8 | # embossre.enz 9 | # ftp://ftp.neb.com/pub/rebase/ 10 | 11 | use strict; 12 | use File::Basename; 13 | use BioUtil::Misc; 14 | use BioUtil::Seq; 15 | use BioUtil::Util; 16 | 17 | my $usage = sprintf " 18 | Usage: %s [enzyme list file] 19 | 20 | ", basename($0); 21 | die $usage unless @ARGV == 3 or @ARGV == 2; 22 | 23 | my $enzymefile = shift @ARGV; 24 | my $seqfile = shift @ARGV; 25 | 26 | my $enzs = parse_embossre($enzymefile); 27 | my %subenzs = (); 28 | 29 | my $listfile = shift @ARGV; 30 | if ( defined $listfile ) { 31 | my $list = get_list_from_file($listfile); 32 | my %listhash = map { $_ => 0 } @$list; 33 | for my $enz ( keys %$enzs ) { 34 | if ( exists $listhash{$enz} ) { 35 | $subenzs{$enz} = $$enzs{$enz}; 36 | } 37 | } 38 | } 39 | else { 40 | %subenzs = %$enzs; 41 | } 42 | 43 | # show process 44 | local $| = 1; 45 | my $n = 0; 46 | my $sum = scalar keys %subenzs; 47 | my $left = $sum; 48 | 49 | my $next_seq = FastaReader($seqfile); 50 | while ( my $fa = &$next_seq() ) { 51 | my ( $header, $seq ) = @$fa; 52 | $seq = uc $seq; 53 | my $revcom = revcom($seq); 54 | 55 | for my $enz ( keys %subenzs ) { 56 | my $e = $subenzs{$enz}; 57 | my $pattern = $$e{pattern_regexp}; 58 | # check enzyme digest site 59 | if ( $seq =~ /$pattern/ or $revcom =~ /$pattern/ ) { 60 | delete $subenzs{$enz}; 61 | } 62 | } 63 | 64 | # show process 65 | $n++; 66 | $left = scalar keys %subenzs; 67 | print STDERR "\rcheck seq $n, candidate: $left / $sum"; 68 | } 69 | $| = 0; 70 | 71 | print STDERR "\n"; 72 | print "$_\n" for sort keys %subenzs; 73 | -------------------------------------------------------------------------------- /file_formats/add_annotations_to_myva.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # Fuction : To add annotations to COG aa file 3 | # Author : Wei Shen 4 | # Email : shenwei356@gmail.com 5 | # Date : 2011-04-08, cost 2 hour. 6 | # Last Update : 2011-04-08 7 | 8 | # Annotations are in following files downloaded from 9 | # ftp://ftp.ncbi.nlm.nih.gov/pub/COG/COG 10 | # 11 | # FILE DATA(* means important) 12 | # *fun.txt function_id(one letter) -> function 13 | # *myva protein_id*, aa sequence* 14 | # *myva=gb protein_id -> GI numbers* 15 | # *whog Organism(three letters)*, protein_id, 16 | # function_id, cog_id, protein* 17 | # *org.txt Organism(three letters) -> detail 18 | # 19 | # Output format 20 | # > ____ 21 | # aa sequence 22 | # 23 | # Therefore, five files with sign * will be used 24 | # 25 | # Attention: 26 | # 1. NOT all protein_ids from file myva=gb could be found in file whog 27 | # 2. NOT all protein_ids from file myva could be found in file whog 28 | 29 | use strict; 30 | 31 | # parse file fun 32 | my $fun = &parse_file_fun('fun.txt'); 33 | 34 | 35 | # parse file myva=gb 36 | my $pro_gi = &parse_file_myva_gb('myva=gb'); 37 | 38 | 39 | # parse file org 40 | my $org = &parse_file_org('org.txt'); 41 | 42 | 43 | # parse file whog 44 | my $whog = &parse_file_whog('whog'); 45 | 46 | # my @keys = keys %$whog; 47 | # print "$_\n" unless $_ ~~ @keys for keys %$pro_gi; 48 | # the result showed that not all protein_ids from file myva=gb 49 | # are in file whog 50 | 51 | # parse file myva and add annotation 52 | my $file = 'myva'; 53 | my $out_file = "$file.full_annotation.txt"; 54 | my ($head, $seq, $pro_id_trim); 55 | 56 | open IN, $file or die "File $file failed to open sequence.\n"; 57 | $/ = '>';; 58 | open OUT, ">", $out_file or die "File $out_file failed to open sequence.\n"; 59 | 60 | while ( ) { 61 | s/\r?\n>//; 62 | ( $head, $seq ) = split /\r?\n/, $_, 2; 63 | ## > ____ 64 | $pro_id_trim = $head; 65 | $pro_id_trim = $1 if $head =~ /(.+)\_\d+/; # for gi 66 | $head = $head 67 | . " __pro__". $$whog{$head}{protein} 68 | . "__fun_". $$whog{$head}{fun_id}. "_". $$fun{$$whog{$head}{fun_id}} 69 | . "__org__". $$org{$$whog{$head}{org_id}}{organism} 70 | . "__gi__". $$pro_gi{$pro_id_trim}; 71 | print OUT ">$head\n$seq\n"; 72 | } 73 | close IN; 74 | close OUT; 75 | 76 | #==================================================================== 77 | # out put data structure: 78 | # $hash_ref = {fun_id => function} 79 | sub parse_file_fun($){ 80 | my ($file) = @_; 81 | my $fun = {}; 82 | 83 | open IN, $file or die "File $file failed to open\n"; 84 | while () { 85 | next unless /\[(\w)\] (.+) $/; 86 | $$fun{$1} = $2; 87 | } 88 | close IN; 89 | # print scalar keys %$fun; 90 | return $fun; 91 | } 92 | 93 | # out put data structure: 94 | # $hash_ref = {protein_id => gi} 95 | sub parse_file_myva_gb($){ 96 | my ($file) = @_; 97 | my $pro_gi = {}; 98 | 99 | open IN, $file or die "File $file failed to open\n"; 100 | while () { 101 | next unless /^(.+)\s+(.+)$/; 102 | $$pro_gi{$1} = $2; 103 | } 104 | close IN; 105 | # print scalar keys %$pro_gi; 106 | return $pro_gi; 107 | } 108 | 109 | # out put data structure: 110 | # $hash_ref = {org_id => {kindom => kindom, organism => organism} } 111 | sub parse_file_org($){ 112 | my ($file) = @_; 113 | my $org = {}; 114 | 115 | open IN, $file or die "File $file failed to open\n"; 116 | while () { 117 | next unless /^(\w{3})\s+\d+\s+(.+?)\s+(.+)$/; 118 | $$org{$1}{kindom} = $2; 119 | $$org{$1}{organism} = $3; 120 | } 121 | close IN; 122 | # print scalar keys %$org; 123 | return $org; 124 | } 125 | 126 | # out put data structure: 127 | # $hash_ref = {protein_id => {org_id => org_id, cog_id => cog_id, 128 | # fun_id => fun_id, protein => protein} } 129 | sub parse_file_whog($){ 130 | my ($file) = @_; 131 | my $whog = {}; 132 | 133 | my ($fun_id, $cog_id, $protein, $org_id, $protein_id, @protein_ids); 134 | open IN, $file or die "File $file failed to open\n"; 135 | while () { 136 | if (/^\[(\w)\] (\w+) (.+)$/) { 137 | $fun_id = $1; 138 | $cog_id = $2; 139 | $protein = $3; 140 | } 141 | elsif (/^\s+(\w{3})\:\s+(.+)$/) { 142 | $org_id = $1; 143 | @protein_ids = split /\s+/, $2; 144 | for $protein_id (@protein_ids) { 145 | $$whog{$protein_id}{fun_id} = $fun_id; 146 | $$whog{$protein_id}{cog_id} = $cog_id; 147 | $$whog{$protein_id}{protein} = $protein; 148 | $$whog{$protein_id}{org_id} = $org_id; 149 | } 150 | } 151 | elsif (/ (.+)/) { 152 | @protein_ids = split /\s+/, $1; 153 | for $protein_id (@protein_ids) { 154 | $$whog{$protein_id}{fun_id} = $fun_id; 155 | $$whog{$protein_id}{cog_id} = $cog_id; 156 | $$whog{$protein_id}{protein} = $protein; 157 | $$whog{$protein_id}{org_id} = $org_id; 158 | } 159 | } 160 | elsif (/_______/) { 161 | } 162 | else { 163 | } 164 | } 165 | close IN; 166 | # print scalar keys %$whog; 167 | # print "$_\t". ($$whog{'PH0109_1'}{$_}). "\n" for keys %{$$whog{'PH0109_1'}}; 168 | return $whog; 169 | } 170 | -------------------------------------------------------------------------------- /file_formats/bam2gff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # https://github.com/shenwei356/bio_scripts 4 | 5 | import argparse 6 | import sys 7 | from collections import Counter, defaultdict 8 | 9 | import pysam 10 | 11 | parser = argparse.ArgumentParser( 12 | description="bam2gff. Extracting the locations of properly mapping paired (single) ends to GFF format.", 13 | epilog="https://github.com/shenwei356/bio_scripts") 14 | 15 | parser.add_argument('bamfile', type=str, help='bam file') 16 | parser.add_argument('-c', '--cache-size', type=int, default=1000, help='cache size [1000]') 17 | parser.add_argument('-m', '--match-proportion', type=float, default=0.75, 18 | help='minimum match proportion to define properly paired ends [0.75]') 19 | parser.add_argument('-se', '--single-end', action='store_true', help='single read mapping result') 20 | 21 | parser.add_argument("-v", "--verbose", help='verbosely print information', 22 | action="count", default=0) 23 | 24 | args = parser.parse_args() 25 | 26 | pairs = defaultdict(lambda: defaultdict(dict)) 27 | stats = Counter() 28 | samfile = pysam.AlignmentFile(args.bamfile, "rb") 29 | for read in samfile.fetch(): 30 | if args.single_end: 31 | if not read.reference_length or read.reference_length < read.query_length * args.match_proportion: # full match 32 | stats['bad match'] += 1 33 | continue 34 | ref = samfile.getrname(read.reference_id) 35 | if read.is_reverse: 36 | start, end, strand = read.reference_start, read.reference_end, '-' 37 | else: 38 | start, end, strand = read.reference_start, read.reference_end, '+' 39 | sys.stdout.write('\t'.join( 40 | [ref, 'bam2gff.py', 'single_ends', str(start + 1), str(end), '.', strand, '.', 41 | read.query_name]) + "\n") 42 | continue 43 | 44 | if read.is_proper_pair and not read.is_secondary: 45 | if read.reference_length < read.query_length * args.match_proportion: # full match 46 | stats['bad match'] += 1 47 | continue 48 | key = '_'.join([str(x) for x in sorted([read.reference_start, read.next_reference_start])]) 49 | pairs[read.query_name][key]['read1' if read.is_read1 else 'read2'] = {'start': read.reference_start, 50 | 'end': read.reference_end, 51 | 'ref': samfile.getrname( 52 | read.reference_id), 53 | 'reverse': read.is_reverse} 54 | 55 | if 'read1' in pairs[read.query_name][key] and 'read2' in pairs[read.query_name][key]: 56 | read1, read2 = pairs[read.query_name][key]['read1'], pairs[read.query_name][key]['read2'] 57 | 58 | if not read1['reverse']: 59 | strand, start, end = '+', read1['start'], read2['end'] 60 | else: 61 | strand, start, end = '-', read2['start'], read1['end'] 62 | 63 | sys.stdout.write('\t'.join( 64 | [read1['ref'], 'bam2gff.py', 'paired_ends', str(start + 1), str(end), '.', strand, '.', 65 | read.query_name]) + "\n") 66 | 67 | stats['paired'] += 1 68 | 69 | del pairs[read.query_name][key] 70 | 71 | samfile.close() 72 | 73 | for query, sites in pairs.items(): 74 | if len(sites) == 0: 75 | continue 76 | stats['unpaired'] += 1 77 | 78 | sys.stderr.write('{} summary: {}\n'.format(args.bamfile, stats)) 79 | -------------------------------------------------------------------------------- /file_formats/extract_cds_from_glimmer_predict_result.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # https://github.com/shenwei356/bio_scripts 3 | 4 | use strict; 5 | use File::Basename; 6 | use BioUtil::Seq; 7 | use BioUtil::Util; 8 | 9 | $0 = basename($0); 10 | my $usage = qq( 11 | usage: $0 [gff] 12 | 13 | ); 14 | die $usage unless @ARGV == 2 or @ARGV == 3; 15 | my $prfile = shift @ARGV; 16 | my $seqfile = shift @ARGV; 17 | my $gtf = shift @ARGV; 18 | 19 | my $genome = ( values %{ read_sequence_from_fasta_file($seqfile) } )[0]; 20 | 21 | my @data = (); 22 | my ( $genome, $name, $a, $b, $frame, $seq ); 23 | open my $fh, '<', $prfile or die "fail to open file: $prfile\n"; 24 | while (<$fh>) { 25 | s/\r?\n//g; 26 | $genome = $1 if /^>(.+)/; 27 | @data = split /\s+/, $_; 28 | next unless scalar(@data) == 5; 29 | ( $name, $a, $b, $frame ) = @data; 30 | next unless $a =~ /^\d+$/; 31 | if ($a > $b) { 32 | my $tmp = $a; 33 | $a = $b; 34 | $b = $tmp; 35 | } 36 | 37 | if ( $gtf eq 'gff' ) { 38 | my $strand = $frame > 0 ? '+' : '-'; 39 | printf "%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", 40 | $genome, 'glimmer', 'CDS', $a, $b, '.', $strand, '.', $name; 41 | } 42 | else { 43 | if ( $frame > 0 ) { 44 | $seq = substr( $genome, $a - 1, ( $b - $a + 1 ) ); 45 | } 46 | else { 47 | $seq = revcom( substr( $genome, $b - 1, ( $a - $b + 1 ) ) ); 48 | } 49 | print ">${name}_${a}..${b}..$frame\n", format_seq($seq); 50 | } 51 | 52 | } 53 | close $fh; 54 | -------------------------------------------------------------------------------- /file_formats/extract_features_from_genbank_file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # https://github.com/shenwei356/bio_scripts 3 | 4 | import sys 5 | import argparse 6 | import gzip 7 | 8 | from Bio import SeqIO 9 | from Bio.Seq import Seq 10 | from Bio.SeqRecord import SeqRecord 11 | 12 | 13 | def parse_args(): 14 | parser = argparse.ArgumentParser( 15 | description="Extract features from Genbank file", 16 | epilog="https://github.com/shenwei356/bio_scripts") 17 | 18 | parser.add_argument('gbkfile', type=str, help='Genbank file') 19 | parser.add_argument( 20 | '-t', 21 | '--type', 22 | type=str, 23 | default='CDS', 24 | help='Feature type (CDS tRNA). Multiple values should be separated by comma. "." for any types.') 25 | outfmt_choices = ['fasta', 'gtf', 'gff'] 26 | parser.add_argument('-f', 27 | '--outfmt', 28 | type=str, 29 | default='fasta', 30 | help='Out format, fasta or gtf') 31 | 32 | parser.add_argument('-p', 33 | '--peptide', 34 | action="store_true", 35 | help='Translate the nucleotides to peptides') 36 | parser.add_argument( 37 | '--table', 38 | type=int, 39 | help='Genetic code table (detail: http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi ) [1]') 40 | 41 | args = parser.parse_args() 42 | 43 | if args.outfmt not in outfmt_choices: 44 | sys.stderr.write('[ERROR] -f | --outfmt should be in {}\n'.format(outfmt_choices)) 45 | sys.exit(1) 46 | 47 | if args.table: 48 | args.peptide = True 49 | 50 | return args 51 | 52 | 53 | if __name__ == '__main__': 54 | args = parse_args() 55 | 56 | types = set(args.type.lower().split(',')) 57 | with gzip.open(args.gbkfile) if args.gbkfile.endswith('.gz') else open(args.gbkfile) as fh: 58 | records = SeqIO.parse(fh, "genbank") 59 | for record in records: 60 | for f in record.features: 61 | if '.' not in types and f.type.lower() not in types: 62 | continue 63 | 64 | start, end = f.location.start, f.location.end 65 | strand = '+' if f.strand > 0 else '-' 66 | 67 | qualifiers = f.qualifiers 68 | if 'product' in qualifiers: 69 | product = qualifiers['product'][0] 70 | else: 71 | product = '' 72 | 73 | if 'note' in qualifiers: 74 | note = qualifiers['note'][0] 75 | else: 76 | note = '' 77 | 78 | if 'gene' in qualifiers: 79 | gene_id = qualifiers['gene'][0] 80 | elif 'locus_tag' in qualifiers: 81 | gene_id = qualifiers['locus_tag'][0] 82 | else: 83 | gene_id = '' 84 | 85 | if args.outfmt == 'fasta': 86 | seq = None 87 | if args.peptide: 88 | if args.table: 89 | transl_table = args.table 90 | elif 'transl_table' in qualifiers: 91 | transl_table = qualifiers['transl_table'] 92 | else: 93 | sys.stderr.write('[WARNING] neither translate table given or found in features. set 1\n') 94 | transl_table = 1 95 | 96 | if 'translation' in qualifiers: 97 | seq = Seq(qualifiers['translation'][0]) 98 | else: 99 | seq = record.seq[start:end].translate(table=transl_table) 100 | else: 101 | seq = record.seq[start:end] 102 | 103 | SeqIO.write( 104 | [SeqRecord(seq, 105 | id='{}_{}..{}..{}'.format(record.id, start + 1, end, strand), description=product)], 106 | sys.stdout, 107 | "fasta") 108 | 109 | elif args.outfmt == 'gtf': 110 | frame = int(qualifiers['codon_start'][0]) - 1 if 'codon_start' in qualifiers else 0 111 | 112 | transcript_id = gene_id 113 | 114 | attribute = 'gene_id "{}"; transcript_id "{}"'.format(gene_id, transcript_id) 115 | 116 | if 'protein_id' in f.qualifiers: 117 | attribute += '; protein_id "{}"'.format(qualifiers['protein_id'][0]) 118 | 119 | if 'db_xref' in qualifiers: 120 | for ext in qualifiers['db_xref']: 121 | attribute += '; db_xref "{}"'.format(ext) 122 | 123 | if 'note' in f.qualifiers: 124 | attribute += '; note "{}"'.format(qualifiers['note'][0]) 125 | 126 | attribute += '; product "{}"; '.format(product) 127 | 128 | sys.stdout.write('\t'.join( 129 | [record.id, 'genbank', f.type, str(start + 1), str(end), '.', strand, str(frame), attribute]) + "\n") 130 | 131 | elif args.outfmt == 'gff': 132 | if 'codon_start' in qualifiers: 133 | frame = int(qualifiers['codon_start'][0]) - 1 134 | else: 135 | frame = 0 136 | sys.stdout.write('\t'.join( 137 | [record.id, 'genbank', f.type, str(start + 1), 138 | str(end), '.', strand, str(frame), 139 | "{},{}".format(gene_id, product)]) + "\n") 140 | -------------------------------------------------------------------------------- /file_formats/extract_sequence_from_genbank_file.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Author : Wei Shen 3 | # Email : shenwei356@gmail.com 4 | # Date : 2011-07-20 5 | # Last Update : 2011-07-20 6 | use strict; 7 | use File::Basename; 8 | 9 | $0 = basename($0); 10 | die "Usage: $0 gb_file\n" unless @ARGV == 1; 11 | my $file = shift; 12 | 13 | my ($definition, $version, $gi, $seq); 14 | 15 | open IN, $file or die "failed to open file: $file\n"; 16 | $/ = "\n//"; 17 | while () { 18 | next unless /DEFINITION (.+)\./; 19 | $definition = $1; 20 | #print "$definition\n"; 21 | next unless /VERSION (.+) GI\:(.+)\r?\n/; 22 | $version = $1; 23 | $gi = $2; 24 | #print "$version, $gi\n"; 25 | $seq = substr($_, index($_, 'ORIGIN') + 6); 26 | $seq =~ s/\/\/.*//s; 27 | $seq =~ s/\s+//g; 28 | $seq =~ s/\d+//g; 29 | #print "$seq\n"; 30 | #print length($seq),"\n"; 31 | print ">gi|$gi|gb|$version| $definition\n".(format_seq($seq, 60))."\n"; 32 | } 33 | $/ = "\n"; 34 | close IN; 35 | 36 | 37 | sub format_seq($$){ 38 | my ($s, $n) = @_; 39 | my $s2 =''; 40 | my ($j, $int); 41 | $int = int ((length $s) / $n); 42 | for($j = 0 ; $j <= $int - 1; $j ++){ 43 | $s2 .= substr($s, $j * $n, $n)."\n"; 44 | } 45 | $s2 .= substr($s, $int * $n); 46 | return $s2; 47 | } 48 | -------------------------------------------------------------------------------- /file_formats/genbank_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import re 4 | import sys 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser( 8 | description='filter gene records by regular expression from genbank file', 9 | epilog="https://github.com/shenwei356/bio_scripts") 10 | parser.add_argument('gbfile', type=str, help='genbank file') 11 | parser.add_argument('pattern', 12 | type=str, 13 | help='pattern (regular expression) [.]') 14 | args = parser.parse_args() 15 | 16 | with open(args.gbfile) as fh: 17 | tmp = '' 18 | for line in fh: 19 | if line.startswith(' gene '): 20 | if tmp == '': 21 | tmp = line 22 | else: 23 | if re.search(args.pattern, tmp): 24 | sys.stdout.write(tmp) 25 | tmp = line 26 | elif line != '': 27 | tmp += line 28 | if re.search(args.pattern, tmp): 29 | sys.stdout.write(tmp) 30 | -------------------------------------------------------------------------------- /file_formats/gff2fa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # https://github.com/shenwei356/bio_scripts 4 | 5 | from __future__ import print_function 6 | 7 | import argparse 8 | import gzip 9 | import sys 10 | from collections import defaultdict 11 | 12 | from Bio import SeqIO 13 | from Bio.SeqRecord import SeqRecord 14 | 15 | parser = argparse.ArgumentParser(description="extract_cds_by_gff") 16 | parser.add_argument('-t', 17 | '--type', 18 | type=str, 19 | default='CDS', 20 | help='gene type. "." for any types. [CDS]') 21 | parser.add_argument('-us', 22 | '--up-stream', 23 | type=int, 24 | default=0, 25 | help='up stream length [0]') 26 | parser.add_argument('-ds', 27 | '--down-stream', 28 | type=int, 29 | default=0, 30 | help='down stream length [0]') 31 | parser.add_argument('-j', 32 | '--just', 33 | action="store_true", 34 | help='only output up and down stream') 35 | parser.add_argument('gff_file', type=str, help='gff file') 36 | parser.add_argument('fasta_file', type=str, help='fasta file') 37 | args = parser.parse_args() 38 | if not (args.up_stream >= 0 and args.down_stream >= 0): 39 | print('value of --up-stream and --down-stream should be >= 0', 40 | file=sys.stderr) 41 | sys.exit(1) 42 | if args.just: 43 | if args.up_stream and args.down_stream or not (args.up_stream or 44 | args.down_stream): 45 | print( 46 | 'when using option --just, ONE of --up-stream and --down-stream should given', 47 | file=sys.stderr) 48 | sys.exit(1) 49 | 50 | 51 | def read_gff_file(file): 52 | genes = defaultdict(list) 53 | with open(file, 'rt') as fh: 54 | for row in fh: 55 | data = row.strip().split('\t') 56 | if len(data) < 9: 57 | continue 58 | name = data[0] 59 | gene = dict() 60 | gene['type'], gene['start'], gene['end'], gene['strand'], gene[ 61 | 'product' 62 | ] = data[2], int(data[3]), int( 63 | data[4]), data[6], data[8] 64 | genes[name].append(gene) 65 | 66 | return genes 67 | 68 | 69 | genes = read_gff_file(args.gff_file) 70 | 71 | fh = gzip.open(args.fasta_file, 72 | 'rt') if args.fasta_file.endswith('.gz') else open( 73 | args.fasta_file, 'r') 74 | for record in SeqIO.parse(fh, 'fasta'): 75 | name, genome = record.id, record.seq 76 | genomesize = len(genome) 77 | if name not in genes: 78 | continue 79 | 80 | for gene in genes[name]: 81 | if args.type != '.' and gene['type'].lower() != args.type.lower(): 82 | continue 83 | seq = '' 84 | flag = '' 85 | if gene['strand'] == '+': 86 | if args.just: 87 | if args.up_stream: 88 | s = gene['start'] - args.up_stream - 1 89 | e = gene['start'] - 1 90 | flag = 'jus..{}'.format(args.up_stream) 91 | else: 92 | s = gene['end'] 93 | e = gene['end'] + args.down_stream 94 | flag = 'jds..{}'.format(args.down_stream) 95 | else: 96 | s = gene['start'] - args.up_stream - 1 97 | s = 0 if s < 0 else s 98 | e = gene['end'] + args.down_stream 99 | if args.up_stream: 100 | flag = 'us..{}'.format(args.up_stream) 101 | else: 102 | flag = 'ds..{}'.format(args.down_stream) 103 | 104 | s = 0 if s < 0 else s 105 | end = genomesize - 1 if e > genomesize - 1 else e 106 | seq = genome[s:e] 107 | else: 108 | if args.just: 109 | if args.up_stream: 110 | s = gene['end'] 111 | e = gene['end'] + args.up_stream 112 | flag = 'jus..{}'.format(args.up_stream) 113 | else: 114 | s = gene['start'] - args.down_stream - 1 115 | e = gene['start'] - 1 116 | flag = 'jds..{}'.format(args.down_stream) 117 | else: 118 | s = gene['start'] - args.down_stream - 1 119 | s = 0 if s < 0 else s 120 | e = gene['end'] + args.up_stream 121 | if args.up_stream: 122 | flag = 'us..{}'.format(args.up_stream) 123 | else: 124 | flag = 'ds..{}'.format(args.down_stream) 125 | 126 | s = 0 if s < 0 else s 127 | end = genomesize - 1 if e > genomesize - 1 else e 128 | seq = genome[s:e].reverse_complement() 129 | 130 | if args.up_stream or args.down_stream: 131 | id = '{}_{}..{}..{}_{}'.format(name, gene['start'], gene['end'], 132 | gene['strand'], flag) 133 | else: 134 | id = '{}_{}..{}..{}'.format(name, gene['start'], gene['end'], 135 | gene['strand']) 136 | SeqIO.write( 137 | SeqRecord(seq, 138 | id=id, 139 | description=gene['product']), 140 | sys.stdout, 141 | 'fasta') 142 | fh.close() 143 | -------------------------------------------------------------------------------- /file_formats/gff_frame_start_coverage.plot.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(dplyr) 3 | library(ggplot2) 4 | library(reshape2) 5 | 6 | args <- commandArgs(TRUE) 7 | if (length(args) != 3) { 8 | write("\nusage: gff_frame_start_coverage.plot.R infile out.png title\n", stderr()) 9 | quit(status = 1) 10 | } 11 | 12 | df <- read.csv(args[1], sep = "\t") 13 | windows <- df['end'] - df['start'] 14 | window <- windows[1,1] + 1 15 | 16 | if (window == 1000) { 17 | ylabel = paste("Counts/", 1, "kb", sep='') 18 | } else if (window > 1000) { 19 | ylabel = paste("Counts/", window/1000, "kb", sep='') 20 | } else { 21 | ylabel = paste("Counts/", window, "bp", sep='') 22 | } 23 | 24 | df <- select(df, X.chr, strand, cnt_f0, cnt_f1, cnt_f2) 25 | 26 | df_m <- melt(df, id.vars = c("X.chr", "strand")) 27 | 28 | p <- ggplot(df_m, aes(variable, value, fill = strand)) + 29 | geom_violin(adjust=1, position = position_dodge(width = 0.75)) + 30 | scale_x_discrete(labels = c('0','1','2')) + 31 | xlab('Frame') + 32 | ylab(ylabel) + 33 | ggtitle(args[3]) + 34 | facet_grid(. ~ X.chr) + 35 | theme_bw() + 36 | theme( 37 | panel.border = element_blank(), 38 | panel.grid.major = element_blank(), 39 | panel.grid.minor = element_blank(), 40 | axis.line = element_line(colour = "black"), 41 | legend.key = element_blank(), 42 | strip.background = element_rect( 43 | colour = "white", fill = "white", 44 | size = 0.2 45 | ) 46 | ) 47 | 48 | ggsave(p, file = args[2], width = 8, height = 4) 49 | -------------------------------------------------------------------------------- /file_formats/gff_frame_start_coverage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # https://github.com/shenwei356/bio_scripts 4 | # Author : Wei Shen 5 | # Contact : shenwei356@gmail.com 6 | # LastUpdate : 2015-07-17 7 | 8 | from __future__ import division, print_function 9 | 10 | import argparse 11 | import gzip 12 | import os 13 | import pickle 14 | import sys 15 | from collections import Counter, defaultdict 16 | 17 | import numpy as np 18 | 19 | parser = argparse.ArgumentParser(description="gff frame start coverage", 20 | epilog="https://github.com/shenwei356/bio_scripts") 21 | 22 | parser.add_argument('genome_size_file', type=str, help='genome size file. two fields (chr and size) per line. ') 23 | parser.add_argument('gff_file', type=str, help='gff/gtf file') 24 | parser.add_argument('-w', '--window', type=int, default=1000, help='windows size [1000]') 25 | parser.add_argument('-s', '--step', type=int, default=30, help='step size [30]') 26 | 27 | args = parser.parse_args() 28 | 29 | # read genome size file 30 | sys.stderr.write('read genome size\n') 31 | genomesizes = defaultdict(int) 32 | with gzip.open(args.genome_size_file) if args.genome_size_file.endswith('.gz') else open(args.genome_size_file) as fh: 33 | for line in fh: 34 | if line.isspace() or line[0] == '#': 35 | continue 36 | data = line.rstrip().split() 37 | if len(data) < 2: 38 | sys.stderr.write('number of columns < 2! {}'.format(line)) 39 | continue 40 | chr, size = data[0], data[1] 41 | genomesizes[chr] = int(size) 42 | 43 | # read gff file 44 | sys.stderr.write('read gff file\n') 45 | coverages = defaultdict(dict) 46 | file_cov_pickle = '{}.cov.pickle'.format(args.gff_file) 47 | if not (os.path.exists(file_cov_pickle) and os.path.getsize(file_cov_pickle) > 0): 48 | with gzip.open(args.gff_file) if args.gff_file.endswith('.gz') else open(args.gff_file) as fh: 49 | chr = '' 50 | for line in fh: 51 | if line.isspace() or line[0] == '#': 52 | continue 53 | 54 | data = line.rstrip().split('\t') 55 | if len(data) != 9: 56 | sys.stderr.write('number of columns != 9: {}'.format(line)) 57 | 58 | g, start, end, strand = data[0], int(data[3]), int(data[4]), data[6] 59 | if g != chr: 60 | chr = g 61 | coverages[chr]['+'] = np.zeros(genomesizes[chr] + 1, dtype=np.uint32) 62 | coverages[chr]['-'] = np.zeros(genomesizes[chr] + 1, dtype=np.uint32) 63 | sys.stderr.write('read chr {}\n'.format(chr)) 64 | if strand == '+': 65 | coverages[chr][strand][start] += 1 66 | # print(chr, strand, start) 67 | else: 68 | coverages[chr][strand][end] += 1 69 | with open(file_cov_pickle, 'wb') as fh: 70 | pickle.dump(coverages, fh, pickle.HIGHEST_PROTOCOL) 71 | else: 72 | with open(file_cov_pickle, 'rb') as fh: 73 | coverages = pickle.load(fh) 74 | 75 | 76 | def mean_coverage(data): 77 | return round(sum((c for j, c in data)) / len(data), 2) if len(data) > 0 else 0 78 | 79 | # counting 80 | sys.stderr.write('statistics...\n') 81 | sys.stdout.write('#{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format('chr', 'strand', 'start', 'end', 'cnt', 82 | 'cnt_f0', 'cnt_f1', 'cnt_f2', 83 | 'a_cov_f0', 'a_cov_f1', 'a_cov_f2')) 84 | for chr in sorted(coverages.keys()): 85 | for strand in ['+', '-']: 86 | coverage = coverages[chr][strand] 87 | # for i, c in enumerate(coverage): 88 | # sys.stdout.write('{}\t{}\t{}\t{}\n'.format(chr, strand, i, c)) 89 | _end = genomesizes[chr] - args.window + 1 if genomesizes[chr] > args.window else 1 90 | # print(chr, strand, genomesizes[chr], _end) 91 | for i in np.arange(1, _end + 1, args.step, dtype=np.uint32): 92 | data = [(j, coverage[j]) for j in np.arange(i, i + args.window) if coverage[j] > 0] 93 | data_f0 = [(j, c) for j, c in data if j % 3 == 1] 94 | data_f1 = [(j, c) for j, c in data if j % 3 == 2] 95 | data_f2 = [(j, c) for j, c in data if j % 3 == 0] 96 | sys.stdout.write( 97 | '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(chr, strand, i, i + args.window - 1, len(data), 98 | len(data_f0), len(data_f1), len(data_f2), 99 | mean_coverage(data_f0), mean_coverage(data_f1), 100 | mean_coverage(data_f2))) 101 | -------------------------------------------------------------------------------- /file_formats/gff_intersect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # https://github.com/shenwei356/bio_scripts 4 | # Author : Wei Shen 5 | # Contact : shenwei356@gmail.com 6 | # LastUpdate : 2015-06-26 7 | 8 | from __future__ import print_function, division 9 | import argparse 10 | import os 11 | import shutil 12 | import sys 13 | import gzip 14 | from collections import defaultdict, Counter 15 | from bx.intervals.intersection import Intersecter, Interval 16 | 17 | parser = argparse.ArgumentParser(description="gff intersect", 18 | epilog="https://github.com/shenwei356/bio_scripts") 19 | 20 | parser.add_argument('query', type=str, help='query gff file') 21 | parser.add_argument('subject', type=str, help='subject gff file') 22 | parser.add_argument('-e', '--embeded', action='store_true', 23 | help='see what genes (query) contained in specific regions (subject)') 24 | parser.add_argument('-c', '--cover', action='store_true', 25 | help='see what genes (query) containing specific regions (subject)') 26 | parser.add_argument('-s', '--split', action='store_true', 27 | help='split results into multiple files') 28 | parser.add_argument('-o', '--split-dir', type=str, 29 | help='directory for split results') 30 | parser.add_argument('-eu', '--extend-upstream', type=int, default=0, 31 | help='extend N bases in the upstream [0]') 32 | parser.add_argument('-ed', '--extend-downstream', type=int, default=0, 33 | help='extend N bases in the downstream [0]') 34 | 35 | args = parser.parse_args() 36 | 37 | if args.extend_upstream and args.extend_upstream <= 0: 38 | sys.stderr.write('value of option --extend-upstream should be greater than 0\n') 39 | sys.exit(1) 40 | 41 | if args.extend_downstream and args.extend_downstream <= 0: 42 | sys.stderr.write('value of option --extend-downstream should be greater than 0\n') 43 | sys.exit(1) 44 | 45 | if args.cover and args.embeded: 46 | sys.stderr.write('only one of option -e/--embeded and -c/--cover allowed\n') 47 | sys.exit(1) 48 | 49 | sys.stderr.write('building tree from {}\n'.format(args.subject)) 50 | trees = dict() 51 | with gzip.open(args.subject) if args.subject.endswith('.gz') else open(args.subject) as fh: 52 | genome = '' 53 | for line in fh: 54 | if line.isspace() or line[0] == '#': 55 | continue 56 | 57 | data = line.rstrip().split('\t') 58 | if len(data) != 9: 59 | sys.stderr.write('number of columns != 9: {}'.format(line)) 60 | 61 | g, start, end, strand = data[0], int(data[3]), int(data[4]), data[6] 62 | if g != genome: 63 | genome = g 64 | trees[genome] = Intersecter() 65 | 66 | if strand == '+': 67 | start -= args.extend_upstream 68 | end += args.extend_downstream 69 | else: 70 | start -= args.extend_downstream 71 | end += args.extend_upstream 72 | 73 | if not args.embeded and strand == '-': # complement strand 74 | start, end = -end, -start 75 | trees[genome].add_interval(Interval(start, end, value=data)) 76 | 77 | if args.split: 78 | if args.split_dir is None: 79 | outdir = '{}.intersect@{}'.format(os.path.normpath(os.path.basename(args.query)), 80 | os.path.normpath(os.path.basename(args.subject))) 81 | else: 82 | outdir = args.split_dir 83 | 84 | if os.path.exists(outdir): 85 | shutil.rmtree(outdir) 86 | os.makedirs(outdir) 87 | 88 | sys.stderr.write('querying\n') 89 | with gzip.open(args.query) if args.query.endswith('.gz') else open(args.query) as fh: 90 | for line in fh: 91 | if line.isspace() or line[0] == '#': 92 | continue 93 | data = line.rstrip().split('\t') 94 | if len(data) != 9: 95 | sys.stderr.write('number of columns != 9: {}'.format(line)) 96 | 97 | genome, start, end, strand, product = data[0], int(data[3]), int(data[4]), data[6], data[8] 98 | 99 | if genome not in trees: 100 | continue 101 | 102 | overlaps = trees[genome].find(start, end) 103 | if len(overlaps) == 0: 104 | continue 105 | 106 | overlap_data, stats = list(), Counter() 107 | for x in overlaps: 108 | s, e = x.start, x.end 109 | if args.embeded: 110 | strand2 = '.' 111 | elif s > 0: 112 | strand2 = '+' 113 | else: # complement strand 114 | s, e = -x.end, -x.start 115 | strand2 = '-' 116 | 117 | overlap, t = 0, '' 118 | if s <= start: 119 | if e >= end: 120 | # start ======== end 121 | # s ------------- e 122 | overlap = end - start + 1 123 | t = 'embed' 124 | if args.cover: 125 | continue 126 | else: 127 | # start ======== end 128 | # s ------ e 129 | if args.embeded or args.cover: 130 | continue 131 | overlap = e - start + 1 132 | t = 'overlap.downstream' if strand == '+' else 'overlap.upstream' 133 | else: 134 | if e >= end: 135 | # start ======== end 136 | # s ------ e 137 | if args.embeded or args.cover: 138 | continue 139 | overlap = end - s + 1 140 | t = 'overlap.upstream' if strand == '+' else 'overlap.downstream' 141 | else: 142 | # start ======== end 143 | # s --- e 144 | if args.embeded: 145 | continue 146 | overlap = e - s + 1 147 | t = 'cover' 148 | 149 | if args.embeded or args.cover: 150 | frame = '.' 151 | elif strand == '+': 152 | frame = abs(s - start) % 3 153 | else: 154 | frame = abs(e - end) % 3 155 | 156 | stats[t] += 1 157 | if args.embeded or args.cover: 158 | overlap_data.append(x.value) 159 | else: 160 | overlap_data.append([str(i) for i in 161 | [data[0], s, e, strand2, overlap, round(100 * overlap / (end - start + 1), 1), t, frame, 162 | x.value[-1]]]) 163 | if len(overlap_data) == 0: 164 | continue 165 | 166 | if args.split: 167 | fh_out = open(os.path.join(outdir, '{}_{}..{}..{}_{}.gff'.format(genome, 168 | start, end, strand, product.replace('/', '_').replace('"', ''))), 'wt') 169 | fh_out.write('# {}'.format(line)) 170 | else: 171 | fh_out = sys.stdout 172 | fh_out.write('>{}'.format(line)) 173 | 174 | if args.embeded or args.cover: 175 | sorted_overlap_data = sorted(overlap_data, key=lambda o: (o[0], o[1])) 176 | else: 177 | fh_out.write('# summary: {}\n'.format(stats)) 178 | fh_out.write( 179 | '\t'.join(['chr', 'start', 'end', 'strand', 'overlap', 'overlap%', 'type', 'frame', 'attribute']) + '\n') 180 | sorted_overlap_data = sorted(overlap_data, key=lambda o: (o[6], o[7], -float(o[5]))) 181 | 182 | for overlap in sorted_overlap_data: 183 | fh_out.write('\t'.join(overlap) + '\n') 184 | 185 | if args.split: 186 | fh_out.close() 187 | -------------------------------------------------------------------------------- /for_education/Parsing grouped data in multi-line.pl: -------------------------------------------------------------------------------- 1 | # https://github.com/shenwei356 2 | # 3 | # Ths script illustrates how to parse grouped data in multi-line, as below. 4 | # String of first column is the group ID, and a group may have 5 | # more than one records in multi-line. 6 | # 7 | # g1 2 3 8 | # g1 2 5 9 | # g2 2 3 10 | # g2 2 5 11 | # g3 2 3 12 | # g3 2 5 13 | # 14 | # Outline 15 | # 16 | # A flag “last_id” is used to judge first / same / new group (See code bellow). 17 | # 18 | # For different situation, 19 | # 20 | # 1. First record. Initializing container for current group ( id), 21 | # and add in this record. last_id = id 22 | # 2. Same group. Add this record into the container for current group ( id ). 23 | # 2. New group. Do something with previous group ( last_id ). Initializing 24 | # container for current group ( id ), and add in this record. last_id = id . 25 | # 2. Last group. Adding last group ( last_id) at the end of file (EOF). 26 | # 27 | # Extension 28 | # 29 | # In previous case, the marker for a new record is a different id. In other cases, 30 | # parsing fasta file for example, the marker is the character “>”. 31 | # 32 | use strict; 33 | 34 | my $data = {}; # container for all data 35 | my ( $id, $last_id ) = ( "", "" ); 36 | my $record = ""; 37 | 38 | while () { 39 | 40 | # parse id 41 | next unless /^(.+?)\s+/; 42 | $id = $1; 43 | 44 | # parse record. Here is the whole line 45 | $record = $_; 46 | 47 | if ( $last_id eq "" ) { # first record 48 | $$data{$id} = []; # initialize container for this group 49 | push @{ $$data{$id} }, $record; # add this record 50 | $last_id = $id; # restore this id for further use 51 | } 52 | else { 53 | if ( $id eq $last_id ) { # same group 54 | push @{ $$data{$id} }, $record; # add this record 55 | } 56 | else { # new group 57 | # do something with previous group 58 | &dosomthing( $$data{$last_id} ); 59 | 60 | $$data{$id} = []; 61 | push @{ $$data{$id} }, $record; 62 | $last_id = $id; 63 | } 64 | } 65 | } 66 | 67 | # do something with the last group 68 | &dosomthing( $$data{$id} ); 69 | 70 | sub dosomthing { 71 | my ($records) = @_; 72 | for (@$records) { 73 | print " $_"; 74 | } 75 | print "\n"; 76 | } 77 | 78 | # example data 79 | __DATA__ 80 | g1 2 3 81 | g1 2 5 82 | g2 2 3 83 | g2 2 5 84 | g3 2 3 85 | g3 2 5 -------------------------------------------------------------------------------- /for_education/extract_cds_by_gff.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use File::Basename; 5 | use Getopt::Long; 6 | use BioUtil::Seq; 7 | use BioUtil::Util; 8 | use Data::Dumper; 9 | 10 | $0 = basename($0); 11 | my $usage = qq( 12 | Usage: $0 [options] gff_file fasta_file 13 | Options: 14 | -t, --type gene type (CDS or mRNA) [CDS] 15 | -us, --up-stream up stream length [0] 16 | -ds, --down-stream down stream length [0] 17 | -h, --help show this usage 18 | 19 | ); 20 | 21 | my $argv = {}; 22 | $$argv{type} = 'CDS'; 23 | $$argv{up_stream} = 0; 24 | $$argv{down_stream} = 0; 25 | 26 | GetOptions( 27 | 'help|h' => \$$argv{help}, 28 | 'type|t=s' => \$$argv{type}, 29 | 'up-stream|us=s' => \$$argv{up_stream}, 30 | 'down-stream|ds=s' => \$$argv{down_stream}, 31 | ); 32 | 33 | die $usage if $$argv{help}; 34 | die $usage if scalar(@ARGV) != 2; 35 | 36 | check_positive_integer( $$argv{up_stream} + 1 ); 37 | check_positive_integer( $$argv{down_stream} + 1 ); 38 | 39 | my ( $gff_file, $fasta_file ) = @ARGV; 40 | 41 | my $genes = read_gff_file($gff_file); 42 | 43 | # print Dumper($genes); 44 | 45 | my $next_seq = FastaReader($fasta_file); 46 | while ( my $fa = &$next_seq() ) { 47 | my ( $name, $genome ) = @$fa; 48 | next if not exists $$genes{$name}; 49 | 50 | for my $gene ( @{ $$genes{$name} } ) { 51 | next if lc $$gene{type} ne lc $$argv{type}; # specific type 52 | my $seq = ''; 53 | 54 | if ( $$gene{strand} eq '+' ) { 55 | my $s = $$gene{start} - $$argv{up_stream} - 1; 56 | $s = 0 if $s < 0; 57 | $seq = substr( 58 | $genome, $s, 59 | $$gene{end} 60 | - $$gene{start} 61 | + $$argv{down_stream} + 1 62 | 63 | ); 64 | } 65 | else { 66 | my $s = $$gene{start} - $$argv{down_stream} - 1; 67 | $s = 0 if $s < 0; 68 | $seq = revcom( 69 | substr( 70 | $genome, $s, 71 | $$gene{end} - $$gene{start} + $$argv{up_stream} + 1 72 | ) 73 | ); 74 | } 75 | printf( ">%s_%d..%d..%s\n%s", 76 | $name, $$gene{start}, $$gene{end}, $$gene{strand}, 77 | format_seq($seq) ); 78 | } 79 | 80 | } 81 | 82 | sub read_gff_file { 83 | my ($file) = @_; 84 | my $genes = {}; 85 | open( my $fh, "<", $file ) or die "fail to open file: $file\n"; 86 | while (<$fh>) { 87 | my @data = split( /\s+/, $_ ); 88 | next unless scalar(@data) >= 9; 89 | my $name = $data[0]; 90 | my $gene = {}; 91 | ( $$gene{type}, $$gene{start}, $$gene{end}, $$gene{strand} ) 92 | = ( $data[2], $data[3], $data[4], $data[6] ); 93 | if ( not exists $$genes{$name} ) { 94 | $$genes{$name} = []; 95 | } 96 | push @{ $$genes{$name} }, $gene; 97 | 98 | } 99 | close($fh); 100 | return $genes; 101 | } 102 | 103 | -------------------------------------------------------------------------------- /for_education/fasta_common_seqs2.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2013 Wei Shen (shenwei356#gmail.com). All rights reserved. 3 | # Use of this source code is governed by a MIT-license 4 | # that can be found in the LICENSE file. 5 | use File::Basename; 6 | use Getopt::Long; 7 | use Digest::MD5; 8 | use strict; 9 | 10 | $0 = basename($0); 11 | my $usage = <<"USAGE"; 12 | =============================================================================== 13 | Function: Find common sequences in fasta files. Version 2. 14 | Features: 15 | 1) Comparing by name or sequence are both supported. 16 | 2) No files number limit. 17 | 3) Very low RAM usage. (Lower than Version 1). 18 | Note that: 19 | 1) Records with different names may have same sequences. 20 | 2) Case of sequence letters or name may be different. 21 | 3) Duplicated records may exist in a fasta file. 22 | Contact : Wei Shen 23 | Date : 2013-12-05 24 | Update : 2014-08-14 25 | Site : https://github.com/shenwei356/bio_scripts 26 | 27 | Usage : $0 [-s] [-i] fastafile fastafile2 [fastafile3 ...] 28 | Options : 29 | -s Comparing by sequence. 30 | -i Ignore case. 31 | =============================================================================== 32 | 33 | USAGE 34 | 35 | our $by_seq = 0; 36 | our $ignore_case = 0; 37 | GetOptions( 38 | "s" => \$by_seq, 39 | "i" => \$ignore_case, 40 | ) or die $usage; 41 | 42 | # at least two files; 43 | die "$usage\n>= 2 sequence file needed.\n" unless @ARGV >= 2; 44 | 45 | our $counts = {}; 46 | our $names = {}; 47 | 48 | our ( $file, $has_head, $last_head, $head, $head0, $seq_len, $seq_md5 ); 49 | our $md5; 50 | $md5 = Digest::MD5->new if $by_seq; 51 | 52 | # check files 53 | for $file (@ARGV) { 54 | die "File ($file) does not exists.\n" unless -e $file; 55 | } 56 | 57 | for $file (@ARGV) { 58 | open IN, "<", $file 59 | or die "Fail to open file: $file!\n"; 60 | 61 | $has_head = 0; 62 | $seq_len = 0; 63 | $md5->reset if $by_seq; 64 | 65 | while () { 66 | s/\r?\n//; 67 | if (/^\s*>/) { # fasta head 68 | s/>\s*//; 69 | s/\s+$//; 70 | 71 | recording(); 72 | 73 | $seq_len = 0; 74 | $has_head = 1; 75 | } 76 | elsif ( $has_head == 1 ) { # sequence 77 | next if $_ eq ""; 78 | 79 | $seq_len += length $_; 80 | 81 | next unless $by_seq; 82 | tr/A-Z/a-z/ if $ignore_case; 83 | $md5->add($_); 84 | } 85 | } 86 | close IN; 87 | 88 | # do not forget the last record 89 | recording() if $seq_len > 0; 90 | } 91 | 92 | sub recording { 93 | $head0 = $last_head; # orgin sequence name 94 | $last_head = $_; # store this head for next turn; 95 | 96 | $head = $head0; 97 | $head = lc $head if $ignore_case; 98 | if ($by_seq) { 99 | $seq_md5 = $md5->hexdigest; 100 | $md5->reset; 101 | 102 | # ingore sequence records without sequence. 103 | return if $seq_len == 0; 104 | 105 | # count sequences with md5 $seq_md5 in $file 106 | $$counts{$seq_md5}{$file}++; 107 | 108 | # record the origin sequence name. 109 | $$names{$seq_md5}{$file} = $head0; 110 | } 111 | else { 112 | # ingore sequence records without head 113 | return if $head eq ''; 114 | 115 | # count sequences with name $head in $file 116 | $$counts{$head}{$file}++; 117 | $$names{$head}{$file} = $head0; 118 | } 119 | } 120 | 121 | # find common sequences 122 | my $file_num = scalar @ARGV; 123 | 124 | # extract sequences from the first file. 125 | $file = $ARGV[0]; 126 | my $names_ok = {}; 127 | for my $key ( keys %$counts ) { 128 | 129 | # all files have a same record 130 | next unless ( scalar keys %{ $$counts{$key} } ) == $file_num; 131 | 132 | # save into a hash. 133 | $$names_ok{ $$names{$key}{$file} } 134 | = $$counts{$key}{$file}; 135 | } 136 | 137 | # print common sequences 138 | my $is_target = 0; 139 | open IN, "<", $file 140 | or die "Fail to open file: $file!\n"; 141 | while () { 142 | if (/^\s*>/) { 143 | s/>\s*//; 144 | s/\s+$//; 145 | next if $_ eq ''; 146 | 147 | $head = $_; 148 | $is_target = 0; 149 | if ( exists $$names_ok{$head} and $$names_ok{$head} > 0 ) { 150 | print ">$head\n"; 151 | $is_target = 1; 152 | 153 | # just export one record for duplicated records. 154 | $$names_ok{$head} = 0; 155 | } 156 | } 157 | elsif ( $is_target == 1 ) { 158 | print $_; 159 | } 160 | } 161 | close IN; 162 | -------------------------------------------------------------------------------- /for_education/fasta_extract_sequence_by_id_file.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Function: Given a id file, extracting records in another file. 4 | # It works well for super big file. 5 | # Author : Wei Shen http://shenwei.me 6 | # Date : 2013-08-01 7 | # Update : 2014-11-14 8 | # Docment : http://blog.shenwei.me/extract_records_by_id_file/ 9 | 10 | use strict; 11 | use File::Basename; 12 | use BioUtil::Util; 13 | 14 | $0 = basename($0); 15 | my $usage = < 18 | 19 | USAGE 20 | 21 | die $usage unless @ARGV == 3; 22 | 23 | my $id_file = shift; 24 | my $seq_file = shift; 25 | my $out_file = shift; 26 | 27 | #-------------[ read ids ]------------- 28 | 29 | my %ids_hash 30 | ; # 用字典(查询效率更高)来存储id及每个id的命中数 31 | 32 | open ID, "<", $id_file 33 | or die "Failed to open file $id_file.\n"; 34 | while () { 35 | s/\r?\n//; # 记得把回车\r和换行符\n删掉 36 | next if /^\s*$/; 37 | s/^\s+|\s+$//; 38 | 39 | # 根据具体情况提取id !!!!!! 40 | # next unless /gi\|(\d+)/; # gi|12313|的情况 41 | next unless /(.+)/; # 整个一行作为id的情况 42 | 43 | $ids_hash{$1} = 0; # 加入字典 44 | } 45 | close ID; 46 | 47 | # show number of ids 48 | my @ids = keys %ids_hash; 49 | my $n = @ids; 50 | print "\nRead $n ids.\n\n"; 51 | 52 | #-------------[ searching ]------------- 53 | 54 | # 显示搜索进度的变量,当目标文件非常大的时候很有用 55 | my $count = 0; # 当前处理的序列数 56 | my $hits = 0; # 匹配到的序列数 57 | local $| = 1 58 | ; # 输出通道在每次打印或写之后都强制刷新,提高显示进度速度 59 | 60 | open OUT, ">", $out_file 61 | or die "Failed to open file $out_file.\n"; 62 | 63 | my $next_seq = FastaReader($seq_file); 64 | while ( my $fa = &$next_seq() ) { 65 | my ( $head, $seq ) = @$fa; 66 | 67 | $count++; 68 | 69 | $seq =~ s/\s+//g; 70 | 71 | # 根据具体情况提取id !!!!!!!!!!!!!!!!!!!!! 72 | # 取出记录中的id 73 | # next unless $head =~ /gi\|(\d+)\|/; # gi|12313|的情况 74 | # next unless $head =~ /(.+?)_/; # 我测试的例子,勿套用 75 | next unless $head =~ /(.+)/; # 整个一行作为id的情况 76 | 77 | # 在%ids_hash中查询记录 78 | if ( exists $ids_hash{$1} ) { 79 | print OUT ">$head\n$seq\n"; 80 | 81 | # 如果确信目标文件中只有唯一与ID匹配的记录,则从字典中删除,提高查询速度 82 | # delete $ids_hash{$1}; 83 | 84 | # record hit number of a id 85 | $ids_hash{$1}++; 86 | $hits++; 87 | } 88 | print "\rProcessing ${count} th record. hits: $hits"; 89 | } 90 | close OUT; 91 | 92 | # 显示没有匹配到任何记录的id 93 | my @ids = grep { $ids_hash{$_} == 0 } keys %ids_hash; 94 | my $n = @ids; 95 | print "\n\n$n ids did not match any record in $seq_file:\n"; 96 | print "@ids\n"; 97 | 98 | -------------------------------------------------------------------------------- /for_education/join_table.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | use File::Basename; 4 | use Data::Dumper; 5 | 6 | $0 = basename($0); 7 | my $usage = < 9 | 10 | USAGE 11 | die($usage) unless scalar(@ARGV) == 4; 12 | my ( $tsv1, $index1, $tsv2, $index2 ) = @ARGV; 13 | 14 | sub tsv2map ($$) { 15 | my ( $file, $index ) = @_; 16 | $index = 1 unless defined($index); # index column, defautl: 1 17 | 18 | my $data = {}; # data is a hash reference, I prefer this. 19 | open( my $fh, "<", $file ) or die("failed to open file: $file\n"); 20 | while (<$fh>) { 21 | chomp($_); 22 | my @items = split( /\t/, $_ ); 23 | if ( scalar(@items) < $index ) { # verify $index 24 | die "number of column in file ($file) < index ($index).\n"; 25 | } 26 | my $key = $items[ $index - 1 ]; # get the key 27 | $$data{$key} = $_; # store key => value 28 | } 29 | close $fh; 30 | 31 | return $data; 32 | } 33 | 34 | my $data_tsv2 = tsv2map( $tsv2, $index2 ); 35 | 36 | # print Dumper($data_tsv2); 37 | # result: 38 | # $VAR1 = { 39 | # '123' => '123 onetwothree', 40 | # 'str' => 'str string', 41 | # '245' => '245 twofourfive' 42 | # }; 43 | 44 | # parse tsv1 45 | open( my $fh, "<", $tsv1 ) or die("failed to open file: $tsv1\n"); 46 | while (<$fh>) { 47 | chomp($_); 48 | my @items = split( /\t/, $_ ); 49 | if ( scalar(@items) < $index1 ) { 50 | die "number of column in file ($tsv1) < index ($index1).\n"; 51 | } 52 | my $key = $items[ $index1 - 1 ]; # get the key 53 | 54 | if ( exists $$data_tsv2{$key} ) { # check if key existed in tsv2 55 | print "$_\t$$data_tsv2{$key}\n"; 56 | } 57 | else { 58 | print "$_\n"; 59 | } 60 | } 61 | close $fh; 62 | -------------------------------------------------------------------------------- /for_education/simple_statistics.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2014 Wei Shen (shenwei356#gmail.com). All rights reserved. 4 | # Use of this source code is governed by a MIT-license 5 | # that can be found in the LICENSE file. 6 | # https://github.com/shenwei356 7 | 8 | use strict; 9 | 10 | my $usage = < 13 | 14 | is a plain text file. each column should be seperated by TAB(\\t) 15 | is the column number of the table. 16 | 17 | USAGE 18 | 19 | die $usage unless @ARGV == 2; 20 | 21 | my $file = shift @ARGV; 22 | my $column = shift @ARGV; 23 | 24 | my $data = get_column_data( $file, $column ); 25 | 26 | printf "#.\t%d\n", scalar @$data; 27 | printf "min.\t%d\n", min($data); 28 | printf "max.\t%d\n", max($data); 29 | 30 | my ($mean, $stdev) = mean_and_stdev($data); 31 | 32 | printf "mean.\t%.2f\n", $mean; 33 | printf "stdev.\t%.2f\n", $stdev; 34 | 35 | 36 | sub get_column_data { 37 | my ( $file, $column ) = @_; 38 | unless ( $column =~ /^(\d+)$/ and $column > 0 ) { 39 | warn 40 | "column number ($column) should be an integer and greater than 0.\n"; 41 | $column = 1; 42 | } 43 | 44 | open IN, "<", $file or die "failed to open file: $file\n"; 45 | my @linedata = (); 46 | my @data = (); 47 | my $n = 0; 48 | while () { 49 | s/\r?\n//; 50 | @linedata = split /\t/, $_; 51 | $n = scalar @linedata; 52 | next unless $n > 0; 53 | 54 | if ( $column > $n ) { 55 | die 56 | "number of columns of this line ($n) is less than given column number ($column)\n"; 57 | } 58 | 59 | push @data, $linedata[ $column - 1 ]; 60 | } 61 | close IN; 62 | 63 | return \@data; 64 | } 65 | 66 | # you can also modules 67 | # use List::Util qw/max min sum/; 68 | 69 | sub max { 70 | my ($list) = @_; 71 | my $max = shift @$list; 72 | for (@$list) { 73 | $max = $_ if $_ > $max; 74 | } 75 | return $max; 76 | } 77 | 78 | sub min { 79 | my ($list) = @_; 80 | my $min = shift @$list; 81 | for (@$list) { 82 | $min = $_ if $_ < $min; 83 | } 84 | return $min; 85 | } 86 | 87 | sub mean_and_stdev($) { 88 | my ($list) = @_; 89 | return ( 0, 0 ) if @$list == 0; 90 | my $sum = 0; 91 | $sum += $_ for @$list; 92 | my $sum_square = 0; 93 | $sum_square += $_ * $_ for @$list; 94 | my $mean = $sum / @$list; 95 | my $variance = $sum_square / @$list - $mean * $mean; 96 | my $std = sqrt $variance; 97 | return ( $mean, $std ); 98 | } 99 | -------------------------------------------------------------------------------- /not_used/csv2tab: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # https://github.com/shenwei356 3 | awk -F'^"|","|,"|",|,|"$' '{ out=$1; for(i=2;i<=NF;i++){out=out"\t"$i}; print out}' $@ 4 | -------------------------------------------------------------------------------- /not_used/csv_join: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # https://github.com/shenwei356/bio_scripts 4 | # Author : Wei Shen 5 | # Contact : shenwei356@gmail.com 6 | # LastUpdate : 2015-08-13 7 | 8 | from __future__ import print_function 9 | 10 | import argparse 11 | import csv 12 | import logging 13 | import sys 14 | 15 | 16 | def parse_key_index(key): 17 | if ',' in key: 18 | return [int(i) for i in key.split(',')] 19 | else: 20 | return [int(key)] 21 | 22 | 23 | def parse_args(): 24 | parser = argparse.ArgumentParser(description="Merge csvfile2 to csvfile1. Multiple keys supported.", 25 | epilog="https://github.com/shenwei356/bio_scripts") 26 | 27 | parser.add_argument('csvfile1', type=str, help='CSV file 1') 28 | parser.add_argument("key1", type=str, 29 | help='Column number of key in csvfile1. Multiple values shoud be separated by comma.') 30 | parser.add_argument('csvfile2', type=str, help='CSV file 2') 31 | parser.add_argument("key2", type=str, 32 | help='Column number of key in csvfile2. Multiple values shoud be separated by comma.') 33 | 34 | parser.add_argument("-f1", type=str, default=",", 35 | help='Field separator in csvfile1 [,]') 36 | parser.add_argument("-q1", type=str, default='"', 37 | help='Quote char in csvfile1 ["]') 38 | parser.add_argument("-f2", type=str, default=",", 39 | help='Field separator in csvfile2 [,]') 40 | parser.add_argument("-q2", type=str, default='"', 41 | help='Quote char in csvfile2 ["]') 42 | parser.add_argument("-of", type=str, default=",", 43 | help='Field separator in output [,]') 44 | 45 | parser.add_argument("-t1", action='store_true', 46 | help='csvfile1 is table file. Quote char is "\\t"') 47 | parser.add_argument("-t2", action='store_true', 48 | help='csvfile1 is table file. Quote char is "\\t"') 49 | parser.add_argument("-to", action='store_true', 50 | help='Output quote char is "\\t"') 51 | parser.add_argument("-t", action='store_true', 52 | help='Abbreviation for "-t1 -t2 -to"') 53 | 54 | parser.add_argument("-k", "--keep-unmatched", action='store_true', 55 | help='Keep rows in CSV file1 not matching row in file2"') 56 | 57 | args = parser.parse_args() 58 | 59 | if args.t: 60 | args.f1, args.f2, args.of = '\t', '\t', '\t' 61 | else: 62 | if args.t1: 63 | args.f1 = '\t' 64 | if args.t2: 65 | args.f2 = '\t' 66 | if args.to: 67 | args.of = '\t' 68 | 69 | return args 70 | 71 | 72 | def read_csv_file(file, key_index, fs, qc): 73 | data = dict() 74 | 75 | with open(file) as fh: 76 | reader = csv.reader(fh, delimiter=fs, quotechar=qc) 77 | for row in reader: 78 | ncolumn = len(row) 79 | if ncolumn == 0: 80 | continue 81 | 82 | key = list() 83 | for k in parse_key_index(key_index): 84 | if ncolumn < k: 85 | logging.error( 86 | "key ({}) is beyond number of column ({})".format(k, ncolumn)) 87 | sys.exit(1) 88 | key.append(row[k - 1].strip()) 89 | key = '_'.join(key) 90 | 91 | data[key] = row 92 | 93 | return data 94 | 95 | 96 | if __name__ == '__main__': 97 | logging.basicConfig(level=logging.DEBUG, format="[%(levelname)s] %(message)s") 98 | 99 | args = parse_args() 100 | 101 | data = read_csv_file(args.csvfile2, args.key2, args.f2, args.q2) 102 | 103 | file, fs, qc, key_index = args.csvfile1, args.f1, args.q1, args.key1 104 | 105 | writer = csv.writer(sys.stdout, delimiter=args.of, quotechar=qc, quoting=csv.QUOTE_MINIMAL) 106 | with open(file) as fh: 107 | reader = csv.reader(fh, delimiter=fs, quotechar=qc) 108 | for row in reader: 109 | ncolumn = len(row) 110 | if ncolumn == 0: 111 | continue 112 | 113 | key = list() 114 | for k in parse_key_index(key_index): 115 | if ncolumn < k: 116 | logging.error( 117 | "key ({}) is beyond number of column ({})".format(k, ncolumn)) 118 | sys.exit(1) 119 | key.append(row[k - 1].strip()) 120 | key = '_'.join(key) 121 | 122 | if key in data: 123 | writer.writerow(row + data[key]) 124 | elif args.keep_unmatched: 125 | writer.writerow(row) 126 | -------------------------------------------------------------------------------- /not_used/csv_join_paired_lines.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # https://github.com/shenwei356/bio_scripts 3 | # Author : Wei Shen 4 | # Contact : shenwei356@gmail.com 5 | # LastUpdate : 2015-02-04 6 | 7 | import argparse 8 | import csv 9 | import logging 10 | import sys 11 | import re 12 | 13 | # ===================================[ args ]================================= 14 | 15 | parser = argparse.ArgumentParser(description="Join paired lines from two files into one file") 16 | 17 | parser.add_argument("-v", "--verbose", help='Verbosely print information', 18 | action="count", default=0) 19 | 20 | parser.add_argument('infile1', type=argparse.FileType('r'), 21 | help='Input file 1') 22 | parser.add_argument('infile2', type=argparse.FileType('r'), 23 | help='Input file 2') 24 | parser.add_argument('outfile', nargs='*', type=argparse.FileType('w'), 25 | default=sys.stdout, help='Output file') 26 | 27 | parser.add_argument("-k", '--key', type=int, default=1, 28 | help='Column number of key in csvfile') 29 | parser.add_argument("-H", "--ignoretitle", help="Ignore title", 30 | action="store_true") 31 | parser.add_argument("-F", '--fs', type=str, default="\t", 32 | help='Field separator [\\t]') 33 | parser.add_argument("-Q", '--qc', type=str, default='"', 34 | help='Quote char["]') 35 | 36 | args = parser.parse_args() 37 | 38 | # logging level 39 | if args.verbose >= 2: 40 | logginglevel = logging.DEBUG 41 | elif args.verbose == 1: 42 | logginglevel = logging.INFO 43 | else: 44 | logginglevel = logging.WARN 45 | logging.basicConfig(level=logginglevel, 46 | format="[%(levelname)s] %(message)s") 47 | 48 | logging.info("Column number of key in csvfile: {}".format(args.key)) 49 | 50 | # ===================================[ read csv ]============================= 51 | 52 | 53 | 54 | def get_key_from_row(nrow, row): 55 | if nrow < args.key: 56 | logging.error( 57 | "-k ({}) is beyond number of column ({})".format(args.key, nrow)) 58 | sys.exit(1) 59 | elif args.key < 1: 60 | args.key = 1 61 | key = row[args.key - 1].strip() 62 | return key 63 | 64 | 65 | reader1 = csv.reader(iter(args.infile1.readline, ''), delimiter=args.fs, quotechar=args.qc) 66 | reader2 = csv.reader(iter(args.infile2.readline, ''), delimiter=args.fs, quotechar=args.qc) 67 | 68 | writer = csv.writer(args.outfile, delimiter=args.fs, quotechar=args.qc, quoting=csv.QUOTE_MINIMAL) 69 | 70 | once = True 71 | for row1, row2 in zip(reader1, reader2): 72 | if args.ignoretitle and once: # Ignore title 73 | once = False 74 | continue 75 | 76 | nrow1, nrow2 = len(row1), len(row2) 77 | if nrow1 == 0 or nrow2 == 0: 78 | continue 79 | if nrow1 != nrow2: 80 | logging.error("unpaired column number: {} vs {}".format(nrow1, nrow2)) 81 | sys.exit(1) 82 | 83 | key1, key2 = get_key_from_row(nrow1, row1), get_key_from_row(nrow2, row2) 84 | 85 | if key1 != key2: 86 | logging.error("keys do not match: {} vs {}".format(key1, key2)) 87 | sys.exit(1) 88 | 89 | writer.writerow(row1) 90 | writer.writerow(row2) 91 | -------------------------------------------------------------------------------- /not_used/csv_split_paired_lines.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # https://github.com/shenwei356/bio_scripts 3 | # Author : Wei Shen 4 | # Contact : shenwei356@gmail.com 5 | # LastUpdate : 2015-02-04 6 | 7 | import argparse 8 | import csv 9 | import logging 10 | import sys 11 | import re 12 | 13 | # ===================================[ args ]================================= 14 | 15 | parser = argparse.ArgumentParser(description="Split paired lines into two files") 16 | 17 | parser.add_argument('csvfile', nargs='*', type=argparse.FileType('r'), 18 | default=sys.stdin, help='Input file(s)') 19 | parser.add_argument("-v", "--verbose", help='Verbosely print information', 20 | action="count", default=0) 21 | 22 | parser.add_argument('outfile1', type=argparse.FileType('w'), 23 | default="out_1.tab", help='Output file 1') 24 | parser.add_argument('outfile2', type=argparse.FileType('w'), 25 | default="out_2.tab", help='Output file 2') 26 | 27 | parser.add_argument("-k", '--key', type=int, default=1, 28 | help='Column number of key in csvfile') 29 | parser.add_argument("-H", "--ignoretitle", help="Ignore title", 30 | action="store_true") 31 | parser.add_argument("-F", '--fs', type=str, default="\t", 32 | help='Field separator [\\t]') 33 | parser.add_argument("-Q", '--qc', type=str, default='"', 34 | help='Quote char["]') 35 | 36 | args = parser.parse_args() 37 | 38 | # logging level 39 | if args.verbose >= 2: 40 | logginglevel = logging.DEBUG 41 | elif args.verbose == 1: 42 | logginglevel = logging.INFO 43 | else: 44 | logginglevel = logging.WARN 45 | logging.basicConfig(level=logginglevel, 46 | format="[%(levelname)s] %(message)s") 47 | 48 | logging.info("Column number of key in csvfile: {}".format(args.key)) 49 | 50 | # ===================================[ read csv ]============================= 51 | 52 | writer1 = csv.writer(args.outfile1, delimiter=args.fs, quotechar=args.qc, quoting=csv.QUOTE_MINIMAL) 53 | writer2 = csv.writer(args.outfile2, delimiter=args.fs, quotechar=args.qc, quoting=csv.QUOTE_MINIMAL) 54 | 55 | cnt, sum = 0, 0 56 | stdinflag = False 57 | 58 | # If "iter(sys.stdin.readline, '')" in the flowing for-loop, first line 59 | # of stdin will be missing 60 | if args.csvfile is sys.stdin: 61 | logging.info("read data from STDIN") 62 | stdinflag = True 63 | args.csvfile = [iter(sys.stdin.readline, '')] 64 | 65 | 66 | def get_key_from_row(nrow, row): 67 | if nrow < args.key: 68 | logging.error( 69 | "-k ({}) is beyond number of column ({})".format(args.key, nrow)) 70 | sys.exit(1) 71 | elif args.key < 1: 72 | args.key = 1 73 | key = row[args.key - 1].strip() 74 | return key 75 | 76 | 77 | key0, row0, flag = '', '', True 78 | 79 | for f in args.csvfile: 80 | if not stdinflag: 81 | logging.info("read data from file") 82 | f = iter(f.readline, '') 83 | reader = csv.reader(f, delimiter=args.fs, quotechar=args.qc) 84 | 85 | once = True 86 | for row in reader: 87 | if args.ignoretitle and once: # Ignore title 88 | once = False 89 | continue 90 | 91 | nrow = len(row) 92 | if nrow == 0: 93 | continue 94 | 95 | sum += 1 96 | key = get_key_from_row(nrow, row) 97 | 98 | if key0 == '': 99 | key0, row0 = key, row 100 | continue 101 | 102 | if flag: 103 | if key0 != key: 104 | logging.error("unpaired key: line {} {} vs line {} {} ".format(sum - 1, row0, sum, row)) 105 | sys.exit(1) 106 | else: 107 | writer1.writerow(row0) 108 | writer2.writerow(row) 109 | 110 | flag = not flag 111 | key0, row0 = key, row 112 | 113 | if flag: 114 | logging.error("unpaired record remain: {}".format(row0)) 115 | sys.exit(1) -------------------------------------------------------------------------------- /not_used/fasta_seq_gc_content_plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | 4 | import sys 5 | import os 6 | 7 | from Bio import SeqIO 8 | from Bio.SeqUtils import GC 9 | 10 | import seaborn as sns 11 | import matplotlib as mpl 12 | import matplotlib.pyplot as plt 13 | 14 | usage = """ 15 | Usage: fasta_seq_gc_content_plot.py fastafile [fastafile...] 16 | """ 17 | 18 | if len(sys.argv) <= 1: 19 | print(usage) 20 | sys.exit(0) 21 | 22 | gc = [] 23 | 24 | for file in sys.argv[1:]: 25 | if not os.path.exists(file): 26 | print("file not exists: %s" % file) 27 | sys.exit(0) 28 | 29 | with open(file + ".gc", 'w') as fh: 30 | for seq in SeqIO.parse(file, "fasta"): 31 | gccontent = GC(seq.seq) 32 | gc.append(gccontent) 33 | fh.write("%s\t%d\n" % (seq.id, gccontent)) 34 | 35 | mpl.rc("figure", figsize=(8, 4)) 36 | sns.distplot(gc) 37 | plt.savefig(file + ".gc.png") 38 | -------------------------------------------------------------------------------- /not_used/fasta_seq_length_plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | 4 | import sys 5 | import os 6 | 7 | from Bio import SeqIO 8 | 9 | import seaborn as sns 10 | import matplotlib as mpl 11 | import matplotlib.pyplot as plt 12 | 13 | usage = """ 14 | Usage: fasta_seq_length_plot.py fastafile [fastafile...] 15 | """ 16 | 17 | if len(sys.argv) <= 1: 18 | print(usage) 19 | sys.exit(0) 20 | 21 | lengths = [] 22 | 23 | for file in sys.argv[1:]: 24 | if not os.path.exists(file): 25 | print("file not exists: %s" % file) 26 | sys.exit(0) 27 | 28 | with open(file + ".len", 'w') as fh: 29 | for seq in SeqIO.parse(file, "fasta"): 30 | l = len(seq) 31 | lengths.append(l) 32 | fh.write("%s\t%d\n" % (seq.id, l)) 33 | 34 | mpl.rc("figure", figsize=(8, 4)) 35 | sns.distplot(lengths) 36 | plt.savefig(file + ".len.png") 37 | -------------------------------------------------------------------------------- /plot/README.md: -------------------------------------------------------------------------------- 1 | # Plot utilities 2 | 3 | ## plot_barplot.R 4 | 5 | source is not open right now 6 | 7 | Sample output: 8 | 9 | ![Sample output](example/plot_barplot.png) 10 | 11 | ## plot_distribution.R 12 | 13 | Plot distribution 14 | 15 | Usage 16 | 17 | usage: ./plot_distribution.R [-h] [-bw BINWIDTH] [--xlab XLAB] [--ylab YLAB] 18 | [--width WIDTH] [--height HEIGHT] [-t title] 19 | infile outfile 20 | 21 | Plot distribution.Infile should be a tsv file of two columns (group and "value") 22 | 23 | positional arguments: 24 | infile infile 25 | outfile outfile 26 | 27 | optional arguments: 28 | -h, --help show this help message and exit 29 | -bw BINWIDTH, --binwidth BINWIDTH 30 | binwidth 31 | --xlab XLAB xlabel 32 | --ylab YLAB ylabel 33 | --width WIDTH output image width [20] 34 | --height HEIGHT output image height [5] 35 | -t title, --title title 36 | title 37 | 38 | Sample output: 39 | 40 | ![Sample output](example/data.tsv.dist.png) 41 | 42 | 43 | ## plot_distribution.py 44 | 45 | Distribution plot using seaborn 46 | 47 | Example: distribution of sequence length 48 | 49 | cat ../sequence/seq.fa | fasta2tab -l | cut -f 3 | \ 50 | plot_distribution.py -t "Disribution of sequence length" -x "sequence length" -o pic.png 51 | 52 | Sample output: 53 | 54 | ![Sample output](example/data.txt.png) 55 | 56 | 57 | ## plot_heatmap.R 58 | 59 | Plot heatmap with *pheatmap* 60 | 61 | Usage 62 | 63 | usage: ./plot_heatmap.R [-h] [-H] [-F field_seperator] [-a] [-al] 64 | [--title title] [-s scale] [-ncr] [-ncc] [-c color] 65 | [-nrc] [--width width] [--height height] 66 | [-thr treeheight_row] [-thc treeheight_col] 67 | [-fo fontsize] [-fr fontsize_row] [-fc fontsize_col] 68 | infile outname 69 | 70 | Plot heatmap. Infile should be a csv/tsv file with header containing column names. Annotation for row is also supported, please put them in the last column. 71 | 72 | positional arguments: 73 | infile infile (tsv, with head 74 | outname outname 75 | 76 | optional arguments: 77 | -h, --help show this help message and exit 78 | -H, --header header 79 | -F field_seperator, --field-seperator field_seperator 80 | field seperator 81 | -a, --with-annot add annotation_row from the last column 82 | -al, --with-annot-legend 83 | show annotation_row_legend 84 | --title title title 85 | -s scale, --scale scale 86 | scale. row | column | none [row] 87 | -ncr, --not-cluster-rows 88 | do not cluster_rows 89 | -ncc, --not-cluster-cols 90 | do not cluster_cols 91 | -c color, --color color 92 | sequential palettes names [RdYlBu] 93 | -nrc, --not-reverse-color-order 94 | do not reverse color order 95 | --width width output image width 96 | --height height output image height 97 | -thr treeheight_row, --treeheight_row treeheight_row 98 | treeheight_row 99 | -thc treeheight_col, --treeheight_col treeheight_col 100 | treeheight_col 101 | -fo fontsize, --fontsize fontsize 102 | fontsize 103 | -fr fontsize_row, --fontsize_row fontsize_row 104 | fontsize_row 105 | -fc fontsize_col, --fontsize_col fontsize_col 106 | fontsize_col 107 | 108 | Sample output: 109 | 110 | ![Sample output](example/heatmap.png) -------------------------------------------------------------------------------- /plot/example/data.tsv: -------------------------------------------------------------------------------- 1 | group value 2 | A -2.478041594 3 | A -1.670924814 4 | A 0.302085867 5 | A 1.723464735 6 | A 1.732239018 7 | A 0.397676057 8 | A -1.550346069 9 | A -0.950674475 10 | A 1.681924819 11 | A 0.971857536 12 | A -0.866578709 13 | A -0.002960986 14 | A -1.099090501 15 | A -0.387710361 16 | A 0.833717474 17 | A 0.446464809 18 | A 0.041753848 19 | A -1.663461487 20 | A -0.89552437 21 | A -0.506462862 22 | A -1.593121423 23 | A -0.331644759 24 | A -0.876270979 25 | A 1.704732645 26 | A 0.350942092 27 | A -0.650102515 28 | A -0.20037638 29 | A -1.470960109 30 | A -0.779294956 31 | A -0.182033116 32 | A -2.161884173 33 | A 0.852318015 34 | A -0.642384841 35 | A 0.435884613 36 | A -0.156577243 37 | A 0.744276341 38 | A 1.307949666 39 | A 0.425873497 40 | A 0.444357135 41 | A -0.561607921 42 | A 0.548522524 43 | A 0.552736143 44 | A 1.04787135 45 | A 0.735733792 46 | A 0.500453334 47 | A -1.126314462 48 | A -0.707289961 49 | A 1.23401702 50 | A -1.003678165 51 | A 0.415567648 52 | A 0.962199827 53 | A 0.103141335 54 | A -0.836026032 55 | A -0.098418515 56 | A 1.563558927 57 | A 1.54544268 58 | A -0.029325997 59 | A -1.401309572 60 | A 1.521911545 61 | A -1.892086994 62 | A 1.64618857 63 | A 1.236972495 64 | A 0.269366887 65 | A -0.607749077 66 | A -1.314902995 67 | A -0.910285157 68 | A -0.019836224 69 | A -0.591987119 70 | A 0.123428499 71 | A -0.348583796 72 | A 1.337303538 73 | A 0.826278844 74 | A -0.375063867 75 | A 0.656301925 76 | A 0.090945182 77 | A -0.809690651 78 | A -1.334256525 79 | A 0.544701029 80 | A -0.39094792 81 | A -0.861895104 82 | A 1.292683052 83 | A -1.17919095 84 | A -1.774046316 85 | A 1.08309747 86 | A -0.73170333 87 | A 0.246381319 88 | A -1.045812696 89 | A 0.520570011 90 | A -0.998067032 91 | A -0.819041117 92 | A 0.299772419 93 | A -0.932132226 94 | A 0.874542401 95 | A 1.089482407 96 | A 1.598371819 97 | A 0.268834238 98 | A 0.338377536 99 | A -0.5965213 100 | A 0.43380957 101 | A 1.240334583 102 | A 0.383542102 103 | A 0.392334889 104 | A 1.582287577 105 | A 0.934345306 106 | A 0.185303317 107 | A -0.408425632 108 | A 1.320292991 109 | A 0.04157766 110 | A -1.416251282 111 | A 1.746497661 112 | A 0.331006288 113 | A 1.403564316 114 | A 1.301531005 115 | A -0.299222217 116 | A -0.105714073 117 | A 1.785036895 118 | A -0.134575051 119 | A -1.042480709 120 | A 1.78300135 121 | A -0.438683097 122 | A -1.795526974 123 | A 0.150512279 124 | A 0.981735884 125 | A 2.473467286 126 | A -0.132078898 127 | A 0.654606396 128 | A -1.759474484 129 | A 1.066364322 130 | A -0.169913279 131 | A -1.018935367 132 | A 1.081703818 133 | A -0.848156005 134 | A -0.845524301 135 | A 1.053516424 136 | A 0.153402096 137 | A 0.221617988 138 | A 0.101040281 139 | A 1.328408995 140 | A 0.498088679 141 | A 1.318025052 142 | A -0.814440626 143 | A -1.068784756 144 | A -0.198829267 145 | A -1.366966838 146 | A -0.362010626 147 | A -0.62462386 148 | A -1.362625316 149 | A -0.997938157 150 | A 1.268118962 151 | A -0.019648713 152 | A -0.695817179 153 | A -0.192844372 154 | A -0.637827077 155 | A -0.723913683 156 | A -0.991369846 157 | A 0.586160041 158 | A 2.726372112 159 | A -0.577510955 160 | A -0.990340728 161 | A 0.076611801 162 | A 0.064507967 163 | A -2.53536432 164 | A -1.464073017 165 | A -1.47193269 166 | A 1.172569386 167 | A -0.189016092 168 | A -0.274093583 169 | A 0.90876957 170 | A -0.131689974 171 | A -0.847336909 172 | A -1.119121979 173 | A 0.551980938 174 | A -0.575261175 175 | A 0.002730048 176 | A 0.940787285 177 | A -0.439276259 178 | A 0.134038804 179 | A 1.018411703 180 | A -1.137956506 181 | A -0.157175605 182 | A -0.315387133 183 | A 0.075165076 184 | A 1.345233144 185 | A 0.830714846 186 | A -0.873349342 187 | A -0.284379877 188 | A -0.580637572 189 | A 0.876275425 190 | A 0.294259868 191 | A -1.107709451 192 | A -1.064997494 193 | A 0.182133669 194 | A 0.284590814 195 | A -0.831312758 196 | A 0.781795593 197 | A -0.202621437 198 | A -0.839671657 199 | A -0.546233119 200 | A -0.887818316 201 | B -6.40E-001 202 | B -1.12E+000 203 | B 3.23E-001 204 | B 4.10E-001 205 | B 5.93E-001 206 | B 1.06E+000 207 | B 1.98E+000 208 | B 9.21E-001 209 | B 1.46E+000 210 | B -4.07E-001 211 | B 1.98E+000 212 | B 3.56E-001 213 | B 3.38E-001 214 | B 2.40E+000 215 | B -4.92E-001 216 | B 1.25E+000 217 | B -4.66E-001 218 | B -3.06E-001 219 | B -7.69E-001 220 | B 2.11E-001 221 | B -3.48E-001 222 | B -1.95E+000 223 | B 6.90E-002 224 | B 1.12E+000 225 | B 1.62E+000 226 | B -9.32E-001 227 | B 1.39E+000 228 | B 7.02E-001 229 | B 9.40E-001 230 | B 2.38E+000 231 | B -3.74E-001 232 | B 2.14E+000 233 | B 1.35E+000 234 | B 2.38E+000 235 | B 6.77E-001 236 | B 1.56E+000 237 | B 1.84E+000 238 | B 1.13E+000 239 | B 1.88E-001 240 | B -2.48E-001 241 | B 5.65E-001 242 | B 3.00E+000 243 | B 5.57E-001 244 | B 1.47E+000 245 | B 8.15E-001 246 | B 1.45E+000 247 | B -1.24E+000 248 | B -5.90E-001 249 | B 4.49E-001 250 | B -5.34E-001 251 | B 2.34E+000 252 | B 2.81E+000 253 | B -1.05E+000 254 | B 1.37E+000 255 | B 1.15E+000 256 | B 4.73E-001 257 | B 2.33E+000 258 | B 2.65E+000 259 | B 7.09E-001 260 | B -2.81E-001 261 | B 7.41E-001 262 | B 8.63E-001 263 | B -1.60E-001 264 | B 5.80E-001 265 | B -2.73E-001 266 | B 1.34E+000 267 | B 2.66E+000 268 | B -1.18E+000 269 | B 5.74E-001 270 | B 3.83E-001 271 | B 3.55E+000 272 | B 1.56E+000 273 | B 8.55E-001 274 | B -1.34E+000 275 | B 9.92E-001 276 | B 3.70E-001 277 | B 2.79E-001 278 | B 6.87E-001 279 | B -5.85E-001 280 | B 1.96E+000 281 | B -1.56E+000 282 | B 4.47E-005 283 | B 5.39E-002 284 | B -1.14E+000 285 | B 2.67E+000 286 | B 1.14E+000 287 | B 2.73E+000 288 | B 2.18E+000 289 | B 1.99E+000 290 | B 1.49E+000 291 | B 1.61E+000 292 | B -4.01E-001 293 | B 9.41E-001 294 | B 1.31E+000 295 | B 1.44E-001 296 | B 2.09E+000 297 | B 1.55E+000 298 | B -1.64E-001 299 | B 4.08E-001 300 | B -2.13E-001 301 | B -9.80E-001 302 | B 9.76E-001 303 | B -1.81E-001 304 | B 5.84E-001 305 | B 8.30E-001 306 | B 1.14E+000 307 | B 1.49E+000 308 | B 1.87E+000 309 | B 7.76E-001 310 | B -1.01E+000 311 | B -1.63E+000 312 | B 2.14E+000 313 | B 7.27E-001 314 | B 1.86E+000 315 | B -1.38E+000 316 | B -1.97E-001 317 | B 4.70E-001 318 | B 1.95E-001 319 | B -7.66E-002 320 | B 1.86E+000 321 | B 3.13E+000 322 | B 2.46E+000 323 | B -9.66E-002 324 | B 1.96E+000 325 | B 1.43E+000 326 | B 7.85E-001 327 | B 1.16E+000 328 | B 1.10E+000 329 | B 2.53E-001 330 | B -1.55E-001 331 | B 5.81E-002 332 | B 7.55E-001 333 | B 1.42E+000 334 | B 2.24E+000 335 | B 1.04E+000 336 | B 1.49E-001 337 | B 2.61E+000 338 | B 1.58E+000 339 | B 1.64E+000 340 | B 1.20E+000 341 | B -7.85E-002 342 | B 6.80E-001 343 | B 3.56E-002 344 | B 1.17E-001 345 | B 8.46E-001 346 | B 9.02E-001 347 | B 1.37E+000 348 | B -4.92E-001 349 | B 1.17E+000 350 | B 2.90E+000 351 | B 2.81E+000 352 | B 8.50E-001 353 | B 1.18E+000 354 | B -5.11E-001 355 | B 2.93E+000 356 | B -4.87E-004 357 | B 4.52E-001 358 | B 1.00E+000 359 | B 1.00E+000 360 | B 2.85E+000 361 | B 8.16E-001 362 | B 1.32E+000 363 | B 1.37E+000 364 | B 4.03E-001 365 | B 3.60E-001 366 | B 4.25E-002 367 | B 2.58E-002 368 | B 8.25E-001 369 | B 1.22E+000 370 | B 1.05E-001 371 | B -7.12E-003 372 | B 1.16E+000 373 | B 1.38E+000 374 | B -2.63E-001 375 | B 1.23E+000 376 | B 6.94E-001 377 | B 2.12E+000 378 | B 1.38E+000 379 | B -3.36E-001 380 | B 4.35E-001 381 | B 2.46E+000 382 | B 1.96E+000 383 | B 1.70E+000 384 | B 2.08E+000 385 | B 2.15E+000 386 | B 2.15E+000 387 | B 1.25E+000 388 | B 1.92E-001 389 | B -1.20E+000 390 | B 8.32E-001 391 | B 1.05E+000 392 | B 2.93E-001 393 | B 2.88E-001 394 | B 6.69E-001 395 | B 2.48E+000 396 | B 1.38E+000 397 | B 2.10E-001 398 | B 3.42E-001 399 | B 6.19E-001 400 | B -2.54E-001 401 | -------------------------------------------------------------------------------- /plot/example/data.tsv.dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/bio_scripts/703cec8d21903516346e2aae4d77d23385c30905/plot/example/data.tsv.dist.png -------------------------------------------------------------------------------- /plot/example/data.txt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/bio_scripts/703cec8d21903516346e2aae4d77d23385c30905/plot/example/data.txt.png -------------------------------------------------------------------------------- /plot/example/heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/bio_scripts/703cec8d21903516346e2aae4d77d23385c30905/plot/example/heatmap.png -------------------------------------------------------------------------------- /plot/example/plot_barplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/bio_scripts/703cec8d21903516346e2aae4d77d23385c30905/plot/example/plot_barplot.png -------------------------------------------------------------------------------- /plot/plot_distribution.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # https://github.com/shenwei356/bio_scripts 3 | 4 | library(methods) 5 | library(proto) 6 | library(argparse) 7 | library(ggplot2) 8 | library(reshape2) 9 | library(scales) 10 | 11 | #----------------------------------------------------------------------------- 12 | 13 | description <- paste( 14 | "Plot distribution.", 15 | "Infile should be a tsv file of two columns (group and \"value\")", sep = "" 16 | ) 17 | 18 | parser <- 19 | ArgumentParser(description = description, 20 | formatter_class = "argparse.RawTextHelpFormatter") 21 | 22 | #----------------------------------------------------------------------------- 23 | 24 | parser$add_argument("infile", type = "character", 25 | help = "infile") 26 | parser$add_argument("outfile", type = "character", 27 | help = "outfile") 28 | 29 | parser$add_argument( 30 | "-bw", "--binwidth", type = "double", 31 | default = 0.1, help = "binwidth" 32 | ) 33 | 34 | parser$add_argument("--xlab", type = "character", default = "Value", 35 | help = "xlabel") 36 | parser$add_argument("--ylab", type = "character", default = "Density", 37 | help = "ylabel") 38 | parser$add_argument("--width", type = "integer", default = 6, 39 | help = "output image width [20]") 40 | parser$add_argument("--height", type = "integer", default = 3, 41 | help = "output image height [5]") 42 | 43 | parser$add_argument( 44 | "-t", "--title", metavar = "title", type = "character", 45 | default = "", help = "title" 46 | ) 47 | 48 | #----------------------------------------------------------------------------- 49 | 50 | args <- parser$parse_args() 51 | 52 | if (args$title == "") { 53 | args$title = "" 54 | } 55 | 56 | #----------------------------------------------------------------------------- 57 | 58 | df <- read.csv(args$infile, sep = "\t") 59 | 60 | p <- ggplot(df, aes(x = value, fill = group, colour = group)) + 61 | geom_histogram( 62 | aes(y = ..density..), alpha = .3, position = "identity", binwidth = args$binwidth 63 | ) + 64 | geom_density(alpha = .2) + 65 | ylab(args$ylab) + 66 | xlab(args$xlab) + 67 | ggtitle(args$title) + 68 | theme_bw() + 69 | theme( 70 | panel.border = element_blank(), 71 | panel.grid.major = element_blank(), 72 | panel.grid.minor = element_blank(), 73 | axis.line = element_line(colour = "black"), 74 | legend.key = element_blank(), 75 | # legend.position = "none", 76 | legend.title = element_blank() 77 | ) 78 | 79 | ggsave( 80 | p, file = args$outfile, width = args$width, height = args$height 81 | ) -------------------------------------------------------------------------------- /plot/plot_distribution.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import print_function 4 | 5 | import argparse 6 | import re 7 | import sys 8 | 9 | import matplotlib as mpl 10 | import matplotlib.pyplot as plt 11 | import seaborn as sns 12 | 13 | parser = argparse.ArgumentParser(description='Plot distribution', 14 | epilog="https://github.com/shenwei356/bio_scripts") 15 | 16 | parser.add_argument('-i', '--infile', nargs='?', type=argparse.FileType('r'), 17 | default=sys.stdin, help='Input file') 18 | 19 | parser.add_argument('-o', '--outfile', nargs='?', type=str, 20 | default='dist.png', help='Output file') 21 | 22 | parser.add_argument('--width', type=int, default=8, help='Figure width') 23 | parser.add_argument('--height', type=int, default=6, help='Figure heigth') 24 | parser.add_argument('--x_lim', type=str, help='x_lim. format: "1,100"') 25 | parser.add_argument('--y_lim', type=str, help='y_lim. format: "1,100"') 26 | parser.add_argument('--bins', type=int, default=0, help='bins, 0 for None') 27 | 28 | parser.add_argument( 29 | '-t', '--title', type=str, default='Distribution Plot', help='Figure Title') 30 | parser.add_argument( 31 | '-x', '--xlabel', type=str, default='Value', help='Figure X label') 32 | parser.add_argument( 33 | '-y', '--ylabel', type=str, default='Frequency', help='Figure Y label') 34 | 35 | args = parser.parse_args() 36 | 37 | if args.y_lim and not re.match('^[\d\.]+,[\d\.]+$', args.y_lim): 38 | print("Invalid option value for --y_lim. Example: --y_lim 1,100 ", file=sys.stderr) 39 | sys.exit(1) 40 | if args.x_lim and not re.match('^[\d\.]+,[\d\.]+$', args.x_lim): 41 | print("Invalid option value for --x_lim. Example: --y_lim 1,100 ", file=sys.stderr) 42 | sys.exit(1) 43 | 44 | data = [] 45 | for line in args.infile: 46 | data.append(float(line.strip())) 47 | 48 | mpl.rc("figure", figsize=(args.width, args.height)) 49 | 50 | if args.bins == 0: 51 | args.bins = None 52 | 53 | figure = sns.distplot(data, bins=args.bins) 54 | 55 | figure.set_title(args.title) 56 | figure.set_xlabel(args.xlabel) 57 | figure.set_ylabel(args.ylabel) 58 | 59 | if args.x_lim: 60 | figure.set_xlim([float(x) for x in args.x_lim.split(',')]) 61 | if args.y_lim: 62 | figure.set_ylim([float(y) for y in args.y_lim.split(',')]) 63 | 64 | plt.savefig(args.outfile) 65 | -------------------------------------------------------------------------------- /protein/protein_batch_compute_pI.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Function: Batch compute pI (isoelectric point) and Mw (molecular weight) 4 | # via submiting sequences to Compute pI/Mw tool at ExPASy. 5 | # Author : Wei Shen http://shenwei.me 6 | # Date : 2013-10-16 7 | # Update : 2014-07-29 8 | 9 | use strict; 10 | use BioUtil::Seq; 11 | 12 | my $usage = <<"USAGE"; 13 | 14 | Function: Batch compute pI (isoelectric point) and Mw (molecular weight) 15 | via submiting sequences to Compute pI/Mw tool at ExPASy 16 | Contact: Wei Shen 17 | Usage: $0 amino_acid_fasta_file 18 | 19 | USAGE 20 | die $usage 21 | unless @ARGV == 1; 22 | my $aa_file = shift @ARGV; 23 | 24 | # initialize fasta file parser 25 | my $next_seq = FastaReader($aa_file); 26 | 27 | # initialize pI request 28 | my $PI = &compute_pi(); 29 | 30 | my ( $head, $seq ); 31 | my ( $success, $pi, $mw ); 32 | my $out_file = "$aa_file.result.txt"; 33 | open OUT, ">", $out_file 34 | or die "fail to write file $out_file\n"; 35 | 36 | while ( my $fa = &$next_seq() ) { 37 | my ( $header, $seq ) = @$fa; 38 | 39 | ( $success, $pi, $mw ) = &$PI($seq); 40 | unless ($success) { 41 | print 42 | "$pi. Please check whether the amino acid sequence contains illegal characters.\r\n" 43 | ; # here $pi is the status_line of response 44 | next; 45 | } 46 | print "$header\t$pi\t$mw\r\n"; 47 | print OUT "$header\t$pi\t$mw\r\n"; 48 | } 49 | 50 | close OUT; 51 | 52 | # Compute pI/Mw via submiting sequence to Compute pI/Mw tool at ExPASy. 53 | # 54 | # See more: http://web.expasy.org/compute_pi/ 55 | # 56 | # Example: 57 | # 58 | # my @proteins = qw/AYYAYYAYAYAY ACACAGACG ---/; 59 | # my $PI = &compute_pi(); 60 | # my ( $success, $pi, $mw ); 61 | # for my $protein (@proteins) { 62 | # ( $success, $pi, $mw ) = &$PI($protein, "average"); 63 | # # ( $success, $pi, $mw ) = &$PI($protein, "monoisotopic"); 64 | # unless ($success) { 65 | # print "$pi\n"; # here $pi is the status_line of response 66 | # next; 67 | # } 68 | # print "($pi, $mw)\n"; 69 | # } 70 | sub compute_pi() { 71 | use LWP::UserAgent; 72 | 73 | my $ua = LWP::UserAgent->new; 74 | my $url = "http://web.expasy.org/cgi-bin/compute_pi/pi_tool"; 75 | my ( $res, $formdata, $result ); 76 | 77 | return sub($$) { 78 | my ( $protein, $resolution ) = @_; 79 | $resolution = "average" unless defined $resolution; # or monoisotopic 80 | $formdata = [ 81 | protein => $protein, 82 | resolution => $resolution, 83 | file => "" 84 | ]; 85 | 86 | $res = $ua->post( $url, $formdata ); 87 | 88 | # 0 means failed 89 | return ( 0, $res->status_line ) 90 | unless $res->is_success; 91 | 92 | $result = $res->content; 93 | $result =~ /Theoretical pI\/Mw: ([\d\.]+)\s\/\s([\d\.]+)/; 94 | 95 | # 1 means success 96 | return ( 1, $1, $2 ); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /sequence/README.md: -------------------------------------------------------------------------------- 1 | # Manipulation on FASTA/Q format file 2 | 3 | Recommend my toolkit [SeqKit](https://github.com/shenwei356/seqkit), 4 | a cross-platform and efficient toolkit for FASTA/Q file manipulation, 5 | which integrades most of the functions provided by these scripts. 6 | 7 | ## FASTA 8 | 9 | ### fasta2tab and tab2fasta 10 | 11 | [*fasta2tab*](https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta2tab) 12 | and [*tab2fasta*](https://github.com/shenwei356/bio_scripts/blob/master/sequence/tab2fasta) 13 | are used in pair. *fasta2tab* transforms the FASTA fromat to two-column table, 14 | fist column is the header and the second is sequence. 15 | Its could also compute the reverse complement sequence and remove gaps. 16 | Sequence length and GC content could be outputted as another column, 17 | which could be used for filtering and sorting. tab2fasta just tranform the 18 | table back to FASTA format. Combining with shell tool like awk and sed, 19 | it’s easy to filter, sort FASTA files. 20 | 21 | #### Examples 22 | 23 | ##### 1. sort fasta by sequnece length 24 | 25 | ``` 26 | cat seq.fa | fasta2tab -t -l | sort -r -t"`echo -e '\t'`" -n -k3,3 \ 27 | | tab2fasta -l 70 > seq.sorted.fa 28 | ``` 29 | 30 | ##### 2. extract sub sequence 31 | 32 | ``` 33 | fasta2tab -t -sub 3,10 -rc seq.fa | tab2fasta 34 | ``` 35 | 36 | ##### 3. extract sequence longer than 1000 bp 37 | 38 | ``` 39 | cat seq.fa | fasta2tab -t -l | awk -F'\t' '$3 >= 1000' | tab2fasta -l 70 40 | ``` 41 | 42 | ##### 4. extract aligned sequence of which the original sequence is longer than 1000 bp 43 | 44 | ``` 45 | cat seq.fa | fasta2tab -l2 | awk -F'\t' '$3 >= 1000' | tab2fasta -l 70 46 | ``` 47 | 48 | ##### 5. reverse complement sequence, uppercase, and trim gaps 49 | 50 | ``` 51 | zcat seq.fa.gz | fasta2tab -uc -rc -t | tab2fasta 52 | ``` 53 | 54 | ### fasta_extract_by_pattern.pl 55 | 56 | [fasta_extract_by_pattern.pl](https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta_extract_by_pattern.pl) 57 | could extract FASTA sequences by header or sequence, exactly matching or regular 58 | expression matching are both supported. The query pattern could read from files. 59 | And negation of the result is also easy to get. What's the most important, it could read from STDIN. 60 | 61 | Combining fasta2tab and tab2fasta with [*cvs_grep*](https://github.com/shenwei356/bio_scripts/blob/master/util/csv_grep) 62 | could also have the same function. 63 | 64 | #### Examples 65 | 66 | ##### 1. sequences WITH "bacteria" in header 67 | 68 | ``` 69 | fasta_extract_by_pattern.pl -r -p Bacteria *.fa > result.fa 70 | ``` 71 | 72 | ##### 2. sequences WITHOUT “bacteria” in header 73 | 74 | ``` 75 | fasta_extract_by_pattern.pl -r -n -p Bacteria seq1.fa seq2.fa > result.fa 76 | ``` 77 | 78 | ##### 3. sequences with TTSAA (AgsI digest site) in SEQUENCE. Base S stands for C or G. 79 | 80 | ``` 81 | fasta_extract_by_pattern.pl -r -s -p 'TT[C|G]AA' seq.fa > result.fa 82 | ``` 83 | 84 | ##### 4. sequences (read from STDIN ) with header that matches any patterns in list file 85 | 86 | ``` 87 | zcat seq.fa.gz | fasta_extract_by_pattern.pl -pf name_list.txt > result.fa 88 | ``` 89 | 90 | ### fasta_common_seqs.pl 91 | 92 | [fasta_common_seqs.pl](https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta_common_seqs.pl) 93 | is used to find common sequences in multiple files. It supports comparing by header or sequence. 94 | By storing the MD5 value of sequences, it has a low memory usage. It’s also could be 95 | used to remove duplicated records, by finding common sequencing from the 96 | file and its copy or soft link. 97 | 98 | ### fasta_remove_duplicates.pl 99 | 100 | [fasta_remove_duplicates.pl](https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta_remove_duplicates.pl) 101 | could remove duplicated records from file or STDIN, by both sequence and header. 102 | 103 | ### fasta_locate_motif.pl 104 | 105 | [fasta_locate_motif.pl](https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta_locate_motif.pl) 106 | could find restrict enzyme recognition site or other motif location. 107 | 108 | ### fasta_gc_skew.py and fasta_gc_skew.plot.R 109 | 110 | Sample out: 111 | 112 | ![GC Skew](sample/gc_skew.png) 113 | 114 | ## FASTQ 115 | 116 | ### fastq2tab and tab2fastq 117 | 118 | [*fastq2tab*](https://github.com/shenwei356/bio_scripts/blob/master/sequence/fastq2tab) and [*tab2fastq*](https://github.com/shenwei356/bio_scripts/blob/master/sequence/tab2fastq) are similar to fasta2tab and tab2fasta. It could use to filter fastq with help of [*cvs_grep*](https://github.com/shenwei356/bio_scripts/blob/master/util/csv_grep). 119 | 120 | Example: removing contaminate reads 121 | 122 | zcat reads.fq.gz \ 123 | | fastq2tab \ 124 | | csv_grep -t -pf <(cat idlist) -i -d \ 125 | | tab2fastq \ 126 | | gzip -c \ 127 | > reads2.fq.gz 128 | -------------------------------------------------------------------------------- /sequence/fasta2tab: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # https://github.com/shenwei356/bio_scripts 3 | 4 | use strict; 5 | use Getopt::Long; 6 | use BioUtil::Seq; 7 | use BioUtil::Util; 8 | 9 | my $usage = q( 10 | fasta2tab - transform the fasta fromat to two-column table 11 | 12 | Usage: fasta2tab [options] [fastafiles...] 13 | Options: 14 | -r, --reverse Reverse sequence 15 | -c, --complement Complement sequence 16 | -rc, --reversecomplement Reversecomplement 17 | -sub, --subseq INT,INT Substring of sequence, 1-based 18 | Examples: 19 | seq ACGAGACGTA 20 | index 1234567890 21 | 22 | option subseq 23 | -------------------- 24 | -sub 2,7 CGAGAC 25 | -sub 2,2 C 26 | -sub ,7 ACGAGAC 27 | -sub 2, CGAGACGTA 28 | -sub -3, GTA 29 | -sub -3,-2 GT 30 | -sub ,-3 ACGAGACG 31 | 32 | -t, --trim Trim non-Latin alphabet 33 | -lc, --lowercase Lowercase 34 | -uc, --uppercase Uppercase 35 | 36 | -l, --length Ouput sequence length at another column 37 | -l2, --length2 Ouput number of latin-letter in sequence 38 | at another column 39 | -bc, --bc STRING[,STRING] Ouput base content 40 | Examples: 41 | 'GC' : G+C content, 42 | 'G,C': G and C, in two column 43 | -gc, --gc Ouput GC content at another column 44 | 45 | -h, --help Show this help information 46 | 47 | Examples: 48 | 49 | 1. sort fasta by sequnece length 50 | cat seq.fa | fasta2tab -t -l | sort -r -t"`echo -e '\t'`" -n -k3,3 \ 51 | | tab2fasta -l 70 > seq.sorted.fa 52 | 53 | 2. extract sub sequence 54 | fasta2tab -t -sub 3,10 -rc seq.fa | tab2fasta 55 | 56 | 3. extract sequence longer than 1000 bp 57 | cat seq.fa | fasta2tab -t -l | awk -F'\t' '$3 >= 1000' | tab2fasta -l 70 58 | 59 | 4. extract aligned sequence of which the original sequence is longer than 1000 bp 60 | cat seq.fa | fasta2tab -l2 | awk -F'\t' '$3 >= 1000' | tab2fasta -l 70 61 | 62 | 5. reverse complement sequence, uppercase, and trim gaps 63 | zcat seq.fa.gz | fasta2tab -uc -rc -t | tab2fasta 64 | 65 | This script is usually used in pair with tab2fasta. 66 | https://github.com/shenwei356/bio_scripts 67 | 68 | ); 69 | 70 | my $para = {}; 71 | GetOptions( 72 | 'help|h' => \$$para{help}, 73 | 74 | 'reverse|r' => \$$para{rev}, 75 | 'complement|c' => \$$para{comp}, 76 | 'reversecomplement|rc' => \$$para{rc}, 77 | 'subseq|sub=s' => \$$para{sub}, 78 | 79 | 'trim|t' => \$$para{trim}, 80 | 'lowercase|lc' => \$$para{lc}, 81 | 'uppercase|uc' => \$$para{uc}, 82 | 83 | 'length|l' => \$$para{len}, 84 | 'length2|l2' => \$$para{len2}, 85 | 'bc=s' => \$$para{bc}, 86 | 'gc' => \$$para{gc}, 87 | ) or die $usage; 88 | 89 | die $usage if $$para{help}; 90 | if ( $$para{sub} ) { 91 | die qq( 92 | parameter of -sub not correct. 93 | 94 | examples: 95 | seq ACGAGACGTA 96 | index 1234567890 97 | 98 | option subseq 99 | -------------------- 100 | -sub 2,7 CGAGAC 101 | -sub 2,2 C 102 | -sub ,7 ACGAGAC 103 | -sub 2, CGAGACGTA 104 | -sub -3, GTA 105 | -sub -3,-2 GT 106 | -sub ,-3 ACGAGACG 107 | 108 | ) unless $$para{sub} =~ /^(-?\d*),(-?\d*)$/; 109 | die "warning: end ($2) should be >= start ($1)\n" if $2 ne '' and $1 ne '' and $2 < $1 ; 110 | } 111 | 112 | my @files = file_list_from_argv(@ARGV); 113 | 114 | for my $file (@files) { 115 | my $next_seq = FastaReader($file); 116 | while ( my $fa = &$next_seq() ) { 117 | my ( $header, $seq ) = @$fa; 118 | 119 | $header =~ s/\t/__tab__/g; 120 | 121 | if ( $$para{trim} ) { 122 | $seq =~ s/[^a-zA-Z]+//g; 123 | } 124 | 125 | if ( $$para{sub} ) { 126 | my ( $start, $end ) = split /,/, $$para{sub}; 127 | if ( $start eq '' ) { 128 | $start = 1; 129 | } 130 | elsif ( $start < 0 ) { 131 | $start += 1; 132 | } 133 | 134 | if ( $end eq '' ) { 135 | $end = 1 + length $seq; 136 | } 137 | elsif ( $end < 0 ) { 138 | $end += 1; 139 | } 140 | $seq = substr $seq, $start - 1, $end - $start + 1; 141 | } 142 | 143 | if ( $$para{rc} ) { 144 | $seq = revcom($seq); 145 | } 146 | else { 147 | $seq = complement($seq) if $$para{comp}; 148 | $seq = reverse $seq if $$para{rev}; 149 | } 150 | 151 | if ( $$para{lc} ) { 152 | $seq = lc $seq; 153 | } 154 | elsif ( $$para{uc} ) { 155 | $seq = uc $seq; 156 | } 157 | 158 | print "$header\t$seq"; 159 | print "\t", length $seq if $$para{len}; 160 | if ( $$para{len2} ) { 161 | if ( $$para{trim} ) { 162 | print "\t", length $seq; 163 | } 164 | else { 165 | my $seq2 = $seq; 166 | $seq2 =~ s/[^a-zA-Z]+//g; 167 | print "\t", length $seq2; 168 | } 169 | } 170 | 171 | if ($$para{gc}) { 172 | print "\t", base_content( 'gc', $seq ); 173 | } elsif ($$para{bc}) { 174 | my @bases = split /,/, $$para{bc}; 175 | for my $base (@bases) { 176 | print "\t", base_content( $base, $seq ); 177 | } 178 | } 179 | print "\n"; 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /sequence/fasta_common_seqs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # Copyright 2014 Wei Shen (shenwei356#gmail.com). All rights reserved. 3 | # Use of this source code is governed by a MIT-license 4 | # that can be found in the LICENSE file. 5 | use strict; 6 | use File::Basename; 7 | use Getopt::Long; 8 | use Digest::MD5 'md5_hex'; 9 | use BioUtil::Seq; 10 | 11 | local $| = 1; 12 | $0 = basename($0); 13 | my $usage = <<"USAGE"; 14 | =============================================================================== 15 | Function: Find common sequences in fasta files. 16 | Features: 17 | 1) Comparing by name or sequence are both supported. 18 | 2) No files number limit. 19 | 3) Low RAM usage. 20 | Note that: 21 | 1) Records with different names may have same sequences. 22 | 2) Case of sequence letters or name may be different. 23 | 3) Duplicated records may exist in a fasta file. 24 | Contact : Wei Shen 25 | Date : 2013-11-07 26 | Update : 2014-08-14 27 | Site : https://github.com/shenwei356/bio_scripts 28 | 29 | Usage : $0 [-s] [-i] fastafile fastafile2 [fastafile3 ...] 30 | Options : 31 | -s Comparing by sequence. 32 | -i Ignore case. 33 | -l Output line length. [70] 34 | =============================================================================== 35 | 36 | USAGE 37 | 38 | my $by_seq = 0; 39 | my $ignore_case = 0; 40 | my $linelength = 70; 41 | GetOptions( 42 | "s" => \$by_seq, 43 | "i" => \$ignore_case, 44 | 'l=i' => \$linelength, 45 | ) or die $usage; 46 | 47 | # at least two files; 48 | die "$usage\n>= 2 sequence file needed.\n" unless @ARGV >= 2; 49 | 50 | my $counts = {}; 51 | my $names = {}; 52 | 53 | my ( $file, $next_seq, $head, $head0, $seq, $seq_md5 ); 54 | 55 | for $file (@ARGV) { 56 | print STDERR "\nparsing $file...\n"; 57 | my $n = 0; 58 | $next_seq = FastaReader($file); 59 | while ( my $fa = &$next_seq() ) { 60 | ( $head, $seq ) = @$fa; 61 | print STDERR "\r", ++$n; 62 | $head0 = $head; # orgin sequence name 63 | $head = lc $head if $ignore_case; 64 | 65 | if ($by_seq) { 66 | $seq =~ tr/A-Z/a-z/ if $ignore_case; 67 | $seq_md5 = md5_hex($seq); 68 | 69 | # count sequences with md5 $seq_md5 in $file 70 | $$counts{$seq_md5}{$file}++; # 71 | # record the origin sequence name. 72 | $$names{$seq_md5}{$file} = $head0; 73 | } 74 | else { 75 | # count sequences with name $head in $file 76 | $$counts{$head}{$file}++; 77 | $$names{$head}{$file} = $head0; 78 | } 79 | } 80 | } 81 | 82 | # output common sequences 83 | print STDERR "\nchecking...\n"; 84 | my $file_num = scalar @ARGV; 85 | $file = $ARGV[0]; # extract sequences from the first file. 86 | my $names_ok = {}; 87 | for my $key ( keys %$counts ) { 88 | 89 | # all files have a same record 90 | next unless ( scalar keys %{ $$counts{$key} } ) == $file_num; 91 | 92 | $$names_ok{ $$names{$key}{$file} } 93 | = $$counts{$key}{$file}; # save to a hash. 94 | } 95 | 96 | print STDERR "extracting...\n"; 97 | my $n = 0; 98 | $next_seq = FastaReader($file); 99 | while ( my $fa = &$next_seq() ) { 100 | ( $head, $seq ) = @$fa; 101 | 102 | if ( exists $$names_ok{$head} and $$names_ok{$head} > 0 ) { 103 | print STDERR "\rhit: ", ++$n; 104 | print ">$head\n", format_seq( $seq, $linelength ); 105 | 106 | # just export one record for duplicated records. 107 | $$names_ok{$head} = 0; 108 | } 109 | } 110 | print STDERR "\n"; 111 | -------------------------------------------------------------------------------- /sequence/fasta_extract_by_pattern.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # https://github.com/shenwei356/bio_scripts 3 | 4 | use strict; 5 | 6 | use Getopt::Long; 7 | use File::Basename; 8 | use BioUtil::Seq; 9 | use BioUtil::Util; 10 | 11 | $0 = basename($0); 12 | my $usage = < result.fa 32 | 2) sequences WITHOUT "bacteria" in header 33 | $0 -r -n -p Bacteria seq1.fa seq2.fa > result.fa 34 | 3) sequences with TTSAA (AgsI digest site) in SEQUENCE. 35 | Base S stands for C or G. 36 | $0 -r -s -p 'TT[C|G]AA' seq.fa > result.fa 37 | 4) sequences (read from STDIN ) with header that matches any patterns 38 | in list file 39 | zcat seq.fa.gz | $0 -pf name_list.txt > result.fa 40 | 41 | https://github.com/shenwei356/bio_scripts 42 | 43 | USAGE 44 | 45 | my $para = {}; 46 | GetOptions( 47 | 'help|h' => \$$para{help}, 48 | 'useregexp|r' => \$$para{useregexp}, 49 | 'speedup|d' => \$$para{speedup}, 50 | 'not|n' => \$$para{not}, 51 | 'pattern|p=s' => \$$para{pattern}, 52 | 'patternfile|pf=s' => \$$para{patternfile}, 53 | 'byseq|s' => \$$para{byseq}, 54 | ) or die $usage; 55 | die $usage if $$para{help}; 56 | 57 | # get patterns 58 | my $patterns = {}; 59 | $$patterns{$$para{pattern}} = 1 if $$para{pattern}; 60 | if ( $$para{patternfile} ){ 61 | $$patterns{$_} = 1 for @{ get_column_data( $$para{patternfile}, 1 ) }; 62 | } 63 | die "no patterns given. Type \"$0 -h\" for help.\n" if keys %$patterns == 0; 64 | 65 | # get the file list 66 | my @files = file_list_from_argv(@ARGV); 67 | 68 | my $not_trim = 1; 69 | $not_trim = 0 if $$para{byseq}; 70 | 71 | my ( $sum, $n ) = ( 0, 0 ); 72 | 73 | for my $file (@files) { 74 | 75 | my $next_seq = FastaReader( $file, $not_trim ); 76 | while ( my $fa = &$next_seq() ) { 77 | my ( $header, $seq ) = @$fa; 78 | $sum++; 79 | 80 | # matching object, by header or sequence 81 | my $object = $header; 82 | if ( $$para{byseq} ) { 83 | $object = $seq; 84 | } 85 | 86 | my $hit = undef; 87 | if ( $$para{useregexp} ) { # use regular expression 88 | for my $p (keys %$patterns) { 89 | if ( $object =~ /$p/i ) { 90 | $hit = 1; 91 | delete $$patterns{$p} if $$para{speedup}; 92 | last; 93 | } 94 | } 95 | } 96 | else { # compare with full header | sequence 97 | if ( exists $$patterns{$object} ) { 98 | $hit = 1; 99 | } 100 | } 101 | 102 | if ( $$para{not} ) { # NOT 103 | next if $hit; 104 | } 105 | else { 106 | next unless $hit; 107 | } 108 | 109 | $n++; 110 | if ( $$para{byseq} ) { 111 | print ">$header\n", format_seq($seq); 112 | } 113 | else { 114 | print ">$header\n$seq"; 115 | } 116 | } 117 | } 118 | 119 | print STDERR "\rHits: $n / $sum\n"; 120 | -------------------------------------------------------------------------------- /sequence/fasta_extract_randomly.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # https://github.com/shenwei356/bio_scripts 3 | 4 | use strict; 5 | 6 | use File::Basename; 7 | use BioUtil::Seq; 8 | use BioUtil::Util; 9 | 10 | $0 = basename($0); 11 | my $usage = <= 1; 26 | 27 | my $p = shift @ARGV; 28 | die "Probability should between 0 and 1\n" 29 | unless $p =~ /^[\d\.]+$/ 30 | and $p > 0 31 | and $p <= 1; 32 | 33 | srand(); 34 | 35 | my @files = file_list_from_argv(@ARGV); 36 | 37 | my $n = 0; 38 | for my $file (@files) { 39 | my $next_seq = FastaReader( $file, 1 ); 40 | while ( my $fa = &$next_seq() ) { 41 | my ( $header, $seq ) = @$fa; 42 | 43 | next unless rand() < $p; 44 | $n++; 45 | print ">$header\n$seq"; 46 | } 47 | } 48 | 49 | print STDERR "sum: $n\n"; 50 | -------------------------------------------------------------------------------- /sequence/fasta_gc_skew.plot.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # https://github.com/shenwei356/bio_scripts 3 | library(methods) 4 | library(proto) 5 | library(dplyr) 6 | library(tidyr) 7 | library(argparse) 8 | library(ggplot2) 9 | library(scales) 10 | library(ggthemes) 11 | library(tidyr) 12 | library(swr) 13 | 14 | 15 | parser <- 16 | ArgumentParser(description = "Plot GC and GC Skew with the result produced by fasta_gc_skew.py", 17 | formatter_class = "argparse.RawTextHelpFormatter") 18 | 19 | parser$add_argument("infile", type = "character", 20 | help = "gcskew file produced by fasta_gc_skew.py") 21 | parser$add_argument("outfile", type = "character", 22 | help = "outfile") 23 | parser$add_argument( 24 | "-xi", 25 | "--x-interval", 26 | type = "integer", 27 | default = 1000000, 28 | help = "x axix interval [1,000,000]" 29 | ) 30 | parser$add_argument("-n", 31 | type = "integer", 32 | default = 10, 33 | help = "divide the normalized accum_gcskew by n so it looks better [10]") 34 | parser$add_argument( 35 | "--width", 36 | metavar = "width", 37 | type = "integer", 38 | default = 20, 39 | help = "output image width [20]" 40 | ) 41 | parser$add_argument( 42 | "--height", 43 | metavar = "height", 44 | type = "integer", 45 | default = 5, 46 | help = "output image height [5]" 47 | ) 48 | parser$add_argument( 49 | "-g", 50 | "--gc-content", 51 | action = "store_true", 52 | dest = "gc_content", 53 | help = "only plot GC Content" 54 | ) 55 | parser$add_argument("-s", 56 | "--gc-skew", 57 | action = "store_true", 58 | dest = "gc_skew", 59 | help = "only plot GC Skew") 60 | parser$add_argument( 61 | "-t", 62 | "--title", 63 | metavar = "title", 64 | type = "character", 65 | default = "GC Content/GC Skew", 66 | help = "title" 67 | ) 68 | 69 | args <- parser$parse_args() 70 | 71 | if (args$title == "") { 72 | args$title = NULL 73 | } 74 | 75 | df <- read.csv(args$infile, sep = "\t") 76 | df['accum_gcskew'] = df['accum_gcskew'] / max(df['accum_gcskew']) / args$n 77 | 78 | if (args$gc_content && !args$gc_skew) { 79 | df['gcskew'] = NULL 80 | df['accum_gcskew'] = NULL 81 | } 82 | if (!args$gc_content && args$gc_skew) { 83 | df['gc'] = NULL 84 | } 85 | 86 | df_m <- df %>% gather(group, value, -chr, -loc) 87 | 88 | p <- ggplot(df_m) + 89 | geom_line(aes(loc, value, color = group)) + 90 | geom_hline(aes(yintercept = 0), linetype = 2) + 91 | scale_size(range = c(0.1)) + 92 | scale_colour_wsj() + 93 | facet_grid(chr ~ .) + 94 | ylab(NULL) + 95 | xlab("Position (bp)") + 96 | scale_x_continuous(breaks = seq(0, max(df$loc), by = args$x_interval), 97 | labels = comma) + 98 | ggtitle(args$title) + 99 | shenwei356.theme() + 100 | theme(legend.position = "top") 101 | 102 | ggsave( 103 | p, 104 | file = args$outfile, 105 | width = args$width, 106 | height = args$height 107 | ) 108 | -------------------------------------------------------------------------------- /sequence/fasta_gc_skew.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # https://github.com/shenwei356/bio_scripts 4 | from __future__ import division 5 | 6 | import argparse 7 | import sys 8 | 9 | import numpy as np 10 | from Bio import SeqIO 11 | 12 | 13 | def parse_args(): 14 | parser = argparse.ArgumentParser(description="GC Skew", 15 | epilog="https://github.com/shenwei356/bio_scripts") 16 | 17 | parser.add_argument('infile', type=str, help='fasta file') 18 | parser.add_argument('-w', '--window', type=int, default=10000, help='window size [10000]') 19 | parser.add_argument('-s', '--step', type=int, default=200, help='step size [200]') 20 | parser.add_argument('-c', '--circular', action='store_true', help='circular genome') 21 | 22 | args = parser.parse_args() 23 | return args 24 | 25 | 26 | def GC_Skew(seq, window=10000, step=200, circular=False): 27 | length, cnt = len(seq), 0 28 | if circular: 29 | end = length - step if length > step else 0 30 | else: 31 | end = length - window if length > window else 0 32 | locs = range(0, end + 1, step) 33 | GC, skew = np.zeros(len(locs)), np.zeros(len(locs)) 34 | for i in locs: 35 | if i >= length - window: 36 | s = '{}{}'.format(seq[i:length], seq[0:window - (length - i)]) 37 | else: 38 | s = seq[i:i + window] 39 | g, c = s.count('g') + s.count('G'), s.count('c') + s.count('C') 40 | GC[cnt] = (g + c) / window 41 | skew[cnt] = (g - c) / (g + c) 42 | cnt += 1 43 | return GC, skew 44 | 45 | 46 | if __name__ == '__main__': 47 | args = parse_args() 48 | 49 | with open(args.infile) as fh: 50 | sys.stdout.write('{}\t{}\t{}\t{}\t{}\n'.format('chr', 'loc', 'gc', 'gcskew', 'accum_gcskew')) 51 | for seq in SeqIO.parse(fh, 'fasta'): 52 | sys.stderr.write('compute gcskew: {}\n'.format(seq.id)) 53 | GC, gcskew = GC_Skew(seq.seq, window=args.window, step=args.step, circular=args.circular) 54 | acc = 0 55 | for i in range(0, len(GC)): 56 | gc, skew = GC[i], gcskew[i] 57 | acc += skew 58 | sys.stdout.write('{}\t{}\t{:.4f}\t{:.4f}\t{:.4f}\n'.format(seq.id, i * args.step + 1, gc, skew, acc)) 59 | -------------------------------------------------------------------------------- /sequence/fasta_locate_motif.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # https://github.com/shenwei356/bio_scripts 3 | 4 | use strict; 5 | use Getopt::Long; 6 | use File::Basename; 7 | use BioUtil::Seq; 8 | 9 | $0 = basename($0); 10 | my $usage = < 19 | Options: 20 | 21 | -d, --degenerate Motif contains egenerate base 22 | -h, --help Show this help information 23 | 24 | Attention: In default, motifs are treated as regular expression. 25 | When option -d given, regular expression may be wrong. 26 | For example: "\\w" -> "\\[AT]". In this case you can use "\\.+?" 27 | 28 | USAGE 29 | 30 | my $args = {}; 31 | GetOptions( 32 | 'help|h' => \$$args{help}, 33 | 'degenerate|d' => \$$args{degenerate}, 34 | ) or die $usage; 35 | die $usage if $$args{help}; 36 | die $usage unless @ARGV == 2; 37 | 38 | my $queries = read_sequence_from_fasta_file( shift @ARGV ); 39 | 40 | my $next_seq = FastaReader( shift @ARGV ); 41 | 42 | print "subject\tquery\tstart\tend\tstrand\tmatched\n"; 43 | while ( my $fa = &$next_seq() ) { 44 | my ( $header, $seq ) = @$fa; 45 | 46 | for my $qname ( sort keys %$queries ) { 47 | my $qseq = $$queries{$qname}; 48 | 49 | my $qseq_r = $qseq; 50 | $qseq_r = degenerate_seq_to_regexp($qseq_r) if $$args{degenerate}; 51 | 52 | my $matches = match_regexp( $qseq_r, $seq ); 53 | for my $match (@$matches) { 54 | my ( $start, $end, $matched ) = @$match; 55 | $start += 1; 56 | $end += 1; 57 | print "$header\t$qname\t$start\t$end\t+\t$matched\n"; 58 | } 59 | 60 | my $qseq_r = revcom($qseq); 61 | $qseq_r = degenerate_seq_to_regexp($qseq_r) if $$args{degenerate}; 62 | my $matches = match_regexp( $qseq_r, $seq ); 63 | for my $match (@$matches) { 64 | my ( $start, $end, $matched ) = @$match; 65 | $start += 1; 66 | $end += 1; 67 | print "$header\t$qname\t$start\t$end\t-\t" 68 | . revcom($matched) . "\n"; 69 | } 70 | } 71 | } 72 | 73 | =head2 degenerate_seq_to_regexp 74 | 75 | Translate degenerate sequence to regular expression. 76 | 77 | =cut 78 | 79 | sub degenerate_seq_to_regexp { 80 | my ($seq) = @_; 81 | my %bases = ( 82 | 'A' => 'A', 83 | 'T' => 'T', 84 | 'U' => 'U', 85 | 'C' => 'C', 86 | 'G' => 'G', 87 | 'R' => '[AG]', 88 | 'Y' => '[CT]', 89 | 'M' => '[AC]', 90 | 'K' => '[GT]', 91 | 'S' => '[CG]', 92 | 'W' => '[AT]', 93 | 'H' => '[ACT]', 94 | 'B' => '[CGT]', 95 | 'V' => '[ACG]', 96 | 'D' => '[AGT]', 97 | 'N' => '[ACGT]', 98 | ); 99 | return join '', map { exists $bases{$_} ? $bases{$_} : $_ } 100 | split //, uc $seq; 101 | } 102 | 103 | =head2 match_regexp 104 | 105 | Find all sites matching the regular expression. 106 | 107 | See https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta_locate_motif.pl 108 | 109 | =cut 110 | 111 | sub match_regexp { 112 | my ( $r, $s ) = @_; 113 | my @matched = (); 114 | my $pos = -1; 115 | while ( $s =~ /($r)/ig ) { 116 | $pos = pos $s; 117 | 118 | # return start, end, matched string 119 | # start and end are 0-based 120 | push @matched, [ $pos - length($1), $pos - 1, $1 ]; 121 | pos $s = $pos - length($1) + 1; 122 | } 123 | return \@matched; 124 | } 125 | -------------------------------------------------------------------------------- /sequence/fasta_remove_duplicates.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # https://github.com/shenwei356/bio_scripts 3 | 4 | use strict; 5 | use File::Basename; 6 | use Getopt::Long; 7 | use Digest::MD5 'md5_hex'; 8 | use BioUtil::Seq; 9 | use BioUtil::Util; 10 | 11 | local $| = 1; 12 | $0 = basename($0); 13 | my $usage = < uniq.fa 30 | fasta_remove_duplicates.pl -n seq*.fa > uniq.fa 31 | zcat seq.fa.gz | fasta_remove_duplicates.pl -s -i > uniq.fa 32 | 33 | # remove records same header and seqs 34 | fasta_remove_duplicates.pl -s -n -i seq1.fa > uniq.fa 35 | 36 | https://github.com/shenwei356/bio_scripts 37 | 38 | USAGE 39 | 40 | my $help = 0; 41 | my $by_head = 0; 42 | my $by_seq = 0; 43 | my $ignore_case = 0; 44 | my $linelength = 70; 45 | GetOptions( 46 | 'help|h' => \$help, 47 | "n" => \$by_head, 48 | "s" => \$by_seq, 49 | "i" => \$ignore_case, 50 | 'l=i' => \$linelength, 51 | ) or die $usage; 52 | 53 | die $usage if $help; 54 | if ($linelength <= 0 ){ 55 | die sprintf "value of -l (%d) should be greatter than 0\n", $linelength; 56 | } 57 | 58 | # get the file list 59 | my @files = file_list_from_argv(@ARGV); 60 | 61 | my $md5s = {}; 62 | my ( $sum, $n ) = ( 0, 0 ); 63 | my ( $file, $next_seq, $fa, $header, $seq, $target, $md5 ) = (undef) x 7; 64 | for $file (@files) { 65 | $next_seq = FastaReader($file); 66 | while ( $fa = &$next_seq() ) { 67 | ( $header, $seq ) = @$fa; 68 | 69 | if ($by_seq) { # comparing by seq 70 | $target = $seq; 71 | if ($by_head) { # comparing by head and seq 72 | $target = $header . $seq; 73 | } 74 | } 75 | elsif ($by_head) { # comparing by head 76 | $target = $header; 77 | } 78 | 79 | $target = lc $target if $ignore_case; 80 | $md5 = md5_hex($target); 81 | 82 | if ( $$md5s{$md5} == 1 ) { # duplicates 83 | $n++; 84 | } 85 | else { 86 | $$md5s{$md5} = 1; 87 | $sum++; 88 | print ">$header\n", format_seq( $seq, $linelength ); 89 | } 90 | print STDERR "\rremove: $n; remain: $sum"; 91 | } 92 | } 93 | 94 | print STDERR "\n"; 95 | -------------------------------------------------------------------------------- /sequence/fasta_rename_duplicated_names.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # https://github.com/shenwei356/bio_scripts 3 | 4 | use strict; 5 | use File::Basename; 6 | use Getopt::Long; 7 | use BioUtil::Seq; 8 | use BioUtil::Util; 9 | 10 | $0 = basename($0); 11 | my $usage = <= 0, 0 for no formating [70] 19 | -h Show this help information. 20 | https://github.com/shenwei356/bio_scripts 21 | 22 | USAGE 23 | 24 | my $help = 0; 25 | my $linelength = 70; 26 | GetOptions( 27 | 'help|h' => \$help, 28 | 'l=i' => \$linelength, 29 | ) or die $usage; 30 | 31 | die $usage if $help; 32 | if ( $linelength < 0 ) { 33 | die sprintf "value of -l (%d) should be greatter or equal to 0\n", 34 | $linelength; 35 | } 36 | 37 | # get the file list 38 | my @files = file_list_from_argv(@ARGV); 39 | 40 | my $names = {}; 41 | for my $file (@files) { 42 | my $next_seq = FastaReader($file); 43 | while ( my $fa = &$next_seq() ) { 44 | my ( $header, $seq ) = @$fa; 45 | if ( exists $$names{$header} ) { 46 | $$names{$header}++; 47 | $header = "$header r$$names{$header}"; 48 | } 49 | else { 50 | $$names{$header} = 1; 51 | } 52 | 53 | if ( $linelength > 0 ) { 54 | print ">$header\n", format_seq( $seq, $linelength ); 55 | } 56 | else { 57 | print ">$header\n", $seq, "\n"; 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /sequence/fasta_reset_start_position_for_circular_genome.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | 4 | my $usage = < 8 | Example: 9 | 1. Set the 100th base as the new start position 10 | reset_start_position_for_circular_genome seq.fa 100 11 | 12 | Author: Wei Shen 13 | Change history: 14 | - 2014-04-30 rewrite. 15 | - 2011 first edition. 16 | 17 | USAGE 18 | 19 | die $usage unless @ARGV == 2; 20 | 21 | my ( $infile, $newstart, $head, $seq, $newseq, $buffer, $outfile ); 22 | 23 | $infile = shift; 24 | $newstart = shift; 25 | 26 | die "newstart should be integer greater than 0, you input $newstart.\n" 27 | unless $newstart =~ /^\d+$/ and $newstart > 0; 28 | 29 | $buffer = ''; 30 | open IN, $infile or die "fail to open sequence file $infile!\n"; 31 | local $/ = '>'; 32 | ; 33 | 34 | while () { 35 | s/>$//; 36 | ( $head, $seq ) = split "\r?\n", $_, 2; 37 | $seq =~ s/\s+//g; 38 | 39 | $newseq = substr( $seq, $newstart - 1 ) . substr( $seq, 0, $newstart - 1 ); 40 | 41 | $buffer .= ">$head (start position move to $newstart)\n" 42 | . format_seq( $newseq, 70 ) . "\n"; 43 | } 44 | close IN; 45 | $/ = "\n"; 46 | 47 | $outfile = "$infile.newstart$newstart.fa"; 48 | if ( $infile =~ /(.+)\.(.+?)$/ ) { 49 | $outfile = "$1.newstart$newstart.$2"; 50 | } 51 | open OUT, ">", $outfile or die "failed to open file $outfile\n"; 52 | print OUT $buffer; 53 | close OUT; 54 | 55 | sub format_seq($$) { 56 | my ( $s, $n ) = @_; 57 | my $s2 = ''; 58 | my ( $j, $int ); 59 | $int = int( ( length $s ) / $n ); 60 | for ( $j = 0; $j <= $int - 1; $j++ ) { 61 | $s2 .= substr( $s, $j * $n, $n ) . "\n"; 62 | } 63 | $s2 .= substr( $s, $int * $n ); 64 | return $s2; 65 | } 66 | -------------------------------------------------------------------------------- /sequence/fasta_sliding_window.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use File::Basename; 5 | use BioUtil::Seq; 6 | use BioUtil::Util; 7 | 8 | $0 = basename $0; 9 | die "\nusage: $0 \n\n" 10 | unless @ARGV == 5; 11 | 12 | my ( $file_query, $win_start, $win_end, $win_step, $step ) = @ARGV; 13 | check_positive_integer($win_start); 14 | check_positive_integer($win_end); 15 | check_positive_integer($win_step); 16 | check_positive_integer($step); 17 | 18 | die "win_start should not be larger han win_end\n" 19 | unless $win_end >= $win_start; 20 | 21 | my $next_seq = FastaReader($file_query); 22 | while ( my $fa = &$next_seq() ) { 23 | my ( $header, $seq ) = @$fa; 24 | my $len_seq = length $seq; 25 | 26 | for ( my $win = $win_start; $win <= $win_end; $win += $win_step ) { 27 | my $end = $len_seq - $win < 0 ? 0 : $len_seq - $win; 28 | for ( my $i = 0; $i <= $end; $i += $step ) { 29 | my $s = substr( $seq, $i, $win ); 30 | printf ">%s_window(%d,%d)\n%s\n", $header, $i+1, $win, $s; 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /sequence/fasta_trim_aligned_fasta.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # https://github.com/shenwei356/bio_scripts 3 | use strict; 4 | use Getopt::Long; 5 | use File::Temp qw/ tempfile/; 6 | use BioUtil::Seq; 7 | use BioUtil::Util; 8 | 9 | local $| = 1; 10 | 11 | my @GAPS = ( '-', '.' ); 12 | my $tmpfile_prefix = "fasta_trim_aligned_fasta_tmpfile_"; 13 | 14 | my $usage = < \$$para{help}, 37 | 'gaps|g=s' => \$$para{gaps}, 38 | 'linelength|l=i' => \$$para{linelength}, 39 | ) or die $usage; 40 | 41 | die $usage if $$para{help}; 42 | 43 | # gap symbols 44 | my %GAPSMAP = (); 45 | if ( $$para{gaps} ) { 46 | @GAPS = split //, $$para{gaps}; 47 | } 48 | $GAPSMAP{$_} = 1 for @GAPS; 49 | 50 | my $use_stdin = 0; 51 | my ( $tmp_file_fh, $tmp_file ) = (undef) x 2; 52 | 53 | my @files = (); 54 | for my $file (@ARGV) { 55 | for my $f ( glob $file ) { 56 | push @files, $f; 57 | } 58 | } 59 | if ( @files == 0 ) { 60 | push @files, 'STDIN'; 61 | ( $tmp_file_fh, $tmp_file ) 62 | = tempfile( $tmpfile_prefix . "XXXXXX", DIR => ".", SUFFIX => '.fa' ); 63 | 64 | $use_stdin = 1; 65 | } 66 | 67 | print STDERR "sequences from STDIN is saved in $tmp_file\n" if $use_stdin; 68 | print STDERR "check...\n"; 69 | 70 | my $gaploc = {}; # store the gap location 71 | my $do_once = 1; 72 | my ( $header, $seq, $len, $i, $base ) = (undef) x 5; 73 | my ( $sum, $n ) = (0) x 2; 74 | for my $file (@files) { 75 | my $next_seq = FastaReader($file); 76 | while ( my $fa = &$next_seq() ) { 77 | ( $header, $seq ) = @$fa; 78 | $sum++; 79 | print STDERR "\rcount: $sum"; 80 | if ($do_once) { 81 | $len = length $seq; 82 | $$gaploc{$_} = 1 for 0 .. ( $len - 1 ); 83 | $do_once = 0; 84 | } 85 | 86 | for $i ( 0 .. ( $len - 1 ) ) { 87 | $base = substr $seq, $i, 1; 88 | if ( $GAPSMAP{$base} != 1 ) { # it's not a gap! 89 | delete $$gaploc{$i}; 90 | } 91 | } 92 | 93 | if ( scalar keys %$gaploc == 0 ) { 94 | close $tmp_file_fh if $use_stdin; 95 | remove_tmpfile() if $use_stdin; 96 | die "\nno gap to trim\n"; 97 | } 98 | 99 | print $tmp_file_fh ">$header\n$seq\n" if $use_stdin; 100 | } 101 | } 102 | 103 | close $tmp_file_fh if $use_stdin; 104 | 105 | my @index = keys %$gaploc; 106 | 107 | print STDERR "\n", (scalar @index), " gaps to trim\n"; 108 | print STDERR "\nextract sequences...\n"; 109 | 110 | @files = ($tmp_file) if $use_stdin; 111 | 112 | for my $file (@files) { 113 | my $next_seq = FastaReader($file); 114 | while ( my $fa = &$next_seq() ) { 115 | ( $header, $seq ) = @$fa; 116 | $n++; 117 | print STDERR "\r$n / $sum"; 118 | print ">$header\n", 119 | format_seq( delete_string_elements_by_indexes( \$seq, \@index ), 120 | $$para{linelength} ); 121 | } 122 | } 123 | 124 | print STDERR "\n"; 125 | 126 | remove_tmpfile() if $use_stdin; 127 | 128 | sub remove_tmpfile { 129 | print STDERR "\nremove temporary files\n"; 130 | for ( glob "$tmpfile_prefix*" ) { 131 | unlink $_ or die "fail to remove $_\n"; 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /sequence/fastq2tab: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # https://github.com/shenwei356/bio_scripts 3 | 4 | use strict; 5 | use Getopt::Long; 6 | 7 | my $usage = q( 8 | fastq2tab - transform the fastq fromat to four-column table 9 | 10 | Usage: fastq2tab [options] [fastafiles...] 11 | Options: 12 | -s, --split reads id only. no index info 13 | 14 | This script is usually used in pair with tab2fastq. 15 | https://github.com/shenwei356/bio_scripts 16 | 17 | ); 18 | 19 | my $args = {}; 20 | GetOptions( 21 | 'help|h' => \$$args{help}, 22 | 23 | 'split|s' => \$$args{split}, 24 | ) or die $usage; 25 | die $usage if $$args{help}; 26 | 27 | my $line = ''; 28 | while ( my $record = <> ) { 29 | chomp($record); 30 | $record =~ s/^\@//; 31 | 32 | if ( $$args{split} ) { 33 | $record = ( split / /, $record )[0]; 34 | } 35 | 36 | for ( 1 .. 3 ) { 37 | $line = <>; 38 | chomp($line); 39 | $record .= "\t$line"; 40 | } 41 | 42 | print "$record\n"; 43 | } 44 | -------------------------------------------------------------------------------- /sequence/fastq_extract_paired_reads.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # make sure the reads in the two fastq files has same order! 4 | use strict; 5 | use Parallel::Runner; 6 | use File::Basename; 7 | 8 | $0 = basename($0); 9 | die "usage: $0 \n" 10 | unless @ARGV == 2; 11 | 12 | my $fqfile1 = shift @ARGV; 13 | my $fqfile2 = shift @ARGV; 14 | 15 | # =========================================================== 16 | 17 | print "read $fqfile1\n"; 18 | my $headers1 = get_headers($fqfile1); 19 | 20 | print "read $fqfile2\n"; 21 | my $headers2 = get_headers($fqfile2); 22 | 23 | # =========================================================== 24 | 25 | print "find common IDs: "; 26 | my $headers = {}; 27 | for my $header ( keys %$headers1 ) { 28 | next unless exists $$headers2{$header}; 29 | $$headers{$header} = 1; 30 | } 31 | my $n = keys %$headers; 32 | print "$n\n"; 33 | 34 | die "sadly, no paired reads found\n" if $n == 0; 35 | 36 | # =========================================================== 37 | 38 | my $runner = Parallel::Runner->new(2); 39 | 40 | print "extract $fqfile1\n"; 41 | $runner->run( sub { extract( $headers, $fqfile1 ); } ); 42 | 43 | print "extract $fqfile2\n"; 44 | $runner->run( sub { extract( $headers, $fqfile2 ); } ); 45 | 46 | $runner->finish; 47 | 48 | # =========================================================== 49 | 50 | sub extract { 51 | my ( $headers, $fqfile ) = @_; 52 | 53 | my $fqfileout = $fqfile; 54 | $fqfileout =~ s/\.(fq|fastq)$//i; 55 | $fqfileout .= ".pe.fq"; 56 | open my $fh, ">", $fqfileout or die "fail to wrtie file: $fqfileout\n"; 57 | 58 | my $next_seq = FastqReader($fqfile); 59 | my $id = ''; 60 | while ( my $fq = &$next_seq() ) { 61 | my ( $head, $seq, $qual ) = @$fq; 62 | $id = (split / /, $head )[0]; 63 | if ($id =~ /(.+)\/\d$/){ 64 | $id = $1; 65 | } 66 | next unless exists $$headers{ $id }; 67 | print $fh "\@$head\n$seq\n+\n$qual\n"; 68 | } 69 | } 70 | 71 | sub get_headers { 72 | my ($fqfile) = @_; 73 | my $headers = {}; 74 | 75 | my $next_seq = FastqReader($fqfile); 76 | my $id = ''; 77 | while ( my $fq = &$next_seq() ) { 78 | my ( $head, $seq, $qual ) = @$fq; 79 | $id = (split / /, $head )[0]; 80 | if ($id =~ /(.+)\/\d$/){ 81 | $id = $1; 82 | } 83 | $$headers{$id} = '1'; 84 | } 85 | 86 | return $headers; 87 | } 88 | 89 | sub FastqReader { 90 | my ($file) = @_; 91 | 92 | my ( $open_flg, $finished ) = ( 0, 0 ); 93 | my ( $fh, $head, $seq, $qual ) = (undef) x 4; 94 | 95 | if ( $file =~ /^STDIN$/i ) { # from stdin 96 | $fh = *STDIN; 97 | } 98 | elsif ( ref $file eq '' or ref $file eq 'SCALAR' ) { # from file 99 | open $fh, '<', $file or die "fail to open file: $file!\n"; 100 | $open_flg = 1; 101 | } 102 | else { # glob, i.e. given file handler 103 | $fh = $file; 104 | } 105 | 106 | return sub { 107 | return if $finished; 108 | 109 | while (<$fh>) { 110 | if ( substr( $_, 0, 1 ) ne '@' ) { 111 | die "bad fq file\n"; 112 | } 113 | 114 | $head = $_; 115 | $head =~ s/\r?\n$//; 116 | substr( $head, 0, 1, '' ); 117 | 118 | $seq = <$fh>; 119 | $seq =~ s/\r?\n$//; 120 | 121 | <$fh>; 122 | 123 | $qual = <$fh; 124 | $qual =~ s/\r?\n$//; 125 | 126 | return [ $head, $seq, $qual ]; 127 | } 128 | 129 | close $fh if $open_flg; 130 | $finished = 1; 131 | return; 132 | }; 133 | } 134 | -------------------------------------------------------------------------------- /sequence/fastx_mapping_with_bwa.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use File::Basename; 5 | use BioUtil::Util; 6 | 7 | $0 = basename ($0); 8 | die qq( 9 | usage: $0 [] 10 | when two fastx file given, they are treated as paired end reads 11 | 12 | ) 13 | unless @ARGV == 5 or @ARGV == 4; 14 | 15 | my $threads = shift @ARGV; 16 | my $refseq = shift @ARGV; 17 | my $prefix = shift @ARGV; 18 | my $read = shift @ARGV; 19 | my $read2 = shift @ARGV; 20 | 21 | check_positive_integer($threads); 22 | 23 | # build index 24 | my @suffix = qw/.amb .ann .bwt .pac .sa/; 25 | my $index_built = 1; 26 | for (@suffix) { 27 | $index_built = 0 unless -e "$refseq$_"; 28 | } 29 | run("bwa index $refseq") unless $index_built; 30 | run("samtools faidx $refseq") unless -e "$refseq.fai"; 31 | 32 | # =================[ mapping ]=================== 33 | 34 | print "mapping\n"; 35 | if ($read2){ 36 | run("bwa mem -t $threads -M -a $refseq $read $read2 > $prefix.sam"); 37 | }else{ 38 | run("bwa mem -t $threads -M -a $refseq $read > $prefix.sam"); 39 | } 40 | 41 | # =================[ mapping ]=================== 42 | 43 | print "sam -> bam\n"; 44 | run("samtools view -bS $prefix.sam > $prefix.bam"); 45 | 46 | print "sort bam\n"; 47 | run("samtools sort $prefix.bam $prefix.sorted"); 48 | 49 | print "index bam\n"; 50 | run("samtools index $prefix.sorted.bam"); 51 | 52 | print "flagstat\n"; 53 | run("samtools flagstat $prefix.sorted.bam > $prefix.sorted.bam.flagstat"); 54 | 55 | run("rm $prefix.bam $prefix.sam"); -------------------------------------------------------------------------------- /sequence/fastx_pwm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # https://github.com/shenwei356/bio_scripts 4 | from __future__ import print_function 5 | 6 | import argparse 7 | import gzip 8 | import logging 9 | import os 10 | import re 11 | import sys 12 | 13 | from Bio import SeqIO, motifs 14 | from Bio.Seq import Seq 15 | 16 | 17 | def parse_args(): 18 | parser = argparse.ArgumentParser(description="Position Weight Matrices of sequence") 19 | 20 | parser.add_argument("-v", "--verbose", help='verbosely print information', 21 | action="count", default=0) 22 | 23 | group = parser.add_mutually_exclusive_group() 24 | group.add_argument("--stdin", action="store_true", 25 | help='read from stdin, one sequence per line') 26 | group.add_argument('-i', '--infile', type=str, 27 | help='file name should like this: infile.[fasta|fa|fastq|fq][.gz]') 28 | 29 | args = parser.parse_args() 30 | if not ( args.stdin or args.infile ): 31 | sys.stderr.write("option --stdin or -i should be given\n") 32 | sys.exit(1) 33 | 34 | return args 35 | 36 | 37 | def seq_iter(file): 38 | if file: 39 | found = re.search(r'(?i)(fasta|fa|fastq|fq)(.gz)?$', file) 40 | if not found: 41 | sys.stderr.write("invalid file name suffix.\nfile name should like this: infile.[fasfa|fa|fastq|fq][.gz]\n") 42 | sys.exit(1) 43 | seq_format, is_gz = found.groups() 44 | if seq_format == 'fa': 45 | seq_format = 'fasta' 46 | if seq_format == 'fq': 47 | seq_format = 'fastq' 48 | 49 | fh = gzip.open(file, 'rt') if is_gz else open(file, 'r') 50 | for record in SeqIO.parse(fh, seq_format): 51 | yield record.seq 52 | fh.close() 53 | else: 54 | for line in sys.stdin: 55 | yield Seq(line.strip()) 56 | 57 | 58 | if __name__ == '__main__': 59 | args = parse_args() 60 | seqs = seq_iter(args.infile) 61 | seqs2 = [seq for seq in seqs if not 'N' in seq] 62 | m = motifs.create(seqs2) 63 | print(m.pwm) 64 | # print(m.pssm) 65 | # m.weblogo("motif.png") 66 | -------------------------------------------------------------------------------- /sequence/fastx_tm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # https://github.com/shenwei356/bio_scripts 4 | from __future__ import print_function 5 | 6 | import argparse 7 | import gzip 8 | import logging 9 | import os 10 | import re 11 | import sys 12 | 13 | from Bio import SeqIO 14 | from Bio.Seq import Seq 15 | from Bio.SeqRecord import SeqRecord 16 | from Bio.SeqUtils import MeltingTemp as mt 17 | 18 | 19 | def parse_args(): 20 | parser = argparse.ArgumentParser(description="Compute DNA MeltingTemp") 21 | group = parser.add_mutually_exclusive_group() 22 | group.add_argument("--stdin", action="store_true", 23 | help='read from stdin, one sequence per line') 24 | group.add_argument('-i', '--infile', type=str, 25 | help='file name should like this: infile.[fasta|fa|fastq|fq][.gz]') 26 | parser.add_argument('-f', '--format', type=str, # default='fasta', 27 | help='seqence format: fasta |fastq [fasta]') 28 | 29 | args = parser.parse_args() 30 | if not (args.stdin or args.infile): 31 | sys.stderr.write("option --stdin or -i should be given\n") 32 | sys.exit(1) 33 | if args.format and not args.format in ['fasta', 'fastq']: 34 | sys.stderr.write("option -f | --format should be 'fasta' or 'fastq'\n") 35 | sys.exit(1) 36 | if args.stdin and not args.format: 37 | sys.stderr.write("option -f | --format should be given when --stdin is set.\n") 38 | sys.exit(1) 39 | 40 | return args 41 | 42 | 43 | if __name__ == '__main__': 44 | args = parse_args() 45 | 46 | file, seq_format, fh = args.infile, args.format, None, 47 | if file: 48 | if not seq_format: 49 | found = re.search(r'(?i)(fasta|fa|fastq|fq)(.gz)?$', file) 50 | if not found: 51 | print("invalid file name suffix.\nfile name should like this: infile.[fasfa|fa|fastq|fq][.gz]", 52 | file=sys.stderr) 53 | sys.exit(1) 54 | seq_format, is_gz = found.groups() 55 | if seq_format == 'fa': 56 | seq_format = 'fasta' 57 | if seq_format == 'fq': 58 | seq_format = 'fastq' 59 | 60 | fh = gzip.open(file, 'rt') if file.endswith('.gz') else open(file, 'r') 61 | else: 62 | fh = sys.stdin 63 | seq_format = args.format 64 | 65 | 66 | sys.stdout.write('{}\t{}\t{}\t{}\n'.format('seq_id', 'Tm_Wallace', 'Tm_GC', 'Tm_NN')) 67 | for seq in SeqIO.parse(fh, seq_format): 68 | sys.stdout.write('{}\t{:0.2f}\t{:0.2f}\t{:0.2f}\n'.format(seq.id, mt.Tm_Wallace(seq.seq), mt.Tm_GC(seq.seq), mt.Tm_NN(seq.seq))) 69 | fh.close() 70 | -------------------------------------------------------------------------------- /sequence/fastx_translate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # https://github.com/shenwei356/bio_scripts 4 | from __future__ import print_function 5 | 6 | import argparse 7 | import gzip 8 | import logging 9 | import os 10 | import re 11 | import sys 12 | 13 | from Bio import SeqIO 14 | from Bio.Seq import Seq 15 | from Bio.SeqRecord import SeqRecord 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser(description="Translate DNA to peptide") 20 | 21 | parser.add_argument("-v", "--verbose", help='verbosely print information', 22 | action="count", default=0) 23 | 24 | group = parser.add_mutually_exclusive_group() 25 | group.add_argument("--stdin", action="store_true", 26 | help='read from stdin, one sequence per line') 27 | group.add_argument('-i', '--infile', type=str, 28 | help='file name should like this: infile.[fasta|fa|fastq|fq][.gz]') 29 | parser.add_argument('-f', '--format', type=str, # default='fasta', 30 | help='seqence format: fasta |fastq [fasta]') 31 | parser.add_argument('-t', '--table', type=int, default=1, 32 | help='genetic code table (detail: http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi ) [1]') 33 | 34 | args = parser.parse_args() 35 | if not (args.stdin or args.infile): 36 | sys.stderr.write("option --stdin or -i should be given\n") 37 | sys.exit(1) 38 | if args.format and not args.format in ['fasta', 'fastq']: 39 | sys.stderr.write("option -f | --format should be 'fasta' or 'fastq'\n") 40 | sys.exit(1) 41 | if args.stdin and not args.format: 42 | sys.stderr.write("option -f | --format should be given when --stdin is set.\n") 43 | sys.exit(1) 44 | 45 | return args 46 | 47 | 48 | if __name__ == '__main__': 49 | args = parse_args() 50 | 51 | file, seq_format, fh = args.infile, args.format, None, 52 | if file: 53 | if not seq_format: 54 | found = re.search(r'(?i)(fasta|fa|fastq|fq)(.gz)?$', file) 55 | if not found: 56 | print("invalid file name suffix.\nfile name should like this: infile.[fasfa|fa|fastq|fq][.gz]", 57 | file=sys.stderr) 58 | sys.exit(1) 59 | seq_format, is_gz = found.groups() 60 | if seq_format == 'fa': 61 | seq_format = 'fasta' 62 | if seq_format == 'fq': 63 | seq_format = 'fastq' 64 | 65 | fh = gzip.open(file, 'rt') if file.endswith('.gz') else open(file, 'r') 66 | else: 67 | fh = sys.stdin 68 | seq_format = args.format 69 | 70 | for seq in SeqIO.parse(fh, seq_format): 71 | SeqIO.write([SeqRecord(seq.seq.translate(table=args.table), id=seq.id, description=seq.description)], sys.stdout, 'fasta') 72 | 73 | fh.close() 74 | -------------------------------------------------------------------------------- /sequence/run_clustalo.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use File::Basename; 5 | use BioUtil::Util; 6 | 7 | $0 = basename($0); 8 | my $usage = < [fastafile...] 11 | 12 | https://github.com/shenwei356/bio_scripts 13 | 14 | USAGE 15 | 16 | die $usage unless @ARGV >= 2; 17 | 18 | my $threads = shift @ARGV; 19 | 20 | for my $file (@ARGV) { 21 | my $fileout = "$file.align.fa"; 22 | my $cmd = "clustalo -i $file -o $fileout --force --outfmt fasta --threads=$threads"; 23 | my $fail = run($cmd); 24 | die "failed to run:$cmd\n" if $fail; 25 | } 26 | -------------------------------------------------------------------------------- /sequence/sample/gc_skew.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/bio_scripts/703cec8d21903516346e2aae4d77d23385c30905/sequence/sample/gc_skew.png -------------------------------------------------------------------------------- /sequence/sample/seq.fa: -------------------------------------------------------------------------------- 1 | >1234 gene=0001 2 | actgatcat-gtagagag 3 | tagatcagagtc 4 | >seq2 5 | atcgatcgaa 6 | >seq3 7 | atcgatcgaa 8 | >123 gene=00011 9 | acccccctct-ttcgg-tatgct-gata-tgatgatgtacg 10 | -tatgct-gata-tgatgtac 11 | acccccctct-ttcgg-tatgct-tgatgtac 12 | acccccctct-ttcgg-tatgct-tgatgtac 13 | acccccctct-ttcgg-tatgct-tgatgtac 14 | acccccctct-ttcgg-tatgct-tgatgtac 15 | acccccctct-ttcgg-tatgct-tgatgtac 16 | acccccctct-ttcgg-tatgct-tgatgtac 17 | acccccctct-ttcgg-tatgct-tgatgtac 18 | acccccctct-ttcgg-tatgct-tgatgtac 19 | acccccctct-ttcgg-tatgct- 20 | acccccctct-ttcgg-tatgct-gata-tgatgatgtacg 21 | -------------------------------------------------------------------------------- /sequence/sample/seq.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shenwei356/bio_scripts/703cec8d21903516346e2aae4d77d23385c30905/sequence/sample/seq.fq.gz -------------------------------------------------------------------------------- /sequence/seqcomp: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | 4 | my $seq = shift @ARGV; 5 | $seq =~ tr/ACGTRYMKSWBDHVNacgtrymkswbdhvn/TGCAYRKMSWVHDBNtgcayrkmswvhdbn/; 6 | print "$seq\n"; 7 | -------------------------------------------------------------------------------- /sequence/seqrc: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | 4 | my $seq = shift @ARGV; 5 | $seq = reverse $seq; 6 | $seq =~ tr/ACGTRYMKSWBDHVNacgtrymkswbdhvn/TGCAYRKMSWVHDBNtgcayrkmswvhdbn/; 7 | print "$seq\n"; 8 | -------------------------------------------------------------------------------- /sequence/seqrev: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | 4 | my $seq = shift @ARGV; 5 | $seq = reverse $seq; 6 | print "$seq\n"; 7 | -------------------------------------------------------------------------------- /sequence/tab2fasta: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # https://github.com/shenwei356/bio_scripts 3 | 4 | use strict; 5 | use Getopt::Long; 6 | use BioUtil::Util; 7 | use BioUtil::Seq; 8 | 9 | my $usage = q( 10 | tab2fasta - transfrom column table to fasta fromat 11 | 12 | Usage: $0 [options] [tablefile...] 13 | 14 | Options: 15 | 16 | -l, --linelength Output line length 17 | -h, --help Show this help information 18 | 19 | This script is usually used in pair with fasta2tab. 20 | https://github.com/shenwei356/bio_scripts 21 | 22 | ); 23 | 24 | my $para = {}; 25 | GetOptions( 26 | 'help|h' => \$$para{help}, 27 | 'linelength|l=i' => \$$para{linelength}, 28 | ) or die $usage; 29 | 30 | die $usage if $$para{help}; 31 | 32 | my @files = file_list_from_argv(@ARGV); 33 | 34 | for my $file (@files) { 35 | my $fh = undef; 36 | 37 | my $is_stdin = 0; 38 | if ( $file eq 'STDIN' ) { 39 | $fh = *STDIN; 40 | $is_stdin = 1; 41 | } 42 | else { 43 | open $fh, "<", $file 44 | or die "fail to open file: $file\n"; 45 | } 46 | 47 | my ( $header, $seq ) = ( "", "" ); 48 | while (<$fh>) { 49 | s/\r?\n//g; 50 | s/^\s+|\s+$//g; 51 | next if $_ eq '' # blank line 52 | or /^#/; # annotation 53 | 54 | # first column as header, and second column as sequence, 55 | # ignore others 56 | ( $header, $seq ) = split /\t/, $_; 57 | 58 | $header =~ s/__tab__/\t/g; 59 | 60 | if ( $$para{linelength} ) { 61 | print ">$header\n", format_seq( $seq, $$para{linelength} ); 62 | } 63 | else { 64 | print ">$header\n$seq\n"; 65 | } 66 | } 67 | 68 | close $fh unless $is_stdin; 69 | } 70 | -------------------------------------------------------------------------------- /sequence/tab2fastq: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # https://github.com/shenwei356/bio_scripts 3 | 4 | use strict; 5 | 6 | while (<>) { 7 | print '@'.join( "\n", split( /\t/, $_ ) ); 8 | } 9 | -------------------------------------------------------------------------------- /taxon/taxon_fetch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # https://github.com/shenwei356/bio_scripts/ 3 | ''' 4 | fetch taxon information by species name or taxid. 5 | 6 | Take home message: 7 | 8 | 1. using cache to avoid repeatly search 9 | 2. object of Entrez.read(Entrez.efetch()) could be treated as list, 10 | but it could not be rightly pickled. Using Json is also not OK. 11 | The right way is cache the xml text. 12 | 13 | search = Entrez.efetch(id=taxid, db="taxonomy", retmode="xml") 14 | # data = Entrez.read(search) 15 | ## read and parse xml 16 | data_xml = search.read() 17 | data = list(Entrez.parse(StringIO(data_xml))) 18 | 3. pickle file was fragile. a flag file could be used to detect whether 19 | data is rightly dumped. 20 | 4. using multi-threads to accelerate fetching. 21 | 22 | ''' 23 | 24 | from __future__ import print_function 25 | import sys 26 | import argparse 27 | import os 28 | import re 29 | import shutil 30 | import pickle 31 | from StringIO import StringIO 32 | from multiprocessing import Pool 33 | from Bio import Entrez 34 | 35 | parser = argparse.ArgumentParser( 36 | description= 37 | "fetch taxon information by species name or taxid. Cache used to avoid repeatly search", 38 | epilog="https://github.com/shenwei356/bio_scripts/") 39 | 40 | parser.add_argument('infile', help='species name/taxid list') 41 | parser.add_argument('-n', 42 | '--by-name', 43 | action='store_true', 44 | help='search by species name') 45 | parser.add_argument('-t', 46 | '--threads', 47 | type=int, 48 | default=4, 49 | help='threads number, default:4') 50 | 51 | default_cache_path = os.path.join( 52 | os.path.expanduser("~"), '.taxon', 'taxon_map.pickle') 53 | parser.add_argument( 54 | '-c', 55 | '--cache-file', 56 | type=str, 57 | default=default_cache_path, 58 | help='taxon_map cache file, default: {}'.format(default_cache_path)) 59 | parser.add_argument('-d', 60 | '--delete-cache-file', 61 | action='store_true', 62 | help='delete cache file') 63 | 64 | args = parser.parse_args() 65 | 66 | # ================[ caching feteched data ]================== 67 | cache = dict() 68 | 69 | # a flag file to check if the pickle file is ok, its existance means not ok 70 | flag_file = '{}.close-by-accident'.format(args.cache_file) 71 | 72 | if args.delete_cache_file: 73 | if os.path.exists(args.cache_file): 74 | os.unlink(args.cache_file) 75 | if os.path.exists(flag_file): 76 | os.unlink(flag_file) 77 | 78 | # read cache if available 79 | if os.path.exists(args.cache_file): 80 | sys.stderr.write('[INFO] read taxon_map cache from file: {}\n'.format( 81 | args.cache_file)) 82 | 83 | if not os.path.exists(flag_file): 84 | cache = pickle.load(open(args.cache_file, 'rb')) 85 | else: 86 | sys.stderr.write( 87 | '[INFO] it seems that last run failed. delete cache file.\n') 88 | os.unlink(flag_file) 89 | # cache = pickle.load(open(args.cache_file, 'rb')) 90 | else: 91 | sys.stderr.write('[INFO] create new taxon_map cache file: {}\n'.format( 92 | args.cache_file)) 93 | 94 | cache_dir = os.path.dirname(args.cache_file) 95 | if not os.path.exists(cache_dir): 96 | os.mkdir(cache_dir) 97 | 98 | cache_fh = open(args.cache_file, 'wb') 99 | 100 | open(flag_file, 'w').close() 101 | 102 | 103 | # ================[ fetching method ]================== 104 | def get_tax_id(species): 105 | species = species.replace(" ", "+").strip() 106 | 107 | search = Entrez.esearch(term=species, db="taxonomy", retmode="xml") 108 | record = Entrez.read(search) 109 | 110 | return record['IdList'][0] 111 | 112 | 113 | # ================[ fetching method ]================== 114 | def get_tax_data(taxid): 115 | if not re.search('^\d+$', taxid): 116 | sys.stderr.write( 117 | '[ERROR] do you use species name as query? you may use flag: -n\n') 118 | if os.path.exists(flag_file): 119 | os.unlink(flag_file) 120 | sys.exit(0) 121 | search = Entrez.efetch(id=taxid, db="taxonomy", retmode="xml") 122 | 123 | # return Entrez.read(search) # if not using pickle, this is enough 124 | 125 | # save xml for pickle 126 | data_xml = search.read() 127 | return list(Entrez.parse(StringIO(data_xml))), data_xml 128 | 129 | 130 | # ================[ fetching and outputing ]================== 131 | def fetch_taxon(query): 132 | if query in cache: 133 | sys.stderr.write('[INFO] cached query: {}\n'.format(query)) 134 | 135 | taxon_data_xml = cache[query] 136 | data = list(Entrez.parse(StringIO(taxon_data_xml))) 137 | else: 138 | sys.stderr.write('[INFO] new query: {}\n'.format(query)) 139 | 140 | if args.by_name: 141 | taxid = get_tax_id(query) 142 | data, taxon_data_xml = get_tax_data(taxid) 143 | 144 | cache[taxid] = taxon_data_xml 145 | else: 146 | data, taxon_data_xml = get_tax_data(query) 147 | 148 | cache[data[0]['ScientificName']] = taxon_data_xml 149 | 150 | # save xml for pickle 151 | cache[query] = taxon_data_xml 152 | 153 | # output 154 | lineage = data[0]['Lineage'] 155 | division = data[0]['Division'] 156 | taxid = data[0]['TaxId'] 157 | 158 | CommonName = '' 159 | if 'OtherNames' in data[0] and 'GenbankCommonName' in data[0][ 160 | 'OtherNames']: 161 | CommonName = data[0]['OtherNames']['GenbankCommonName'] 162 | 163 | ScientificName = data[0]['ScientificName'] 164 | 165 | if args.by_name: 166 | print('\t'.join([taxid, query, division, CommonName, lineage])) 167 | else: 168 | print('\t'.join([query, ScientificName, division, CommonName, lineage 169 | ])) 170 | 171 | # ================[ read query list ]================== 172 | Entrez.email = "tmp@gmail.com" 173 | 174 | species_list = list() 175 | with open(args.infile) as fh: 176 | for species in fh: 177 | species = species.rstrip().lstrip() 178 | if len(species) == 0: 179 | continue 180 | species_list.append(species) 181 | 182 | # ================[ fetching with multiprocessing ]================== 183 | pool = Pool(args.threads) 184 | #pool.map(fetch_taxon, species_list) 185 | map(fetch_taxon, species_list) 186 | 187 | # ================[ caching ]================== 188 | pickle.dump(cache, cache_fh, -1) 189 | if os.path.exists(flag_file): 190 | os.unlink(flag_file) 191 | -------------------------------------------------------------------------------- /util/unzipGBK: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | import zipfile 7 | 8 | for file in sys.argv[1:]: 9 | print "Processing File " + file 10 | 11 | file=zipfile.ZipFile(file,"r"); 12 | for name in file.namelist(): 13 | utf8name=name.decode('gbk') 14 | print "Extracting " + utf8name 15 | pathname = os.path.dirname(utf8name) 16 | if not os.path.exists(pathname) and pathname!= "": 17 | os.makedirs(pathname) 18 | data = file.read(name) 19 | if not os.path.exists(utf8name): 20 | fo = open(utf8name, "w") 21 | fo.write(data) 22 | fo.close 23 | file.close() 24 | --------------------------------------------------------------------------------