├── .gitignore
├── LICENSE
├── README.md
├── _readme_maker
    ├── README.md
    ├── README.md.template
    ├── history.txt
    └── make_readme
├── archives
    ├── .content
    ├── archive.tar.gz
    └── archive.zip
├── files
    ├── .content
    ├── Green_Ok_Icon.png
    ├── R-package-edgeR.pdf
    ├── Sample.pdf
    ├── cars.csv
    ├── edgeR.url
    ├── gutenberg-freq.png
    ├── gutenberg-plot.png
    ├── introduction.txt
    ├── origin.txt
    ├── putty.png
    ├── roadrunner.gif
    ├── terminal.png
    ├── tree.gif
    ├── wine.csv
    └── wine.tsv
├── misc
    ├── .content
    ├── README.md
    ├── ecoli
    │   ├── ecoli.genes.fa.gz
    │   ├── ecoli.genome.fa.gz
    │   ├── ecoli.gff3.gz
    │   └── ecoli.proteins.fa.gz
    ├── genbank
    │   └── E.coli.genbank.gz
    └── test
    │   ├── README.txt
    │   ├── cat.jpg
    │   ├── data.txt
    │   ├── dna.fa
    │   ├── dog.jpg
    │   ├── excel_data.csv
    │   ├── goldfish.jpg
    │   ├── motifs.fa
    │   ├── oligos.txt
    │   ├── proteins.fa
    │   ├── song1.mp3
    │   ├── song2.mp3
    │   └── todo.txt
├── phage
    ├── .content
    ├── GCF_000840245.1_ViralProj14204
    ├── README.txt
    ├── annotation_hashes.txt
    ├── assembly_status.txt
    ├── md5checksums.txt
    ├── reads
    │   ├── sample1_R1.fastq.gz
    │   ├── sample1_R2.fastq.gz
    │   ├── sample2_R1.fastq.gz
    │   ├── sample2_R2.fastq.gz
    │   ├── sample3_R1.fastq.gz
    │   ├── sample3_R2.fastq.gz
    │   ├── sample4_R1.fastq.gz
    │   └── sample4_R2.fastq.gz
    ├── vir_assembly_report.txt
    ├── vir_assembly_stats.txt
    ├── vir_cds_from_genomic.fna
    ├── vir_feature_count.txt
    ├── vir_feature_table.txt
    ├── vir_genomic.fna
    ├── vir_genomic.gbff
    ├── vir_genomic.gff
    ├── vir_protein.faa
    ├── vir_protein.gpff
    ├── vir_rna_from_genomic.fna
    └── vir_translated_cds.faa
└── scripts
    ├── N50.pl
    ├── fasta_translate.py
    ├── gutenwords-plotzipf.py
    ├── gutenwords-topandplot.py
    ├── gutenwords_0.py
    ├── gutenwords_1.py
    ├── gutenwords_2.py
    ├── linkweb.sh
    ├── prodigal2vcontact.py
    ├── remap.py
    └── start_denovo.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | *.fna.amb
2 | *.fna.ann
3 | *.fna.bwt
4 | *.fna.fai
5 | *.fna.pac
6 | *.fna.sa
7 | changeContigCoordBam.py
8 | phantarun.py


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 
2 | To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter to Creative Commons, 
3 | PO Box 1866, Mountain View, CA 94042, USA.
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Bioinformatics and the Linux Terminal
  2 | 
  3 | A list of example files to start using the Linux Command Line interface (CLI). 
  4 | 
  5 | ### Resources using this repository
  6 | 
  7 | - :book: [**Bash for bioinformatics course**](https://telatin.github.io/microbiome-bioinformatics/Bash-course/)
  8 | - :star: [**Intro to Bash scripting**](https://telatin.github.io/microbiome-bioinformatics/Bash-tutorial-1/)
  9 | 
 10 | ## :book: Bash tutorial
 11 | 
 12 | See the [Wiki](https://github.com/telatin/learn_bash/wiki) in this repository.
 13 | 
 14 | 
 15 | ## :calendar: History
 16 | 
 17 | | Date       | Description      | 
 18 | |------------|------------------|
 19 | | 2022-10-01 |  Cleanup and updates (removed legacy files, added scripts) |
 20 | | 2019-12-04 |  Added FASTQ files, Updated README maker with links |
 21 | | 2019-12-03 |  Refactoring the companion website (now using [github wiki](https://github.com/telatin/learn_bash/wiki)). |
 22 | | 2019-02-20 |  Readme file generator added (_readme_maker/make_readme) | 
 23 | | 2018-10-20 |  Added a "**misc**" subdirectory with examples taken from another training set| 
 24 | | 2018-03-05 |  Package updated, "files" directory added |
 25 | | 2018-02-29 |  First release|
 26 | 
 27 | 
 28 | 
 29 | ## :briefcase: Files in the repository
 30 | 
 31 | 
 32 | 
 33 | 
 34 | ### 📁 [misc](https://github.com/telatin/learn_bash/tree/master/misc) 7.4M
 35 | 
 36 | This is an extra subdirectory, it contains its own README file for details
 37 | 
 38 |  -  [README.md](https://github.com/telatin/learn_bash/blob/master/misc/README.md)
 39 |  -  📁  [ecoli](https://github.com/telatin/learn_bash/blob/master/misc/ecoli)
 40 |  -  📁  [genbank](https://github.com/telatin/learn_bash/blob/master/misc/genbank)
 41 |  -  📁  [test](https://github.com/telatin/learn_bash/blob/master/misc/test)
 42 | 
 43 | 
 44 | ### 📁 [archives](https://github.com/telatin/learn_bash/tree/master/archives)  16K
 45 | 
 46 | Archives to test decompression tools
 47 | 
 48 |  -  [archive.tar.gz](https://github.com/telatin/learn_bash/blob/master/archives/archive.tar.gz)
 49 |  -  [archive.zip](https://github.com/telatin/learn_bash/blob/master/archives/archive.zip)
 50 | 
 51 | 
 52 | ### 📁 [scripts](https://github.com/telatin/learn_bash/tree/master/scripts)  72K
 53 |  -  [N50.pl](https://github.com/telatin/learn_bash/blob/master/scripts/N50.pl)
 54 |  -  [changeContigCoordBam.py](https://github.com/telatin/learn_bash/blob/master/scripts/changeContigCoordBam.py)
 55 |  -  [fasta_translate.py](https://github.com/telatin/learn_bash/blob/master/scripts/fasta_translate.py)
 56 |  -  [gutenwords-plotzipf.py](https://github.com/telatin/learn_bash/blob/master/scripts/gutenwords-plotzipf.py)
 57 |  -  [gutenwords-topandplot.py](https://github.com/telatin/learn_bash/blob/master/scripts/gutenwords-topandplot.py)
 58 |  -  [gutenwords_0.py](https://github.com/telatin/learn_bash/blob/master/scripts/gutenwords_0.py)
 59 |  -  [gutenwords_1.py](https://github.com/telatin/learn_bash/blob/master/scripts/gutenwords_1.py)
 60 |  -  [gutenwords_2.py](https://github.com/telatin/learn_bash/blob/master/scripts/gutenwords_2.py)
 61 |  -  [linkweb.sh](https://github.com/telatin/learn_bash/blob/master/scripts/linkweb.sh)
 62 |  -  [phantarun.py](https://github.com/telatin/learn_bash/blob/master/scripts/phantarun.py)
 63 |  -  [prodigal2vcontact.py](https://github.com/telatin/learn_bash/blob/master/scripts/prodigal2vcontact.py)
 64 |  -  [remap.py](https://github.com/telatin/learn_bash/blob/master/scripts/remap.py)
 65 |  -  [start_denovo.sh](https://github.com/telatin/learn_bash/blob/master/scripts/start_denovo.sh)
 66 | 
 67 | 
 68 | ### 📁 [phage](https://github.com/telatin/learn_bash/tree/master/phage) 4.5M
 69 | 
 70 | A set of files to test parsing of bioinformatics format, mostly related to PhiX phage.
 71 | 
 72 |  -  [GCF_000840245.1_ViralProj14204](https://github.com/telatin/learn_bash/blob/master/phage/GCF_000840245.1_ViralProj14204)
 73 |  -  [README.txt](https://github.com/telatin/learn_bash/blob/master/phage/README.txt)
 74 |  -  [annotation_hashes.txt](https://github.com/telatin/learn_bash/blob/master/phage/annotation_hashes.txt)
 75 |  -  [assembly_status.txt](https://github.com/telatin/learn_bash/blob/master/phage/assembly_status.txt)
 76 |  -  [md5checksums.txt](https://github.com/telatin/learn_bash/blob/master/phage/md5checksums.txt)
 77 |  -  [vir_assembly_report.txt](https://github.com/telatin/learn_bash/blob/master/phage/vir_assembly_report.txt)
 78 |  -  [vir_assembly_stats.txt](https://github.com/telatin/learn_bash/blob/master/phage/vir_assembly_stats.txt)
 79 |  -  [vir_cds_from_genomic.fna](https://github.com/telatin/learn_bash/blob/master/phage/vir_cds_from_genomic.fna)
 80 |  -  [vir_feature_count.txt](https://github.com/telatin/learn_bash/blob/master/phage/vir_feature_count.txt)
 81 |  -  [vir_feature_table.txt](https://github.com/telatin/learn_bash/blob/master/phage/vir_feature_table.txt)
 82 |  -  [vir_genomic.fna](https://github.com/telatin/learn_bash/blob/master/phage/vir_genomic.fna)
 83 |  -  [vir_genomic.gbff](https://github.com/telatin/learn_bash/blob/master/phage/vir_genomic.gbff)
 84 |  -  [vir_genomic.gff](https://github.com/telatin/learn_bash/blob/master/phage/vir_genomic.gff)
 85 |  -  [vir_protein.faa](https://github.com/telatin/learn_bash/blob/master/phage/vir_protein.faa)
 86 |  -  [vir_protein.gpff](https://github.com/telatin/learn_bash/blob/master/phage/vir_protein.gpff)
 87 |  -  [vir_rna_from_genomic.fna](https://github.com/telatin/learn_bash/blob/master/phage/vir_rna_from_genomic.fna)
 88 |  -  [vir_translated_cds.faa](https://github.com/telatin/learn_bash/blob/master/phage/vir_translated_cds.faa)
 89 |  -  📁  [reads](https://github.com/telatin/learn_bash/blob/master/phage/reads)
 90 | 
 91 | 
 92 | ### 📁 [files](https://github.com/telatin/learn_bash/tree/master/files) 2.4M
 93 | 
 94 | Common file formats, both binary (e.g. PNG image) and text files (e.g. CSV). A PDF document is included to see how `less` can also handle them.
 95 | 
 96 |  -  [Green_Ok_Icon.png](https://github.com/telatin/learn_bash/blob/master/files/Green_Ok_Icon.png)
 97 |  -  [R-package-edgeR.pdf](https://github.com/telatin/learn_bash/blob/master/files/R-package-edgeR.pdf)
 98 |  -  [Sample.pdf](https://github.com/telatin/learn_bash/blob/master/files/Sample.pdf)
 99 |  -  [cars.csv](https://github.com/telatin/learn_bash/blob/master/files/cars.csv)
100 |  -  [edgeR.url](https://github.com/telatin/learn_bash/blob/master/files/edgeR.url)
101 |  -  [gutenberg-freq.png](https://github.com/telatin/learn_bash/blob/master/files/gutenberg-freq.png)
102 |  -  [gutenberg-plot.png](https://github.com/telatin/learn_bash/blob/master/files/gutenberg-plot.png)
103 |  -  [introduction.txt](https://github.com/telatin/learn_bash/blob/master/files/introduction.txt)
104 |  -  [putty.png](https://github.com/telatin/learn_bash/blob/master/files/putty.png)
105 |  -  [roadrunner.gif](https://github.com/telatin/learn_bash/blob/master/files/roadrunner.gif)
106 |  -  [terminal.png](https://github.com/telatin/learn_bash/blob/master/files/terminal.png)
107 |  -  [tree.gif](https://github.com/telatin/learn_bash/blob/master/files/tree.gif)
108 |  -  [wine.csv](https://github.com/telatin/learn_bash/blob/master/files/wine.csv)
109 |  -  [wine.tsv](https://github.com/telatin/learn_bash/blob/master/files/wine.tsv)
110 | 
111 | 


--------------------------------------------------------------------------------
/_readme_maker/README.md:
--------------------------------------------------------------------------------
1 | # Readme maker
2 | 
3 | The script `make_readme` will
4 | * Use `README.md.template` as a template
5 | * Slurp the content from `history.txt` (markdown) and put it where the `{history.txt}` placeholder is. 
6 | * Replace `{content}` with a list of directories and their content as processed by the script, in order to add links and directory size
7 | * Automatically save the output in `../README.md`
8 | 


--------------------------------------------------------------------------------
/_readme_maker/README.md.template:
--------------------------------------------------------------------------------
 1 | # Bioinformatics and the Linux Terminal
 2 | 
 3 | A list of example files to start using the Linux Command Line interface (CLI). 
 4 | Originally prepared for [Bash for Bioinformatics](https://seq.space/notes/doku.php?id=bash-beginners) training
 5 | 
 6 | ## :book: Bash tutorial
 7 | 
 8 | See the [Wiki](https://github.com/telatin/learn_bash/wiki) in this repository.
 9 | 
10 | 
11 | ## :calendar: History
12 | 
13 | {history.txt}
14 | 
15 | ## :briefcase: Files in the repository
16 | 
17 | {content}
18 | 


--------------------------------------------------------------------------------
/_readme_maker/history.txt:
--------------------------------------------------------------------------------
 1 | | Date       | Description      | 
 2 | |------------|------------------|
 3 | | 2022-10-01 |  Cleanup and updates (removed legacy files, added scripts) |
 4 | | 2019-12-04 |  Added FASTQ files, Updated README maker with links |
 5 | | 2019-12-03 |  Refactoring the companion website (now using [github wiki](https://github.com/telatin/learn_bash/wiki)). |
 6 | | 2019-02-20 |  Readme file generator added (_readme_maker/make_readme) | 
 7 | | 2018-10-20 |  Added a "**misc**" subdirectory with examples taken from another training set| 
 8 | | 2018-03-05 |  Package updated, "files" directory added |
 9 | | 2018-02-29 |  First release|
10 | 
11 | 


--------------------------------------------------------------------------------
/_readme_maker/make_readme:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | use utf8;
 3 | 
 4 | # Generates "../README.md" based on history.txt and subdir contents
 5 | # Each subdir can optionally have a ".content" file to describe the use of the files
 6 | use v5.16;
 7 | use File::Basename;
 8 | use File::Slurp;
 9 | use autodie;  
10 | 
11 | my $script_directory = dirname($0);
12 | my $rep_directory    = "${script_directory}/..";
13 | my $template = read_file("${script_directory}/README.md.template");
14 | my $history = read_file("${script_directory}/history.txt");
15 | my $content = '';
16 | 
17 | say STDERR "Loaded template: ", length($template), ' chars';
18 | say STDERR "Loaded history: ", length($history), ' chars';
19 | 
20 | # Get history from the 'history.txt' file,
21 | # that should be independently markdown formatted
22 | $template =~s/{history.txt}/$history/;
23 | 
24 | 
25 | opendir my $dir_content, "$rep_directory";
26 | 
27 | while (my $subdir = readdir $dir_content) {
28 | 	next if (! -d "$rep_directory/$subdir" or $subdir=~/^[_\.]/);
29 | 	say STDERR " * Parsing $subdir";
30 | 	my $size = '';
31 | 	$size .= `du -hs "$rep_directory/$subdir" | cut -f1`;
32 | 	chomp($size);
33 | 	$content .= "\n\n\n### 📁 [$subdir](https://github.com/telatin/learn_bash/tree/master/$subdir) $size\n";
34 | 	opendir my $d, "$rep_directory/$subdir";
35 | 		if (-e "$rep_directory/$subdir/.content") {
36 | 		    my $dir_description = read_file("$rep_directory/$subdir/.content");
37 | 		    chomp($dir_description);
38 | 		    $content.="\n$dir_description\n\n";
39 | 		}
40 | 	my @files = ();
41 | 	while (my $f = readdir $d) {
42 | 		# skip hidden files and subdir
43 | 		next if ($f=~/^\./);
44 | 		my $dir = '';
45 | 		$dir = ' 📁 ' if (-d "$subdir/$f");
46 | 		push(@files, " - $dir [$f](https://github.com/telatin/learn_bash/blob/master/$subdir/$f)");
47 | 		#$content.=" - $dir [$f](https://github.com/telatin/learn_bash/blob/master/$subdir/$f)\n";
48 | 	}
49 | 	closedir $d;
50 | 	$content .= join( "\n", sort @files);	
51 | }
52 | 
53 | $template=~s/{content}/$content/;
54 | 
55 | open my $output, '>', "$rep_directory/README.md";
56 | binmode $output, ":utf8";
57 | say {$output} $template;
58 | 
59 | 


--------------------------------------------------------------------------------
/archives/.content:
--------------------------------------------------------------------------------
1 | Archives to test decompression tools
2 | 


--------------------------------------------------------------------------------
/archives/archive.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/archives/archive.tar.gz


--------------------------------------------------------------------------------
/archives/archive.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/archives/archive.zip


--------------------------------------------------------------------------------
/files/.content:
--------------------------------------------------------------------------------
1 | Common file formats, both binary (e.g. PNG image) and text files (e.g. CSV). A PDF document is included to see how `less` can also handle them.
2 | 


--------------------------------------------------------------------------------
/files/Green_Ok_Icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/files/Green_Ok_Icon.png


--------------------------------------------------------------------------------
/files/R-package-edgeR.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/files/R-package-edgeR.pdf


--------------------------------------------------------------------------------
/files/Sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/files/Sample.pdf


--------------------------------------------------------------------------------
/files/cars.csv:
--------------------------------------------------------------------------------
 1 | model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
 2 | Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4
 3 | Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4
 4 | Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
 5 | Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
 6 | Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
 7 | Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1
 8 | Duster 360,14.3,8,360,245,3.21,3.57,15.84,0,0,3,4
 9 | Merc 240D,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2
10 | Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
11 | Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4
12 | Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
13 | Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3
14 | Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3
15 | Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18,0,0,3,3
16 | Cadillac Fleetwood,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4
17 | Lincoln Continental,10.4,8,460,215,3,5.424,17.82,0,0,3,4
18 | Chrysler Imperial,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4
19 | Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
20 | Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2
21 | Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1
22 | Toyota Corona,21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1
23 | Dodge Challenger,15.5,8,318,150,2.76,3.52,16.87,0,0,3,2
24 | AMC Javelin,15.2,8,304,150,3.15,3.435,17.3,0,0,3,2
25 | Camaro Z28,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4
26 | Pontiac Firebird,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2
27 | Fiat X1-9,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1
28 | Porsche 914-2,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2
29 | Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
30 | Ford Pantera L,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4
31 | Ferrari Dino,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6
32 | Maserati Bora,15,8,301,335,3.54,3.57,14.6,0,1,5,8
33 | Volvo 142E,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2
34 | 


--------------------------------------------------------------------------------
/files/edgeR.url:
--------------------------------------------------------------------------------
1 | https://www.bioconductor.org/packages/devel/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
2 | 


--------------------------------------------------------------------------------
/files/gutenberg-freq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/files/gutenberg-freq.png


--------------------------------------------------------------------------------
/files/gutenberg-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/files/gutenberg-plot.png


--------------------------------------------------------------------------------
/files/introduction.txt:
--------------------------------------------------------------------------------
 1 | Introduction
 2 | WHEN on board H.M.S. Beagle as naturalist, I was much struck with certain facts in the distribution of the organic beings inhabiting South America, and in the geological relations of the present to the past inhabitants of that continent. These facts, as will be seen in the latter chapters of this volume, seemed to throw some light on the origin of species- that mystery of mysteries, as it has been called by one of our greatest philosophers. On my return home, it occurred to me, in 1837, that something might perhaps be made out on this question by patiently accumulating and reflecting on all sorts of facts which could possibly have any bearing on it. After five years' work I allowed myself to speculate on the subject, and drew up some short notes; these I enlarged in 1844 into a sketch of the conclusions, which then seemed to me probable: from that period to the present day I have steadily pursued the same object. I hope that I may be excused for entering on these personal details, as I give them to show that I have not been hasty in coming to a decision.
 3 | 
 4 | My work is now (1859) nearly finished; but as it will take me many more years to complete it, and as my health is far from strong, I have been urged to publish this abstract. I have more especially been induced to do this, as Mr. Wallace, who is now studying the natural history of the Malay Archipelago, has arrived at almost exactly the same general conclusions that I have on the origin of species. In 1858 he sent me a memoir on this subject, with a request that I would forward it to Sir Charles Lyell, who sent it to the Linnean Society, and it is published in the third volume of the Journal of that society. Sir C. Lyell and Dr. Hooker, who both knew of my work- the latter having read my sketch of 1844- honoured me by thinking it advisable to publish, with Mr. Wallace's excellent memoir, some brief extracts from my manuscripts.
 5 | 
 6 | This abstract, which I now publish, must necessarily be imperfect. cannot here give references and authorities for my several statements; and I must trust to the reader reposing some confidence in my accuracy. No doubt errors will have crept in, though I hope I have always been cautious in trusting to good authorities alone. I can here give only the general conclusions at which I have arrived, with a few facts in illustration, but which, I hope, in most cases will suffice. No one can feel more sensible than I do of the necessity of hereafter publishing in detail all the facts, with references, on which my conclusions have been grounded; and I hope in a future work to do this. For I am well aware that scarcely a single point is discussed in this volume on which facts cannot be adduced, often apparently leading to conclusions directly opposite to those at which I have arrived. A fair result can be obtained only by fully stating and balancing the facts and arguments on both sides of each question; and this is here impossible.
 7 | 
 8 | I much regret that want of space prevents my having the satisfaction of acknowledging the generous assistance which I have received from very many naturalists, some of them personally unknown to me. I cannot, however, let this opportunity pass without expressing my deep obligations to Dr. Hooker, who, for the last fifteen years, has aided me in every possible way by his large stores of knowledge and his excellent judgment.
 9 | 
10 | In considering the Origin of Species, it is quite conceivable that a naturalist, reflecting on the mutual affinities of organic beings, on their embryological relations, their geographical distribution, geological succession, and other such facts, might come to the conclusion that species had not been independently created, but had descended, like varieties, from other species. Nevertheless, such a conclusion, even if well founded, would be unsatisfactory, until it could be shown how the innumerable species inhabiting this world have been modified, so as to acquire that perfection of structure and coadaptation which justly excites our admiration. Naturalists continually refer to external conditions, such as climate, food, &c., as the only possible cause of variation. In one limited sense, as we shall hereafter see, this may be true; but it is preposterous to attribute to mere external conditions, the structure, for instance, of the woodpecker, with its feet, tail, beak, and tongue, so admirably adapted to catch insects under the bark of trees. In the case of the mistletoe, which draws its nourishment from certain trees, which has seeds that must be transported by certain birds, and which has flowers with separate sexes absolutely requiring the agency of certain insects to bring pollen from one flower to the other, it is equally preposterous to account for the structure of this parasite, with its relations to several distinct organic beings, by the effects of external conditions, or of habit, or of the volition of the plant itself.
11 | 
12 | It is, therefore, of the highest importance to gain a clear insight into the means of modification and coadaptation. At the commencement of my observations it seemed to me probable that a careful study of domesticated animals and of cultivated plants would offer the best chance of making out this obscure problem. Nor have I been disappointed; in this and in all other perplexing cases I have invariably found that our knowledge, imperfect though it be, of variation under domestication, afforded the best and safest clue. I may venture to express my conviction of the high value of such studies, although they have been very commonly neglected by naturalists.
13 | 
14 | From these considerations, I shall devote the first chapter of this Abstract to Variation under Domestication. We shall thus see that a large amount of hereditary modification is at least possible; and, what is equally or more important, we shall see how great is the power of man in accumulating by his Selection successive slight variations. I will then pass on to the variability of species in a state of nature; but I shall, unfortunately, be compelled to treat this subject far too briefly, as it can be treated properly only by giving long catalogues of facts. We shall, however, be enabled to discuss what circumstances are most favourable to variation. In the next chapter the Struggle for Existence amongst all organic beings throughout the world, which inevitably follows from the high geometrical ratio of their increase, will be considered. This is the doctrine of Malthus, applied to the whole animal and vegetable kingdoms. As many more individuals of each species are born than can possibly survive; and as, consequently, there is a frequently recurring struggle for existence, it follows that any being, if it vary however slightly in any manner profitable to itself, under the complex and sometimes varying conditions of life, will have a better chance of surviving, and thus be naturally selected. From the strong principle of inheritance, any selected variety will tend to propagate its new and modified form.
15 | 
16 | This fundamental subject of Natural Selection will be treated at some length in the fourth chapter; and we shall then see how Natural Selection almost inevitably causes much Extinction of the less improved forms of life, and leads to what I have called Divergence of Character. In the next chapter I shall discuss the complex and little known laws of variation. In the five succeeding chapters, the most apparent and gravest difficulties in accepting the theory will be given: namely, first, the difficulties of transitions, or how a simple being or a simple organ can be changed and perfected into a highly developed being or into an elaborately constructed organ; secondly, the subject of Instinct, or the mental powers of animals; thirdly, Hybridism, or the infertility of species and the fertility of varieties when intercrossed; and fourthly, the imperfection of the Geological Record. In the next chapter I shall consider the geological succession of organic beings throughout time; in the twelfth and thirteenth, their geographical distribution throughout space; in the fourteenth, their classification or mutual affinities, both when mature and in an embryonic condition. In the last chapter I shall give a brief recapitulation of the whole work, and a few concluding remarks.
17 | 
18 | No one ought to feel surprise at much remaining as yet unexplained in regard to the origin of species and varieties, if he make due allowance for our profound ignorance in regard to the mutual relations of the many beings which live around us. Who can explain why one species ranges widely and is very numerous, and why another allied species has a narrow range and is rare? Yet these relations are of the highest importance, for they determine the present welfare and, as I believe, the future success and modification of every inhabitant of this world. Still less do we know of the mutual relations of the innumerable inhabitants of the world during the many past geological epochs in its history. Although much remains obscure, and will long remain obscure, I can entertain no doubt, after the most deliberate study and dispassionate judgment of which I am capable, that the view which most naturalists until recently entertained, and which I formerly entertained- namely, that each species has been independently created- is erroneous. I am fully convinced that species are not immutable; but that those belonging to what are called the same genera are lineal descendants of some other and generally extinct species, in the same manner as the acknowledged varieties of any one species are the descendants of that species. Furthermore, I am convinced that Natural Selection has been the most important, but not the exclusive, means of modification. 
19 | 
20 | -- From C. Darwin "On the Origin Of Species" (1859)
21 | 


--------------------------------------------------------------------------------
/files/putty.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/files/putty.png


--------------------------------------------------------------------------------
/files/roadrunner.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/files/roadrunner.gif


--------------------------------------------------------------------------------
/files/terminal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/files/terminal.png


--------------------------------------------------------------------------------
/files/tree.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/files/tree.gif


--------------------------------------------------------------------------------
/files/wine.csv:
--------------------------------------------------------------------------------
 1 | "","country","alcohol","deaths","heart","liver"
 2 | "1","Australia",2.5,785,211,15.3000001907349
 3 | "2","Austria",3.90000009536743,863,167,45.5999984741211
 4 | "3","Belg/Lux",2.90000009536743,883,131,20.7000007629395
 5 | "4","Canada",2.40000009536743,793,191,16.3999996185303
 6 | "5","Denmark",2.90000009536743,971,220,23.8999996185303
 7 | "6","Finland",0.800000011920929,970,297,19
 8 | "7","France",9.10000038146973,751,71,37.9000015258789
 9 | "8","Iceland",0.800000011920929,743,211,11.1999998092651
10 | "9","Ireland",0.699999988079071,1000,300,6.5
11 | "10","Israel",0.600000023841858,834,183,13.6999998092651
12 | "11","Italy",7.90000009536743,775,107,42.2000007629395
13 | "12","Japan",1.5,680,36,23.2000007629395
14 | "13","Netherlands",1.79999995231628,773,167,9.19999980926514
15 | "14","New Zealand",1.89999997615814,916,266,7.69999980926514
16 | "15","Norway",0.800000011920929,806,227,12.1999998092651
17 | "16","Spain",6.5,724,86,36.4000015258789
18 | "17","Sweden",1.60000002384186,743,207,11.1999998092651
19 | "18","Switzerland",5.80000019073486,693,115,20.2999992370605
20 | "19","UK",1.29999995231628,941,285,10.3000001907349
21 | "20","US",1.20000004768372,926,199,22.1000003814697
22 | "21","West Germany",2.70000004768372,861,172,36.7000007629395
23 | 


--------------------------------------------------------------------------------
/files/wine.tsv:
--------------------------------------------------------------------------------
 1 | #ID	Country	Alcohol	Deaths	Heart	Liver
 2 | 1	Australia	2.5	785	211	15.3000001907349
 3 | 2	Austria	3.90000009536743	863	167	45.5999984741211
 4 | 3	Belg/Lux	2.90000009536743	883	131	20.7000007629395
 5 | 4	Canada	2.40000009536743	793	191	16.3999996185303
 6 | 5	Denmark	2.90000009536743	971	220	23.8999996185303
 7 | 6	Finland	0.800000011920929	970	297	19
 8 | 7	France	9.10000038146973	751	71	37.9000015258789
 9 | 8	Iceland	0.800000011920929	743	211	11.1999998092651
10 | 9	Ireland	0.699999988079071	1000	300	6.5
11 | 10	Israel	0.600000023841858	834	183	13.6999998092651
12 | 11	Italy	7.90000009536743	775	107	42.2000007629395
13 | 12	Japan	1.5	680	36	23.2000007629395
14 | 13	Netherlands	1.79999995231628	773	167	9.19999980926514
15 | 14	New Zealand	1.89999997615814	916	266	7.69999980926514
16 | 15	Norway	0.800000011920929	806	227	12.1999998092651
17 | 16	Spain	6.5	724	86	36.4000015258789
18 | 17	Sweden	1.60000002384186	743	207	11.1999998092651
19 | 18	Switzerland	5.80000019073486	693	115	20.2999992370605
20 | 19	UK	1.29999995231628	941	285	10.3000001907349
21 | 20	US	1.20000004768372	926	199	22.1000003814697
22 | 21	West Germany	2.70000004768372	861	172	36.7000007629395
23 | 


--------------------------------------------------------------------------------
/misc/.content:
--------------------------------------------------------------------------------
1 | This is an extra subdirectory, it contains its own README file for details
2 | 


--------------------------------------------------------------------------------
/misc/README.md:
--------------------------------------------------------------------------------
 1 | # "Bash training" dataset
 2 | 
 3 | Files for Bash training, with bioinformatics examples. This was formerly the 'telatin/bashtraining' repository, now 'misc' subdirectory of "learn_bash"
 4 | 
 5 | The introductory course program is available here: 
 6 | https://seq.space/notes/doku.php?id=bash-beginners
 7 | 
 8 | To download this repository:
 9 | ```
10 | git clone https://github.com/telatin/learn_bash
11 | ```
12 | 
13 | 
14 | ### Sources:
15 |  - Part of these files are from: http://korflab.ucdavis.edu/Unix_and_Perl/
16 | 


--------------------------------------------------------------------------------
/misc/ecoli/ecoli.genes.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/ecoli/ecoli.genes.fa.gz


--------------------------------------------------------------------------------
/misc/ecoli/ecoli.genome.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/ecoli/ecoli.genome.fa.gz


--------------------------------------------------------------------------------
/misc/ecoli/ecoli.gff3.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/ecoli/ecoli.gff3.gz


--------------------------------------------------------------------------------
/misc/ecoli/ecoli.proteins.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/ecoli/ecoli.proteins.fa.gz


--------------------------------------------------------------------------------
/misc/genbank/E.coli.genbank.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/genbank/E.coli.genbank.gz


--------------------------------------------------------------------------------
/misc/test/README.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/test/README.txt


--------------------------------------------------------------------------------
/misc/test/cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/test/cat.jpg


--------------------------------------------------------------------------------
/misc/test/data.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/test/data.txt


--------------------------------------------------------------------------------
/misc/test/dna.fa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/test/dna.fa


--------------------------------------------------------------------------------
/misc/test/dog.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/test/dog.jpg


--------------------------------------------------------------------------------
/misc/test/excel_data.csv:
--------------------------------------------------------------------------------
1 | sequence 1,acacagagagsequence 2,acacaggggaaasequence 3,ttcacagagasequence 4,cacaccaaacacsequence 5,tttatatttaatata


--------------------------------------------------------------------------------
/misc/test/goldfish.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/test/goldfish.jpg


--------------------------------------------------------------------------------
/misc/test/motifs.fa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/test/motifs.fa


--------------------------------------------------------------------------------
/misc/test/oligos.txt:
--------------------------------------------------------------------------------
 1 | M13	GTAAAACGACGGCCAGT
 2 | M13-rev	CAGGAAACAGCTATGAC
 3 | T7	TAATACGACTCACTATAGG
 4 | T3	ATTAACCCTCACTAAAG
 5 | SP6	ATTTAGGTGACACTATAG
 6 | CMV	CGCAAATGGGCGGTAGGCGTG
 7 | HA-F	TACCCATACGACGTCCCAGA
 8 | HA-R	TCTGGGACGTCGTATGGGTA
 9 | Neo-F	CGTTGGCTACCCGTGATATT
10 | Neo-R	GCCCAGTCATAGCCGAATAG
11 | Tac	GAGCGGATAACAATTTCACACAGG


--------------------------------------------------------------------------------
/misc/test/proteins.fa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/test/proteins.fa


--------------------------------------------------------------------------------
/misc/test/song1.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/test/song1.mp3


--------------------------------------------------------------------------------
/misc/test/song2.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/test/song2.mp3


--------------------------------------------------------------------------------
/misc/test/todo.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/misc/test/todo.txt


--------------------------------------------------------------------------------
/phage/.content:
--------------------------------------------------------------------------------
1 | A set of files to test parsing of bioinformatics format, mostly related to PhiX phage.
2 | 


--------------------------------------------------------------------------------
/phage/GCF_000840245.1_ViralProj14204:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/phage/GCF_000840245.1_ViralProj14204


--------------------------------------------------------------------------------
/phage/README.txt:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | README for ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/
  3 |            ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/
  4 |            ftp://ftp.ncbi.nlm.nih.gov/genomes/all/
  5 | 
  6 | Last updated: January 24, 2017
  7 | ################################################################################
  8 | 
  9 | ==========
 10 | Background
 11 | ==========
 12 | Sequence data is provided for all single organism genome assemblies that are 
 13 | included in NCBI's Assembly resource (www.ncbi.nlm.nih.gov/assembly/).  This 
 14 | includes submissions to databases of the International Nucleotide Sequence 
 15 | Database Collaboration, which are available in NCBI's GenBank database, as well 
 16 | as the subset of those submissions that are included in NCBI's RefSeq Genomes 
 17 | project. 
 18 | 
 19 | Available by anonymous FTP at:
 20 |      ftp://ftp.ncbi.nlm.nih.gov/genomes/
 21 | 
 22 | Please refer to README files and the FTP FAQ for additional information:
 23 |      https://www.ncbi.nlm.nih.gov/genome/doc/ftpfaq/
 24 | 
 25 | Subscribe to the genomes-announce mail list to be informed of changes to the
 26 | NCBI genomes FTP site:
 27 |      https://www.ncbi.nlm.nih.gov/mailman/listinfo/genomes-announce
 28 | 
 29 | 
 30 | =====================================================================
 31 | Genome sequence and annotation data is provided in three directories:
 32 | =====================================================================
 33 | 1) all:     content is the union of GenBank and RefSeq assemblies. The two 
 34 |             directories under "all" are named for the accession prefix (GCA or
 35 |             GCF) and these directories contain another three levels of 
 36 |             directories named for digits 1-3, 4-6 & 7-9 of the assembly 
 37 |             accession. The next level is the data directories for individual 
 38 |             assembly versions. Only data directories for "latest" assemblies
 39 |             are refreshed when annotation is updated or when software updates
 40 |             are released, so new file formats or improvements to existing 
 41 |             formats are not available for non-latest assemblies.
 42 | 2) genbank: content includes primary submissions of assembled genome sequence 
 43 |             and associated annotation data, if any, as exchanged among members 
 44 |             of the International Nucleotide Sequence Database Collaboration, 
 45 |             of which NCBI's GenBank database is a member. The GenBank directory 
 46 |             area includes genome sequence data for a larger number of organisms 
 47 |             than the RefSeq directory area; however, some assemblies are 
 48 |             unannotated. The sub-directory structure includes:
 49 |             a. archaea
 50 |             b. bacteria
 51 |             c. fungi
 52 |             d. invertebrate
 53 |             e. metagenomes
 54 |             f. other -  this directory includes synthetic genomes
 55 |             g. plant
 56 |             h. protozoa
 57 |             i. vertebrate_mammalian
 58 |             j. vertebrate_other
 59 | 3) refseq:  content includes assembled genome sequence and RefSeq annotation 
 60 |             data. All prokaryotic and eukaryotic RefSeq genomes have annotation. 
 61 |             RefSeq annotation data may be calculated by NCBI annotation  
 62 |             pipelines or propagated from the GenBank submission. The RefSeq 
 63 |             directory area includes fewer organisms than the GenBank directory
 64 |             area because not all genome assemblies are selected for the RefSeq
 65 |             project.
 66 |             Sub-directories include:
 67 |             a. archaea
 68 |             b. bacteria
 69 |             c. fungi
 70 |             d. invertebrate
 71 |             e. plant
 72 |             f. protozoa
 73 |             g. vertebrate_mammalian
 74 |             h. vertebrate_other 
 75 |             i. viral
 76 |             j. mitochondrion [Content of the mitochondrion, plasmid and plastid
 77 |             k. plasmid     directories is from the RefSeq release FTP site. See 
 78 |             l. plastid     ftp://ftp.ncbi.nlm.nih.gov/refseq/release/README]
 79 | 
 80 | Data are further organized within each of the above directories as a series of 
 81 | directories named as the species binomial. For example:
 82 |    ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/Escherichia_coli/
 83 |            - or - 
 84 |    ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/
 85 | 
 86 | The next hierarchy provides access to all assemblies for the species, latest 
 87 | assemblies, and selected reference or representative assemblies for the species 
 88 | (if any). Within these groupings, sequence and annotation (and other) data is 
 89 | provided per assembly in a series of directories that are named using the rule:
 90 | 
 91 |    [Assembly accession.version]_[assembly name]
 92 | 
 93 | For example, the directory hierarchy for the GenBank Escherichia coli K-12 
 94 | subst. MG1655 genome, which has the assembly accession GCA_000005845.2 and 
 95 | default assembly name ASM584v2 looks like this:  
 96 |    /genomes/genbank/bacteria/Escherichia_coli/all_assembly_versions/GCA_000005845.2_ASM584v2  
 97 | 
 98 | The directory hierarchy for the RefSeq annotated human reference genome which 
 99 | has the assembly accession GCF_000001405.30 and assembly name GRCh38.p4 looks 
100 | like this:
101 |    /genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.30_GRCh38.p4
102 | 
103 | Genome assemblies of interest can be identified using the NCBI Assembly resource
104 | (www.ncbi.nlm.nih.gov/assembly), or by using the assembly summary report files 
105 | that are provided for both all genbank and all refseq assemblies:
106 | ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt
107 | or ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt
108 | ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt
109 | or ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt
110 | 
111 | Assembly summary report files containing information on assemblies for a 
112 | particular taxonomic group or species are provided in the group and 
113 | Genus_species directories under the "genbank" and "refseq" directory trees. e.g.
114 | ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/bacteria/assembly_summary.txt
115 | ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Sulfolobus_islandicus/assembly_summary.txt
116 | 
117 | Search the meta-data fields, or filter the files, to find assemblies of 
118 | interest.
119 | 
120 | 
121 | ===========================
122 | Data provided per assembly:
123 | ===========================
124 | Sequence and other data files provided per assembly are named according to the 
125 | rule:
126 | [assembly accession.version]_[assembly name]_[content type].[optional format]
127 | 
128 | File formats and content:
129 | 
130 |    assembly_status.txt
131 |        A text file reporting the current status of the version of the assembly
132 |        for which data is provided. Any assembly anomalies are also reported.
133 |    *_assembly_report.txt file
134 |        Tab-delimited text file reporting the name, role and sequence 
135 |        accession.version for objects in the assembly. The file header contains 
136 |        meta-data for the assembly including: assembly name, assembly 
137 |        accession.version, scientific name of the organism and its taxonomy ID, 
138 |        assembly submitter, and sequence release date.
139 |    *_assembly_stats.txt file
140 |        Tab-delimited text file reporting statistics for the assembly including: 
141 |        total length, ungapped length, contig & scaffold counts, contig-N50, 
142 |        scaffold-L50, scaffold-N50, scaffold-N75, and scaffold-N90
143 |    *_assembly_regions.txt
144 |        Provided for assemblies that include alternate or patch assembly units. 
145 |        Tab-delimited text file reporting the location of genomic regions and 
146 |        the alt/patch scaffolds placed within those regions.
147 |    *_assembly_structure directory
148 |        This directory will only be present if the assembly has internal 
149 |        structure. When present, it will contain AGP files that define how 
150 |        component sequences are organized into scaffolds and/or chromosomes. 
151 |        Other files define how scaffolds and chromosomes are organized into 
152 |        non-nuclear and other assembly-units, and how any alternate or patch 
153 |        scaffolds are placed relative to the chromosomes. Refer to the README.txt
154 |        file in the assembly_structure directory for additional information.
155 |    *_cds_from_genomic.fna.gz
156 |        FASTA format of the nucleotide sequences corresponding to all CDS 
157 |        features annotated on the assembly, based on the genome sequence. See 
158 |        the "Description of files" section below for details of the file format.
159 |    *_feature_table.txt.gz
160 |        Tab-delimited text file reporting locations and attributes for a subset 
161 |        of annotated features. Included feature types are: gene, CDS, RNA (all 
162 |        types), operon, C/V/N/S_region, and V/D/J_segment. Replaces the .ptt & 
163 |        .rnt format files that were provided in the old genomes FTP directories.
164 |        See the "Description of files" section below for details of the file 
165 |        format.
166 |    *_genomic.fna.gz file
167 |        FASTA format of the genomic sequence(s) in the assembly. Repetitive 
168 |        sequences in eukaryotes are masked to lower-case (see below).
169 |        The FASTA title is formatted as sequence accession.version plus 
170 |        description. The genomic.fna.gz file includes all top-level sequences in
171 |        the assembly (chromosomes, plasmids, organelles, unlocalized scaffolds,
172 |        unplaced scaffolds, and any alternate loci or patch scaffolds). Scaffolds
173 |        that are part of the chromosomes are not included because they are
174 |        redundant with the chromosome sequences; sequences for these placed 
175 |        scaffolds are provided under the assembly_structure directory.
176 |    *_genomic.gbff.gz file
177 |        GenBank flat file format of the genomic sequence(s) in the assembly. This
178 |        file includes both the genomic sequence and the CONTIG description (for 
179 |        CON records), hence, it replaces both the .gbk & .gbs format files that 
180 |        were provided in the old genomes FTP directories.
181 |    *_genomic.gff.gz file
182 |        Annotation of the genomic sequence(s) in Generic Feature Format Version 3
183 |        (GFF3). Sequence identifiers are provided as accession.version.
184 |        Additional information about NCBI's GFF files is available at 
185 |        ftp://ftp.ncbi.nlm.nih.gov/genomes/README_GFF3.txt.
186 |    *_protein.faa.gz file
187 |        FASTA format of the accessioned protein products annotated on the genome 
188 |        assembly
189 |        The FASTA title is formatted as sequence accession.version plus 
190 |        description.
191 |    *_protein.gpff.gz file
192 |        GenPept format of the accessioned protein products annotated on the 
193 |        genome assembly
194 |    *_rm.out.gz file
195 |        RepeatMasker output; 
196 |        Provided for Eukaryotes 
197 |    *_rm.run file
198 |        Documentation of the RepeatMasker version, parameters, and library; 
199 |        Provided for Eukaryotes 
200 |    *_rna.fna.gz file
201 |        FASTA format of accessioned RNA products annotated on the genome 
202 |        assembly; Provided for RefSeq assemblies as relevant (Note, RNA and mRNA 
203 |        products are not instantiated as a separate accessioned record in GenBank
204 |        but are provided for some RefSeq genomes, most notably the eukaryotes.)
205 |        The FASTA title is provided as sequence accession.version plus 
206 |        description.
207 |    *_rna.gbff.gz file
208 |        GenBank flat file format of RNA products annotated on the genome 
209 |        assembly; Provided for RefSeq assemblies as relevant
210 |    *_rna_from_genomic.fna.gz
211 |        FASTA format of the nucleotide sequences corresponding to all RNA 
212 |        features annotated on the assembly, based on the genome sequence. See 
213 |        the "Description of files" section below for details of the file format.
214 |    *_wgsmaster.gbff.gz
215 |        GenBank flat file format of the WGS master for the assembly (present only
216 |        if a WGS master record exists for the sequences in the assembly).
217 |    annotation_hashes.txt
218 |        Tab-delimited text file reporting hash values for different aspects
219 |        of the annotation data. See the "Description of files" section below 
220 |        for details of the file format.
221 |    md5checksums.txt file
222 |        file checksums are provided for all data files in the directory
223 | 
224 | 
225 | =====================
226 | Description of files:
227 | =====================
228 | 
229 | Masking of fasta sequences in genomic.fna.gz files
230 | --------------------------------------------------
231 | Repetitive sequences in eukaryotic genome assembly sequence files, as 
232 | identified by WindowMasker (Morgulis A, Gertz EM, Schaffer AA, Agarwala R. 
233 | 2006. Bioinformatics 22:134-41), have been masked to lower-case.
234 | 
235 | Alignment programs typically have parameters that control whether the program 
236 | will ignore lower-case masking, treat it as soft-masking (i.e. only for finding 
237 | initial matches) or treat it as hard-masking. By default NCBI BLAST will ignore 
238 | lower-case masking but this can be changed by adding options to the blastn 
239 | command-line.
240 | To have blastn treat lower-case masking in the query sequence as soft-masking 
241 | add:
242 |      -lcase_masking
243 | To have blastn treat lower-case masking in the query sequence as hard-masking 
244 | add:
245 |      -lcase_masking -soft_masking false
246 | 
247 | Alternatively, commands such as the following can be used to generate either 
248 | unmasked sequence or sequence masked with Ns.
249 | 
250 | Example commands to remove lower-case masking:
251 | perl -pe '/^[^>]/ and $_=uc' genomic.fna > genomic.unmasked.fna
252 |   -or-
253 | awk '{if(/^[^>]/)$0=toupper($0);print $0}' genomic.fna > genomic.unmasked.fna
254 | 
255 | Example commands to convert lower-case masking to masking with Ns (hard-masked):
256 | perl -pe '/^[^>]/ and $_=~ s/[a-z]/N/g' genomic.fna > genomic.N-masked.fna
257 |   -or-
258 | awk '{if(/^[^>]/)gsub(/[a-z]/,"N");print $0}' genomic.fna > genomic.N-masked.fna
259 | 
260 | 
261 | *_cds_from_genomic.fna.gz & *_rna_from_genomic.fna.gz
262 | -----------------------------------------------------
263 | FASTA sequences of individual features annotated on the genomic records. The 
264 | sequences are based solely on the genome sequence and annotated feature at a
265 | particular location. They may differ from the product sequences found in the 
266 | *_rna.fna.gz and *_protein.faa.gz files which may be based on transcript or 
267 | other data sources and include mismatches, indels, or additional sequence not 
268 | found at a particular genomic location.
269 | 
270 | Seq-ids are constructed based on the following rule to ensure uniqueness:
271 | lcl|<genomic accession.version>_<feature_type>_<product accession.version>_<counter>
272 | Note the seq-id is not intended to be stable if the annotation is updated; in 
273 | particular, addition or removal of feature(s) will cause the counter to change 
274 | on following features.
275 | 
276 | The remainder of the FASTA definition line is composed of a series of qualifiers
277 | bounded by brackets, as described at:
278 |   https://www.ncbi.nlm.nih.gov/Sequin/modifiers.html
279 |   The qualifiers that may appear in these files are:
280 | 	     gene
281 | 	     locus_tag
282 | 	     db_xref
283 | 	     protein
284 | 	     product
285 | 	     ncRNA_class
286 | 	     pseudo
287 | 	     pseudogene
288 | 	     frame
289 | 	     partial
290 | 	     transl_except
291 | 	     exception
292 | 	     protein_id
293 | 	     location
294 | 
295 | Note that some qualifier values such as product names may themselves contain 
296 | un-escaped brackets, which should be allowed for if parsing the files.
297 | 		 
298 | For CDS features that begin in frame 2 or 3, the first 1 or 2 bp of sequence
299 | are trimmed from the CDS FASTA so that it always begins with the first complete
300 | codon. The location and frame qualifiers are left unaltered; consequently, the 
301 | length of the ranges in the location string may be 1-2 bp longer than the FASTA 
302 | sequence.
303 | 
304 | For RefSeq assemblies annotated by NCBI's Eukaryotic Genome Annotation 
305 | Pipeline, a gene may have a frameshifting indel(s) in the genome that is 
306 | thought to result from a genome sequencing error; in these cases, the gene is 
307 | still considered to be protein-coding and annotated with mRNA and CDS features, 
308 | but the genome sequence won't translate correctly downstream from the 
309 | frameshift. To compensate, the FASTA sequence of the genomic CDS and RNA 
310 | features is modified with 1-2 bp gaps (aka "micro-introns") in order to 
311 | restore the predicted reading frame. This modification is reflected by 1-2 bp 
312 | micro-introns in the location qualifier. An equivalent modification is also
313 | made in the *_genomic.gff.gz file. A protein-coding gene may also be annotated
314 | with a CDS feature containing an in-frame stop codon that is translated as a
315 | selenocysteine, subject to stop-codon readthrough, or thought to result from a
316 | genome sequencing error; in these cases, a transl_except qualifier is provided
317 | indicating the genomic location of the stop codon and its proposed translation.
318 | For more details, see the section on "Annotation accommodations for putative 
319 | assembly errors" in:
320 | ftp://ftp.ncbi.nlm.nih.gov/genomes/README_GFF3.txt
321 | 
322 | Pseudogenes annotated with CDS features may be included in the 
323 | *_cds_from_genomic.fna.gz file, and have FASTAs that are disrupted by 
324 | frameshifting indels or in-frame stop codons. Pseudogene features can be
325 | identified and screened out based on the presence of a [pseudo=true] qualifier
326 | in the defline.
327 | 
328 | 
329 | *_feature_table.txt.gz
330 | ----------------------
331 | Tab-delimited text file reporting locations and attributes for a subset of 
332 | annotated features. Included feature types are: gene, CDS, RNA (all types), 
333 | operon, C/V/N/S_region, and V/D/J_segment. 
334 | 
335 | The file is tab delimited (including a #header) with the following columns:
336 | col 1: feature: INSDC feature type
337 | col 2: class: For ncRNA features, this is the ncRNA_class for the feature. For 
338 |        gene features, this is the gene biotype computed based on the set of 
339 |        child features for that gene. See the description of the gene_biotype 
340 |        attribute in the GFF3 documentation for more details:
341 |        ftp://ftp.ncbi.nlm.nih.gov/genomes/README_GFF3.txt
342 | col 3: assembly: assembly accession.version
343 | col 4: assembly_unit: name of the assembly unit, such as "Primary Assembly", 
344 |        "ALT_REF_LOCI_1", or "non-nuclear"
345 | col 5: seq_type: sequence type, computed from the "Sequence-Role" and 
346 |        "Assigned-Molecule-Location/Type" in the *_assembly_report.txt file. The
347 |        value is computed as:
348 |        if an assembled-molecule, then reports the location/type value. e.g. 
349 |        chromosome, mitochondrion, or plasmid
350 |        if an unlocalized-scaffold, then report "unlocalized scaffold on <type>".
351 |        e.g. unlocalized scaffold on chromosome
352 |        else the role, e.g. alternate scaffold, fix patch, or novel patch
353 | col 6: chromosome
354 | col 7: genomic_accession
355 | col 8: start: feature start coordinate (base-1). start is always less than end
356 | col 9: end: feature end coordinate (base-1)
357 | col10: strand
358 | col11: product_accession: accession.version of the product referenced by this 
359 |        feature, if exists
360 | col12: non-redundant_refseq: for bacteria and archaea assemblies, the 
361 |        non-redundant WP_ protein accession corresponding to the CDS feature. May
362 |        be the same as column 11, for RefSeq genomes annotated directly with WP_
363 |        RefSeq proteins, or may be different, for genomes annotated with 
364 |        genome-specific protein accessions (e.g. NP_ or YP_ RefSeq proteins) that
365 |        reference a WP_ RefSeq accession.
366 | col13: related_accession: for eukaryotic RefSeq annotations, the RefSeq protein
367 |        accession corresponding to the transcript feature, or the RefSeq 
368 |        transcript accession corresponding to the protein feature.
369 | col14: name: For genes, this is the gene description or full name. For RNA, CDS,
370 |        and some other features, this is the product name.
371 | col15: symbol: gene symbol
372 | col16: GeneID: NCBI GeneID, for those RefSeq genomes included in NCBI's Gene 
373 |        resource
374 | col17: locus_tag
375 | col18: feature_interval_length: sum of the lengths of all intervals for the 
376 |        feature (i.e. the length without introns for a joined feature)
377 | col19: product_length: length of the product corresponding to the 
378 |        accession.version in column 11. Protein product lengths are in amino acid
379 |        units, and do not include the stop codon which is included in column 18.
380 |        Additionally, product_length may differ from feature_interval_length if 
381 |        the product contains sequence differences vs. the genome, as found for 
382 |        some RefSeq transcript and protein products based on mRNA sequences and 
383 |        also for INSDC proteins that are submitted to correct genome 
384 |        discrepancies.
385 | col20: attributes: semi-colon delimited list of a controlled set of qualifiers.
386 |        The list currently includes:
387 |        partial, pseudo, pseudogene, ribosomal_slippage, trans_splicing, 
388 |        anticodon=NNN (for tRNAs), old_locus_tag=XXX 
389 | 
390 | 
391 | annotation_hashes.txt
392 | ---------------------
393 | Tab-delimited text file reporting hash values and change dates for specific 
394 | details of the annotation. Hashes are computed based on the underlying data in 
395 | ASN.1 format, and thus aren't affected by changes in file formats. In contrast,
396 | the checksums reported in the md5checksums.txt file will change with any change
397 | to the files, including file formats and differences in gzip compression. The
398 | hashes are useful to monitor for when annotation has changed in a way that is 
399 | significant for a particular use case and warrants downloading the updated 
400 | records.
401 | 
402 | The file is tab delimited (including a #header) with the following columns:
403 | col 1: Assembly accession: accession.version
404 | col 2: Descriptors hash: hash of all descriptors on top-level sequence records,
405 |        including BioSource, molinfo, user objects, publications, and dates
406 | col 3: Descriptors last changed: date and time of the last change to any 
407 |        descriptors
408 | col 4: Features hash: hash of all features annotated on the assembly, including
409 |        both locations and qualifiers stored directly on the genome records. For
410 |        RefSeq genomes annotated with WP proteins and some other cases, protein
411 |        product names aren't stored on the genome records and thus changes in 
412 |        protein names do not alter the features hash.
413 | col 5: Features last changed: date and time of the last change to any features
414 | col 6: Locations hash: hash of just the locations of all features annotated on
415 |        the assembly.
416 | col 7: Locations last changed: date and time of the last change to any feature
417 |        locations
418 | col 8: Protein names hash: hash of the protein names for all CDS features 
419 |        annotated on the assembly.
420 | col 9: Protein names last changed: date and time of the last change to any 
421 |        protein names.
422 | 
423 | Example use cases:
424 |   A change in the Locations hash indicates that at least one feature has been 
425 |      added, removed, or had its location altered.
426 |   A change in the Features hash but not the Locations hash implies that only
427 |      feature qualifiers have changed, such as names or db_xrefs.
428 |   A change in the Protein names hash indicates that at least one protein name
429 |      has changed compared to the previous files provided on the genomes FTP 
430 |      site. Note for RefSeq prokaryotic genomes, protein names are updated 
431 |      continuously but files on the FTP site are only refreshed intermittently
432 |      to minimize churn.
433 |   A change in the Descriptors hash but not the Features hash implies that only
434 |      record metadata has been touched, such as the addition of a publication.
435 | 
436 | NOTE: currently the descriptors hash values are not stable due to a bug.
437 | 
438 | 
439 | assembly_status.txt
440 | ------------------
441 | A text file reporting the current status of the version of the assembly for 
442 | which data is provided. Any assembly anomalies are also reported. Lines have the
443 | format tag=value.
444 | 
445 | First line: status=<value> 
446 |   where <value> is one of latest, replaced or suppressed
447 | Second line (if any): assembly anomaly=<value>
448 |   where value is a comma separated list of assembly anomalies as described in
449 |   the "Anomalous assemblies" section of this web page:
450 |   https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/
451 | 
452 | ________________________________________________________________________________
453 | National Center for Biotechnology Information (NCBI)
454 | National Library of Medicine
455 | National Institutes of Health
456 | 8600 Rockville Pike
457 | Bethesda, MD 20894, USA
458 | tel: (301) 496-2475
459 | fax: (301) 480-9241
460 | e-mail: info@ncbi.nlm.nih.gov
461 | ________________________________________________________________________________
462 | 


--------------------------------------------------------------------------------
/phage/annotation_hashes.txt:
--------------------------------------------------------------------------------
1 | # Assembly accession	Descriptors hash	Descriptors last changed	Features hash	Features last changed	Locations hash	Locations last change	Protein names hash	Protein names last changed
2 | GCF_000840245.1	48CCF39FD72248A2AEDB863B2619B83E	2016/10/18 00:06:00	3FAE0C7922A801406BCD22F7799F774A	2016/11/01 00:08:00	2FCAC3A3082546ACDC8D93B730C2830C	2015/05/14 22:16:00	9BE9B620B30C8BD77B5DDAA9C2926204	2016/06/16 10:22:00
3 | 


--------------------------------------------------------------------------------
/phage/assembly_status.txt:
--------------------------------------------------------------------------------
1 | status=latest
2 | 


--------------------------------------------------------------------------------
/phage/md5checksums.txt:
--------------------------------------------------------------------------------
 1 | 53b863dbf700cebbb0b4bd7a946ba85d  ./GCF_000840245.1_ViralProj14204_assembly_report.txt
 2 | 7c7a3334d1c4cda0b268c88964336558  ./GCF_000840245.1_ViralProj14204_assembly_stats.txt
 3 | c847dc9d0b295c72e172d197ef2800b1  ./GCF_000840245.1_ViralProj14204_cds_from_genomic.fna.gz
 4 | 173a313ac0f4aa27013ff40e589180e0  ./GCF_000840245.1_ViralProj14204_feature_count.txt.gz
 5 | 6c3f4532d9b865d0218036d8923e71b6  ./GCF_000840245.1_ViralProj14204_feature_table.txt.gz
 6 | 7e74fba2c9e1107f228dbb12bada5c1c  ./GCF_000840245.1_ViralProj14204_genomic.fna.gz
 7 | 2288b910ef72acf4fd65347edd8c8170  ./GCF_000840245.1_ViralProj14204_genomic.gbff.gz
 8 | c87dfe2229b8d2e7fda450a2649c84ad  ./GCF_000840245.1_ViralProj14204_genomic.gff.gz
 9 | ac1825c7c89c36c42dbb6d0fe573bf49  ./GCF_000840245.1_ViralProj14204_protein.faa.gz
10 | 5970ed6ecd106a94ace900c046531b84  ./GCF_000840245.1_ViralProj14204_protein.gpff.gz
11 | d2ac5ffe4bbf51065ba8744a6af05a44  ./GCF_000840245.1_ViralProj14204_rna_from_genomic.fna.gz
12 | e8e3d9a181d720e922e3d0fcb0d074cf  ./GCF_000840245.1_ViralProj14204_translated_cds.faa.gz
13 | 8e8ea5a1c61b02a743bd28282fc10c13  ./annotation_hashes.txt
14 | 


--------------------------------------------------------------------------------
/phage/reads/sample1_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/phage/reads/sample1_R1.fastq.gz


--------------------------------------------------------------------------------
/phage/reads/sample1_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/phage/reads/sample1_R2.fastq.gz


--------------------------------------------------------------------------------
/phage/reads/sample2_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/phage/reads/sample2_R1.fastq.gz


--------------------------------------------------------------------------------
/phage/reads/sample2_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/phage/reads/sample2_R2.fastq.gz


--------------------------------------------------------------------------------
/phage/reads/sample3_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/phage/reads/sample3_R1.fastq.gz


--------------------------------------------------------------------------------
/phage/reads/sample3_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/phage/reads/sample3_R2.fastq.gz


--------------------------------------------------------------------------------
/phage/reads/sample4_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/phage/reads/sample4_R1.fastq.gz


--------------------------------------------------------------------------------
/phage/reads/sample4_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/telatin/learn_bash/3359e9e84fa252ce649a85ee0b9a2d26691266d6/phage/reads/sample4_R2.fastq.gz


--------------------------------------------------------------------------------
/phage/vir_assembly_report.txt:
--------------------------------------------------------------------------------
 1 | # Assembly name:  ViralProj14204
 2 | # Organism name:  Escherichia virus Lambda (viruses)
 3 | # Taxid:          10710
 4 | # BioProject:     PRJNA14204
 5 | # Submitter:      n/a
 6 | # Date:           1993-4-28
 7 | # Assembly type:  n/a
 8 | # Release type:   major
 9 | # Assembly level: Complete Genome
10 | # Genome representation: full
11 | # GenBank assembly accession: GCA_000840245.1
12 | # RefSeq assembly accession: GCF_000840245.1
13 | # RefSeq assembly and GenBank assemblies identical: yes
14 | #
15 | ## Assembly-Units:
16 | ## GenBank Unit Accession	RefSeq Unit Accession	Assembly-Unit name
17 | ## GCA_000840235.1	GCF_000840235.1	Primary assembly
18 | #
19 | # Ordered by chromosome/plasmid; the chromosomes/plasmids are followed by
20 | # unlocalized scaffolds.
21 | # Unplaced scaffolds are listed at the end.
22 | # RefSeq is equal or derived from GenBank object.
23 | #
24 | # Sequence-Name	Sequence-Role	Assigned-Molecule	Assigned-Molecule-Location/Type	GenBank-Accn	Relationship	RefSeq-Accn	Assembly-Unit	Sequence-Length	UCSC-style-name
25 | Unknown	assembled-molecule	Unknown	Segment	J02459.1	=	NC_001416.1	Primary assembly	48502	na
26 | 


--------------------------------------------------------------------------------
/phage/vir_assembly_stats.txt:
--------------------------------------------------------------------------------
 1 | # Assembly Statistics Report
 2 | # Assembly name:  ViralProj14204
 3 | # Organism name:  Escherichia virus Lambda (viruses)
 4 | # Taxid:          10710
 5 | # BioProject:     PRJNA14204
 6 | # Submitter:      n/a
 7 | # Date:           1993-4-28
 8 | # Assembly type:  n/a
 9 | # Release type:   major
10 | # Assembly level: Complete Genome
11 | # Genome representation: full
12 | # GenBank assembly accession: GCA_000840245.1
13 | # RefSeq assembly accession: GCF_000840245.1
14 | # RefSeq assembly and GenBank assemblies identical: yes
15 | #
16 | ## Assembly-Units:
17 | ## GenBank Unit Accession	RefSeq Unit Accession	Assembly-Unit name
18 | ## GCA_000840235.1	GCF_000840235.1	Primary assembly
19 | #
20 | # Statistic Types
21 | # Statistic	Description
22 | # molecule-count	Number of chromosomes and plasmids in full assembly
23 | # region-count	Number of genomic regions defined in full assembly
24 | # spanned-gaps	Number of spanned gaps. Spanned gaps are gaps within a scaffold
25 | # top-level-count	Number of chromosomes or plasmids, unplaced/unlocalized scaffolds, alt-loci scaffolds, and patch scaffolds
26 | # total-gap-length	Total length of gaps
27 | # total-length	Total sequence length including bases and gaps
28 | # ungapped-length	Total length excluding gaps
29 | # unspanned-gaps	Number of unspanned gaps. Unspanned gaps are gaps between scaffolds
30 | #
31 | # Sequence-type Description
32 | # all	statistic covers all the sequences in the unit-assembly and molecule(s) specified.
33 | # molecule	statistic covers the specified molecule. molecule-name and molecule-type/loc will be given.
34 | # unlocalized	statistic covers the sequences assigned to a molecule but with no position. molecule-name and molecule-type/loc will be given.
35 | # unplaced	statistic covers the sequences not assigned to any molecule in the assembly.
36 | #
37 | # unit-name	molecule-name	molecule-type/loc	sequence-type	statistic	value
38 | all	all	all	all	total-length	48502
39 | all	all	all	all	spanned-gaps	0
40 | all	all	all	all	unspanned-gaps	0
41 | all	all	all	all	region-count	0
42 | all	all	all	all	total-gap-length	0
43 | all	all	all	all	molecule-count	1
44 | all	all	all	all	top-level-count	1
45 | Primary Assembly	all	all	all	total-length	48502
46 | Primary Assembly	all	all	all	ungapped-length	48502
47 | Primary Assembly	all	all	all	spanned-gaps	0
48 | Primary Assembly	all	all	all	unspanned-gaps	0
49 | Primary Assembly	all	all	assembled-molecule	total-length	48502
50 | Primary Assembly	all	all	assembled-molecule	ungapped-length	48502
51 | Primary Assembly	all	all	assembled-molecule	spanned-gaps	0
52 | Primary Assembly	all	all	assembled-molecule	unspanned-gaps	0
53 | Primary Assembly	all	all	unlocalized-scaffold	total-length	0
54 | Primary Assembly	all	all	unlocalized-scaffold	ungapped-length	0
55 | Primary Assembly	all	all	unlocalized-scaffold	spanned-gaps	0
56 | Primary Assembly	all	all	unlocalized-scaffold	unspanned-gaps	0
57 | Primary Assembly	Unknown	Segment	all	total-length	48502
58 | Primary Assembly	Unknown	Segment	all	ungapped-length	48502
59 | Primary Assembly	Unknown	Segment	all	spanned-gaps	0
60 | Primary Assembly	Unknown	Segment	all	unspanned-gaps	0
61 | Primary Assembly	Unknown	Segment	assembled-molecule	total-length	48502
62 | Primary Assembly	Unknown	Segment	assembled-molecule	ungapped-length	48502
63 | Primary Assembly	Unknown	Segment	assembled-molecule	spanned-gaps	0
64 | Primary Assembly	Unknown	Segment	assembled-molecule	unspanned-gaps	0
65 | Primary Assembly	Unknown	Segment	unlocalized-scaffold	total-length	0
66 | Primary Assembly	Unknown	Segment	unlocalized-scaffold	ungapped-length	0
67 | Primary Assembly	Unknown	Segment	unlocalized-scaffold	spanned-gaps	0
68 | Primary Assembly	Unknown	Segment	unlocalized-scaffold	unspanned-gaps	0
69 | 


--------------------------------------------------------------------------------
/phage/vir_feature_count.txt:
--------------------------------------------------------------------------------
1 | # Feature	Class	Full Assembly	Assembly-unit accession	Assembly-unit name	Unique Ids	Placements
2 | CDS	with_protein	GCF_000840245.1	GCF_000840235.1	Primary assembly	73	73
3 | gene	other	GCF_000840245.1	GCF_000840235.1	Primary assembly	19	19
4 | gene	protein_coding	GCF_000840245.1	GCF_000840235.1	Primary assembly	73	73
5 | mRNA		GCF_000840245.1	GCF_000840235.1	Primary assembly	na	23
6 | 


--------------------------------------------------------------------------------
/phage/vir_feature_table.txt:
--------------------------------------------------------------------------------
  1 | # feature	class	assembly	assembly_unit	seq_type	chromosome	genomic_accession	start	end	strand	product_accession	non-redundant_refseq	related_accession	name	symbol	GeneID	locus_tag	feature_interval_length	product_length	attributes
  2 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	191	736	+					nu1	2703523	lambdap01	546		
  3 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	191	736	+	NP_040580.1			DNA packaging protein	nu1	2703523	lambdap01	546	181	
  4 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	711	2636	+					A	2703524	lambdap02	1926		
  5 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	711	2636	+	NP_040581.1			DNA packaging protein	A	2703524	lambdap02	1926	641	
  6 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	2633	2839	+					W	2703525	lambdap03	207		
  7 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	2633	2839	+	NP_040582.1			head-tail joining protein	W	2703525	lambdap03	207	68	
  8 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	2836	4437	+					B	2703526	lambdap04	1602		
  9 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	2836	4437	+	NP_040583.1			capsid component	B	2703526	lambdap04	1602	533	
 10 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	4418	5737	+					C	2703527	lambdap05	1320		
 11 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	4418	5737	+	NP_040584.1			capsid component	C	2703527	lambdap05	1320	439	
 12 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	5132	5737	+					nu3	2703528	lambdap06	606		
 13 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	5132	5737	+	NP_040585.1			capsid assembly protein	nu3	2703528	lambdap06	606	201	
 14 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	5747	6079	+					D	2703529	lambdap07	333		
 15 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	5747	6079	+	NP_040586.1			head-DNA stabilization protein	D	2703529	lambdap07	333	110	
 16 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	6135	7160	+					E	2703482	lambdap08	1026		
 17 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	6135	7160	+	NP_040587.1			capsid component	E	2703482	lambdap08	1026	341	
 18 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	7202	7600	+					Fi	2703483	lambdap09	399		
 19 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	7202	7600	+	NP_040588.1			DNA packaging protein	Fi	2703483	lambdap09	399	132	
 20 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	7612	7965	+					Fii	2703484	lambdap10	354		
 21 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	7612	7965	+	NP_040589.1			head-tail joining protein	Fii	2703484	lambdap10	354	117	
 22 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	7977	8555	+					Z	2703485	lambdap11	579		
 23 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	7977	8555	+	NP_040590.1			tail component	Z	2703485	lambdap11	579	192	
 24 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	8552	8947	+					U	2703486	lambdap12	396		
 25 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	8552	8947	+	NP_040591.1			tail component	U	2703486	lambdap12	396	131	
 26 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	8955	9695	+					V	2703487	lambdap13	741		
 27 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	8955	9695	+	NP_040592.1			tail component	V	2703487	lambdap13	741	246	
 28 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	9711	10133	+					G	2703488	lambdap14	423		
 29 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	9711	10133	+	NP_040593.1			tail component	G	2703488	lambdap14	423	140	
 30 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	10115	10549	+					T	2703489	lambdap15	435		
 31 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	10115	10549	+	NP_040594.1			tail component	T	2703489	lambdap15	435	144	
 32 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	10542	13103	+					H	2703511	lambdap16	2562		
 33 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	10542	13103	+	NP_040595.1			tail component	H	2703511	lambdap16	2562	853	
 34 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	13100	13429	+					M	2703512	lambdap17	330		
 35 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	13100	13429	+	NP_040596.1			tail component	M	2703512	lambdap17	330	109	
 36 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	13429	14127	+					L	2703513	lambdap18	699		
 37 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	13429	14127	+	NP_040597.1			tail component	L	2703513	lambdap18	699	232	
 38 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	14276	14875	+					K	2703514	lambdap19	600		
 39 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	14276	14875	+	NP_040598.1			tail component	K	2703514	lambdap19	600	199	
 40 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	14773	15444	+					I	2703515	lambdap20	672		
 41 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	14773	15444	+	NP_040599.1			tail component	I	2703515	lambdap20	672	223	
 42 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	15505	18903	+					J	2703516	lambdap21	3399		
 43 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	15505	18903	+	NP_040600.1			tail:host specificity protein	J	2703516	lambdap21	3399	1132	
 44 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	18482	35582	-						2703502	lambdap22	17101		
 45 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	18482	35582	-				mRNA-pl (alt.; via t'j4 terminator)		2703502	lambdap22	17101		
 46 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	18597	35582	-						2703536	lambdap23	16986		
 47 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	18597	35582	-				mRNA-pl (alt.; via t'j3 terminator)		2703536	lambdap23	16986		
 48 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	18637	35582	-						2703468	lambdap24	16946		
 49 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	18637	35582	-				mRNA-pl (alt.; via t'j2 terminator)		2703468	lambdap24	16946		
 50 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	18671	35582	-						2703472	lambdap25	16912		
 51 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	18671	35582	-				mRNA-pl (alt.; via t'j1 terminator)		2703472	lambdap25	16912		
 52 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	18965	19585	+					lom	2703517	lambdap26	621		
 53 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	18965	19585	+	NP_040601.1			outer host membrane	lom	2703517	lambdap26	621	206	
 54 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	19650	20855	+					orf-401	2703518	lambdap27	1206		
 55 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	19650	20855	+	NP_040602.1			Tail fiber protein	orf-401	2703518	lambdap27	1206	401	
 56 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	20147	20767	-					orf206b	3827061	lambdap90	621		
 57 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	20147	20767	-	NP_040603.1			hypothetical protein	orf206b	3827061	lambdap90	621	206	
 58 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	21029	21973	+					orf-314	2703519	lambdap28	945		
 59 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	21029	21973	+	NP_040604.1			Tail fiber	orf-314	2703519	lambdap28	945	314	
 60 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	21973	22557	+					orf-194	2703503	lambdap29	585		
 61 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	21973	22557	+	NP_040605.1			Putative fiber assembly protein	orf-194	2703503	lambdap29	585	194	
 62 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	22686	23918	-					ea47	3827051	lambdap80	1233		
 63 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	22686	23918	-	NP_040606.1			ea47	ea47	3827051	lambdap80	1233	410	
 64 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	23231	23231	-						2703520	lambdap30	1		
 65 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	23231	23231	-				mRNA-pbl		2703520	lambdap30	1		partial
 66 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	24509	25399	-					ea31	3827052	lambdap81	891		
 67 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	24509	25399	-	NP_040607.1			ea31	ea31	3827052	lambdap81	891	296	
 68 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	25396	26973	-					ea59	3827053	lambdap82	1578		
 69 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	25396	26973	-	NP_040608.1			ea59	ea59	3827053	lambdap82	1578	525	
 70 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	27538	35582	-					xis	2703469	lambdap31	8045		
 71 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	27538	35582	-				mRNA-pl (alt.; via ti terminator)	xis	2703469	lambdap31	8045		
 72 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	27538	29065	-						2703464	lambdap32	1528		
 73 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	27538	29065	-				mRNA int (integration; 356; via ti terminator)		2703464	lambdap32	1528		
 74 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	27812	28882	-					int	2703470	lambdap33	1071		
 75 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	27812	28882	-	NP_040609.1			integration protein	int	2703470	lambdap33	1071	356	
 76 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	28860	29078	-					xis	2703504	lambdap34	219		
 77 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	28860	29078	-	NP_040610.1			Excisionase	xis	2703504	lambdap34	219	72	
 78 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	29118	29285	-						2703530	lambdap35	168		
 79 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	29118	29285	-	NP_597778.1			hypothetical protein		2703530	lambdap35	168	55	
 80 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	29374	29655	-					ea8.5	2703505	lambdap36	282		
 81 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	29374	29655	-	NP_040611.1			ea8.5	ea8.5	2703505	lambdap36	282	93	
 82 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	29847	30395	-					ea22	3827054	lambdap83	549		
 83 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	29847	30395	-	NP_040612.1			ea22	ea22	3827054	lambdap83	549	182	
 84 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	30839	31024	-					orf61	2703506	lambdap37	186		
 85 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	30839	31024	-	NP_040613.1			hypothetical protein	orf61	2703506	lambdap37	186	61	
 86 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	31005	31196	-					orf63	2703507	lambdap38	192		
 87 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	31005	31196	-	NP_040614.1			hypothetical protein	orf63	2703507	lambdap38	192	63	
 88 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	31169	31351	-					orf60a	2703508	lambdap39	183		
 89 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	31169	31351	-	NP_040615.1			hypothetical protein	orf60a	2703508	lambdap39	183	60	
 90 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	31262	35582	-						2703535	lambdap40	4321		
 91 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	31262	35582	-				mRNA-pl (alt.; via tl3 terminator)		2703535	lambdap40	4321		
 92 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	31348	32028	-					exo	2703522	lambdap41	681		
 93 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	31348	32028	-	NP_040616.1			exonuclease	exo	2703522	lambdap41	681	226	
 94 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	32025	32810	-					bet	3827055	lambdap84	786		
 95 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	32025	32810	-	NP_040617.1			bet	bet	3827055	lambdap84	786	261	
 96 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	32816	33232	-					gam	2703509	lambdap42	417		
 97 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	32816	33232	-	NP_040618.1			host-nuclease inhibitor protein Gam	gam	2703509	lambdap42	417	138	
 98 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	33100	35582	-						2703542	lambdap43	2483		
 99 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	33100	35582	-				mRNA-pl (alt.; via tl2d terminator)		2703542	lambdap43	2483		
100 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	33141	35582	-						2703471	lambdap44	2442		
101 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	33141	35582	-				mRNA-pl (alt.; via tl2c terminator)		2703471	lambdap44	2442		
102 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	33187	33330	-					kil	3827057	lambdap85	144		
103 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	33187	33330	-	NP_040619.1			host-killing protein	kil	3827057	lambdap85	144	47	
104 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	33299	33463	-					cIII	3827056	lambdap86	165		
105 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	33299	33463	-	NP_040620.1			antitermination protein	cIII	3827056	lambdap86	165	54	
106 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	33494	35582	-					ea10	2703541	lambdap45	2089		
107 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	33494	35582	-			NP_040621.1	mRNA-pl (alt.; via tl2b terminator)	ea10	2703541	lambdap45	2089		
108 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	33536	33904	-	NP_040621.1			Putative single-stranded DNA binding protein	ea10	2703541	lambdap45	369	122	
109 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	33930	35582	-					ral	2703473	lambdap46	1653		
110 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	33930	35582	-			NP_040622.1	mRNA-pl (alt.; via tl2a terminator)	ral	2703473	lambdap46	1653		
111 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	34087	34287	-	NP_040622.1			restriction alleviation protein	ral	2703473	lambdap46	201	66	
112 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	34271	34357	-					orf28	2703510	lambdap47	87		
113 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	34271	34357	-	NP_040623.1			hypothetical protein	orf28	2703510	lambdap47	87	28	
114 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	34482	35036	+						2703531	lambdap48	555		
115 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	34482	35036	+	NP_597779.1			Superinfection exclusion protein B		2703531	lambdap48	555	184	
116 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	34560	35582	-					N	2703540	lambdap49	1023		
117 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	34560	35582	-			NP_040625.1	mRNA-pl (alt.; via tl1 terminator)	N	2703540	lambdap49	1023		
118 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	35037	35438	-	NP_040625.1			early gene regulator	N	2703540	lambdap49	402	133	
119 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	35798	38343	-						2703537	lambdap50	2546		
120 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	35798	38343	-				mRNA-pre (via timm terminator)		2703537	lambdap50	2546		
121 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	35798	37940	-						2703538	lambdap51	2143		
122 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	35798	37940	-				mRNA-prm (via timm terminator)		2703538	lambdap51	2143		
123 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	35798	36256	-						2703465	lambdap52	459		
124 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	35798	36256	-				mRNA-plit (via timm terminator)		2703465	lambdap52	459		
125 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	35825	36259	-					rexb	2703493	lambdap53	435		
126 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	35825	36259	-	NP_040626.1			exclusion protein	rexb	2703493	lambdap53	435	144	
127 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	36275	37114	-					rexa	3827058	lambdap87	840		
128 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	36275	37114	-	NP_040627.1			exclusion protein	rexa	3827058	lambdap87	840	279	
129 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	37227	37940	-					cI	3827059	lambdap88	714		
130 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	37227	37940	-	NP_040628.1			repressor	cI	3827059	lambdap88	714	237	
131 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38023	40624	+						2703490	lambdap54	2602		
132 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38023	40624	+				mRNA-pr (alt.; via tr2 terminator)		2703490	lambdap54	2602		
133 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38023	38370	+						2703491	lambdap55	348		
134 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38023	38370	+				mRNA-pr (alt.; via tr1c terminator)		2703491	lambdap55	348		
135 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38023	38337	+						2703466	lambdap56	315		
136 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38023	38337	+				mRNA-pr (alt.; via tr1b terminator)		2703466	lambdap56	315		
137 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38023	38315	+					cro	2703467	lambdap57	293		
138 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38023	38315	+			NP_040629.1	mRNA-pr (alt.; via tr1a terminator)	cro	2703467	lambdap57	293		
139 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38023	38135	+						2703539	lambdap58	113		
140 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38023	38135	+				mRNA-pr (alt.; via tr0 terminator)		2703539	lambdap58	113		
141 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38041	38241	+	NP_040629.1			antirepressor	cro	2703467	lambdap57	201	66	
142 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38360	38653	+					cII	2703494	lambdap59	294		
143 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38360	38653	+	NP_040630.1			cII protein	cII	2703494	lambdap59	294	97	
144 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38599	38675	-						2703492	lambdap60	77		
145 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38599	38675	-				mRNA-oop transcription mRNA		2703492	lambdap60	77		
146 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38686	39585	+					O	3827060	lambdap89	900		
147 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	38686	39585	+	NP_040631.1			DNA replication protein	O	3827060	lambdap89	900	299	
148 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	39582	40283	+					P	2703495	lambdap61	702		
149 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	39582	40283	+	NP_040632.1			DNA replication protein	P	2703495	lambdap61	702	233	
150 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	40280	40570	+					ren	2703496	lambdap62	291		
151 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	40280	40570	+	NP_040633.1			ren exclusion protein	ren	2703496	lambdap62	291	96	
152 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	40644	41084	+					NinB	2703497	lambdap63	441		
153 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	40644	41084	+	NP_040634.1			NinB	NinB	2703497	lambdap63	441	146	
154 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	41081	41953	+					NinC	2703498	lambdap64	873		
155 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	41081	41953	+	NP_040635.1			NinC protein	NinC	2703498	lambdap64	873	290	
156 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	41950	42123	+					NinD	2703499	lambdap65	174		
157 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	41950	42123	+	NP_040636.1			NinD protein	NinD	2703499	lambdap65	174	57	
158 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	42090	42272	+					NinE	2703500	lambdap66	183		
159 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	42090	42272	+	NP_040637.1			NinE protein	NinE	2703500	lambdap66	183	60	
160 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	42269	42439	+					NinF	2703501	lambdap67	171		
161 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	42269	42439	+	NP_040638.1			NinF protein	NinF	2703501	lambdap67	171	56	
162 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	42429	43043	+					NinG	2703474	lambdap68	615		
163 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	42429	43043	+	NP_040639.1			NinG protein	NinG	2703474	lambdap68	615	204	
164 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	43040	43246	+					NinH	2703475	lambdap69	207		
165 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	43040	43246	+	NP_040640.1			NinH protein	NinH	2703475	lambdap69	207	68	
166 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	43224	43889	+					NinI	2703476	lambdap70	666		
167 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	43224	43889	+	NP_040641.1			NinI protein	NinI	2703476	lambdap70	666	221	
168 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	43886	44509	+					Q	2703477	lambdap71	624		
169 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	43886	44509	+	NP_040642.1			late gene regulator	Q	2703477	lambdap71	624	207	
170 | gene	other	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	44587	44780	+						2703521	lambdap72	194		
171 | mRNA		GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	44587	44780	+				mRNA-pr' transcription (late genes) mRNA		2703521	lambdap72	194		
172 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	44621	44815	+					orf-64	2703478	lambdap73	195		
173 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	44621	44815	+	NP_040643.1			hypothetical protein	orf-64	2703478	lambdap73	195	64	
174 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	45186	45509	+					S	2703479	lambdap74	324		
175 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	45186	45509	+	NP_040644.1			anti-holin	S	2703479	lambdap74	324	107	
176 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	45192	45509	+					S'	5740919	lambdap92	318		
177 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	45192	45509	+	YP_001551775.1			holin	S'	5740919	lambdap92	318	105	
178 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	45493	45969	+					R	2703480	lambdap75	477		
179 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	45493	45969	+	NP_040645.1			endolysin	R	2703480	lambdap75	477	158	
180 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	45966	46427	+					Rz	2703481	lambdap76	462		
181 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	45966	46427	+	NP_040646.1			cell lysis protein	Rz	2703481	lambdap76	462	153	
182 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	46186	46368	+					Rz1	5739319	lambdap91	183		
183 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	46186	46368	+	YP_001551744.1			Rz1 protein	Rz1	5739319	lambdap91	183	60	
184 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	46459	46752	-					bor	2703532	lambdap77	294		
185 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	46459	46752	-	NP_597780.1			Bor protein precursor	bor	2703532	lambdap77	294	97	
186 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	47042	47575	-						2703533	lambdap78	534		
187 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	47042	47575	-	NP_597781.1			putative envelope protein		2703533	lambdap78	534	177	
188 | gene	protein_coding	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	47738	47944	+						2703534	lambdap79	207		
189 | CDS	with_protein	GCF_000840245.1	Primary assembly	segment	Unknown	NC_001416.1	47738	47944	+	NP_597782.1			hypothetical protein		2703534	lambdap79	207	68	
190 | 


--------------------------------------------------------------------------------
/phage/vir_protein.faa:
--------------------------------------------------------------------------------
  1 | >NP_040580.1 DNA packaging protein [Escherichia virus Lambda]
  2 | MEVNKKQLADIFGASIRTIQNWQEQGMPVLRGGGKGNEVLYDSAAVIKWYAERDAEIENEKLRREVEELRQASEADLQPG
  3 | TIEYERHRLTRAQADAQELKNARDSAEVVETAFCTFVLSRIAGEIASILDGLPLSVQRRFPELENRHVDFLKRDIIKAMN
  4 | KAAALDELIPGLLSEYIEQSG
  5 | >NP_040581.1 DNA packaging protein [Escherichia virus Lambda]
  6 | MNISNSQVNRLRHFVRAGLRSLFRPEPQTAVEWADANYYLPKESAYQEGRWETLPFQRAIMNAMGSDYIREVNVVKSARV
  7 | GYSKMLLGVYAYFIEHKQRNTLIWLPTDGDAENFMKTHVEPTIRDIPSLLALAPWYGKKHRDNTLTMKRFTNGRGFWCLG
  8 | GKAAKNYREKSVDVAGYDELAAFDDDIEQEGSPTFLGDKRIEGSVWPKSIRGSTPKVRGTCQIERAASESPHFMRFHVAC
  9 | PHCGEEQYLKFGDKETPFGLKWTPDDPSSVFYLCEHNACVIRQQELDFTDARYICEKTGIWTRDGILWFSSSGEEIEPPD
 10 | SVTFHIWTAYSPFTTWVQIVKDWMKTKGDTGKRKTFVNTTLGETWEAKIGERPDAEVMAERKEHYSAPVPDRVAYLTAGI
 11 | DSQLDRYEMRVWGWGPGEESWLIDRQIIMGRHDDEQTLLRVDEAINKTYTRRNGAEMSISRICWDTGGIDPTIVYERSKK
 12 | HGLFRVIPIKGASVYGKPVASMPRKRNKNGVYLTEIGTDTAKEQIYNRFTLTPEGDEPLPGAVHFPNNPDIFDLTEAQQL
 13 | TAEEQVEKWVDGRKKILWDSKKRRNEALDCFVYALAALRISISRWQLDLSALLASLQEEDGAATNKKTLADYARALSGED
 14 | E
 15 | >NP_040582.1 head-tail joining protein [Escherichia virus Lambda]
 16 | MTRQEELAAARAALHDLMTGKRVATVQKDGRRVEFTATSVSDLKKYIAELEVQTGMTQRRRGPAGFYV
 17 | >NP_040583.1 capsid component [Escherichia virus Lambda]
 18 | MKTPTIPTLLGPDGMTSLREYAGYHGGGSGFGGQLRSWNPPSESVDAALLPNFTRGNARADDLVRNNGYAANAIQLHQDH
 19 | IVGSFFRLSHRPSWRYLGIGEEEARAFSREVEAAWKEFAEDDCCCIDVERKRTFTMMIREGVAMHAFNGELFVQATWDTS
 20 | SSRLFRTQFRMVSPKRISNPNNTGDSRNCRAGVQINDSGAALGYYVSEDGYPGWMPQKWTWIPRELPGGRASFIHVFEPV
 21 | EDGQTRGANVFYSVMEQMKMLDTLQNTQLQSAIVKAMYAATIESELDTQSAMDFILGANSQEQRERLTGWIGEIAAYYAA
 22 | APVRLGGAKVPHLMPGDSLNLQTAQDTDNGYSVFEQSLLRYIAAGLGVSYEQLSRNYAQMSYSTARASANESWAYFMGRR
 23 | KFVASRQASQMFLCWLEEAIVRRVVTLPSKARFSFQEARSAWGNCDWIGSGRMAIDGLKEVQEAVMLIEAGLSTYEKECA
 24 | KRGDDYQEIFAQQVRETMERRAAGLKPPAWAAAAFESGLRQSTEEEKSDSRAA
 25 | >NP_040584.1 capsid component [Escherichia virus Lambda]
 26 | MTAELRNLPHIASMAFNEPLMLEPAYARVFFCALAGQLGISSLTDAVSGDSLTAQEALATLALSGDDDGPRQARSYQVMN
 27 | GIAVLPVSGTLVSRTRALQPYSGMTGYNGIIARLQQAASDPMVDGILLDMDTPGGMVAGAFDCADIIARVRDIKPVWALA
 28 | NDMNCSAGQLLASAASRRLVTQTARTGSIGVMMAHSNYGAALEKQGVEITLIYSGSHKVDGNPYSHLPDDVRETLQSRMD
 29 | ATRQMFAQKVSAYTGLSVQVVLDTEAAVYSGQEAIDAGLADELVNSTDAITVMRDALDARKSRLSGGRMTKETQSTTVSA
 30 | TASQADVTDVVPATEGENASAAQPDVNAQITAAVAAENSRIMGILNCEEAHGREEQARVLAETPGMTVKTARRILAAAPQ
 31 | SAQARSDTALDRLMQGAPAPLAAGNPASDAVNDLLNTPV
 32 | >NP_040585.1 capsid assembly protein [Escherichia virus Lambda]
 33 | MDATRQMFAQKVSAYTGLSVQVVLDTEAAVYSGQEAIDAGLADELVNSTDAITVMRDALDARKSRLSGGRMTKETQSTTV
 34 | SATASQADVTDVVPATEGENASAAQPDVNAQITAAVAAENSRIMGILNCEEAHGREEQARVLAETPGMTVKTARRILAAA
 35 | PQSAQARSDTALDRLMQGAPAPLAAGNPASDAVNDLLNTPV
 36 | >NP_040586.1 head-DNA stabilization protein [Escherichia virus Lambda]
 37 | MTSKETFTHYQPQGNSDPAHTATAPGGLSAKAPAMTPLMLDTSSRKLVAWDGTTDGAAVGILAVAADQTSTTLTFYKSGT
 38 | FRYEDVLWPEAASDETKKRTAFAGTAISIV
 39 | >NP_040587.1 capsid component [Escherichia virus Lambda]
 40 | MSMYTTAQLLAANEQKFKFDPLFLRLFFRESYPFTTEKVYLSQIPGLVNMALYVSPIVSGEVIRSRGGSTSEFTPGYVKP
 41 | KHEVNPQMTLRRLPDEDPQNLADPAYRRRRIIMQNMRDEELAIAQVEEMQAVSAVLKGKYTMTGEAFDPVEVDMGRSEEN
 42 | NITQSGGTEWSKRDKSTYDPTDDIEAYALNASGVVNIIVFDPKGWALFRSFKAVKEKLDTRRGSNSELETAVKDLGKAVS
 43 | YKGMYGDVAIVVYSGQYVENGVKKNFLPDNTMVLGNTQARGLRTYGCIQDADAQREGINASARYPKNWVTTGDPAREFTM
 44 | IQSAPLMLLADPDEFVSVQLA
 45 | >NP_040588.1 DNA packaging protein [Escherichia virus Lambda]
 46 | MTKDELIARLRSLGEQLNRDVSLTGTKEELALRVAELKEELDDTDETAGQDTPLSRENVLTGHENEVGSAQPDTVILDTS
 47 | ELVTVVALVKLHTDALHATRDEPVAFVLPGTAFRVSAGVAAEMTERGLARMQ
 48 | >NP_040589.1 head-tail joining protein [Escherichia virus Lambda]
 49 | MADFDNLFDAAIARADETIRGYMGTSATITSGEQSGAVIRGVFDDPENISYAGQGVRVEGSSPSLFVRTDEVRQLRRGDT
 50 | LTIGEENFWVDRVSPDDGGSCHLWLGRGVPPAVNRRR
 51 | >NP_040590.1 tail component [Escherichia virus Lambda]
 52 | MAIKGLEQAVENLSRISKTAVPGAAAMAINRVASSAISQSASQVARETKVRRKLVKERARLKRATVKNPQARIKVNRGDL
 53 | PVIKLGNARVVLSRRRRRKKGQRSSLKGGGSVLVVGNRRIPGAFIQQLKNGRWHVMQRVAGKNRYPIDVVKIPMAVPLTT
 54 | AFKQNIERIRRERLPKELGYALQHQLRMVIKR
 55 | >NP_040591.1 tail component [Escherichia virus Lambda]
 56 | MKHTELRAAVLDALEKHDTGATFFDGRPAVFDEADFPAVAVYLTGAEYTGEELDSDTWQAELHIEVFLPAQVPDSELDAW
 57 | MESRIYPVMSDIPALSDLITSMVASGYDYRRDDDAGLWSSADLTYVITYEM
 58 | >NP_040592.1 tail component [Escherichia virus Lambda]
 59 | MPVPNPTMPVKGAGTTLWVYKGSGDPYANPLSDVDWSRLAKVKDLTPGELTAESYDDSYLDDEDADWTATGQGQKSAGDT
 60 | SFTLAWMPGEQGQQALLAWFNEGDTRAYKIRFPNGTVDVFRGWVSSIGKAVTAKEVITRTVKVTNVGRPSMAEDRSTVTA
 61 | ATGMTVTPASTSVVKGQSTTLTVAFQPEGVTDKSFRAVSADKTKATVSVSGMTITVNGVAAGKVNIPVVSGNGEFAAVAE
 62 | ITVTAS
 63 | >NP_040593.1 tail component [Escherichia virus Lambda]
 64 | MFLKTESFEHNGVTVTLSELSALQRIEHLALMKRQAEQAESDSNRKFTVEDAIRTGAFLVAMSLWHNHPQKTQMPSMNEA
 65 | VKQIEQEVLTTWPTEAISHAENVVYRLSGMYEFVVNNAPEQTEDAGPAEPVSAGKCSTVS
 66 | >NP_040594.1 tail component [Escherichia virus Lambda]
 67 | MFDGELSFALKLAREMGRPDWRAMLAGMSSTEYADWHRFYSTHYFHDVLLDMHFSGLTYTVLSLFFSDPDMHPLDFSLLN
 68 | RREADEEPEDDVLMQKAAGLAGGVRFGPDGNEVIPASPDVADMTEDDVMLMTVSEGIAGGVRYG
 69 | >NP_040595.1 tail component [Escherichia virus Lambda]
 70 | MAEPVGDLVVDLSLDAARFDEQMARVRRHFSGTESDAKKTAAVVEQSLSRQALAAQKAGISVGQYKAAMRMLPAQFTDVA
 71 | TQLAGGQSPWLILLQQGGQVKDSFGGMIPMFRGLAGAITLPMVGATSLAVATGALAYAWYQGNSTLSDFNKTLVLSGNQA
 72 | GLTADRMLVLSRAGQAAGLTFNQTSESLSALVKAGVSGEAQIASISQSVARFSSASGVEVDKVAEAFGKLTTDPTSGLTA
 73 | MARQFHNVSAEQIAYVAQLQRSGDEAGALQAANEAATKGFDDQTRRLKENMGTLETWADRTARAFKSMWDAVLDIGRPDT
 74 | AQEMLIKAEAAYKKADDIWNLRKDDYFVNDEARARYWDDREKARLALEAARKKAEQQTQQDKNAQQQSDTEASRLKYTEE
 75 | AQKAYERLQTPLEKYTARQEELNKALKDGKILQADYNTLMAAAKKDYEATLKKPKQSSVKVSAGDRQEDSAHAALLTLQA
 76 | ELRTLEKHAGANEKISQQRRDLWKAESQFAVLEEAAQRRQLSAQEKSLLAHKDETLEYKRQLAALGDKVTYQERLNALAQ
 77 | QADKFAQQQRAKRAAIDAKSRGLTDRQAEREATEQRLKEQYGDNPLALNNVMSEQKKTWAAEDQLRGNWMAGLKSGWSEW
 78 | EESATDSMSQVKSAATQTFDGIAQNMAAMLTGSEQNWRSFTRSVLSMMTEILLKQAMVGIVGSIGSAIGGAVGGGASASG
 79 | GTAIQAAAAKFHFATGGFTGTGGKYEPAGIVHRGEFVFTKEATSRIGVGNLYRLMRGYATGGYVGTPGSMADSRSQASGT
 80 | FEQNNHVVINNDGTNGQIGPAALKAVYDMARKGARDEIQTQMRDGGLFSGGGR
 81 | >NP_040596.1 tail component [Escherichia virus Lambda]
 82 | MKTFRWKVKPGMDVASVPSVRKVRFGDGYSQRAPAGLNANLKTYSVTLSVPREEATVLESFLEEHGGWKSFLWTPPYEWR
 83 | QIKVTCAKWSSRVSMLRVEFSAEFEQVVN
 84 | >NP_040597.1 tail component [Escherichia virus Lambda]
 85 | MQDIRQETLNECTRAEQSASVVLWEIDLTEVGGERYFFCNEQNEKGEPVTWQGRQYQPYPIQGSGFELNGKGTSTRPTLT
 86 | VSNLYGMVTGMAEDMQSLVGGTVVRRKVYARFLDAVNFVNGNSYADPEQEVISRWRIEQCSELSAVSASFVLSTPTETDG
 87 | AVFPGRIMLANTCTWTYRGDECGYSGPAVADEYDQPTSDITKDKCSKCLSGCKFRNNVGNFGGFLSINKLSQ
 88 | >NP_040598.1 tail component [Escherichia virus Lambda]
 89 | MSPEDWLQAEMQGEIVALVHSHPGGLPWLSEADRRLQVQSDLPWWLVCRGTIHKFRCVPHLTGRRFEHGVTDCYTLFRDA
 90 | YHLAGIEMPDFHREDDWWRNGQNLYLDNLEATGLYQVPLSAAQPGDVLLCCFGSSVPNHAAIYCGDGELLHHIPEQLSKR
 91 | ERYTDKWQRRTHSLWRHRAWRASAFTGIYNDLVAASTFV
 92 | >NP_040599.1 tail component [Escherichia virus Lambda]
 93 | MAATHTLPLASPGMARICLYGDLQRFGRRIDLRVKTGAEAIRALATQLPAFRQKLSDGWYQVRIAGRDVSTSGLTAQLHE
 94 | TLPDGAVIHIVPRVAGAKSGGVFQIVLGAAAIAGSFFTAGATLAAWGAAIGAGGMTGILFSLGASMVLGGVAQMLAPKAR
 95 | TPRIQTTDNGKQNTYFSSLDNMVAQGNVLPVLYGEMRVGSRVVSQEISTADEGDGGQVVVIGR
 96 | >NP_040600.1 tail:host specificity protein [Escherichia virus Lambda]
 97 | MGKGSSKGHTPREAKDNLKSTQLLSVIDAISEGPIEGPVDGLKSVLLNSTPVLDTEGNTNISGVTVVFRAGEQEQTPPEG
 98 | FESSGSETVLGTEVKYDTPITRTITSANIDRLRFTFGVQALVETTSKGDRNPSEVRLLVQIQRNGGWVTEKDITIKGKTT
 99 | SQYLASVVMGNLPPRPFNIRMRRMTPDSTTDQLQNKTLWSSYTEIIDVKQCYPNTALVGVQVDSEQFGSQQVSRNYHLRG
100 | RILQVPSNYNPQTRQYSGIWDGTFKPAYSNNMAWCLWDMLTHPRYGMGKRLGAADVDKWALYVIGQYCDQSVPDGFGGTE
101 | PRITCNAYLTTQRKAWDVLSDFCSAMRCMPVWNGQTLTFVQDRPSDKTWTYNRSNVVMPDDGAPFRYSFSALKDRHNAVE
102 | VNWIDPNNGWETATELVEDTQAIARYGRNVTKMDAFGCTSRGQAHRAGLWLIKTELLETQTVDFSVGAEGLRHVPGDVIE
103 | ICDDDYAGISTGGRVLAVNSQTRTLTLDREITLPSSGTALISLVDGSGNPVSVEVQSVTDGVKVKVSRVPDGVAEYSVWE
104 | LKLPTLRQRLFRCVSIRENDDGTYAITAVQHVPEKEAIVDNGAHFDGEQSGTVNGVTPPAVQHLTAEVTADSGEYQVLAR
105 | WDTPKVVKGVSFLLRLTVTADDGSERLVSTARTTETTYRFTQLALGNYRLTVRAVNAWGQQGDPASVSFRIAAPAAPSRI
106 | ELTPGYFQITATPHLAVYDPTVQFEFWFSEKQIADIRQVETSTRYLGTALYWIAASINIKPGHDYYFYIRSVNTVGKSAF
107 | VEAVGRASDDAEGYLDFFKGKITESHLGKELLEKVELTEDNASRLEEFSKEWKDASDKWNAMWAVKIEQTKDGKHYVAGI
108 | GLSMEDTEEGKLSQFLVAANRIAFIDPANGNETPMFVAQGNQIFMNDVFLKRLTAPTITSGGNPPAFSLTPDGKLTAKNA
109 | DISGSVNANSGTLSNVTIAENCTINGTLRAEKIVGDIVKAASAAFPRQRESSVDWPSGTRTVTVTDDHPFDRQIVVLPLT
110 | FRGSKRTVSGRTTYSMCYLKVLMNGAVIYDGAANEAVQVFSRIVDMPAGRGNVILTFTLTSTRHSADIPPYTFASDVQVM
111 | VIKKQALGISVV
112 | >NP_040601.1 outer host membrane [Escherichia virus Lambda]
113 | MRNVCIAVAVFAALAVTVTPARAEGGHGTFTVGYFQVKPGTLPSLSGGDTGVSHLKGINVKYRYELTDSVGVMASLGFAA
114 | SKKSSTVMTGEDTFHYESLRGRYVSVMAGPVLQISKQVSAYAMAGVAHSRWSGSTMDYRKTEITPGYMKETTTARDESAM
115 | RHTSVAWSAGIQINPAASVVVDIAYEGSGSGDWRTDGFIVGVGYKF
116 | >NP_040602.1 Tail fiber protein [Escherichia virus Lambda]
117 | MAVKISGVLKDGTGKPVQNCTIQLKARRNSTTVVVNTVGSENPDEAGRYSMDVEYGQYSVILQVDGFPPSHAGTITVYED
118 | SQPGTLNDFLCAMTEDDARPEVLRRLELMVEEVARNASVVAQSTADAKKSAGDASASAAQVAALVTDATDSARAASTSAG
119 | QAASSAQEASSGAEAASAKATEAEKSAAAAESSKNAAATSAGAAKTSETNAAASQQSAATSASTAATKASEAATSARDAV
120 | ASKEAAKSSETNASSSAGRAASSATAAENSARAAKTSETNARSSETAAERSASAAADAKTAAAGSASTASTKATEAAGSA
121 | VSASQSKSAAEAAAIRAKNSAKRAEDIASAVALEDADTTRKGIVQLSSATNSTSETLAATPKAVKVVMDETNRKAHWTVR
122 | H
123 | >NP_040603.1 hypothetical protein lambdap90 [Escherichia virus Lambda]
124 | MLLVALLSCTIPFLVVSASSSATAEAISSARFAEFFARIAAASAALLLCDADTALPAASVAFVDAVDALPAAAVFASAAA
125 | EALRSAAVSDDLAFVSDVFAALAEFSAAVAEEAARPALDDAFVSDDFAASFEATASRAEVAASDAFVAAVEADVAADCCD
126 | AAAFVSDVFAAPALVAAAFFEDSAAAALFSASVAFADAASAPEDAS
127 | >NP_040604.1 Tail fiber [Escherichia virus Lambda]
128 | MTNALAGKQPKNATLTALAGLSTAKNKLPYFAENDAASLTELTQVGRDILAKNSVADVLEYLGAGENSAFPAGAPIPWPS
129 | DIVPSGYVLMQGQAFDKSAYPKLAVAYPSGVLPDMRGWTIKGKPASGRAVLSQEQDGIKSHTHSASASGTDLGTKTTSSF
130 | DYGTKTTGSFDYGTKSTNNTGAHAHSLSGSTGAAGAHAHTSGLRMNSSGWSQYGTATITGSLSTVKGTSTQGIAYLSKTD
131 | SQGSHSHSLSGTAVSAGAHAHTVGIGAHQHPVVIGAHAHSFSIGSHGHTITVNAAGNAENTVKNIAFNYIVRLA
132 | >NP_040605.1 Putative fiber assembly protein [Escherichia virus Lambda]
133 | MAFRMSEQPRTIKIYNLLAGTNEFIGEGDAYIPPHTGLPANSTDIAPPDIPAGFVAVFNSDEASWHLVEDHRGKTVYDVA
134 | SGDALFISELGPLPENFTWLSPGGEYQKWNGTAWVKDTEAEKLFRIREAEETKKSLMQVASEHIAPLQDAADLEIATKEE
135 | TSLLEAWKKYRVLLNRVDTSTAPDIEWPAVPVME
136 | >NP_040606.1 ea47 [Escherichia virus Lambda]
137 | MTKKPWERRLKDLSHLLKCCIDTYFDPELFRLNLNQFLQTARTVTFIIQKNKNQIIGYDIWYNNNVIEKWKNDPLMAWAK
138 | NSRNTIEKQGDLEMYSEAKATLISSYIEENDIEFITNESMLNIGIKKLVRLAQKKLPSYLTESSIIKSERRWVANTLKDY
139 | ELLHALAIIYGRMYNCCNSLGIQINNPMGDDVISPTSFDSLFDEARRITYLKLKDYSISKLSFSMIQYDNKIIPEDIKER
140 | LKLVDKPKNITSTEELVDYTAKLAETTFLKDGYHIQTLIFYDKQFHPIDLINTTFEDQADKYIFWRYAADRAKITNAYGF
141 | IWISELWLRKASIYSNKPIHTMPIIDERLQVIGIDSNNNQKCISWKIVRENEEKKPTLEISTADSKHDEKPYFMRSVLKA
142 | IGGDVNTMNN
143 | >NP_040607.1 ea31 [Escherichia virus Lambda]
144 | MKKLPLPARTYSEMLNKCSEGMMQINVRNNFITHFPTFLQKEQQYRILSSTGQLFTYDRTHPLEPTTLVVGNLTKVKLEK
145 | LYENNLRDKNKPARTYYDDMLVSSGEKCPFCGDIGQTKNIDHFLPIAHYPEFSVMPINLVPSCRDCNMGEKGQVFAVDEV
146 | HQAIHPYIDKDIFFREQWVYANFVSGTPGAISFYVECPANWRQEDKHRALHHFKLLNIANRYRLEAGKHLSEVITQRNSF
147 | VKVIRKYSSTATFQQLQSEFIEANLKPIIDLNDFPNYWKRVMYQCLANSEDFFRGI
148 | >NP_040608.1 ea59 [Escherichia virus Lambda]
149 | MLEFSVIERGGYIPAVEKNKAFLRADGWNDYSFVTMFYLTVFDEHGEKCDIGNVKIGFVGQKEEVSTYSLIDKKFSQLPE
150 | MFFSLGESIDYYVNLSKLSDGFKHNLLKAIQDLVVWPNRLADIENESVLNTSLLRGVTLSEIHGQFARVLNGLPELSDFH
151 | FSFNRKSAPGFSDLTIPFEVTVNSMPSTNIHAFIGRNGCGKTTILNGMIGAITNPENNEYFFSENNRLIESRIPKGYFRS
152 | LVSVSFSAFDPFTPPKEQPDPAKGTQYFYIGLKNAASNSLKSLGDLRLEFISAFIGCMRVDRKRQLWLEAIKKLSSDENF
153 | SNMELISLISKYEELRRNEPQIQVDDDKFTKLFYDNIQKYLLRMSSGHAIVLFTITRLVDVVGEKSLVLFDEPEVHLHPP
154 | LLSAFLRTLSDLLDARNGVAIIATHSPVVLQEVPKSCMWKVLRSREAINIIRPDIETFGENLGVLTREVFLLEVTNSGYH
155 | HLLSQSVDSELSYETILKNYNGQIGLEGRTVLKAMIMNRDEGKVQ
156 | >NP_040609.1 integration protein [Escherichia virus Lambda]
157 | MGRRRSHERRDLPPNLYIRNNGYYCYRDPRTGKEFGLGRDRRIAITEAIQANIELFSGHKHKPLTARINSDNSVTLHSWL
158 | DRYEKILASRGIKQKTLINYMSKIKAIRRGLPDAPLEDITTKEIAAMLNGYIDEGKAASAKLIRSTLSDAFREAIAEGHI
159 | TTNHVAATRAAKSEVRRSRLTADEYLKIYQAAESSPCWLRLAMELAVVTGQRVGDLCEMKWSDIVDGYLYVEQSKTGVKI
160 | AIPTALHIDALGISMKETLDKCKEILGGETIIASTRREPLSSGTVSRYFMRARKASGLSFEGDPPTFHELRSLSARLYEK
161 | QISDKFAQHLLGHKSDTMASQYRDDRGREWDKIEIK
162 | >NP_040610.1 Excisionase [Escherichia virus Lambda]
163 | MYLTLQEWNARQRRPRSLETVRRWVRECRIFPPPVKDGREYLFHESAVKVDLNRPVTGGLLKRIRNGKKAKS
164 | >NP_040611.1 ea8.5 [Escherichia virus Lambda]
165 | MSINELESEQKDWALSMLCRSGVLSPCRHHEGVYVDEGIDIESAYKYSMKVYKSNEDKSPFCNVREMTDTVQNYYHEYGG
166 | NDTCPLCTKHIDD
167 | >NP_040612.1 ea22 [Escherichia virus Lambda]
168 | MSEINSQALREAAEQAMHDDWGFDADLFHELVTPSIVLELLDERERNQQYIKRRDQENEDIALTVGKLRVELETAKSKLN
169 | EQREYYEGVISDGSKRIAKLESNEVREDGNQFLVVRHPGKTPVIKHCTGDLEEFLRQLIEQDPLVTIDIITHRYYGVGGQ
170 | WVQDAGEYLHMMSDAGIRIKGE
171 | >NP_040613.1 hypothetical protein lambdap37 [Escherichia virus Lambda]
172 | MRETRYDNHGMHFSGSGLHILCAYACRHGTCSMTPQQENALRSIARQANSEIKKSQTAVSG
173 | >NP_040614.1 hypothetical protein lambdap38 [Escherichia virus Lambda]
174 | MHKASSVELRTSIEMAHSLAQIGIRFVPIPVETDEEFHTLAASLSQKLEMMVAKAEADERNQV
175 | >NP_040615.1 hypothetical protein lambdap39 [Escherichia virus Lambda]
176 | MTHPHDNIRVGAITFVYSVTKRGWVFPGLSVIRNPLKAQRLAEEINNKRGAVCTKHLLLS
177 | >NP_040616.1 exonuclease [Escherichia virus Lambda]
178 | MTPDIILQRTGIDVRAVEQGDDAWHKLRLGVITASEVHNVIAKPRSGKKWPDMKMSYFHTLLAEVCTGVAPEVNAKALAW
179 | GKQYENDARTLFEFTSGVNVTESPIIYRDESMRTACSPDGLCSDGNGLELKCPFTSRDFMKFRLGGFEAIKSAYMAQVQY
180 | SMWVTRKNAWYFANYDPRMKREGLHYVVIERDEKYMASFDEIVPEFIEKMDEALAEIGFVFGEQWR
181 | >NP_040617.1 bet [Escherichia virus Lambda]
182 | MSTALATLAGKLAERVGMDSVDPQELITTLRQTAFKGDASDAQFIALLIVANQYGLNPWTKEIYAFPDKQNGIVPVVGVD
183 | GWSRIINENQQFDGMDFEQDNESCTCRIYRKDRNHPICVTEWMDECRREPFKTREGREITGPWQSHPKRMLRHKAMIQCA
184 | RLAFGFAGIYDKDEAERIVENTAYTAERQPERDITPVNDETMQEINTLLIALDKTWDDDLLPLCSQIFRRDIRASSELTQ
185 | AEAVKALGFLKQKAAEQKVAA
186 | >NP_040618.1 host-nuclease inhibitor protein Gam [Escherichia virus Lambda]
187 | MDINTETEIKQKHSLTPFPVFLISPAFRGRYFHSYFRSSAMNAYYIQDRLEAQSWARHYQQLAREEKEAELADDMEKGLP
188 | QHLFESLCIDHLQRHGASKKSITRAFDDDVEFQERMAEHIRYMVETIAHHQVDIDSEV
189 | >NP_040619.1 host-killing protein [Escherichia virus Lambda]
190 | MDQTLMAIQTKFTIATFIGDEKMFREAVDAYKKWILILKLRSSKSIH
191 | >NP_040620.1 antitermination protein [Escherichia virus Lambda]
192 | MQYAIAGWPVAGCPSESLLERITRKLRDGWKRLIDILNQPGVPKNGSNTYGYPD
193 | >NP_040621.1 Putative single-stranded DNA binding protein [Escherichia virus Lambda]
194 | MSNIKKYIIDYDWKASIEIEIDHDVMTEEKLHQINNFWSDSEYRLNKHGSVLNAVLIMLAQHALLIAISSDLNAYGVVCE
195 | FDWNDGNGQEGWPPMDGSEGIRITDIDTSGIFDSDDMTIKAA
196 | >NP_040622.1 restriction alleviation protein [Escherichia virus Lambda]
197 | MTTTIDKNQWCGQFKRCNGCKLQSECMVKPEEMFPVMEDGKYVDKWAIRTTAMIARELGKQNNKAA
198 | >NP_040623.1 hypothetical protein lambdap47 [Escherichia virus Lambda]
199 | MEEEFEEFEEHPQDVMEQYQDYPYDYDY
200 | >NP_040625.1 early gene regulator [Escherichia virus Lambda]
201 | MCQSRGVFVQDYNCHTPPKLTDRRIQMDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKSRVESALNP
202 | IDLTVLAEYHKQIESNLQRIERKNQRTWYSKPGERGITCSGRQKIKGKSIPLI
203 | >NP_040626.1 exclusion protein [Escherichia virus Lambda]
204 | MRNRIMPGVYIVIIPYVIVSICYLLFRHYIPGVSFSAHRDGLGATLSSYAGTMIAILIAALTFLIGSRTRRLAKIREYGY
205 | MTSVVIVYALSFVELGALFFCGLLLLSSISGYMIPTIAIGIASASFIHICILVFQLYNLTREQE
206 | >NP_040627.1 exclusion protein [Escherichia virus Lambda]
207 | MKNGFYATYRSKNKGKDKRSINLSVFLNSLLADNHHLQVGSNYLYIHKIDGKTFLFTKTNDKSLVQKINRSKASVEDIKN
208 | SLADDESLGFPSFLFVEGDTIGFARTVFGPTTSDLTDFLIGKGMSLSSGERVQIEPLMRGTTKDDVMHMHFIGRTTVKVE
209 | AKLPVFGDILKVLGATDIEGELFDSLDIVIKPKFKRDIKKVAKDIIFNPSPQFSDISLRAKDEAGDILTEHYLSEKGHLS
210 | APLNKVTNAEIAEEMAYCYARMKSDILECFKRQVGKVKD
211 | >NP_040628.1 repressor [Escherichia virus Lambda]
212 | MSTKKKPLTQEQLEDARRLKAIYEKKKNELGLSQESVADKMGMGQSGVGALFNGINALNAYNAALLAKILKVSVEEFSPS
213 | IAREIYEMYEAVSMQPSLRSEYEYPVFSHVQAGMFSPELRTFTKGDAERWVSTTKKASDSAFWLEVEGNSMTAPTGSKPS
214 | FPDGMLILVDPEQAVEPGDFCIARLGGDEFTFKKLIRDSGQVFLQPLNPQYPMIPCNESCSVVGKVIASQWPEETFG
215 | >NP_040629.1 antirepressor [Escherichia virus Lambda]
216 | MEQRITLKDYAMRFGQTKTAKDLGVYQSAINKAIHAGRKIFLTINADGSVYAEEVKPFPSNKKTTA
217 | >NP_040630.1 cII protein [Escherichia virus Lambda]
218 | MVRANKRNEALRIESALLNKIAMLGTEKTAEAVGVDKSQISRWKRDWIPKFSMLLAVLEWGVVDDDMARLARQVAAILTN
219 | KKRPAATERSEQIQMEF
220 | >NP_040631.1 DNA replication protein [Escherichia virus Lambda]
221 | MTNTAKILNFGRGNFAGQERNVADLDDGYARLSNMLLEAYSGADLTKRQFKVLLAILRKTYGWNKPMDRITDSQLSEITK
222 | LPVKRCNEAKLELVRMNIIKQQGGMFGPNKNISEWCIPQNEGKSPKTRDKTSLKLGDCYPSKQGDTKDTITKEKRKDYSS
223 | ENSGESSDQPENDLSVVKPDAAIQSGSKWGTAEDLTAAEWMFDMVKTIAPSARKPNFAGWANDIRLMRERDGRNHRDMCV
224 | LFRWACQDNFWSGNVLSPAKLRDKWTQLEINRNKQQAGVTASKPKLDLTNTDWIYGVDL
225 | >NP_040632.1 DNA replication protein [Escherichia virus Lambda]
226 | MKNIAAQMVNFDREQMRRIANNMPEQYDEKPQVQQVAQIINGVFSQLLATFPASLANRDQNEVNEIRRQWVLAFRENGIT
227 | TMEQVNAGMRVARRQNRPFLPSPGQFVAWCREEASVTAGLPNVSELVDMVYEYCRKRGLYPDAESYPWKSNAHYWLVTNL
228 | YQNMRANALTDAELRRKAADELVHMTARINRGEAIPEPVKQLPVMGGRPLNRAQALAKIAEIKAKFGLKGASV
229 | >NP_040633.1 ren exclusion protein [Escherichia virus Lambda]
230 | MTGKEAIIHYLGTHNSFCAPDVAALTGATVTSINQAAAKMARAGLLVIEGKVWRTVYYRFATREEREGKMSTNLVFKECR
231 | QSAAMKRVLAVYGVKR
232 | >NP_040634.1 NinB [Escherichia virus Lambda]
233 | MKKLTFEIRSPAHQQNAIHAVQQILPDPTKPIVVTIQERNRSLDQNRKLWACLGDVSRQVEWHGRWLDAESWKCVFTAAL
234 | KQQDVVPNLAGNGFVVIGQSTSRMRVGEFAELLELIQAFGTERGVKWSDEARLALEWKARWGDRAA
235 | >NP_040635.1 NinC protein [Escherichia virus Lambda]
236 | MINVVSFSGGRTSAYLLWLMEQKRRAGKDVHYVFMDTGCEHPMTYRFVREVVKFWDIPLTVLQVDINPELGQPNGYTVWE
237 | PKDIQTRMPVLKPFIDMVKKYGTPYVGGAFCTDRLKLVPFTKYCDDHFGRGNYTTWIGIRADEPKRLKPKPGIRYLAELS
238 | DFEKEDILAWWKQQPFDLQIPEHLGNCIFCIKKSTQKIGLACKDEEGLQRVFNEVITGSHVRDGHRETPKEIMYRGRMSL
239 | DGIAKMYSENDYQALYQDMVRAKRFDTGSCSESCEIFGGQLDFDFGREAA
240 | >NP_040636.1 NinD protein [Escherichia virus Lambda]
241 | MMRCYRCGECKEDNRFRPNQPYWNRWCLRCERTPTGVLPLPQEKEDVWRDSDEVSPT
242 | >NP_040637.1 NinE protein [Escherichia virus Lambda]
243 | MARQRRSITDIICENCKYLPTKRTRNKPKPIPKESDVKTFNYTAHLWDIRWLRRRARKTR
244 | >NP_040638.1 NinF protein [Escherichia virus Lambda]
245 | MIDQNRSYEQESVERALTCANCGQKLHVLEVHVCEHCCAELMSDPNSSMHEEEDDG
246 | >NP_040639.1 NinG protein [Escherichia virus Lambda]
247 | MMAKPARRRCKNDECREWFHPAFANQWWCSPECGTKIALERRSKEREKAEKAAEKKRRREEQKQKDKLKIRKLALKPRSY
248 | WIKQAQQAVNAFIRERDRDLPCISCGTLTSAQWDAGHYRTTAAAPQLRFNERNIHKQCVVCNQHKSGNLVPYRVELISRI
249 | GQEAVDEIESNHNRHRWTIEECKAIKAEYQQKLKDLRNSRSEAA
250 | >NP_040640.1 NinH protein [Escherichia virus Lambda]
251 | MTFSVKTIPDMLVEAYGNQTEVARRLKCSRGTVRKYVDDKDGKMHAIVNDVLMVHRGWSERDALLRKN
252 | >NP_040641.1 NinI protein [Escherichia virus Lambda]
253 | MRYYEKIDGSKYRNIWVVGDLHGCYTNLMNKLDTIGFDNKKDLLISVGDLVDRGAENVECLELITFPWFRAVRGNHEQMM
254 | IDGLSERGNVNHWLLNGGGWFFNLDYDKEILAKALAHKADELPLIIELVSKDKKYVICHADYPFDEYEFGKPVDHQQVIW
255 | NRERISNSQNGIVKEIKGADTFIFGHTPAVKPLKFANQMYIDTGAVFCGNLTLIQVQGEGA
256 | >NP_040642.1 late gene regulator [Escherichia virus Lambda]
257 | MRLESVAKFHSPKSPMMSDSPRATASDSLSGTDVMAAMGMAQSQAGFGMAAFCGKHELSQNDKQKAINYLMQFAHKVSGK
258 | YRGVAKLEGNTKAKVLQVLATFAYADYCRSAATPGARCRDCHGTGRAVDIAKTELWGRVVEKECGRCKGVGYSRMPASAA
259 | YRAVTMLIPNLTQPTWSRTVKPLYDALVVQCHKEESIADNILNAVTR
260 | >NP_040643.1 hypothetical protein lambdap73 [Escherichia virus Lambda]
261 | MKRGGAYYRFRLVGHFDVSSGTPTIAGREVCKMQSRNSSQVIVRACITVSGFFISAQQVRALSR
262 | >NP_040644.1 anti-holin [Escherichia virus Lambda]
263 | MKMPEKHDLLAAILAAKEQGIGAILAFAMAYLRGRYNGGAFTKTVIDATMCAIIAWFIRDLLDFAGLSSNLAYITSVFIG
264 | YIGTDSIGSLIKRFAAKKAGVEDGRNQ
265 | >NP_040645.1 endolysin [Escherichia virus Lambda]
266 | MVEINNQRKAFLDMLAWSEGTDNGRQKTRNHGYDVIVGGELFTDYSDHPRKLVTLNPKLKSTGAGRYQLLSRWWDAYRKQ
267 | LGLKDFSPKSQDAVALQQIKERGALPMIDRGDIRQAIDRCSNIWASLPGAGYGQFEHKADSLIAKFKEAGGTVREIDV
268 | >NP_040646.1 cell lysis protein [Escherichia virus Lambda]
269 | MSRVTAIISALVICIIVCLSWAVNHYRDNAITYKAQRDKNARELKLANAAITDMQMRQRDVAALDAKYTKELADAKAEND
270 | ALRDDVAAGRRRLHIKAVCQSVREATTASGVDNAASPRLADTAERDYFTLRERLITMQKQLEGTQKYINEQCR
271 | >NP_597778.1 hypothetical protein lambdap35 [Escherichia virus Lambda]
272 | MHFRVTGEWNGEPFNRVIEAENINDCYDHWMIWAQIAHADVTNIRIEELKEHQAA
273 | >NP_597779.1 Superinfection exclusion protein B [Escherichia virus Lambda]
274 | MMSIEMDPLVILGRVFSNEPLERTMYMIVIWVGLLLLSPDNWPEYVNERIGIPHVWHVFVFALAFSLAINVHRLSAIASA
275 | RYKRFKLRKRIKMQNDKVRSVIQNLTEEQSMVLCAALNEGRKYVVTSKQFPYISELIELGVLNKTFSRWNGKHILFPIED
276 | IYWTELVASYDPYNIEIKPRPISK
277 | >NP_597780.1 Bor protein precursor [Escherichia virus Lambda]
278 | MKKMLLATALALLITGCAQQTFTVQNKPAAVAPKETITHHFFVSGIGQKKTVDAAKICGGAENVVKTETQQTFVNGLLGF
279 | ITLGIYTPLEARVYCSQ
280 | >NP_597781.1 putative envelope protein [Escherichia virus Lambda]
281 | MKTFLIFDINHQQNTRRSDQIETIKTIMQTTRPRITWKVLPMAQVAIFKEIFDQVRKDLDCELFYSELKRHNVSHYIYYL
282 | ATDNIHIVLENDNTVLIKGLKKVVNVKFSRNTHLIETSYDRLKSREITFQQYRENLAKAGVFRWVTNIHEHKRYYYTFDN
283 | SLLFTESIQNTTQIFPR
284 | >NP_597782.1 hypothetical protein lambdap79 [Escherichia virus Lambda]
285 | MNKEQSADDPSVDLIRVKNMLNSTISMSYPDVVIACIEHKVSLEAFRAIEAALVKHDNNMKDYSLVVD
286 | >YP_001551744.1 Rz1 protein [Escherichia virus Lambda]
287 | MLKLKMMLCVMMLPLVVVGCTSKQSVSQCVKPPPPPAWIMQPPPDWQTPLNGIISPSERG
288 | >YP_001551775.1 holin [Escherichia virus Lambda]
289 | MPEKHDLLAAILAAKEQGIGAILAFAMAYLRGRYNGGAFTKTVIDATMCAIIAWFIRDLLDFAGLSSNLAYITSVFIGYI
290 | GTDSIGSLIKRFAAKKAGVEDGRNQ
291 | 


--------------------------------------------------------------------------------
/phage/vir_translated_cds.faa:
--------------------------------------------------------------------------------
  1 | >lcl|NC_001416.1_prot_NP_040580.1_1 [gene=nu1] [locus_tag=lambdap01] [db_xref=GeneID:2703523] [protein=DNA packaging protein] [protein_id=NP_040580.1] [location=191..736] [gbkey=CDS]
  2 | MEVNKKQLADIFGASIRTIQNWQEQGMPVLRGGGKGNEVLYDSAAVIKWYAERDAEIENEKLRREVEELRQASEADLQPG
  3 | TIEYERHRLTRAQADAQELKNARDSAEVVETAFCTFVLSRIAGEIASILDGLPLSVQRRFPELENRHVDFLKRDIIKAMN
  4 | KAAALDELIPGLLSEYIEQSG
  5 | >lcl|NC_001416.1_prot_NP_040581.1_2 [gene=A] [locus_tag=lambdap02] [db_xref=GeneID:2703524] [protein=DNA packaging protein] [protein_id=NP_040581.1] [location=711..2636] [gbkey=CDS]
  6 | MNISNSQVNRLRHFVRAGLRSLFRPEPQTAVEWADANYYLPKESAYQEGRWETLPFQRAIMNAMGSDYIREVNVVKSARV
  7 | GYSKMLLGVYAYFIEHKQRNTLIWLPTDGDAENFMKTHVEPTIRDIPSLLALAPWYGKKHRDNTLTMKRFTNGRGFWCLG
  8 | GKAAKNYREKSVDVAGYDELAAFDDDIEQEGSPTFLGDKRIEGSVWPKSIRGSTPKVRGTCQIERAASESPHFMRFHVAC
  9 | PHCGEEQYLKFGDKETPFGLKWTPDDPSSVFYLCEHNACVIRQQELDFTDARYICEKTGIWTRDGILWFSSSGEEIEPPD
 10 | SVTFHIWTAYSPFTTWVQIVKDWMKTKGDTGKRKTFVNTTLGETWEAKIGERPDAEVMAERKEHYSAPVPDRVAYLTAGI
 11 | DSQLDRYEMRVWGWGPGEESWLIDRQIIMGRHDDEQTLLRVDEAINKTYTRRNGAEMSISRICWDTGGIDPTIVYERSKK
 12 | HGLFRVIPIKGASVYGKPVASMPRKRNKNGVYLTEIGTDTAKEQIYNRFTLTPEGDEPLPGAVHFPNNPDIFDLTEAQQL
 13 | TAEEQVEKWVDGRKKILWDSKKRRNEALDCFVYALAALRISISRWQLDLSALLASLQEEDGAATNKKTLADYARALSGED
 14 | E
 15 | >lcl|NC_001416.1_prot_NP_040582.1_3 [gene=W] [locus_tag=lambdap03] [db_xref=GeneID:2703525] [protein=head-tail joining protein] [protein_id=NP_040582.1] [location=2633..2839] [gbkey=CDS]
 16 | MTRQEELAAARAALHDLMTGKRVATVQKDGRRVEFTATSVSDLKKYIAELEVQTGMTQRRRGPAGFYV
 17 | >lcl|NC_001416.1_prot_NP_040583.1_4 [gene=B] [locus_tag=lambdap04] [db_xref=GeneID:2703526] [protein=capsid component] [protein_id=NP_040583.1] [location=2836..4437] [gbkey=CDS]
 18 | MKTPTIPTLLGPDGMTSLREYAGYHGGGSGFGGQLRSWNPPSESVDAALLPNFTRGNARADDLVRNNGYAANAIQLHQDH
 19 | IVGSFFRLSHRPSWRYLGIGEEEARAFSREVEAAWKEFAEDDCCCIDVERKRTFTMMIREGVAMHAFNGELFVQATWDTS
 20 | SSRLFRTQFRMVSPKRISNPNNTGDSRNCRAGVQINDSGAALGYYVSEDGYPGWMPQKWTWIPRELPGGRASFIHVFEPV
 21 | EDGQTRGANVFYSVMEQMKMLDTLQNTQLQSAIVKAMYAATIESELDTQSAMDFILGANSQEQRERLTGWIGEIAAYYAA
 22 | APVRLGGAKVPHLMPGDSLNLQTAQDTDNGYSVFEQSLLRYIAAGLGVSYEQLSRNYAQMSYSTARASANESWAYFMGRR
 23 | KFVASRQASQMFLCWLEEAIVRRVVTLPSKARFSFQEARSAWGNCDWIGSGRMAIDGLKEVQEAVMLIEAGLSTYEKECA
 24 | KRGDDYQEIFAQQVRETMERRAAGLKPPAWAAAAFESGLRQSTEEEKSDSRAA
 25 | >lcl|NC_001416.1_prot_NP_040584.1_5 [gene=C] [locus_tag=lambdap05] [db_xref=GeneID:2703527] [protein=capsid component] [protein_id=NP_040584.1] [location=4418..5737] [gbkey=CDS]
 26 | MTAELRNLPHIASMAFNEPLMLEPAYARVFFCALAGQLGISSLTDAVSGDSLTAQEALATLALSGDDDGPRQARSYQVMN
 27 | GIAVLPVSGTLVSRTRALQPYSGMTGYNGIIARLQQAASDPMVDGILLDMDTPGGMVAGAFDCADIIARVRDIKPVWALA
 28 | NDMNCSAGQLLASAASRRLVTQTARTGSIGVMMAHSNYGAALEKQGVEITLIYSGSHKVDGNPYSHLPDDVRETLQSRMD
 29 | ATRQMFAQKVSAYTGLSVQVVLDTEAAVYSGQEAIDAGLADELVNSTDAITVMRDALDARKSRLSGGRMTKETQSTTVSA
 30 | TASQADVTDVVPATEGENASAAQPDVNAQITAAVAAENSRIMGILNCEEAHGREEQARVLAETPGMTVKTARRILAAAPQ
 31 | SAQARSDTALDRLMQGAPAPLAAGNPASDAVNDLLNTPV
 32 | >lcl|NC_001416.1_prot_NP_040585.1_6 [gene=nu3] [locus_tag=lambdap06] [db_xref=GeneID:2703528] [protein=capsid assembly protein] [protein_id=NP_040585.1] [location=5132..5737] [gbkey=CDS]
 33 | MDATRQMFAQKVSAYTGLSVQVVLDTEAAVYSGQEAIDAGLADELVNSTDAITVMRDALDARKSRLSGGRMTKETQSTTV
 34 | SATASQADVTDVVPATEGENASAAQPDVNAQITAAVAAENSRIMGILNCEEAHGREEQARVLAETPGMTVKTARRILAAA
 35 | PQSAQARSDTALDRLMQGAPAPLAAGNPASDAVNDLLNTPV
 36 | >lcl|NC_001416.1_prot_NP_040586.1_7 [gene=D] [locus_tag=lambdap07] [db_xref=GeneID:2703529] [protein=head-DNA stabilization protein] [protein_id=NP_040586.1] [location=5747..6079] [gbkey=CDS]
 37 | MTSKETFTHYQPQGNSDPAHTATAPGGLSAKAPAMTPLMLDTSSRKLVAWDGTTDGAAVGILAVAADQTSTTLTFYKSGT
 38 | FRYEDVLWPEAASDETKKRTAFAGTAISIV
 39 | >lcl|NC_001416.1_prot_NP_040587.1_8 [gene=E] [locus_tag=lambdap08] [db_xref=GeneID:2703482] [protein=capsid component] [protein_id=NP_040587.1] [location=6135..7160] [gbkey=CDS]
 40 | MSMYTTAQLLAANEQKFKFDPLFLRLFFRESYPFTTEKVYLSQIPGLVNMALYVSPIVSGEVIRSRGGSTSEFTPGYVKP
 41 | KHEVNPQMTLRRLPDEDPQNLADPAYRRRRIIMQNMRDEELAIAQVEEMQAVSAVLKGKYTMTGEAFDPVEVDMGRSEEN
 42 | NITQSGGTEWSKRDKSTYDPTDDIEAYALNASGVVNIIVFDPKGWALFRSFKAVKEKLDTRRGSNSELETAVKDLGKAVS
 43 | YKGMYGDVAIVVYSGQYVENGVKKNFLPDNTMVLGNTQARGLRTYGCIQDADAQREGINASARYPKNWVTTGDPAREFTM
 44 | IQSAPLMLLADPDEFVSVQLA
 45 | >lcl|NC_001416.1_prot_NP_040588.1_9 [gene=Fi] [locus_tag=lambdap09] [db_xref=GeneID:2703483] [protein=DNA packaging protein] [protein_id=NP_040588.1] [location=7202..7600] [gbkey=CDS]
 46 | MTKDELIARLRSLGEQLNRDVSLTGTKEELALRVAELKEELDDTDETAGQDTPLSRENVLTGHENEVGSAQPDTVILDTS
 47 | ELVTVVALVKLHTDALHATRDEPVAFVLPGTAFRVSAGVAAEMTERGLARMQ
 48 | >lcl|NC_001416.1_prot_NP_040589.1_10 [gene=Fii] [locus_tag=lambdap10] [db_xref=GeneID:2703484] [protein=head-tail joining protein] [protein_id=NP_040589.1] [location=7612..7965] [gbkey=CDS]
 49 | MADFDNLFDAAIARADETIRGYMGTSATITSGEQSGAVIRGVFDDPENISYAGQGVRVEGSSPSLFVRTDEVRQLRRGDT
 50 | LTIGEENFWVDRVSPDDGGSCHLWLGRGVPPAVNRRR
 51 | >lcl|NC_001416.1_prot_NP_040590.1_11 [gene=Z] [locus_tag=lambdap11] [db_xref=GeneID:2703485] [protein=tail component] [protein_id=NP_040590.1] [location=7977..8555] [gbkey=CDS]
 52 | MAIKGLEQAVENLSRISKTAVPGAAAMAINRVASSAISQSASQVARETKVRRKLVKERARLKRATVKNPQARIKVNRGDL
 53 | PVIKLGNARVVLSRRRRRKKGQRSSLKGGGSVLVVGNRRIPGAFIQQLKNGRWHVMQRVAGKNRYPIDVVKIPMAVPLTT
 54 | AFKQNIERIRRERLPKELGYALQHQLRMVIKR
 55 | >lcl|NC_001416.1_prot_NP_040591.1_12 [gene=U] [locus_tag=lambdap12] [db_xref=GeneID:2703486] [protein=tail component] [protein_id=NP_040591.1] [location=8552..8947] [gbkey=CDS]
 56 | MKHTELRAAVLDALEKHDTGATFFDGRPAVFDEADFPAVAVYLTGAEYTGEELDSDTWQAELHIEVFLPAQVPDSELDAW
 57 | MESRIYPVMSDIPALSDLITSMVASGYDYRRDDDAGLWSSADLTYVITYEM
 58 | >lcl|NC_001416.1_prot_NP_040592.1_13 [gene=V] [locus_tag=lambdap13] [db_xref=GeneID:2703487] [protein=tail component] [protein_id=NP_040592.1] [location=8955..9695] [gbkey=CDS]
 59 | MPVPNPTMPVKGAGTTLWVYKGSGDPYANPLSDVDWSRLAKVKDLTPGELTAESYDDSYLDDEDADWTATGQGQKSAGDT
 60 | SFTLAWMPGEQGQQALLAWFNEGDTRAYKIRFPNGTVDVFRGWVSSIGKAVTAKEVITRTVKVTNVGRPSMAEDRSTVTA
 61 | ATGMTVTPASTSVVKGQSTTLTVAFQPEGVTDKSFRAVSADKTKATVSVSGMTITVNGVAAGKVNIPVVSGNGEFAAVAE
 62 | ITVTAS
 63 | >lcl|NC_001416.1_prot_NP_040593.1_14 [gene=G] [locus_tag=lambdap14] [db_xref=GeneID:2703488] [protein=tail component] [protein_id=NP_040593.1] [location=9711..10133] [gbkey=CDS]
 64 | MFLKTESFEHNGVTVTLSELSALQRIEHLALMKRQAEQAESDSNRKFTVEDAIRTGAFLVAMSLWHNHPQKTQMPSMNEA
 65 | VKQIEQEVLTTWPTEAISHAENVVYRLSGMYEFVVNNAPEQTEDAGPAEPVSAGKCSTVS
 66 | >lcl|NC_001416.1_prot_NP_040594.1_15 [gene=T] [locus_tag=lambdap15] [db_xref=GeneID:2703489] [protein=tail component] [protein_id=NP_040594.1] [location=10115..10549] [gbkey=CDS]
 67 | MFDGELSFALKLAREMGRPDWRAMLAGMSSTEYADWHRFYSTHYFHDVLLDMHFSGLTYTVLSLFFSDPDMHPLDFSLLN
 68 | RREADEEPEDDVLMQKAAGLAGGVRFGPDGNEVIPASPDVADMTEDDVMLMTVSEGIAGGVRYG
 69 | >lcl|NC_001416.1_prot_NP_040595.1_16 [gene=H] [locus_tag=lambdap16] [db_xref=GeneID:2703511] [protein=tail component] [protein_id=NP_040595.1] [location=10542..13103] [gbkey=CDS]
 70 | MAEPVGDLVVDLSLDAARFDEQMARVRRHFSGTESDAKKTAAVVEQSLSRQALAAQKAGISVGQYKAAMRMLPAQFTDVA
 71 | TQLAGGQSPWLILLQQGGQVKDSFGGMIPMFRGLAGAITLPMVGATSLAVATGALAYAWYQGNSTLSDFNKTLVLSGNQA
 72 | GLTADRMLVLSRAGQAAGLTFNQTSESLSALVKAGVSGEAQIASISQSVARFSSASGVEVDKVAEAFGKLTTDPTSGLTA
 73 | MARQFHNVSAEQIAYVAQLQRSGDEAGALQAANEAATKGFDDQTRRLKENMGTLETWADRTARAFKSMWDAVLDIGRPDT
 74 | AQEMLIKAEAAYKKADDIWNLRKDDYFVNDEARARYWDDREKARLALEAARKKAEQQTQQDKNAQQQSDTEASRLKYTEE
 75 | AQKAYERLQTPLEKYTARQEELNKALKDGKILQADYNTLMAAAKKDYEATLKKPKQSSVKVSAGDRQEDSAHAALLTLQA
 76 | ELRTLEKHAGANEKISQQRRDLWKAESQFAVLEEAAQRRQLSAQEKSLLAHKDETLEYKRQLAALGDKVTYQERLNALAQ
 77 | QADKFAQQQRAKRAAIDAKSRGLTDRQAEREATEQRLKEQYGDNPLALNNVMSEQKKTWAAEDQLRGNWMAGLKSGWSEW
 78 | EESATDSMSQVKSAATQTFDGIAQNMAAMLTGSEQNWRSFTRSVLSMMTEILLKQAMVGIVGSIGSAIGGAVGGGASASG
 79 | GTAIQAAAAKFHFATGGFTGTGGKYEPAGIVHRGEFVFTKEATSRIGVGNLYRLMRGYATGGYVGTPGSMADSRSQASGT
 80 | FEQNNHVVINNDGTNGQIGPAALKAVYDMARKGARDEIQTQMRDGGLFSGGGR
 81 | >lcl|NC_001416.1_prot_NP_040596.1_17 [gene=M] [locus_tag=lambdap17] [db_xref=GeneID:2703512] [protein=tail component] [protein_id=NP_040596.1] [location=13100..13429] [gbkey=CDS]
 82 | MKTFRWKVKPGMDVASVPSVRKVRFGDGYSQRAPAGLNANLKTYSVTLSVPREEATVLESFLEEHGGWKSFLWTPPYEWR
 83 | QIKVTCAKWSSRVSMLRVEFSAEFEQVVN
 84 | >lcl|NC_001416.1_prot_NP_040597.1_18 [gene=L] [locus_tag=lambdap18] [db_xref=GeneID:2703513] [protein=tail component] [protein_id=NP_040597.1] [location=13429..14127] [gbkey=CDS]
 85 | MQDIRQETLNECTRAEQSASVVLWEIDLTEVGGERYFFCNEQNEKGEPVTWQGRQYQPYPIQGSGFELNGKGTSTRPTLT
 86 | VSNLYGMVTGMAEDMQSLVGGTVVRRKVYARFLDAVNFVNGNSYADPEQEVISRWRIEQCSELSAVSASFVLSTPTETDG
 87 | AVFPGRIMLANTCTWTYRGDECGYSGPAVADEYDQPTSDITKDKCSKCLSGCKFRNNVGNFGGFLSINKLSQ
 88 | >lcl|NC_001416.1_prot_NP_040598.1_19 [gene=K] [locus_tag=lambdap19] [db_xref=GeneID:2703514] [protein=tail component] [protein_id=NP_040598.1] [location=14276..14875] [gbkey=CDS]
 89 | MSPEDWLQAEMQGEIVALVHSHPGGLPWLSEADRRLQVQSDLPWWLVCRGTIHKFRCVPHLTGRRFEHGVTDCYTLFRDA
 90 | YHLAGIEMPDFHREDDWWRNGQNLYLDNLEATGLYQVPLSAAQPGDVLLCCFGSSVPNHAAIYCGDGELLHHIPEQLSKR
 91 | ERYTDKWQRRTHSLWRHRAWRASAFTGIYNDLVAASTFV
 92 | >lcl|NC_001416.1_prot_NP_040599.1_20 [gene=I] [locus_tag=lambdap20] [db_xref=GeneID:2703515] [protein=tail component] [protein_id=NP_040599.1] [location=14773..15444] [gbkey=CDS]
 93 | MAATHTLPLASPGMARICLYGDLQRFGRRIDLRVKTGAEAIRALATQLPAFRQKLSDGWYQVRIAGRDVSTSGLTAQLHE
 94 | TLPDGAVIHIVPRVAGAKSGGVFQIVLGAAAIAGSFFTAGATLAAWGAAIGAGGMTGILFSLGASMVLGGVAQMLAPKAR
 95 | TPRIQTTDNGKQNTYFSSLDNMVAQGNVLPVLYGEMRVGSRVVSQEISTADEGDGGQVVVIGR
 96 | >lcl|NC_001416.1_prot_NP_040600.1_21 [gene=J] [locus_tag=lambdap21] [db_xref=GeneID:2703516] [protein=tail:host specificity protein] [protein_id=NP_040600.1] [location=15505..18903] [gbkey=CDS]
 97 | MGKGSSKGHTPREAKDNLKSTQLLSVIDAISEGPIEGPVDGLKSVLLNSTPVLDTEGNTNISGVTVVFRAGEQEQTPPEG
 98 | FESSGSETVLGTEVKYDTPITRTITSANIDRLRFTFGVQALVETTSKGDRNPSEVRLLVQIQRNGGWVTEKDITIKGKTT
 99 | SQYLASVVMGNLPPRPFNIRMRRMTPDSTTDQLQNKTLWSSYTEIIDVKQCYPNTALVGVQVDSEQFGSQQVSRNYHLRG
100 | RILQVPSNYNPQTRQYSGIWDGTFKPAYSNNMAWCLWDMLTHPRYGMGKRLGAADVDKWALYVIGQYCDQSVPDGFGGTE
101 | PRITCNAYLTTQRKAWDVLSDFCSAMRCMPVWNGQTLTFVQDRPSDKTWTYNRSNVVMPDDGAPFRYSFSALKDRHNAVE
102 | VNWIDPNNGWETATELVEDTQAIARYGRNVTKMDAFGCTSRGQAHRAGLWLIKTELLETQTVDFSVGAEGLRHVPGDVIE
103 | ICDDDYAGISTGGRVLAVNSQTRTLTLDREITLPSSGTALISLVDGSGNPVSVEVQSVTDGVKVKVSRVPDGVAEYSVWE
104 | LKLPTLRQRLFRCVSIRENDDGTYAITAVQHVPEKEAIVDNGAHFDGEQSGTVNGVTPPAVQHLTAEVTADSGEYQVLAR
105 | WDTPKVVKGVSFLLRLTVTADDGSERLVSTARTTETTYRFTQLALGNYRLTVRAVNAWGQQGDPASVSFRIAAPAAPSRI
106 | ELTPGYFQITATPHLAVYDPTVQFEFWFSEKQIADIRQVETSTRYLGTALYWIAASINIKPGHDYYFYIRSVNTVGKSAF
107 | VEAVGRASDDAEGYLDFFKGKITESHLGKELLEKVELTEDNASRLEEFSKEWKDASDKWNAMWAVKIEQTKDGKHYVAGI
108 | GLSMEDTEEGKLSQFLVAANRIAFIDPANGNETPMFVAQGNQIFMNDVFLKRLTAPTITSGGNPPAFSLTPDGKLTAKNA
109 | DISGSVNANSGTLSNVTIAENCTINGTLRAEKIVGDIVKAASAAFPRQRESSVDWPSGTRTVTVTDDHPFDRQIVVLPLT
110 | FRGSKRTVSGRTTYSMCYLKVLMNGAVIYDGAANEAVQVFSRIVDMPAGRGNVILTFTLTSTRHSADIPPYTFASDVQVM
111 | VIKKQALGISVV
112 | >lcl|NC_001416.1_prot_NP_040601.1_22 [gene=lom] [locus_tag=lambdap26] [db_xref=GeneID:2703517] [protein=outer host membrane] [protein_id=NP_040601.1] [location=18965..19585] [gbkey=CDS]
113 | MRNVCIAVAVFAALAVTVTPARAEGGHGTFTVGYFQVKPGTLPSLSGGDTGVSHLKGINVKYRYELTDSVGVMASLGFAA
114 | SKKSSTVMTGEDTFHYESLRGRYVSVMAGPVLQISKQVSAYAMAGVAHSRWSGSTMDYRKTEITPGYMKETTTARDESAM
115 | RHTSVAWSAGIQINPAASVVVDIAYEGSGSGDWRTDGFIVGVGYKF
116 | >lcl|NC_001416.1_prot_NP_040602.1_23 [gene=orf-401] [locus_tag=lambdap27] [db_xref=GeneID:2703518] [protein=Tail fiber protein] [protein_id=NP_040602.1] [location=19650..20855] [gbkey=CDS]
117 | MAVKISGVLKDGTGKPVQNCTIQLKARRNSTTVVVNTVGSENPDEAGRYSMDVEYGQYSVILQVDGFPPSHAGTITVYED
118 | SQPGTLNDFLCAMTEDDARPEVLRRLELMVEEVARNASVVAQSTADAKKSAGDASASAAQVAALVTDATDSARAASTSAG
119 | QAASSAQEASSGAEAASAKATEAEKSAAAAESSKNAAATSAGAAKTSETNAAASQQSAATSASTAATKASEAATSARDAV
120 | ASKEAAKSSETNASSSAGRAASSATAAENSARAAKTSETNARSSETAAERSASAAADAKTAAAGSASTASTKATEAAGSA
121 | VSASQSKSAAEAAAIRAKNSAKRAEDIASAVALEDADTTRKGIVQLSSATNSTSETLAATPKAVKVVMDETNRKAHWTVR
122 | H
123 | >lcl|NC_001416.1_prot_NP_040603.1_24 [gene=orf206b] [locus_tag=lambdap90] [db_xref=GeneID:3827061] [protein=hypothetical protein] [protein_id=NP_040603.1] [location=complement(20147..20767)] [gbkey=CDS]
124 | MLLVALLSCTIPFLVVSASSSATAEAISSARFAEFFARIAAASAALLLCDADTALPAASVAFVDAVDALPAAAVFASAAA
125 | EALRSAAVSDDLAFVSDVFAALAEFSAAVAEEAARPALDDAFVSDDFAASFEATASRAEVAASDAFVAAVEADVAADCCD
126 | AAAFVSDVFAAPALVAAAFFEDSAAAALFSASVAFADAASAPEDAS
127 | >lcl|NC_001416.1_prot_NP_040604.1_25 [gene=orf-314] [locus_tag=lambdap28] [db_xref=GeneID:2703519] [protein=Tail fiber] [protein_id=NP_040604.1] [location=21029..21973] [gbkey=CDS]
128 | MTNALAGKQPKNATLTALAGLSTAKNKLPYFAENDAASLTELTQVGRDILAKNSVADVLEYLGAGENSAFPAGAPIPWPS
129 | DIVPSGYVLMQGQAFDKSAYPKLAVAYPSGVLPDMRGWTIKGKPASGRAVLSQEQDGIKSHTHSASASGTDLGTKTTSSF
130 | DYGTKTTGSFDYGTKSTNNTGAHAHSLSGSTGAAGAHAHTSGLRMNSSGWSQYGTATITGSLSTVKGTSTQGIAYLSKTD
131 | SQGSHSHSLSGTAVSAGAHAHTVGIGAHQHPVVIGAHAHSFSIGSHGHTITVNAAGNAENTVKNIAFNYIVRLA
132 | >lcl|NC_001416.1_prot_NP_040605.1_26 [gene=orf-194] [locus_tag=lambdap29] [db_xref=GeneID:2703503] [protein=Putative fiber assembly protein] [protein_id=NP_040605.1] [location=21973..22557] [gbkey=CDS]
133 | MAFRMSEQPRTIKIYNLLAGTNEFIGEGDAYIPPHTGLPANSTDIAPPDIPAGFVAVFNSDEASWHLVEDHRGKTVYDVA
134 | SGDALFISELGPLPENFTWLSPGGEYQKWNGTAWVKDTEAEKLFRIREAEETKKSLMQVASEHIAPLQDAADLEIATKEE
135 | TSLLEAWKKYRVLLNRVDTSTAPDIEWPAVPVME
136 | >lcl|NC_001416.1_prot_NP_040606.1_27 [gene=ea47] [locus_tag=lambdap80] [db_xref=GeneID:3827051] [protein=ea47] [protein_id=NP_040606.1] [location=complement(22686..23918)] [gbkey=CDS]
137 | MTKKPWERRLKDLSHLLKCCIDTYFDPELFRLNLNQFLQTARTVTFIIQKNKNQIIGYDIWYNNNVIEKWKNDPLMAWAK
138 | NSRNTIEKQGDLEMYSEAKATLISSYIEENDIEFITNESMLNIGIKKLVRLAQKKLPSYLTESSIIKSERRWVANTLKDY
139 | ELLHALAIIYGRMYNCCNSLGIQINNPMGDDVISPTSFDSLFDEARRITYLKLKDYSISKLSFSMIQYDNKIIPEDIKER
140 | LKLVDKPKNITSTEELVDYTAKLAETTFLKDGYHIQTLIFYDKQFHPIDLINTTFEDQADKYIFWRYAADRAKITNAYGF
141 | IWISELWLRKASIYSNKPIHTMPIIDERLQVIGIDSNNNQKCISWKIVRENEEKKPTLEISTADSKHDEKPYFMRSVLKA
142 | IGGDVNTMNN
143 | >lcl|NC_001416.1_prot_NP_040607.1_28 [gene=ea31] [locus_tag=lambdap81] [db_xref=GeneID:3827052] [protein=ea31] [protein_id=NP_040607.1] [location=complement(24509..25399)] [gbkey=CDS]
144 | MKKLPLPARTYSEMLNKCSEGMMQINVRNNFITHFPTFLQKEQQYRILSSTGQLFTYDRTHPLEPTTLVVGNLTKVKLEK
145 | LYENNLRDKNKPARTYYDDMLVSSGEKCPFCGDIGQTKNIDHFLPIAHYPEFSVMPINLVPSCRDCNMGEKGQVFAVDEV
146 | HQAIHPYIDKDIFFREQWVYANFVSGTPGAISFYVECPANWRQEDKHRALHHFKLLNIANRYRLEAGKHLSEVITQRNSF
147 | VKVIRKYSSTATFQQLQSEFIEANLKPIIDLNDFPNYWKRVMYQCLANSEDFFRGI
148 | >lcl|NC_001416.1_prot_NP_040608.1_29 [gene=ea59] [locus_tag=lambdap82] [db_xref=GeneID:3827053] [protein=ea59] [protein_id=NP_040608.1] [location=complement(25396..26973)] [gbkey=CDS]
149 | MLEFSVIERGGYIPAVEKNKAFLRADGWNDYSFVTMFYLTVFDEHGEKCDIGNVKIGFVGQKEEVSTYSLIDKKFSQLPE
150 | MFFSLGESIDYYVNLSKLSDGFKHNLLKAIQDLVVWPNRLADIENESVLNTSLLRGVTLSEIHGQFARVLNGLPELSDFH
151 | FSFNRKSAPGFSDLTIPFEVTVNSMPSTNIHAFIGRNGCGKTTILNGMIGAITNPENNEYFFSENNRLIESRIPKGYFRS
152 | LVSVSFSAFDPFTPPKEQPDPAKGTQYFYIGLKNAASNSLKSLGDLRLEFISAFIGCMRVDRKRQLWLEAIKKLSSDENF
153 | SNMELISLISKYEELRRNEPQIQVDDDKFTKLFYDNIQKYLLRMSSGHAIVLFTITRLVDVVGEKSLVLFDEPEVHLHPP
154 | LLSAFLRTLSDLLDARNGVAIIATHSPVVLQEVPKSCMWKVLRSREAINIIRPDIETFGENLGVLTREVFLLEVTNSGYH
155 | HLLSQSVDSELSYETILKNYNGQIGLEGRTVLKAMIMNRDEGKVQ
156 | >lcl|NC_001416.1_prot_NP_040609.1_30 [gene=int] [locus_tag=lambdap33] [db_xref=GeneID:2703470] [protein=integration protein] [protein_id=NP_040609.1] [location=complement(27812..28882)] [gbkey=CDS]
157 | MGRRRSHERRDLPPNLYIRNNGYYCYRDPRTGKEFGLGRDRRIAITEAIQANIELFSGHKHKPLTARINSDNSVTLHSWL
158 | DRYEKILASRGIKQKTLINYMSKIKAIRRGLPDAPLEDITTKEIAAMLNGYIDEGKAASAKLIRSTLSDAFREAIAEGHI
159 | TTNHVAATRAAKSEVRRSRLTADEYLKIYQAAESSPCWLRLAMELAVVTGQRVGDLCEMKWSDIVDGYLYVEQSKTGVKI
160 | AIPTALHIDALGISMKETLDKCKEILGGETIIASTRREPLSSGTVSRYFMRARKASGLSFEGDPPTFHELRSLSARLYEK
161 | QISDKFAQHLLGHKSDTMASQYRDDRGREWDKIEIK
162 | >lcl|NC_001416.1_prot_NP_040610.1_31 [gene=xis] [locus_tag=lambdap34] [db_xref=GeneID:2703504] [protein=Excisionase] [protein_id=NP_040610.1] [location=complement(28860..29078)] [gbkey=CDS]
163 | MYLTLQEWNARQRRPRSLETVRRWVRECRIFPPPVKDGREYLFHESAVKVDLNRPVTGGLLKRIRNGKKAKS
164 | >lcl|NC_001416.1_prot_NP_597778.1_32 [locus_tag=lambdap35] [db_xref=GeneID:2703530] [protein=hypothetical protein] [protein_id=NP_597778.1] [location=complement(29118..29285)] [gbkey=CDS]
165 | MHFRVTGEWNGEPFNRVIEAENINDCYDHWMIWAQIAHADVTNIRIEELKEHQAA
166 | >lcl|NC_001416.1_prot_NP_040611.1_33 [gene=ea8.5] [locus_tag=lambdap36] [db_xref=GeneID:2703505] [protein=ea8.5] [protein_id=NP_040611.1] [location=complement(29374..29655)] [gbkey=CDS]
167 | MSINELESEQKDWALSMLCRSGVLSPCRHHEGVYVDEGIDIESAYKYSMKVYKSNEDKSPFCNVREMTDTVQNYYHEYGG
168 | NDTCPLCTKHIDD
169 | >lcl|NC_001416.1_prot_NP_040612.1_34 [gene=ea22] [locus_tag=lambdap83] [db_xref=GeneID:3827054] [protein=ea22] [protein_id=NP_040612.1] [location=complement(29847..30395)] [gbkey=CDS]
170 | MSEINSQALREAAEQAMHDDWGFDADLFHELVTPSIVLELLDERERNQQYIKRRDQENEDIALTVGKLRVELETAKSKLN
171 | EQREYYEGVISDGSKRIAKLESNEVREDGNQFLVVRHPGKTPVIKHCTGDLEEFLRQLIEQDPLVTIDIITHRYYGVGGQ
172 | WVQDAGEYLHMMSDAGIRIKGE
173 | >lcl|NC_001416.1_prot_NP_040613.1_35 [gene=orf61] [locus_tag=lambdap37] [db_xref=GeneID:2703506] [protein=hypothetical protein] [protein_id=NP_040613.1] [location=complement(30839..31024)] [gbkey=CDS]
174 | MRETRYDNHGMHFSGSGLHILCAYACRHGTCSMTPQQENALRSIARQANSEIKKSQTAVSG
175 | >lcl|NC_001416.1_prot_NP_040614.1_36 [gene=orf63] [locus_tag=lambdap38] [db_xref=GeneID:2703507] [protein=hypothetical protein] [protein_id=NP_040614.1] [location=complement(31005..31196)] [gbkey=CDS]
176 | MHKASSVELRTSIEMAHSLAQIGIRFVPIPVETDEEFHTLAASLSQKLEMMVAKAEADERNQV
177 | >lcl|NC_001416.1_prot_NP_040615.1_37 [gene=orf60a] [locus_tag=lambdap39] [db_xref=GeneID:2703508] [protein=hypothetical protein] [protein_id=NP_040615.1] [location=complement(31169..31351)] [gbkey=CDS]
178 | MTHPHDNIRVGAITFVYSVTKRGWVFPGLSVIRNPLKAQRLAEEINNKRGAVCTKHLLLS
179 | >lcl|NC_001416.1_prot_NP_040616.1_38 [gene=exo] [locus_tag=lambdap41] [db_xref=GeneID:2703522] [protein=exonuclease] [protein_id=NP_040616.1] [location=complement(31348..32028)] [gbkey=CDS]
180 | MTPDIILQRTGIDVRAVEQGDDAWHKLRLGVITASEVHNVIAKPRSGKKWPDMKMSYFHTLLAEVCTGVAPEVNAKALAW
181 | GKQYENDARTLFEFTSGVNVTESPIIYRDESMRTACSPDGLCSDGNGLELKCPFTSRDFMKFRLGGFEAIKSAYMAQVQY
182 | SMWVTRKNAWYFANYDPRMKREGLHYVVIERDEKYMASFDEIVPEFIEKMDEALAEIGFVFGEQWR
183 | >lcl|NC_001416.1_prot_NP_040617.1_39 [gene=bet] [locus_tag=lambdap84] [db_xref=GeneID:3827055] [protein=bet] [protein_id=NP_040617.1] [location=complement(32025..32810)] [gbkey=CDS]
184 | MSTALATLAGKLAERVGMDSVDPQELITTLRQTAFKGDASDAQFIALLIVANQYGLNPWTKEIYAFPDKQNGIVPVVGVD
185 | GWSRIINENQQFDGMDFEQDNESCTCRIYRKDRNHPICVTEWMDECRREPFKTREGREITGPWQSHPKRMLRHKAMIQCA
186 | RLAFGFAGIYDKDEAERIVENTAYTAERQPERDITPVNDETMQEINTLLIALDKTWDDDLLPLCSQIFRRDIRASSELTQ
187 | AEAVKALGFLKQKAAEQKVAA
188 | >lcl|NC_001416.1_prot_NP_040618.1_40 [gene=gam] [locus_tag=lambdap42] [db_xref=GeneID:2703509] [protein=host-nuclease inhibitor protein Gam] [protein_id=NP_040618.1] [location=complement(32816..33232)] [gbkey=CDS]
189 | MDINTETEIKQKHSLTPFPVFLISPAFRGRYFHSYFRSSAMNAYYIQDRLEAQSWARHYQQLAREEKEAELADDMEKGLP
190 | QHLFESLCIDHLQRHGASKKSITRAFDDDVEFQERMAEHIRYMVETIAHHQVDIDSEV
191 | >lcl|NC_001416.1_prot_NP_040619.1_41 [gene=kil] [locus_tag=lambdap85] [db_xref=GeneID:3827057] [protein=host-killing protein] [protein_id=NP_040619.1] [location=complement(33187..33330)] [gbkey=CDS]
192 | MDQTLMAIQTKFTIATFIGDEKMFREAVDAYKKWILILKLRSSKSIH
193 | >lcl|NC_001416.1_prot_NP_040620.1_42 [gene=cIII] [locus_tag=lambdap86] [db_xref=GeneID:3827056] [protein=antitermination protein] [protein_id=NP_040620.1] [location=complement(33299..33463)] [gbkey=CDS]
194 | MQYAIAGWPVAGCPSESLLERITRKLRDGWKRLIDILNQPGVPKNGSNTYGYPD
195 | >lcl|NC_001416.1_prot_NP_040621.1_43 [gene=ea10] [locus_tag=lambdap45] [db_xref=GeneID:2703541] [protein=Putative single-stranded DNA binding protein] [protein_id=NP_040621.1] [location=complement(33536..33904)] [gbkey=CDS]
196 | MSNIKKYIIDYDWKASIEIEIDHDVMTEEKLHQINNFWSDSEYRLNKHGSVLNAVLIMLAQHALLIAISSDLNAYGVVCE
197 | FDWNDGNGQEGWPPMDGSEGIRITDIDTSGIFDSDDMTIKAA
198 | >lcl|NC_001416.1_prot_NP_040622.1_44 [gene=ral] [locus_tag=lambdap46] [db_xref=GeneID:2703473] [protein=restriction alleviation protein] [protein_id=NP_040622.1] [location=complement(34087..34287)] [gbkey=CDS]
199 | MTTTIDKNQWCGQFKRCNGCKLQSECMVKPEEMFPVMEDGKYVDKWAIRTTAMIARELGKQNNKAA
200 | >lcl|NC_001416.1_prot_NP_040623.1_45 [gene=orf28] [locus_tag=lambdap47] [db_xref=GeneID:2703510] [protein=hypothetical protein] [protein_id=NP_040623.1] [location=complement(34271..34357)] [gbkey=CDS]
201 | MEEEFEEFEEHPQDVMEQYQDYPYDYDY
202 | >lcl|NC_001416.1_prot_NP_597779.1_46 [locus_tag=lambdap48] [db_xref=GeneID:2703531] [protein=Superinfection exclusion protein B] [protein_id=NP_597779.1] [location=34482..35036] [gbkey=CDS]
203 | MMSIEMDPLVILGRVFSNEPLERTMYMIVIWVGLLLLSPDNWPEYVNERIGIPHVWHVFVFALAFSLAINVHRLSAIASA
204 | RYKRFKLRKRIKMQNDKVRSVIQNLTEEQSMVLCAALNEGRKYVVTSKQFPYISELIELGVLNKTFSRWNGKHILFPIED
205 | IYWTELVASYDPYNIEIKPRPISK
206 | >lcl|NC_001416.1_prot_NP_040625.1_47 [gene=N] [locus_tag=lambdap49] [db_xref=GeneID:2703540] [protein=early gene regulator] [protein_id=NP_040625.1] [location=complement(35037..35438)] [gbkey=CDS]
207 | MCQSRGVFVQDYNCHTPPKLTDRRIQMDAQTRRRERRAEKQAQWKAANPLLVGVSAKPVNRPILSLNRKPKSRVESALNP
208 | IDLTVLAEYHKQIESNLQRIERKNQRTWYSKPGERGITCSGRQKIKGKSIPLI
209 | >lcl|NC_001416.1_prot_NP_040626.1_48 [gene=rexb] [locus_tag=lambdap53] [db_xref=GeneID:2703493] [protein=exclusion protein] [protein_id=NP_040626.1] [location=complement(35825..36259)] [gbkey=CDS]
210 | MRNRIMPGVYIVIIPYVIVSICYLLFRHYIPGVSFSAHRDGLGATLSSYAGTMIAILIAALTFLIGSRTRRLAKIREYGY
211 | MTSVVIVYALSFVELGALFFCGLLLLSSISGYMIPTIAIGIASASFIHICILVFQLYNLTREQE
212 | >lcl|NC_001416.1_prot_NP_040627.1_49 [gene=rexa] [locus_tag=lambdap87] [db_xref=GeneID:3827058] [protein=exclusion protein] [protein_id=NP_040627.1] [location=complement(36275..37114)] [gbkey=CDS]
213 | MKNGFYATYRSKNKGKDKRSINLSVFLNSLLADNHHLQVGSNYLYIHKIDGKTFLFTKTNDKSLVQKINRSKASVEDIKN
214 | SLADDESLGFPSFLFVEGDTIGFARTVFGPTTSDLTDFLIGKGMSLSSGERVQIEPLMRGTTKDDVMHMHFIGRTTVKVE
215 | AKLPVFGDILKVLGATDIEGELFDSLDIVIKPKFKRDIKKVAKDIIFNPSPQFSDISLRAKDEAGDILTEHYLSEKGHLS
216 | APLNKVTNAEIAEEMAYCYARMKSDILECFKRQVGKVKD
217 | >lcl|NC_001416.1_prot_NP_040628.1_50 [gene=cI] [locus_tag=lambdap88] [db_xref=GeneID:3827059] [protein=repressor] [protein_id=NP_040628.1] [location=complement(37227..37940)] [gbkey=CDS]
218 | MSTKKKPLTQEQLEDARRLKAIYEKKKNELGLSQESVADKMGMGQSGVGALFNGINALNAYNAALLAKILKVSVEEFSPS
219 | IAREIYEMYEAVSMQPSLRSEYEYPVFSHVQAGMFSPELRTFTKGDAERWVSTTKKASDSAFWLEVEGNSMTAPTGSKPS
220 | FPDGMLILVDPEQAVEPGDFCIARLGGDEFTFKKLIRDSGQVFLQPLNPQYPMIPCNESCSVVGKVIASQWPEETFG
221 | >lcl|NC_001416.1_prot_NP_040629.1_51 [gene=cro] [locus_tag=lambdap57] [db_xref=GeneID:2703467] [protein=antirepressor] [protein_id=NP_040629.1] [location=38041..38241] [gbkey=CDS]
222 | MEQRITLKDYAMRFGQTKTAKDLGVYQSAINKAIHAGRKIFLTINADGSVYAEEVKPFPSNKKTTA
223 | >lcl|NC_001416.1_prot_NP_040630.1_52 [locus_tag=lambdap54] [db_xref=GeneID:2703490] [protein=cII protein] [protein_id=NP_040630.1] [location=38360..38653] [gbkey=CDS]
224 | MVRANKRNEALRIESALLNKIAMLGTEKTAEAVGVDKSQISRWKRDWIPKFSMLLAVLEWGVVDDDMARLARQVAAILTN
225 | KKRPAATERSEQIQMEF
226 | >lcl|NC_001416.1_prot_NP_040631.1_53 [locus_tag=lambdap54] [db_xref=GeneID:2703490] [protein=DNA replication protein] [protein_id=NP_040631.1] [location=38686..39585] [gbkey=CDS]
227 | MTNTAKILNFGRGNFAGQERNVADLDDGYARLSNMLLEAYSGADLTKRQFKVLLAILRKTYGWNKPMDRITDSQLSEITK
228 | LPVKRCNEAKLELVRMNIIKQQGGMFGPNKNISEWCIPQNEGKSPKTRDKTSLKLGDCYPSKQGDTKDTITKEKRKDYSS
229 | ENSGESSDQPENDLSVVKPDAAIQSGSKWGTAEDLTAAEWMFDMVKTIAPSARKPNFAGWANDIRLMRERDGRNHRDMCV
230 | LFRWACQDNFWSGNVLSPAKLRDKWTQLEINRNKQQAGVTASKPKLDLTNTDWIYGVDL
231 | >lcl|NC_001416.1_prot_NP_040632.1_54 [locus_tag=lambdap54] [db_xref=GeneID:2703490] [protein=DNA replication protein] [protein_id=NP_040632.1] [location=39582..40283] [gbkey=CDS]
232 | MKNIAAQMVNFDREQMRRIANNMPEQYDEKPQVQQVAQIINGVFSQLLATFPASLANRDQNEVNEIRRQWVLAFRENGIT
233 | TMEQVNAGMRVARRQNRPFLPSPGQFVAWCREEASVTAGLPNVSELVDMVYEYCRKRGLYPDAESYPWKSNAHYWLVTNL
234 | YQNMRANALTDAELRRKAADELVHMTARINRGEAIPEPVKQLPVMGGRPLNRAQALAKIAEIKAKFGLKGASV
235 | >lcl|NC_001416.1_prot_NP_040633.1_55 [locus_tag=lambdap54] [db_xref=GeneID:2703490] [protein=ren exclusion protein] [protein_id=NP_040633.1] [location=40280..40570] [gbkey=CDS]
236 | MTGKEAIIHYLGTHNSFCAPDVAALTGATVTSINQAAAKMARAGLLVIEGKVWRTVYYRFATREEREGKMSTNLVFKECR
237 | QSAAMKRVLAVYGVKR
238 | >lcl|NC_001416.1_prot_NP_040634.1_56 [gene=NinB] [locus_tag=lambdap63] [db_xref=GeneID:2703497] [protein=NinB] [protein_id=NP_040634.1] [location=40644..41084] [gbkey=CDS]
239 | MKKLTFEIRSPAHQQNAIHAVQQILPDPTKPIVVTIQERNRSLDQNRKLWACLGDVSRQVEWHGRWLDAESWKCVFTAAL
240 | KQQDVVPNLAGNGFVVIGQSTSRMRVGEFAELLELIQAFGTERGVKWSDEARLALEWKARWGDRAA
241 | >lcl|NC_001416.1_prot_NP_040635.1_57 [gene=NinC] [locus_tag=lambdap64] [db_xref=GeneID:2703498] [protein=NinC protein] [protein_id=NP_040635.1] [location=41081..41953] [gbkey=CDS]
242 | MINVVSFSGGRTSAYLLWLMEQKRRAGKDVHYVFMDTGCEHPMTYRFVREVVKFWDIPLTVLQVDINPELGQPNGYTVWE
243 | PKDIQTRMPVLKPFIDMVKKYGTPYVGGAFCTDRLKLVPFTKYCDDHFGRGNYTTWIGIRADEPKRLKPKPGIRYLAELS
244 | DFEKEDILAWWKQQPFDLQIPEHLGNCIFCIKKSTQKIGLACKDEEGLQRVFNEVITGSHVRDGHRETPKEIMYRGRMSL
245 | DGIAKMYSENDYQALYQDMVRAKRFDTGSCSESCEIFGGQLDFDFGREAA
246 | >lcl|NC_001416.1_prot_NP_040636.1_58 [gene=NinD] [locus_tag=lambdap65] [db_xref=GeneID:2703499] [protein=NinD protein] [protein_id=NP_040636.1] [location=41950..42123] [gbkey=CDS]
247 | MMRCYRCGECKEDNRFRPNQPYWNRWCLRCERTPTGVLPLPQEKEDVWRDSDEVSPT
248 | >lcl|NC_001416.1_prot_NP_040637.1_59 [gene=NinE] [locus_tag=lambdap66] [db_xref=GeneID:2703500] [protein=NinE protein] [protein_id=NP_040637.1] [location=42090..42272] [gbkey=CDS]
249 | MARQRRSITDIICENCKYLPTKRTRNKPKPIPKESDVKTFNYTAHLWDIRWLRRRARKTR
250 | >lcl|NC_001416.1_prot_NP_040638.1_60 [gene=NinF] [locus_tag=lambdap67] [db_xref=GeneID:2703501] [protein=NinF protein] [protein_id=NP_040638.1] [location=42269..42439] [gbkey=CDS]
251 | MIDQNRSYEQESVERALTCANCGQKLHVLEVHVCEHCCAELMSDPNSSMHEEEDDG
252 | >lcl|NC_001416.1_prot_NP_040639.1_61 [gene=NinG] [locus_tag=lambdap68] [db_xref=GeneID:2703474] [protein=NinG protein] [protein_id=NP_040639.1] [location=42429..43043] [gbkey=CDS]
253 | MMAKPARRRCKNDECREWFHPAFANQWWCSPECGTKIALERRSKEREKAEKAAEKKRRREEQKQKDKLKIRKLALKPRSY
254 | WIKQAQQAVNAFIRERDRDLPCISCGTLTSAQWDAGHYRTTAAAPQLRFNERNIHKQCVVCNQHKSGNLVPYRVELISRI
255 | GQEAVDEIESNHNRHRWTIEECKAIKAEYQQKLKDLRNSRSEAA
256 | >lcl|NC_001416.1_prot_NP_040640.1_62 [gene=NinH] [locus_tag=lambdap69] [db_xref=GeneID:2703475] [protein=NinH protein] [protein_id=NP_040640.1] [location=43040..43246] [gbkey=CDS]
257 | MTFSVKTIPDMLVEAYGNQTEVARRLKCSRGTVRKYVDDKDGKMHAIVNDVLMVHRGWSERDALLRKN
258 | >lcl|NC_001416.1_prot_NP_040641.1_63 [gene=NinI] [locus_tag=lambdap70] [db_xref=GeneID:2703476] [protein=NinI protein] [protein_id=NP_040641.1] [location=43224..43889] [gbkey=CDS]
259 | MRYYEKIDGSKYRNIWVVGDLHGCYTNLMNKLDTIGFDNKKDLLISVGDLVDRGAENVECLELITFPWFRAVRGNHEQMM
260 | IDGLSERGNVNHWLLNGGGWFFNLDYDKEILAKALAHKADELPLIIELVSKDKKYVICHADYPFDEYEFGKPVDHQQVIW
261 | NRERISNSQNGIVKEIKGADTFIFGHTPAVKPLKFANQMYIDTGAVFCGNLTLIQVQGEGA
262 | >lcl|NC_001416.1_prot_NP_040642.1_64 [gene=Q] [locus_tag=lambdap71] [db_xref=GeneID:2703477] [protein=late gene regulator] [protein_id=NP_040642.1] [location=43886..44509] [gbkey=CDS]
263 | MRLESVAKFHSPKSPMMSDSPRATASDSLSGTDVMAAMGMAQSQAGFGMAAFCGKHELSQNDKQKAINYLMQFAHKVSGK
264 | YRGVAKLEGNTKAKVLQVLATFAYADYCRSAATPGARCRDCHGTGRAVDIAKTELWGRVVEKECGRCKGVGYSRMPASAA
265 | YRAVTMLIPNLTQPTWSRTVKPLYDALVVQCHKEESIADNILNAVTR
266 | >lcl|NC_001416.1_prot_NP_040643.1_65 [gene=orf-64] [locus_tag=lambdap73] [db_xref=GeneID:2703478] [protein=hypothetical protein] [protein_id=NP_040643.1] [location=44621..44815] [gbkey=CDS]
267 | MKRGGAYYRFRLVGHFDVSSGTPTIAGREVCKMQSRNSSQVIVRACITVSGFFISAQQVRALSR
268 | >lcl|NC_001416.1_prot_NP_040644.1_66 [gene=S] [locus_tag=lambdap74] [db_xref=GeneID:2703479] [protein=anti-holin] [protein_id=NP_040644.1] [location=45186..45509] [gbkey=CDS]
269 | MKMPEKHDLLAAILAAKEQGIGAILAFAMAYLRGRYNGGAFTKTVIDATMCAIIAWFIRDLLDFAGLSSNLAYITSVFIG
270 | YIGTDSIGSLIKRFAAKKAGVEDGRNQ
271 | >lcl|NC_001416.1_prot_YP_001551775.1_67 [gene=S'] [locus_tag=lambdap92] [db_xref=GeneID:5740919] [protein=holin] [protein_id=YP_001551775.1] [location=45192..45509] [gbkey=CDS]
272 | MPEKHDLLAAILAAKEQGIGAILAFAMAYLRGRYNGGAFTKTVIDATMCAIIAWFIRDLLDFAGLSSNLAYITSVFIGYI
273 | GTDSIGSLIKRFAAKKAGVEDGRNQ
274 | >lcl|NC_001416.1_prot_NP_040645.1_68 [gene=R] [locus_tag=lambdap75] [db_xref=GeneID:2703480] [protein=endolysin] [protein_id=NP_040645.1] [location=45493..45969] [gbkey=CDS]
275 | MVEINNQRKAFLDMLAWSEGTDNGRQKTRNHGYDVIVGGELFTDYSDHPRKLVTLNPKLKSTGAGRYQLLSRWWDAYRKQ
276 | LGLKDFSPKSQDAVALQQIKERGALPMIDRGDIRQAIDRCSNIWASLPGAGYGQFEHKADSLIAKFKEAGGTVREIDV
277 | >lcl|NC_001416.1_prot_NP_040646.1_69 [gene=Rz] [locus_tag=lambdap76] [db_xref=GeneID:2703481] [protein=cell lysis protein] [protein_id=NP_040646.1] [location=45966..46427] [gbkey=CDS]
278 | MSRVTAIISALVICIIVCLSWAVNHYRDNAITYKAQRDKNARELKLANAAITDMQMRQRDVAALDAKYTKELADAKAEND
279 | ALRDDVAAGRRRLHIKAVCQSVREATTASGVDNAASPRLADTAERDYFTLRERLITMQKQLEGTQKYINEQCR
280 | >lcl|NC_001416.1_prot_YP_001551744.1_70 [gene=Rz1] [locus_tag=lambdap91] [db_xref=GeneID:5739319] [protein=Rz1 protein] [protein_id=YP_001551744.1] [location=46186..46368] [gbkey=CDS]
281 | MLKLKMMLCVMMLPLVVVGCTSKQSVSQCVKPPPPPAWIMQPPPDWQTPLNGIISPSERG
282 | >lcl|NC_001416.1_prot_NP_597780.1_71 [gene=bor] [locus_tag=lambdap77] [db_xref=GeneID:2703532] [protein=Bor protein precursor] [protein_id=NP_597780.1] [location=complement(46459..46752)] [gbkey=CDS]
283 | MKKMLLATALALLITGCAQQTFTVQNKPAAVAPKETITHHFFVSGIGQKKTVDAAKICGGAENVVKTETQQTFVNGLLGF
284 | ITLGIYTPLEARVYCSQ
285 | >lcl|NC_001416.1_prot_NP_597781.1_72 [locus_tag=lambdap78] [db_xref=GeneID:2703533] [protein=putative envelope protein] [protein_id=NP_597781.1] [location=complement(47042..47575)] [gbkey=CDS]
286 | MKTFLIFDINHQQNTRRSDQIETIKTIMQTTRPRITWKVLPMAQVAIFKEIFDQVRKDLDCELFYSELKRHNVSHYIYYL
287 | ATDNIHIVLENDNTVLIKGLKKVVNVKFSRNTHLIETSYDRLKSREITFQQYRENLAKAGVFRWVTNIHEHKRYYYTFDN
288 | SLLFTESIQNTTQIFPR
289 | >lcl|NC_001416.1_prot_NP_597782.1_73 [locus_tag=lambdap79] [db_xref=GeneID:2703534] [protein=hypothetical protein] [protein_id=NP_597782.1] [location=47738..47944] [gbkey=CDS]
290 | MNKEQSADDPSVDLIRVKNMLNSTISMSYPDVVIACIEHKVSLEAFRAIEAALVKHDNNMKDYSLVVD
291 | 


--------------------------------------------------------------------------------
/scripts/N50.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #ABSTRACT: A script to calculate N50 from one or multiple FASTA/FASTQ files, or from STDIN.
  3 | 
  4 | use v5.12;
  5 | use Pod::Usage;
  6 | use Term::ANSIColor  qw(:constants colorvalid colored);
  7 | use Getopt::Long;
  8 | use File::Basename;
  9 | use JSON::PP;
 10 | our $BASE = basename($0);
 11 | local $Term::ANSIColor::AUTORESET = 1;
 12 | 
 13 | our %program = (
 14 |   'NAME'      => 'FASTx N50 CALCULATOR',
 15 |   'AUTHOR'    => 'Andrea Telatin',
 16 |   'MAIL'      => 'andrea.telatin@quadram.ac.uk',
 17 |   'VERSION'   => '1.1',
 18 | );
 19 | my $opt_separator = "\t";
 20 | my $opt_format = 'default';
 21 | my %formats = (
 22 |   'default' => 'Prints only N50 for single file, TSV for multiple files',
 23 |   'tsv'     => 'Tab separated output (file, seqs, total size, N50)',
 24 |   'full'    => 'Not implemented',
 25 |   'json'    => 'JSON (JavaScript Object Notation) output',
 26 |   'short'   => 'Not Implemented',
 27 |   'csv'     => 'Alias for tsv',
 28 |   'custom'  => 'Custom format with --template STRING',
 29 |  );
 30 | 
 31 | my ($opt_help, 
 32 | 	$opt_version, 
 33 | 	$opt_input, 
 34 | 	$opt_verbose, 
 35 | 	$opt_debug, 
 36 | 	$opt_color, 
 37 | 	$opt_nonewline,
 38 | 	$opt_noheader,
 39 | 	$opt_pretty,
 40 | 	$opt_basename,
 41 | 	$opt_template,
 42 | );
 43 | our $tab  = "\t";
 44 | our $new  = "\n";
 45 | my $result = GetOptions(
 46 |     'f|format=s'    => \$opt_format,
 47 |     's|separator=s' => \$opt_separator,
 48 |     'p|pretty'      => \$opt_pretty,
 49 |     'n|nonewline'   => \$opt_nonewline,
 50 |     'j|noheader'    => \$opt_noheader,
 51 |     'b|basename'    => \$opt_basename,
 52 |     't|template=s'  => \$opt_template,
 53 |     'c|color'       => \$opt_color,
 54 |     'h|help'        => \$opt_help,
 55 |     'v|version'     => \$opt_version,
 56 |     'd|debug'       => \$opt_debug,
 57 | );
 58 | 
 59 | pod2usage({-exitval => 0, -verbose => 2}) if $opt_help;
 60 | version() if defined $opt_version;
 61 | 
 62 | our %output_object;
 63 | 
 64 | if (defined $opt_format) {
 65 | 	$opt_format = lc($opt_format);
 66 | 	if (!$formats{$opt_format}) {
 67 | 		my @list = sort keys(%formats);
 68 | 
 69 | 		die " FATAL ERROR:\n Output format not valid (--format '$opt_format').\n Use one of the following: " .
 70 | 			join(', ',@list) . ".\n";
 71 | 	}
 72 | 
 73 | 	if ($formats{$opt_format} eq 'Not implemented') {
 74 | 		print STDERR " WARNING: Format '$opt_format' not implemented yet. Switching to 'tsv'.\n";
 75 | 		$opt_format = 'tsv';
 76 | 	}
 77 |     if ($opt_format eq 'csv') {
 78 |         $opt_separator = ',';
 79 |     }
 80 | 
 81 | }
 82 | 
 83 | if (not defined $ARGV[0]) {
 84 |     print STDERR GREEN, "n50 - calculate N50 of FASTA/FASTQ files\n", RESET;
 85 |     print STDERR "USAGE: $BASE [options] FILE1 FILE2 FILE3...\n";
 86 |     print STDERR "No input files specified.\n";
 87 |     exit;
 88 | }
 89 | foreach my $file (@ARGV) {
 90 | 	
 91 | 	if (!-e "$file" and $file ne '-') {
 92 | 		die " FATAL ERROR:\n File not found ($file).\n";	
 93 | 	} elsif ($file eq '-') {
 94 | 		$file = '<STDIN>';
 95 | 	} elsif ($file =~/.gz$/) {
 96 |         open STDIN, '-|', "gzip -dc $file" || die " FATAL ERROR:\n Unable to open file for reading ($file).\n";
 97 |     } else {
 98 | 		open STDIN, '<', "$file" || die " FATAL ERROR:\n Unable to open file for reading ($file).\n";
 99 | 	}
100 |     
101 | 
102 | 
103 | 
104 | 	my @aux;
105 | 	my %sizes;
106 | 	my ($n, $slen) = (0, 0);
107 | 
108 | 	while (my ($name, $seq) = readfq(\*STDIN, \@aux)) {
109 |         next if ($name eq '');
110 | 	    $n++;
111 | 
112 | 	    my $size = length($seq);
113 | 	    $slen += $size;
114 | 	    $sizes{$size}++;
115 | 	}
116 |     
117 | 	my $n50 = n50fromHash(\%sizes, $slen);
118 | 
119 | 	say STDERR "[$file]\tTotalSize:$slen;N50:$n50;Sequences:$n" if ($opt_debug);
120 | 	
121 | 	$file = basename($file) if ($opt_basename);
122 | 	my %metrics = (
123 | 		'seqs' => $n,
124 | 		'N50'  => $n50,
125 | 		'size' => $slen,
126 | 	);
127 | 	$output_object{$file} = \%metrics;
128 | }
129 | 
130 | my $file_num = scalar keys %output_object;
131 | 
132 | if (!$opt_format or $opt_format eq 'default') {
133 | # DEFAULT
134 | 	if ($file_num == 1) {
135 | 		my @keys = keys %output_object;
136 | 		say $output_object{$keys[0]}{'N50'};
137 | 	} else {
138 | 		foreach my $r (keys %output_object) {
139 | 			say $r, $opt_separator ,$output_object{$r}{'N50'};
140 | 		}		
141 | 	}
142 | } elsif ($opt_format eq 'json') {
143 | 	
144 | 	my $json = JSON::PP->new->allow_nonref;
145 | 	my $pretty_printed = $json->pretty->encode( \%output_object );
146 | 	say $pretty_printed;
147 | 
148 | } elsif ($opt_format eq 'tsv' or $opt_format eq 'csv') {
149 | 
150 | 	my @fields = ('path', 'seqs', 'size', 'N50');
151 | 	say '#', join($opt_separator, @fields) if (!defined $opt_noheader);
152 | 
153 | 	foreach my $r (keys %output_object) {
154 | 		print $r,$opt_separator; 
155 | 		for (my $i = 1; $i <= $#fields; $i++) {
156 | 			print $output_object{$r}{$fields[$i]};
157 | 			if ($i == $#fields and !$opt_nonewline) {
158 | 				print "\n";
159 | 			} else {
160 | 				print $opt_separator;
161 | 			}
162 | 
163 | 		}
164 | 	}
165 | } elsif ($opt_format eq 'custom') {
166 | 	foreach my $r (keys %output_object) {
167 | 		my $output_string = $opt_template;
168 | 		$output_string =~s/{new}/$new/g;
169 | 		$output_string =~s/{tab}/$tab/g;
170 | 		$output_string =~s/{(\w+)}/$output_object{$r}{$1}/g;
171 | 		$output_string =~s/{path}/$r/g;
172 | 		print $output_string;
173 | 	}
174 | }
175 | 
176 | 
177 | sub debug {
178 | 	my ($message, $title) = @_;
179 | 	$title = 'INFO' unless defined $title;
180 | 	$title = uc($title);
181 | 	printMessage($message, $title, 'green', 'reset');
182 | }
183 | sub printMessage {
184 | 	my ($message, $title, $title_color, $message_color) = @_;
185 | 	$title_color   = 'reset' if (!defined $title_color or !colorvalid($title_color) or !$opt_color);
186 | 	$message_color = 'reset' if (!defined $message_color or !colorvalid($message_color) or !$opt_color);
187 | 	say STDERR colored("$title", $title_color), "\t", colored("$message", $message_color);
188 | }
189 | sub n50fromHash {
190 | 	my ($hash_ref, $total) = @_;
191 | 	my $tlen = 0;
192 | 	foreach my $s (sort {$a <=> $b} keys %{$hash_ref}) {
193 | 		$tlen += $s * ${$hash_ref}{$s};
194 | 		return $s if ($tlen >= ($total/2));
195 | 	}
196 | 
197 | }
198 | 
199 | sub version {
200 | 	printMessage("$program{NAME}, ver. $program{VERSION}", '', 'RESET', 'bold green');
201 | 	printMessage(qq(
202 | 	$program{AUTHOR}
203 | 
204 | 	Program to calculate N50 from multiple FASTA/FASTQ files.
205 | 	Type --help (or -h) to see the full documentation.), '', 'blue', 'green');
206 | END;
207 |     exit;
208 | }
209 | sub readfq {
210 |     my ($fh, $aux) = @_;
211 |     @$aux = [undef, 0] if (!(@$aux));
212 |     return if ($aux->[1]);
213 |     if (!defined($aux->[0])) {
214 |         while (<$fh>) {
215 |             chomp;
216 |             if (substr($_, 0, 1) eq '>' || substr($_, 0, 1) eq '@') {
217 |                 $aux->[0] = $_;
218 |                 last;
219 |             }
220 |         }
221 |         if (!defined($aux->[0])) {
222 |             $aux->[1] = 1;
223 |             return;
224 |         }
225 |     }
226 | 
227 |     my $name = '';
228 |     if (defined $_) {
229 |     	$name = /^.(\S+)/? $1 : '';
230 |     }
231 |     
232 |     my $seq = '';
233 |     my $c;
234 |     $aux->[0] = undef;
235 |     while (<$fh>) {
236 |         chomp;
237 |         $c = substr($_, 0, 1);
238 |         last if ($c eq '>' || $c eq '@' || $c eq '+');
239 |         $seq .= $_;
240 |     }
241 |     $aux->[0] = $_;
242 |     $aux->[1] = 1 if (!defined($aux->[0]));
243 |     return ($name, $seq) if ($c ne '+');
244 |     my $qual = '';
245 |     while (<$fh>) {
246 |         chomp;
247 |         $qual .= $_;
248 |         if (length($qual) >= length($seq)) {
249 |             $aux->[0] = undef;
250 |             return ($name, $seq, $qual);
251 |         }
252 |     }
253 |     $aux->[1] = 1;
254 |     return ($name, $seq);
255 | }
256 | 
257 | __END__
258 | 
259 | =head1 NAME
260 |  
261 | B<n50.pl> - A program to calculate N50 from FASTA/FASTQ files
262 |  
263 | =head1 AUTHOR
264 |  
265 | Andrea Telatin <andrea.telatin@quadram.ac.uk>
266 | 
267 | =head1 DESCRIPTION
268 |  
269 | This program parses a list of FASTA/FASTQ files calculating for each one
270 | the number of sequences, the sum of sequences lengths and the N50.
271 | It will print the result in different formats, by default only the N50 is
272 | printed for a single file and all metrics in TSV format for multiple files.
273 |  
274 | =head1 SYNOPSIS
275 |  
276 |   n50.pl [options] [FILE1 FILE2 FILE3...]
277 | 
278 | =head1 PARAMETERS
279 | 
280 | =over 12
281 | 
282 | =item B<-f, --format>
283 | 
284 | Output format: default, tsv, json, custom. 
285 | See below for format specific switches.
286 | 
287 | =item B<-s, --separator>
288 | 
289 | Separator to be used in 'tsv' output. Default: tab.
290 | The 'tsv' format will print a header line, followed
291 | by a line for each file given as input with: file path,
292 | as received, total number of sequences, total size in bp,
293 | and finally N50.
294 | 
295 | =item B<-b, --basename>
296 | 
297 | Instead of printing the path of each file, will only print
298 | the filename, stripping relative or absolute paths to it.
299 | 
300 | =item B<-j, --noheader>
301 | 
302 | When used with 'tsv' output format, will suppress header
303 | line.
304 | 
305 | =item B<-n, --nonewline>
306 | 
307 | If used with 'default' or 'csv' output format, will NOT print the
308 | newline character after the N50. Usually used in bash scripting.
309 | 
310 | =item B<-t, --template>
311 | 
312 | String to be used with 'custom' format. Will be used as template
313 | string for each sample, replacing {new} with newlines, {tab} with
314 | tab and {N50}, {seqs}, {size}, {path} with sample's N50, number of sequences,
315 | total size in bp and file path respectively (the latter will
316 | respect --basename if used).
317 | 
318 | =item B<-p, --pretty>
319 | 
320 | If used with 'json' output format, will format the JSON
321 | in pretty print mode. Example:
322 | 
323 |  
324 |  {
325 |    "file1.fa" : {
326 |      "size" : 290,
327 |      "N50" : "290",
328 |      "seqs" : 2
329 |   },
330 |    "file2.fa" : {
331 |      "N50" : "456",
332 |      "size" : 456,
333 |      "seqs" : 2
334 |   }
335 |  }
336 |  
337 | =item B<-h, --help>
338 | 
339 | Will display this full help message and quit, even if other
340 | arguments are supplied.
341 | 
342 | =back
343 | 
344 | =head1 INSTALLATION
345 | 
346 | A complete package with more feature is Proch::N50, installable with
347 | 
348 |   cpan Proch::N50
349 | 
350 | Or from Bioconda with:
351 | 
352 |   conda install -c bioconda n50
353 | 
354 | And a complete suite of tools including statistics, is available as
355 | 
356 |   conda install -c bioconda "seqfu>=1.10"
357 | 
358 | =head1 CITATION
359 | 
360 | Telatin A, Fariselli P, Birolo G. 
361 | SeqFu: A Suite of Utilities for the Robust and Reproducible Manipulation of Sequence Files. 
362 | Bioengineering 2021, 8, 59. L<10.3390/bioengineering8050059|https://doi.org/10.3390/bioengineering8050059>
363 | 
364 | =head1 COPYRIGHT
365 |  
366 | Copyright (C) 2017 Andrea Telatin 
367 |  
368 | This program is free software: you can redistribute it and/or modify
369 | it under the terms of the GNU General Public License as published by
370 | the Free Software Foundation, either version 3 of the License, or
371 | (at your option) any later version.
372 |  
373 | This program is distributed in the hope that it will be useful,
374 | but WITHOUT ANY WARRANTY; without even the implied warranty of
375 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
376 | GNU General Public License for more details.
377 |  
378 | You should have received a copy of the GNU General Public License
379 | and this program.  If not, see <http://www.gnu.org/licenses/>.
380 |  
381 | =cut
382 | 


--------------------------------------------------------------------------------
/scripts/fasta_translate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | reads fasta file(s) and creates a database in the hashsum format
 4 | The fasta file(s) must be one locus per line. The first allele is the assumed reference.
 5 | Each sequence ID must have the format locus_allele.
 6 | Input:
 7 | Fasta file
 8 | Output:
 9 | - A reference fasta file
10 | - A TSV file with all the alleles
11 | """
12 | 
13 | import sys
14 | import os
15 | import argparse
16 | 
17 | def translate(seq):
18 |     # From: https://www.geeksforgeeks.org/dna-protein-python-3/
19 |     stop = '*'
20 |     table = {
21 |         'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
22 |         'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
23 |         'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
24 |         'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                 
25 |         'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
26 |         'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
27 |         'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
28 |         'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
29 |         'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
30 |         'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
31 |         'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
32 |         'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
33 |         'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
34 |         'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
35 |         'TAC':'Y', 'TAT':'Y', 'TAA': stop, 'TAG': stop,
36 |         'TGC':'C', 'TGT':'C', 'TGA': stop, 'TGG':'W',
37 |     }
38 |     protein =""
39 |     
40 |     for i in range(0, len(seq), 3):
41 |         codon = seq[i:i + 3]
42 |         if len(codon) != 3:
43 |             return protein
44 |         protein+= table[codon]
45 |     return protein
46 | 
47 | def read_fasta(path):
48 |     # FASTA name/seq/comment iterator
49 |     if path.endswith('.gz'):  
50 |         import gzip      
51 |         fasta = gzip.open(path, 'rt')
52 |     else:
53 |         fasta = open(path, 'rt')
54 |     name = None
55 |     comment = ''
56 |         
57 |     for line in fasta:
58 |         if line.startswith('>'):
59 |             if name is not None:
60 |                 yield name, seq, comment
61 |             nameparts = line[1:].rstrip().split()
62 |             name = nameparts[0]
63 |             comment = ' '.join(nameparts[1:]) if len(nameparts) > 1 else ''
64 |             seq = ''
65 |         else:
66 |             seq += line.rstrip()
67 |     yield name, seq, comment
68 |  
69 | def main():
70 |     args = argparse.ArgumentParser()
71 |     args.add_argument("FASTA", help="Fasta file(s) with alleles having name as locus_id", nargs="+")
72 |     args.add_argument("-o", "--out", help="Output file")
73 |     args.add_argument("--verbose", help="Print verbose information", action="store_true")
74 |     args = args.parse_args()
75 | 
76 |     if args.out is None:
77 |         output = sys.stdout
78 |     else:
79 |         output = open(args.out, 'wt')
80 |     
81 | 
82 |     # Precheck all input files exist
83 |     for fastafile in args.FASTA:
84 |         if not os.path.exists(fastafile):
85 |             raise ValueError("ERROR: File %s does not exist" % fastafile)
86 |     
87 |     for fastafile in args.FASTA:
88 |         for name, seq, comment in read_fasta(fastafile):
89 |             print(translate(seq), " len=%s" % len(seq))
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     quit(main())


--------------------------------------------------------------------------------
/scripts/gutenwords-plotzipf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Version modeled after
  4 | https://www.thepythoncode.com/article/plot-zipfs-law-using-matplotlib-python
  5 | but keeping the original data structure
  6 | """
  7 | 
  8 | import logging
  9 | import re
 10 | def title_lines(file, from_line="", to_line=""):
 11 |     """
 12 |     Iterator on file lines (stripped from newlines):
 13 |     Return title, and words
 14 |     """
 15 |     title = "UNDEFINED"
 16 |     text = ""
 17 |     store_line = False if from_line  != "" else True
 18 |     
 19 |     try:
 20 |         with open(file, mode="r", encoding="latin-1") as f:
 21 |             for line in f:
 22 |                 # Check if line matches regex "from_line"
 23 |                 if from_line != "" and re.match(from_line, line):
 24 |                 #if line.startswith(to_line):
 25 |                     # the title is what follows "EBOOK"
 26 |                     if "EBOOK" in line:
 27 |                         title = line.split("EBOOK")[1].strip().strip("* ")
 28 |                     store_line = True
 29 |                 elif to_line  != "" and re.match(to_line, line):
 30 |                     store_line = False
 31 |                     break
 32 |                 elif store_line:
 33 |                     text += line
 34 | 
 35 |         # Strip non A-Z characters from title
 36 |         title = "".join(c for c in title if c.isalpha() or c.isspace()) if title != "UNDEFINED" else os.path.basename(file)
 37 |         return title, text
 38 |     except FileNotFoundError:
 39 |         logging.error("[lines] File %s not found", file)
 40 |         return "ERROR1", "ERROR"
 41 |     except Exception as e:
 42 |         logging.error("[lines] Error reading %s: %s", file, e)
 43 |         return "ERROR2", "ERROR"
 44 | 
 45 | def words(line):
 46 |     """
 47 |     Return words from a line stripping punctuation
 48 |     """
 49 |     for word in line.split():
 50 |         try:
 51 |             w = word.strip(".,;:?!()[]_*{}\"'").lower()
 52 |             # Strip 's and 'll from the end of words
 53 |             if w.endswith("'s"):
 54 |                 w = w[:-2]
 55 |             if w.endswith("'ll"):
 56 |                 w = w[:-3]
 57 |             # Discard hypenated words
 58 |             if "-" in w:
 59 |                 continue
 60 |             # Check for digits
 61 |             if any(c.isdigit() for c in w):
 62 |                 continue
 63 |             yield w
 64 |         except AttributeError:
 65 |             pass
 66 | 
 67 |  
 68 | if __name__ == "__main__":
 69 |     import argparse
 70 |     args = argparse.ArgumentParser("Analyse the frequency of words in a set of Gutenberg files")
 71 |     args.add_argument("FILES", help="Input files", nargs="+")
 72 |     args.add_argument("-o", "--output", help="Plot the top words as png file", required=True)
 73 |     args.add_argument("-m", "--max", help="Max files to process, 0 for all [default: %(default)s]", type=int, default=1000)
 74 |     args.add_argument("-p", "--max-plot", help="Max files to plot, 0 for all [default: %(default)s]", type=int, default=20)
 75 |     
 76 |     args.add_argument("--start", help="Start of the manuscript [default: %(default)s]", default="\*\*\*\s?START OF")
 77 |     args.add_argument("--end", help="End of the manuscript [default: %(default)s]", default="\*\*\*\s?END OF")
 78 |     args.add_argument("--verbose", help="Verbose output", action="store_true")
 79 |     args.add_argument("--debug", help="Debug output", action="store_true")
 80 |     args = args.parse_args()
 81 | 
 82 |     # Logger
 83 |     logFormat = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 84 |     
 85 |     if args.debug:
 86 |         logging.basicConfig(format=logFormat, level=logging.DEBUG)
 87 |     elif args.verbose:
 88 |         logging.basicConfig(format=logFormat, level=logging.INFO)
 89 |     else:
 90 |         logging.basicConfig(format=logFormat, level=logging.WARNING)
 91 |     
 92 | 
 93 |     # Create logger with formatter
 94 |   
 95 |     logger = logging.getLogger("gutenwords")
 96 | 
 97 | 
 98 |     if True:
 99 |         # Imports
100 |         import os
101 |         from matplotlib import pyplot as plt
102 |         import string
103 |         import numpy as np
104 |         from scipy.interpolate import make_interp_spline
105 | 
106 |         # define some dictionaries
107 |         texts = {}
108 |         textlengths = {}
109 |         textwordamounts = {}
110 | 
111 |         unwantedCharacters = list(string.punctuation)
112 | 
113 |         # How many ranks we'll show
114 |         depth = 50
115 |         xAxis = [str(number) for number in range(1, depth+1)]
116 | 
117 |         allText = ""
118 |         texts["ALL"] = ""
119 |         for file in args.FILES:
120 |             # Process at most args.max files
121 |             if args.max > 0 and len(texts) >= args.max:
122 |                 break
123 | 
124 |            
125 |             title, text = title_lines(file, from_line=args.start, to_line=args.end)
126 | 
127 |             if title == "UNDEFINED":
128 |                 logger.warning("Title not found in %s: skipping", file)
129 |                 continue
130 |             texts[title] = text
131 |             allText += " " + text
132 |             
133 |         
134 |         texts["ALL"] = allText
135 | 
136 |         
137 |         # Remove duplicates
138 |         files_to_process = args.FILES
139 |         files_to_process = list(dict.fromkeys(files_to_process))
140 | 
141 |         # Cleaning and counting the Text
142 |         done = 0
143 |         denominator = int(len(files_to_process) / 15)
144 |         denominator = 1 if denominator == 0 else denominator
145 |         for text in texts:
146 |             done = done + 1
147 |             perc = 100 * done / len(files_to_process)
148 |             if done == 1 or done % denominator == 0:
149 |                 logger.info("%d%%: Processing file #%d: %s" % (perc, done, text))
150 |             # Remove unwanted characters from the texts
151 |             for character in unwantedCharacters:
152 |                 texts[text] = texts[text].replace(character, '').lower()
153 |             
154 |             #splittedText = texts[text].split(' ')
155 |             splittedText = list(words(texts[text]))
156 |             # Saving the text length to show in the label of the line later
157 |             textlengths[text] = len(splittedText)
158 |             # Here will be the amount of occurence of each word stored
159 |             textwordamounts[text] = {}
160 |             # Loop through all words in the text
161 |             for i in splittedText:
162 |                 # Add to the word at the given position if it already exists
163 |                 # Else set the amount to one essentially making a new item in the dict
164 |                 if i in textwordamounts[text].keys():
165 |                     textwordamounts[text][i] += 1
166 |                 else:
167 |                     textwordamounts[text][i] = 1
168 |             # Sorting the dict by the values with sorted
169 |             # define custom key so the function knows what to use when sorting
170 |             textwordamounts[text] = dict(
171 |                 sorted(
172 |                     textwordamounts[text ].items(),
173 |                     key=lambda x: x[1],
174 |                     reverse=True)[0:depth]
175 |                 )
176 |         
177 |         # Get the percentage value of a given max value
178 |         def percentify(value, max):
179 |             return round(value / max * 100)
180 | 
181 |         # Generate smooth curvess
182 |         def smoothify(yInput):
183 |             x = np.array(range(0, depth))
184 |             y = np.array(yInput)
185 |             # define x as 600 equally spaced values between the min and max of original x
186 |             x_smooth = np.linspace(x.min(), x.max(), 600) 
187 |             # define spline with degree k=3, which determines the amount of wiggle
188 |             spl = make_interp_spline(x, y, k=3)
189 |             y_smooth = spl(x_smooth)
190 |             # Return the x and y axis
191 |             return x_smooth, y_smooth
192 | 
193 | 
194 |         # Make the perfect Curve
195 |         logger.info("Smoothing curve...")
196 |         ziffianCurveValues = [100/i for i in range(1, depth+1)]
197 |         x, y = smoothify(ziffianCurveValues)
198 |         logger.info("Preparing plot")
199 |         # Set plot size
200 |         plt.figure(figsize=(20, 10))
201 |         plt.plot(x, y, label='Ziffian Curve', ls=':', color='grey')
202 | 
203 | 
204 |         # Plot the texts
205 |         # Get the first ten keys of 
206 |         keys_top = list(textwordamounts.keys())[0:args.max_plot]
207 |         keys_top[0] = "ALL" if not "ALL" in keys_top else keys_top[0]
208 |         for i in keys_top:
209 |             logger.info("Plotting %s" % i)
210 |             maxValue = list(textwordamounts[i].values())[0]
211 |             yAxis = [percentify(value, maxValue) for value in list(textwordamounts[i].values())]
212 |             x, y = smoothify(yAxis)
213 |             if i == "ALL":
214 |                 plt.plot(x, y, label='%s (%d words)' % (i, textlengths[i]), lw=2, color='black')
215 |             else:
216 |                 plt.plot(x, y, label=i+f' [{textlengths[i]}]', lw=1, alpha=0.5)
217 |         
218 |         
219 |         plt.xticks(range(0, depth), xAxis)
220 | 
221 |         plt.legend()
222 |         plt.savefig(args.output, dpi=300)
223 | 


--------------------------------------------------------------------------------
/scripts/gutenwords-topandplot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Given a set of text files from the Gutenberg project, analyse the frequency of words
  4 | from the "start" of the manuscript to its "end". The "start" and "end" are defined
  5 | as after "*** START" and before "*** END" respectively.
  6 | 
  7 | Usage:
  8 | gutenwords.py --plot-file plot.png -n 100 FILE1.txt FILE2.txt ...
  9 | """
 10 | 
 11 | import logging
 12 | 
 13 | def lines(file, from_line="***START OF", to_line="***END OF"):
 14 |     """
 15 |     Iterator on file lines (stripped from newlines):
 16 |     Given a text file will return all the lines between line starting with
 17 |     'from_line' and line starting with 'to_line'
 18 |     """
 19 |     can_yield = False
 20 |     try:
 21 |         with open(file, mode="r", encoding="latin-1") as f:
 22 |             for line in f:
 23 |                 if line.startswith(to_line):
 24 |                     break
 25 |                 elif can_yield:
 26 |                     yield line.strip()
 27 |                 else:
 28 |                     can_yield = line.startswith(from_line)
 29 |     except FileNotFoundError:
 30 |         logging.error("[lines] File %s not found", file)
 31 |     except Exception as e:
 32 |         logging.error("[lines] Error reading %s: %s", file, e)
 33 | 
 34 | def words(line):
 35 |     """
 36 |     Return words from a line stripping punctuation
 37 |     """
 38 |     for word in line.split():
 39 |         try:
 40 |             w = word.strip(".,;:?!()[]_*{}\"'").lower()
 41 |             # Strip 's and 'll from the end of words
 42 |             if w.endswith("'s"):
 43 |                 w = w[:-2]
 44 |             if w.endswith("'ll"):
 45 |                 w = w[:-3]
 46 |             # Discard hypenated words
 47 |             if "-" in w:
 48 |                 continue
 49 |             # Check for digits
 50 |             if any(c.isdigit() for c in w):
 51 |                 continue
 52 |             yield w
 53 |         except AttributeError:
 54 |             pass
 55 | 
 56 | def top_n_words(dict, n=10, reverse=False):
 57 |     """
 58 |     Given a dictionary of key:counts, return a dictionary
 59 |     with the top n key:values
 60 |     """
 61 |     rev = not reverse
 62 |     return {k: v for k, v in sorted(dict.items(), key=lambda item: item[1], reverse=rev)[:n]}
 63 | 
 64 | if __name__ == "__main__":
 65 |     import argparse
 66 |     args = argparse.ArgumentParser("Analyse the frequency of words in a set of Gutenberg files")
 67 |     args.add_argument("FILES", help="Input files", nargs="+")
 68 |     args.add_argument("-n", "--num", help="Number of top words to report", type=int, default=10)
 69 |     args.add_argument("--start", help="Start of the manuscript", default="*** START OF")
 70 |     args.add_argument("--end", help="End of the manuscript", default="*** END OF")
 71 |     args.add_argument("--plot-file", help="Plot the top words as pnggit file")
 72 |     args.add_argument("--verbose", help="Verbose output", action="store_true")
 73 |     args.add_argument("--debug", help="Debug output", action="store_true")
 74 |     args = args.parse_args()
 75 | 
 76 |     # Logger
 77 |     logFormat = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 78 |     
 79 |     if args.debug:
 80 |         logging.basicConfig(format=logFormat, level=logging.DEBUG)
 81 |     elif args.verbose:
 82 |         logging.basicConfig(format=logFormat, level=logging.INFO)
 83 |     else:
 84 |         logging.basicConfig(format=logFormat, level=logging.WARNING)
 85 |     
 86 | 
 87 |     # Create logger with formatter
 88 |   
 89 |     logger = logging.getLogger("gutenwords")
 90 | 
 91 |     word_count = {}
 92 |     for file in args.FILES:
 93 |         logger.info("Processing file %s" % file)
 94 |         for line in lines(file):
 95 |             for word in words(line):
 96 |                 word_count[word] = 1 if word not in word_count else word_count[word] + 1
 97 |     
 98 |     total_words = sum(word_count.values())
 99 |     
100 |     for w, c in top_n_words(word_count, n=args.num, reverse=False).items():
101 |         print("%s\t%s\t%s" % (w, c, c/total_words))
102 |     print("---")
103 |     for w, c in top_n_words(word_count, n=args.num, reverse=True).items():
104 |         print("%s\t%s\t%s" % (w, c, 100*c/total_words))
105 | 
106 | 
107 |     if args.plot_file:
108 |         """
109 |         Plot the top n words in a line plot, save as {args.plot_file}.png
110 |         only if args.plot_file is supplied (--plot-file FILE)
111 |         """
112 |         import matplotlib.pyplot as plt
113 |         import numpy as np
114 |         top_words = top_n_words(word_count, n=args.num, reverse=False)
115 |         plot_title = "Top %s words out of %s in %s files" % (args.num, len(word_count), len(args.FILES))
116 |         plt.title(plot_title)
117 |         # Plot the top words having their counts as y axis
118 |         plt.figure(figsize=(20, 8))
119 |         plt.bar(top_words.keys(), top_words.values())
120 |         plt.xticks(rotation=90)
121 |         plt.savefig(args.plot_file)
122 | 


--------------------------------------------------------------------------------
/scripts/gutenwords_0.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | A very simple script returning the top N words in a text file
 4 | 
 5 | Arguments:
 6 | - n
 7 | - filename
 8 | 
 9 | """
10 | import sys
11 | import os
12 | 
13 | 
14 | # The program requires two arguments: let's check we got them
15 | if len(sys.argv) != 3:
16 |     print("Usage: python3 words.py n filename")
17 |     print("ERROR: Supply two arguments")
18 |     exit(1)
19 | 
20 | # Retrieve the two parameters from the command line using sys.argv
21 | filename, n = sys.argv[1], int(sys.argv[2])
22 | 
23 | # Check that the file exists
24 | if not os.path.exists(filename):
25 |     print(f"ERROR: File {filename} not found")
26 |     exit(2)
27 | 
28 | # Read the file
29 | with open(filename, mode="r", encoding="latin-1") as f:
30 |     text = f.read()
31 | 
32 | # Split the text into words
33 | words = text.split()
34 | 
35 | # Create a dictionary of word:count
36 | word_counts = {}
37 | for word in words:
38 |     # TODO: We can consider doing some cleaning of the words here (e.g. stripping punctuation)
39 |     if word not in word_counts:
40 |         word_counts[word] = 0
41 |     word_counts[word] += 1
42 | 
43 | # Sort the dictionary by value: see for example https://www.freecodecamp.org/news/sort-dictionary-by-value-in-python/
44 | sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
45 | 
46 | # Print the top n words
47 | for word, count in sorted_word_counts[:n]:
48 |     frequency = count / len(words) * 100
49 |     print(f"{word}\t{count}\t{frequency:.2f}%")
50 | 


--------------------------------------------------------------------------------
/scripts/gutenwords_1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | A very simple script returning the top N words in a text file
 4 | 
 5 | Arguments:
 6 | - n
 7 | - filename
 8 | 
 9 | CHANGES:
10 | - Use argparse to parse the command line arguments
11 | - Add a cleanup step to polish the words
12 | 
13 | """
14 | import sys
15 | import os
16 | import argparse
17 | 
18 | # NEW: cleanup function
19 | def cleanup(word):
20 |     """
21 |     Clean up a word by removing punctuation and making it lowercase
22 |     """
23 |     stripped = word.strip(".,;:!?-").lower()
24 |     # Unwanted suffixes
25 |     suffixes = ["'s", "'d", "'ll", "'ve", "'re", "'m"]
26 |     for suffix in suffixes:
27 |         if stripped.endswith(suffix):
28 |             stripped = stripped[:-len(suffix)]
29 |             # Assume only one suffix
30 |             break
31 | 
32 |     return stripped
33 | 
34 | # NEW: Use argparse to parse the command line arguments
35 | args = argparse.ArgumentParser("A very simple script returning the top N words in a text file")
36 | # Positional argument: filename (always required)
37 | args.add_argument("filename", help="Input text file")
38 | # Argument -n INT: required but a default is provided (10)
39 | args.add_argument("-n", "--top", help="Number of words to return [default: %(default)s]", type=int, default=10)
40 | # Flag/Switch
41 | args.add_argument("-v", "--verbose", help="Verbose output", action="store_true")
42 | 
43 | args = args.parse_args()
44 | 
45 | 
46 | # Check that the file exists
47 | if not os.path.exists(args.filename):
48 |     print(f"ERROR: File {args.filename} not found", file=sys.stderr)
49 |     exit(2)
50 | 
51 | # Read the file
52 | 
53 | with open(args.filename, mode="r", encoding="latin-1") as f:
54 |     text = f.read()
55 | 
56 | # Add verbose output
57 | if args.verbose:
58 |     print(f"Read {len(text)} characters from {args.filename}", file=sys.stderr)
59 | 
60 | # Split the text into words
61 | words = text.split()
62 | 
63 | # Add verbose output
64 | if args.verbose:
65 |     print(f"Read {len(words)} words from {args.filename}")
66 | 
67 | 
68 | # Create a dictionary of word:count
69 | word_counts = {}
70 | for word in words:
71 |     # NEW: Some cleanup
72 |     word = cleanup(word)
73 | 
74 |     if word not in word_counts:
75 |         word_counts[word] = 0
76 |     word_counts[word] += 1
77 | 
78 | # Add verbose output
79 | if args.verbose:
80 |     print(f"Counted {len(word_counts)} unique words from {args.filename}")
81 |     
82 | # Sort the dictionary by value: see for example https://www.freecodecamp.org/news/sort-dictionary-by-value-in-python/
83 | sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
84 | 
85 | # Print the top n words
86 | for word, count in sorted_word_counts[:args.top]:
87 | 
88 |     frequency = count / len(words) * 100
89 |     print(f"{word}\t{count}\t{frequency:.2f}%")
90 | 


--------------------------------------------------------------------------------
/scripts/gutenwords_2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | A very simple script returning the top N words in a text file
 4 | 
 5 | Arguments:
 6 | - n
 7 | - filename
 8 | 
 9 | CHANGES:
10 | - Multiple input files allowed
11 | """
12 | import sys
13 | import os
14 | import argparse
15 | 
16 | def cleanup(word):
17 |     """
18 |     Clean up a word by removing punctuation and making it lowercase
19 |     """
20 |     stripped = word.strip(".,;:!?-").lower()
21 |     # Unwanted suffixes
22 |     suffixes = ["'s", "'d", "'ll", "'ve", "'re", "'m"]
23 |     for suffix in suffixes:
24 |         if stripped.endswith(suffix):
25 |             stripped = stripped[:-len(suffix)]
26 |             # Assume only one suffix
27 |             break
28 | 
29 |     return stripped
30 | 
31 | 
32 | 
33 | args = argparse.ArgumentParser("A very simple script returning the top N words in a text file")
34 | 
35 | # Positional argument: filename (always required)
36 | args.add_argument("filenames", help="Input text file", nargs="+")   # NEW: multiple arguments with nargs="+"
37 | # Argument -n INT: required but a default is provided (10)
38 | args.add_argument("-n", "--top", help="Number of words to return [default: %(default)s]", type=int, default=10)
39 | # Flag/Switch
40 | args.add_argument("-v", "--verbose", help="Verbose output", action="store_true")
41 | 
42 | args = args.parse_args()
43 | 
44 | 
45 | # Check that all the file exist
46 | for file in args.filenames:
47 |     if not os.path.exists(file):
48 |         print(f"ERROR: File {file} not found", file=sys.stderr)
49 |         exit(2)
50 | 
51 | # Read the file
52 | text = ""
53 | for file in args.filenames:
54 |     if args.verbose:
55 |         print(f"# Reading {file}...", file=sys.stderr)
56 | 
57 |     with open(file, mode="r", encoding="latin-1") as f:
58 |         text += " " + f.read()
59 | 
60 | # Add verbose output
61 | if args.verbose:
62 |     print(f"# Read {len(text)} characters from {len(args.filenames)} file(s)", file=sys.stderr)
63 | 
64 | # Split the text into words
65 | words = text.split()
66 | 
67 | # Add verbose output
68 | if args.verbose:
69 |     print(f"# Read {len(words)} words from {len(args.filenames)} file(s)")
70 | 
71 | # cleanup all the words
72 | words = [cleanup(word) for word in words]
73 | 
74 | # Create a dictionary of word:count
75 | word_counts = {}
76 | 
77 | for word in words:
78 |     if word not in word_counts:
79 |         word_counts[word] = 0
80 |     word_counts[word] += 1
81 | 
82 | # Add verbose output
83 | if args.verbose:
84 |     print(f"# Counted {len(word_counts)} unique words from {len(args.filenames)} file(s)")
85 |     
86 | # Sort the dictionary by value: see for example https://www.freecodecamp.org/news/sort-dictionary-by-value-in-python/
87 | sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
88 | 
89 | # Print the top n words
90 | for word, count in sorted_word_counts[:args.top]:
91 | 
92 |     frequency = count / len(words) * 100
93 |     print(f"{word}\t{count}\t{frequency:.2f}%")
94 | 


--------------------------------------------------------------------------------
/scripts/linkweb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ ! -d /home/researcher/public_html ]; then
 4 |    echo "This script is designed for training using the MRC CLIMB infrastructure (GVL image)"
 5 |    exit 1
 6 | fi
 7 | 
 8 | if [[ -e /home/ubuntu/web ]]
 9 | then
10 | 	echo "Link already found: ~/web"
11 | 	exit 0
12 | fi
13 | 
14 | if [[ -d /home/researcher/public_html/ ]]
15 | then
16 | 	sudo chown ubuntu /home/researcher/public_html/
17 | 	if [[ -d /home/ubuntu/ ]]
18 | 	then
19 | 		sudo ln -s /home/researcher/public_html/ /home/ubuntu/web
20 | 		echo "Public directory linked: ~/web"
21 | 	fi
22 | fi
23 | 


--------------------------------------------------------------------------------
/scripts/prodigal2vcontact.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Use the FASTA output from Prodigal to create a VContact file
 4 | protein to genome mapping
 5 | """
 6 | 
 7 | import argparse
 8 | import sys
 9 | """
10 | >k141_2513||full_1 # 1 # 264 # -1 # ID=1_1;partial=10;start_type=ATG;rbs_motif=None;rbs_spacer=None;gc_cont=0.402
11 | MQKIEKQIKIQEKLKDEMQKKCAKEGTEFDESKFESGIPIESLEMFENIAFLMHKHGDPD
12 | QPDDINEWLDQFETFDIYEILPEIMEMW
13 | >k141_2513||full_2 # 381 # 917 # -1 # ID=1_2;partial=01;start_type=Edge;rbs_motif=None;rbs_spacer=None;gc_cont=0.447
14 | LQESDTGEITFDTPFAVPGSVSLSLEAQGELTPFYADGIKYYVSSSNSGYEGDWEMALIT
15 | DEFREKILSEYIDKNKVMLEEATAKVKRFALGFEIDGDVRGTRFWFYCCTSTRPTTESST
16 | TEDAIEPTTDTVTVSASAVQLGTAKKMAVRAKTTADTTDDLYEKWFDKVYIPDQEVAA*
17 | >k141_636||full_1 # 1 # 573 # 1 # ID=2_1;partial=10;start_type=Edge;rbs_motif=None;rbs_spacer=None;gc_cont=0.478
18 | DRKTFARLDRLAKSNNVSKKDFLSCALEYFEKYGINPVEHESPAKEMQKLIKRCDQVIAF
19 | IRKQEQDFLRPACEAMGSTSMRVTMSMDSILTEKKFSQYQKDNDLFMRDLASLAGIREQA
20 | """
21 | 
22 | def read_fasta(path):
23 |     import gzip
24 |     seqName = None
25 |     seqComment = None
26 |     with (gzip.open if path.endswith('.gz') else open)(path, 'rt') as fasta:
27 |         for line in fasta:
28 |             if line.startswith('>'):
29 |                 if seqName is not None:
30 |                     yield seqName, seqComment, sequence
31 |                 seqName = line[1:].split()[0]
32 |                 seqComment = line[1:].split()[1:] if len(line[1:].split()) > 1 else ""
33 |                 sequence = ""
34 |                 
35 |             else:
36 |                 sequence += line.strip()
37 |     yield seqName, seqComment, sequence
38 | if __name__ == "__main__":
39 |     args = argparse.ArgumentParser("Create a VContact file from Prodigal output")
40 |     args.add_argument("FASTA", help="FASTA file")
41 |     args.add_argument("-o", "--output", help="VContact mapping file")
42 |     args = args.parse_args()
43 | 
44 |     outfh = open(args.output, 'w') if args.output else sys.stdout
45 |     print("protein_id,contig_id,keywords", file=outfh)
46 | 
47 |     for name, comment, seq in read_fasta(args.FASTA):
48 |         if '||' not in name:
49 |             raise ValueError("FASTA file does not appear to be from Prodigal")
50 |         contig = name.split("||")[0]
51 |         print("{},{},{}".format(name, contig, ""), file=outfh)
52 | 
53 | 
54 |         


--------------------------------------------------------------------------------
/scripts/remap.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | Extract the reads from a BAM file and map them against a new reference
  5 | Requires:
  6 |     - Python3
  7 |     - samtools
  8 |     - minimap2
  9 | """
 10 | 
 11 | import argparse
 12 | import tempfile
 13 | import os, sys, re
 14 | import subprocess
 15 | 
 16 | def checkversion(cmd, grep_kw="."):
 17 |     """
 18 |     Check if a command is available and return its version
 19 |     """
 20 |     try:
 21 |         output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
 22 |     except subprocess.CalledProcessError:
 23 |         print("ERROR: {} not found".format(cmd[0]), file=sys.stderr)
 24 |         sys.exit(1)
 25 |     except Exception as e:
 26 |         print("ERROR: {} not found.\n{}".format(cmd[0], e), file=sys.stderr)
 27 |         sys.exit(1)
 28 | 
 29 |     try:
 30 |         version = re.search(grep_kw, output.decode("utf-8")).group(0)
 31 |         return version
 32 |     except Exception as e:
 33 |         print("ERROR: Version unparsable: {}".format(e), file=sys.stderr)
 34 |         sys.exit(1)
 35 | 
 36 | def execute(cmd):
 37 |     """
 38 |     Execute a command passed as a list of arguments.
 39 |     """
 40 |     command_str = " ".join(cmd)
 41 |     try:
 42 |         print("% {}".format(command_str), file=sys.stderr)
 43 |         retcode = subprocess.call(cmd)
 44 |         if retcode == 0:
 45 |             print("\tDone ✅", file=sys.stderr)
 46 |         elif retcode < 0:
 47 |             print("\tChild was terminated by signal", -retcode, file=sys.stderr)
 48 |         else:
 49 |             print("\tChild returned", retcode, file=sys.stderr)
 50 |     except OSError as e:
 51 |         print("ERROR: Execution failed:", e, file=sys.stderr)
 52 |         sys.exit(1)
 53 |     print("----------------------------------------", file=sys.stderr)
 54 | 
 55 | if __name__ == "__main__":
 56 |     args = argparse.ArgumentParser("Extract reads to a single file")
 57 |     args.add_argument("bam", help="Input BAM file")
 58 |     args.add_argument("-o", "--output", help="Output BAM file", required=True)
 59 |     args.add_argument("-r", "--reference", help="New reference FASTA file", required=True)
 60 |     args.add_argument("-n", "--seqname", help="Extract only reads from this target")
 61 |     args.add_argument("-s", "--singletons", help="Map singletons", action="store_true")
 62 |     args.add_argument("-t", "--threads", help="Number of threads", type=int, default=1)
 63 |     args.add_argument("--keep", help="Keep temporary files", action="store_true")
 64 |     args = args.parse_args()
 65 | 
 66 |     # Check dependencies
 67 |     samtools_version = checkversion(["samtools", "--version"], r"\d+\.\d+\.\d+")
 68 |     minimap2_version = checkversion(["minimap2", "--version"], r".+")
 69 | 
 70 |     print("samtools version: {}".format(samtools_version), file=sys.stderr)
 71 |     print("minimap2 version: {}".format(minimap2_version), file=sys.stderr)
 72 | 
 73 |     # Prepare paths
 74 |     tmpdir = tempfile.mkdtemp(prefix="remaptmp_", dir=os.getcwd())
 75 |     print("Temp dir: %s" % tmpdir, file=sys.stderr)
 76 |     forfile = os.path.join(tmpdir, "read_R1.fq")
 77 |     revfile = os.path.join(tmpdir, "read_R2.fq")
 78 |     tmpfile = os.path.join(tmpdir, "tmp.fq")
 79 |     sngfile = os.path.join(tmpdir, "sing.fq")
 80 |     bamfile = os.path.join(tmpdir, "original.bam")
 81 |     
 82 | 
 83 |     # Extract the reference sequences matching the target
 84 |     # or simply copy the bam to the temp dir for easier access
 85 |     if args.seqname is not None:
 86 |         get_bam_cmd = ["samtools", "view", "-o", bamfile, args.bam, args.seqname]
 87 |     else:
 88 |         get_bam_cmd = ["cp", args.bam, bamfile]
 89 |     
 90 |     execute(get_bam_cmd)
 91 | 
 92 |     # Get reads from the original BAM using samtools
 93 |     if not args.singletons:
 94 |         get_fastq_cmd = ["samtools", "fastq", "-1", forfile, "-2", revfile, "--threads", str(args.threads), bamfile]
 95 |         execute(get_fastq_cmd)
 96 |     else:
 97 |         get_fastq_cmd = ["samtools", "fastq", "-o", forfile, "--threads", str(args.threads), bamfile]
 98 |         execute(get_fastq_cmd)
 99 |       
100 | 
101 |     # Map them
102 |     
103 |     if not args.singletons:
104 |         align_cmd    = ["minimap2", "-x", "sr", "-a" ,"-t", str(args.threads), args.reference, forfile, revfile]
105 |     else:
106 |         align_cmd    = ["minimap2", "-x", "sr", "-a" ,"-t", str(args.threads), args.reference, forfile]
107 |     samtools_cmd = ["samtools", "view", "-bS"]
108 |     sort_cmd     = ["samtools", "sort", "-o", args.output, "-"]
109 | 
110 |     # Compose the pipe using subprocess
111 |     print("% {}".format(align_cmd), file=sys.stderr)
112 |     p1 = subprocess.Popen(align_cmd, stdout=subprocess.PIPE)
113 |     p2 = subprocess.Popen(samtools_cmd, stdin=p1.stdout, stdout=subprocess.PIPE)
114 |     p3 = subprocess.Popen(sort_cmd, stdin=p2.stdout, stdout=None)
115 | 
116 |     # Wait until the pipes have completed
117 |     output, err = p3.communicate()
118 | 
119 | 
120 |     # Remove temporary dir and its contents
121 |     if not args.keep:
122 |         import shutil
123 |         shutil.rmtree(tmpdir)
124 | 
125 | 
126 |     
127 | 


--------------------------------------------------------------------------------
/scripts/start_denovo.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | CONDA_VERSION=$(conda --version)
 5 | DATADIR="$HOME"/denovo-data/
 6 | BOLD=$(tput bold)
 7 | RESET=$(tput sgr0)
 8 | ENVNAME="denovotut"
 9 | CONDA="conda"
10 | 
11 | if [[ ! -z ${1+x} ]]; then
12 |     if [[ $CONDA_VERSION == *"conda 4."* ]]; then
13 |         echo "[INFO] Conda version 4.x detected"
14 |     else
15 |         CONDA="notfound"
16 |     fi
17 | 
18 |     # Check if mamba is available
19 |     if command -v mamba &> /dev/null
20 |     then
21 |         echo "[INFO] Mamba is also available!"
22 |         CONDA="mamba"
23 |     fi
24 | 
25 |     if [[ $CONDA == "notfound" ]]; then
26 |         echo "ERROR: conda/mamba not found"
27 |         exit 1
28 |     fi
29 | 
30 |     # check if the environmnet dtp is already present
31 |     ENV_FOUND=$($CONDA info --envs | grep $ENVNAME | wc -l)
32 |     if [[ $ENV_FOUND == *"0"* ]]; then
33 |         echo -e "=== ${BOLD}Installing packages for the \"de novo\" tutorial${RESET}"
34 |         $CONDA create -n $ENVNAME -c conda-forge -c bioconda --quiet  --yes "seqfu>1.12" "flye" "fastp" "unicycler" "skesa" "abricate" 
35 |     fi
36 | else
37 |     echo "Conda environment will not be created"
38 |     echo 'Try: conda create -n denovo -c conda-forge -c bioconda "seqfu>1.12" "flye" "fastp" "unicycler" "skesa" "abricate" '
39 | fi
40 | #Prepare a folder under HOME USER
41 | echo -e "=== ${BOLD}Downloading datasets${RESET}"
42 | mkdir -p "$DATADIR"
43 | 
44 | 
45 | #Download data
46 | # From the paper Klemm et al 2018: https://doi.org/10.1128/mBio.00105-18
47 | for FILE in illumina_1.fastq.gz illumina_2.fastq.gz JUb129_canu1.6.fa nanopore.fastq.gz Salmonella_enterica_subsp_enterica_serovar_Typhi_str_pBL60006_v1.1.fa Salmonella_enterica_subsp_enterica_serovar_Typhi_str_pBL60006_v1.1.gff;
48 | do
49 |   wget --quiet -O "$DATADIR"/${FILE} "ftp://ftp.sanger.ac.uk/pub/project/pathogens/ap13/${FILE}"
50 |   echo " * Downloaded $FILE"
51 | done
52 | echo -e "${BOLD}Done${RESET}"
53 | 


--------------------------------------------------------------------------------