├── taxdump_readme.txt
├── README.md
└── taxdump_edit.pl


/taxdump_readme.txt:
--------------------------------------------------------------------------------
  1 | This directory contains the following NCBI Taxonomy database dump files:
  2 | 
  3 |    taxdmp.zip
  4 |    taxdump.tar.Z
  5 |    taxdump.tar.gz
  6 | 
  7 | All these files containes exactly the same information and are arranged so
  8 | for the convenience of unpacking them on various operating environments.
  9 | In addition there are files:
 10 | 
 11 |    taxdmp.zip.md5
 12 |    taxdump.tar.Z.md5
 13 |    taxdump.tar.gz.md5
 14 | 
 15 | which contain MD5 sums for the corresponding archive files. These files
 16 | might be used to check correctness of the download of corresponding 
 17 | archive file.
 18 | 
 19 | taxdmp.zip
 20 | ----------
 21 | 
 22 | Is intended for zip-capable utilities such as pkunzip, unzip, and WinZip.
 23 | These utilities are widely available in almost all operating environments.
 24 | To unpack it command-line pkunzip and unzip:
 25 | 
 26 |         pkunzip taxdmp.zip
 27 | or
 28 |         unzip taxdmp.zip
 29 | 
 30 | Note: pkunzip and/or unzip executables must be in the executable search path
 31 | and taxdmp.zip must be in the current directory. Files will be unzipped into
 32 | current directory. For desired dump files placement and more please refer to
 33 | the manual and/or option descriptions of pkunzip and unzip utilities.
 34 | 
 35 | taxdump.tar.Z
 36 | -------------
 37 | 
 38 | This file is to be unpacked by uncompress utility and subsequent tar 
 39 | archiver. These utilities are usually used in UNIX-like environment. 
 40 | Unpacking instructions follows:
 41 | 
 42 |            uncompress -c taxdump.tar.Z | tar xf - 
 43 | 
 44 | taxdump.tar.gz
 45 | --------------
 46 | 
 47 | This file is to be unpacked by GNU unzip utility and subsequent tar 
 48 | archiver. These utilities are usually used in UNIX-like environment. 
 49 | Unpacking instructions follows:
 50 | 
 51 |            gunzip -c taxdump.tar.gz | tar xf - 
 52 | 
 53 | The content of the archive
 54 | --------------------------
 55 | 
 56 | It may look like this:
 57 | 
 58 | citations.dmp
 59 | delnodes.dmp
 60 | division.dmp
 61 | gencode.dmp
 62 | merged.dmp
 63 | names.dmp
 64 | nodes.dmp
 65 | readme.txt
 66 | 
 67 | The readme.txt file gives a brief description of *.dmp files. These files
 68 | contain taxonomic information and are briefly described below. Each of the
 69 | files store one record in the single line that are delimited by "\t|\n"
 70 | (tab, vertical bar, and newline) characters. Each record consists of one 
 71 | or more fields delimited by "\t|\t" (tab, vertical bar, and tab) characters.
 72 | The brief description of field position and meaning for each file follows.
 73 | 
 74 | nodes.dmp
 75 | ---------
 76 | 
 77 | This file represents taxonomy nodes. The description for each node includes 
 78 | the following fields:
 79 | 
 80 | 	tax_id					-- node id in GenBank taxonomy database
 81 |  	parent tax_id				-- parent node id in GenBank taxonomy database
 82 |  	rank					-- rank of this node (superkingdom, kingdom, ...) 
 83 |  	embl code				-- locus-name prefix; not unique
 84 |  	division id				-- see division.dmp file
 85 |  	inherited div flag  (1 or 0)		-- 1 if node inherits division from parent
 86 |  	genetic code id				-- see gencode.dmp file
 87 |  	inherited GC  flag  (1 or 0)		-- 1 if node inherits genetic code from parent
 88 |  	mitochondrial genetic code id		-- see gencode.dmp file
 89 |  	inherited MGC flag  (1 or 0)		-- 1 if node inherits mitochondrial gencode from parent
 90 |  	GenBank hidden flag (1 or 0)            -- 1 if name is suppressed in GenBank entry lineage
 91 |  	hidden subtree root flag (1 or 0)       -- 1 if this subtree has no sequence data yet
 92 |  	comments				-- free-text comments and citations
 93 | 
 94 | names.dmp
 95 | ---------
 96 | Taxonomy names file has these fields:
 97 | 
 98 | 	tax_id					-- the id of node associated with this name
 99 | 	name_txt				-- name itself
100 | 	unique name				-- the unique variant of this name if name not unique
101 | 	name class				-- (synonym, common name, ...)
102 | 
103 | division.dmp
104 | ------------
105 | Divisions file has these fields:
106 | 	division id				-- taxonomy database division id
107 | 	division cde				-- GenBank division code (three characters)
108 | 	division name				-- e.g. BCT, PLN, VRT, MAM, PRI...
109 | 	comments
110 | 
111 | gencode.dmp
112 | -----------
113 | Genetic codes file:
114 | 
115 | 	genetic code id				-- GenBank genetic code id
116 | 	abbreviation				-- genetic code name abbreviation
117 | 	name					-- genetic code name
118 | 	cde					-- translation table for this genetic code
119 | 	starts					-- start codons for this genetic code
120 | 
121 | delnodes.dmp
122 | ------------
123 | Deleted nodes (nodes that existed but were deleted) file field:
124 | 
125 | 	tax_id					-- deleted node id
126 | 
127 | merged.dmp
128 | ----------
129 | Merged nodes file fields:
130 | 
131 | 	old_tax_id                              -- id of nodes which has been merged
132 | 	new_tax_id                              -- id of nodes which is result of merging
133 | 
134 | citations.dmp
135 | -------------
136 | Citations file fields:
137 | 
138 | 	cit_id					-- the unique id of citation
139 | 	cit_key					-- citation key
140 |         medline_id                              -- unique id in MedLine database (0 if not in MedLine)
141 | 	pubmed_id				-- unique id in PubMed database (0 if not in PubMed)
142 | 	url					-- URL associated with citation
143 | 	text					-- any text (usually article name and authors)
144 | 						-- The following characters are escaped in this text by a backslash:
145 | 						-- newline (appear as "\n"),
146 | 						-- tab character ("\t"),
147 | 						-- double quotes ('\"'),
148 | 						-- backslash character ("\\").
149 | 	taxid_list				-- list of node ids separated by a single space
150 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Taxdump Edit
  2 | 
  3 | ## Why?
  4 | The taxdump files from NCBI, along with the 'nr' database, are often used in meta -genomics and -transcriptomics software to inform taxonomic identification of reads, contigs and ORFs. However, if you are working with organisms that have little to no representation in the NCBI databases then you may find yourself a bit stuck.
  5 | 
  6 | Many researchers in this situation will have custom databases of genomic/transcriptomic data and want to use it, but may still find their organism(s) unavailable within the NCBI taxonomy DB. If your organism does not have a valid TaxID in NCBI then you are unable to use many of the software packages that rely on 'taxdump' to extract taxonomic lineage and naming information with your custom DBs.
  7 | 
  8 | ## What?
  9 | This tool will allow you to modify the 'taxdump' (appending new data to names.dmp and nodes.dmp) files from NCBI, to temporarily include your organisms - until they find represenration of their own in the NCBI taxonomy lineage.
 10 | 
 11 | ## How?
 12 | The script will automatically find the largest taxonomic ID in nodes.dmp and increment from that point (with a 10^length-1 addition) and assign it to your new taxa. This large addition is to avoid future conflicts with taxdump updates. Once added, you can then run *makeblastdb* with the '-taxid' option and your newly assigned TaxID.
 13 | 
 14 | ## Usage
 15 | ```
 16 | 	taxdump_edit.pl -names names.dmp -nodes nodes.dmp -taxa NAME -parent XXX -rank NAME -division X
 17 | 
 18 | 	Required Input:
 19 | 		-names names.dmp location
 20 | 		-nodes nodes.dmp location
 21 | 		-taxa new taxa/group name
 22 | 		-parent parent TaxID
 23 | 		-rank rank name (see -help)
 24 | 		-division division ID (see -help)
 25 | 	Optional Input
 26 | 		-override TaxID from previous
 27 | 	Optional Input (names.dmp):
 28 | 		unique name
 29 | 	Default Values (names.dmp):
 30 | 		name class (scientific name) (see -help)
 31 | 	Optional Input (nodes.dmp):
 32 | 		embl code
 33 | 		genetic code (1) (see -help)
 34 | 		mitochondria genetic code (1) (see -help)
 35 | 		comments
 36 | 	Default Values (nodes.dmp):
 37 | 		inherited div flag = 1
 38 | 		inherited GC flag = 1
 39 | 		inherited MGC flag = 1
 40 | 		GenBank hidden flag = 1
 41 | 		hidden subtree root flag = 1
 42 | ```
 43 | ## Example
 44 | ### New 'Species'
 45 | Adding a new 'species' lineage, for example, MAST-4A. We know by looking at the NCBI Taxonomy that there is already a group for "Stramenopiles MAST-4" at TaxID:[1735725](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1735725) with a lineage of "cellular organisms; Eukaryota; Stramenopiles; unclassified stramenopiles". This is correct for our new organism, so we need to note down the TaxID of '1735725'. Then use the script as below, this assumes some default options which have been noted in the [Usage](https://github.com/guyleonard/taxdump_edit/blob/master/README.md#usage) section above:
 46 | 
 47 |     taxdump_edit.pl -names names.dmp -nodes nodes.dmp -taxa MAST-4A -parent 1735725 -rank species -division 11
 48 | We have given the script the location of both names.dmp and nodes.dmp, along with the new taxa name of 'MAST-4A'. We are saying that the parental lineage is TaxID:1735725 and that the rank of the organism is 'species'. The division number is from the [Division](https://github.com/guyleonard/taxdump_edit/blob/master/README.md#divisions) list below, and is 'Environmental Samples' - number 11 - to reflect the provenance of our sample and unlike many other Stramenopiles in NCBI which are listed as '4' - Plants and Fungi. :/ 
 49 | 
 50 | This will show the output:
 51 | 
 52 |     Your calculated TaxID = 3304349. Please use this with makeblastdb and your fasta sequences.
 53 |     Backing up orginal names.dmp
 54 |     Appending new line
 55 |     Done.
 56 |     Backing up orginal nodes.dmp
 57 |     Appending new line
 58 |     Finished.
 59 | Remember your new TaxID of '3304349', this is the ID you will need to use with *makeblastdb*.
 60 | 
 61 | At the end of the names.dmp file, you will now have a new record:
 62 | 
 63 |     3304349	|	MAST-4A	|		|	scientific name	|
 64 | Along with the corresponding record in nodes.dmp
 65 |     
 66 |     3304349	|	1735725	|	species	|		|	11	|	1	|	1	|	1	|	1	|	1	|	1	|	1	|
 67 | The original nodes.dmp and names.dmp have been backed up in the same location as nodes_backup.dmp and names_backup.dmp.
 68 | 
 69 | ### New Group
 70 | This is done much in the same way, but you will have to add the different lineage levels one-by-one in order to build the taxonomic relationships. However, we don't want the TaxID to keep on incrementing by 10^length-1, so we can use the -override variable to supply the script with the previous TaxID and it will increment it by 1. Add the 'lowest' rank of your new lineage first, e.g. kingdom before class and then finally genus and species.
 71 | 
 72 | ### Variable Options
 73 | #### Divisions
 74 | 	0 -> Bacteria
 75 | 	1 -> Invertebrates
 76 | 	2 -> Mammals
 77 | 	3 -> Phages
 78 | 	4 -> Plants and Fungi
 79 | 	5 -> Primates
 80 | 	6 -> Rodents
 81 | 	7 -> Synthetic and Chimeric
 82 | 	~~8 -> Unassigned - Do Not Use~~
 83 | 	9 -> Viruses
 84 | 	10 -> Vertebrates
 85 | 	11 -> Environmental Samples
 86 | #### Genetic Code
 87 | 	0 -> Unspecified
 88 | 	1 -> Standard
 89 | 	2 -> Vertebrate Mitochondrial
 90 | 	3 -> Yeast Mitochondrial
 91 | 	4 -> Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma
 92 | 	5 -> Invertebrate Mitochondrial
 93 | 	6 -> Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear
 94 | 	9 -> Echinoderm Mitochondrial; Flatworm Mitochondrial
 95 | 	10 -> Euplotid Nuclear
 96 | 	11 -> Bacterial, Archaeal and Plant Plastid
 97 | 	12 -> Alternative Yeast Nuclear
 98 | 	13 -> Ascidian Mitochondrial
 99 | 	14 -> Alternative Flatworm Mitochondrial
100 | 	15 -> Blepharisma Macronuclear
101 | 	16 -> Chlorophycean Mitochondrial
102 | 	21 -> Trematode Mitochondrial
103 | 	22 -> Scenedesmus obliquus mitochondrial
104 | 	23 -> Thraustochytrium mitochondrial code
105 | 	24 -> Pterobranchia Mitochondrial
106 | 	25 -> Candidate Division SR 1 and Gracilibacteria
107 | 	26 -> Pachysolen tannophilus Nuclear
108 | 	27 -> Karyorelict Nuclear
109 | 	28 -> Condylostoma Nuclear
110 | 	29 -> Mesodinium Nuclear
111 | 	30 -> Peritrich Nuclear
112 | 	31 -> Blastocrithidia Nuclear
113 | #### Name Class
114 | 	Acronym
115 | 	Anamorph
116 | 	Authority
117 | 	Blast Name
118 | 	Common Name
119 | 	Equivalent Name
120 | 	Genbank Acronym
121 | 	Genbank Anamorph
122 | 	Genbank Common Name
123 | 	Genbank Synonym
124 | 	Includes
125 | 	In-part
126 | 	Misnomer
127 | 	Misspelling
128 | 	Scientific Name
129 | 	Synonym
130 | 	Teleomorph
131 | 	Type Material
132 | #### Taxonomic Rank
133 | 	no rank
134 | 	superkingdom
135 | 		kingdom
136 | 			subkingdom
137 | 	superphylum
138 | 		phylum
139 | 			subphylum
140 | 	superclass
141 | 		class
142 | 			subclass
143 | 				infraclass
144 | 	cohort
145 | 	superorder
146 | 		order
147 | 			suborder
148 | 				infraorder
149 | 					parvorder
150 | 	superfamily
151 | 		family
152 | 			subfamily
153 | 			tribe
154 | 				subtribe
155 | 		genus
156 | 			subgenus
157 | 	species group
158 | 		species
159 | 		species subgroup
160 | 			subspecies
161 | 				varietas
162 | 					forma
163 | 
164 | # More Information 
165 | ## Structure of \*.dmp files
166 | As per NCBI's taxdump_readme.txt:
167 | Each of the files store one record in the single line that are delimited by "\t|\n" (tab, vertical bar, and newline) characters. Each record consists of one or more fields delimited by "\t|\t" (tab, vertical bar, and tab) characters. The brief description of field position and meaning for each file follows.
168 | 
169 | ## nodes.dmp
170 | This file represents taxonomy nodes. The description for each node includes the following fields:
171 | 
172 | 	tax_id					-- node id in GenBank taxonomy database
173 |  	parent tax_id				-- parent node id in GenBank taxonomy database
174 |  	rank					-- rank of this node (superkingdom, kingdom, ...) 
175 |  	embl code				-- locus-name prefix; not unique
176 |  	division id				-- see division.dmp file
177 |  	inherited div flag  (1 or 0)		-- 1 if node inherits division from parent
178 |  	genetic code id				-- see gencode.dmp file
179 |  	inherited GC  flag  (1 or 0)		-- 1 if node inherits genetic code from parent
180 |  	mitochondrial genetic code id		-- see gencode.dmp file
181 |  	inherited MGC flag  (1 or 0)		-- 1 if node inherits mitochondrial gencode from parent
182 |  	GenBank hidden flag (1 or 0)            -- 1 if name is suppressed in GenBank entry lineage
183 |  	hidden subtree root flag (1 or 0)       -- 1 if this subtree has no sequence data yet
184 |  	comments				-- free-text comments and citations
185 | 
186 | ## names.dmp
187 | Taxonomy names file has these fields:
188 | 
189 | 	tax_id					-- the id of node associated with this name
190 | 	name_txt				-- name itself
191 | 	unique name				-- the unique variant of this name if name not unique
192 | 	name class				-- (synonym, common name, ...)
193 | 
194 | ## Taxdump Files
195 | ```
196 | wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
197 | tar zxvf taxdump.tar/gz
198 | ```
199 | 
200 | ## Taxdump Readme
201 | ```
202 | wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump_readme.txt
203 | ```
204 | 
205 | ## Citation
206 | [![DOI](https://zenodo.org/badge/145837876.svg)](https://zenodo.org/badge/latestdoi/145837876)
207 | 
208 | 


--------------------------------------------------------------------------------
/taxdump_edit.pl:
--------------------------------------------------------------------------------
  1 | #!/bin/env perl
  2 | use strict;
  3 | use warnings;
  4 | 
  5 | use File::Basename;
  6 | use File::Copy;
  7 | use Getopt::Long;
  8 | 
  9 | our $VERSION = 0.1;
 10 | my $version = "taxdump_edit.pl v$VERSION";
 11 | 
 12 | # user inputs
 13 | my $nodes;
 14 | my $names;
 15 | my $override;
 16 | 
 17 | ## names.dmp specific
 18 | my $new_name        = '';                   # the name itself
 19 | my $new_name_unique = '';                   # the unique variant of this name if name not unique
 20 | my $new_name_class  = 'scientific name';    # see name class
 21 | 
 22 | ## nodes.dmp specific
 23 | my $parent_tax_id            = '';          # parent node id in GenBank taxonomy database
 24 | my $rank                     = '';          # rank of this node (superkingdom, kingdom, ...)
 25 | my $embl_code                = '';          # locus-name prefix; not unique
 26 | my $division_id              = '';          # see division.dmp file
 27 | my $inherited_div_flag       = '1';         # (1 or 0) 1 if node inherits division from parent
 28 | my $genetic_code_id          = '1';         # see gencode.dmp file
 29 | my $inherited_GC_flag        = '1';         # (1 or 0) 1 if node inherits genetic code from parent
 30 | my $mito_gen_code_id         = '1';         # see gencode.dmp file
 31 | my $inherited_MGC_flag       = '1';         # (1 or 0) 1 if node inherits mitochondrial gencode from parent
 32 | my $gb_hidden_flag           = '1';         # (1 or 0) 1 if name is suppressed in GenBank entry lineage
 33 | my $hidden_subtree_root_flag = '1';         # (1 or 0) 1 if this subtree has no sequence data yet
 34 | my $comments                 = '';          # free-text comments and citations
 35 | 
 36 | # getops
 37 | GetOptions(
 38 | 
 39 |     # required
 40 |     'nodes=s'    => \$nodes,
 41 |     'names=s'    => \$names,
 42 |     'taxa=s'     => \$new_name,
 43 |     'parent=i'   => \$parent_tax_id,
 44 |     'rank=s'     => \$rank,
 45 |     'division=i' => \$division_id,
 46 |     'override=i' => \$override,
 47 | 
 48 |     # optional
 49 | 
 50 |     # other
 51 |     'version|v' => sub { print "$version\n" },
 52 |     'h'         => sub { help_message( "Welcome to $version", 0 ) },
 53 |     'help'      => sub { help_message( "Welcome to $version", 1 ) }
 54 | ) or help_message( "Hello :) Something is missing...", 0 );
 55 | 
 56 | help_message( "The nodes.dmp location must be specified.", 0 )
 57 |   unless defined $nodes;
 58 | help_message( "The names.dmp location must be specified.", 0 )
 59 |   unless defined $names;
 60 | help_message( "The taxon name or group name must be specified.", 0 )
 61 |   unless defined $new_name;
 62 | help_message( "The parent taxa ID must be specified.", 0 )
 63 |   unless defined $parent_tax_id;
 64 | help_message( "The rank must be specified.", 0 ) unless defined $rank;
 65 | help_message( "The division must be specified.", 0 )
 66 |   unless defined $division_id;
 67 | 
 68 | # Get the largest Tax ID from the user specified nodes.dmp
 69 | my $largest_taxid = get_largest_tax_id($nodes);
 70 | 
 71 | # $largest_taxid increased by a factor of 10 to it's length - 1, to avoid conflicts
 72 | my $new_taxid;
 73 | if ( defined $override ) {
 74 |     $new_taxid = $override;
 75 | }
 76 | else {
 77 |     $new_taxid = $largest_taxid + ( 10**( length($largest_taxid) - 1 ) );
 78 | }
 79 | 
 80 | print "Your calculated TaxID = $new_taxid. Please use this with makeblastdb and your fasta sequences.\n";
 81 | 
 82 | ## Edit Names.dmp
 83 | # Backup original file
 84 | my ( $file, $dir, $ext ) = fileparse $names, '\.dmp';
 85 | my $names_backup = "$dir\/$file\_backup$ext";
 86 | print "Backing up orginal names.dmp\n";
 87 | copy( $names, $names_backup ), or die "Copy failed: $!";
 88 | 
 89 | # append new line
 90 | print "Appending new line\n";
 91 | open( my $names_edit_fh, '>>', $names );
 92 | print $names_edit_fh "$new_taxid\t\|\t$new_name\t\|\t$new_name_unique\t\|\t$new_name_class\t\|\n";
 93 | close($names_edit_fh);
 94 | print "Done.\n";
 95 | 
 96 | ## Edit Names.dmp
 97 | # Backup original file
 98 | ( $file, $dir, $ext ) = fileparse $nodes, '\.dmp';
 99 | my $nodes_backup = "$dir\/$file\_backup$ext";
100 | print "Backing up orginal nodes.dmp\n";
101 | copy( $nodes, $nodes_backup ), or die "Copy failed: $!";
102 | 
103 | # append new line
104 | print "Appending new line\n";
105 | open( my $nodes_edit_fh, '>>', $nodes );
106 | print $nodes_edit_fh
107 | "$new_taxid\t\|\t$parent_tax_id\t\|\t$rank\t\|\t$embl_code\t\|\t$division_id\t\|\t$inherited_div_flag\t\|\t$genetic_code_id\t\|\t$inherited_GC_flag\t\|\t$mito_gen_code_id\t\|\t$inherited_MGC_flag\t\|\t$gb_hidden_flag\t\|\t$hidden_subtree_root_flag\t\|\t$comments\n";
108 | print "Finished.\n";
109 | 
110 | #############
111 | # subroutines
112 | #############
113 | 
114 | # get the last line of the nodes or names file
115 | # return the value in the first tab column
116 | # assumes file is sorted
117 | sub get_largest_tax_id {
118 |     my $filename = shift;
119 |     open my $fh, "<$filename" or die "Can't open: $filename $!\n";
120 | 
121 |     my $lastline;
122 |     $lastline = $_ while <$fh>;
123 |     $lastline =~ /(\d+)\t.*/;
124 |     $lastline = $1;
125 | 
126 |     return $lastline;
127 | }
128 | 
129 | sub help_message {
130 |     my $message = $_[0];
131 |     my $verbose = $_[1];
132 |     if ( defined $message && length $message ) {
133 |         $message .= "\n"
134 |           unless $message =~ /\n$/;
135 |     }
136 |     my $command = $0;
137 |     $command =~ s#^.*/##;
138 | 
139 |     print "$message\n";
140 |     print "usage: $command -names names.dmp -nodes nodes.dmp -taxa NAME -parent XXX -rank NAME -division X\n";
141 |     print << "HELP"; 
142 | Required Input:
143 | \t-names names.dmp location
144 | \t-nodes nodes.dmp location
145 | \t-taxa new taxa/group name
146 | \t-parent parent TaxID
147 | \t-rank rank name (see -help)
148 | \t-division division ID (see -help)
149 | Optional Input
150 | \t-override TaxID from previous
151 | Optional Input (names.dmp):
152 | \tunique name
153 | Default Values (names.dmp):
154 | \tname class (scientific name) (see -help)
155 | Optional Input (nodes.dmp):
156 | \tembl code
157 | \tgenetic code (1) (see -help)
158 | \tmitochondria genetic code (1) (see -help)
159 | \tcomments
160 | Default Values (nodes.dmp):
161 | \tinherited div flag = 1
162 | \tinherited GC flag = 1
163 | \tinherited MGC flag = 1
164 | \tGenBank hidden flag = 1
165 | \thidden subtree root flag = 1
166 | HELP
167 |     print "Use -help for a more verbose help message.\n";
168 | 
169 |     my %divisions = (
170 |         "0"  => "Bacteria",
171 |         "1"  => "Invertebrates",
172 |         "2"  => "Mammals",
173 |         "3"  => "Phages",
174 |         "4"  => "Plants and Fungi",
175 |         "5"  => "Primates",
176 |         "6"  => "Rodents",
177 |         "7"  => "Synthetic and Chimeric",
178 |         "8"  => "Unassigned - Do Not Use",
179 |         "9"  => "Viruses",
180 |         "10" => "Vertebrates",
181 |         "11" => "Environmental Samples"
182 |     );
183 | 
184 |     my %genetic_code = (
185 |         "0"  => "Unspecified",
186 |         "1"  => "Standard",
187 |         "2"  => "Vertebrate Mitochondrial",
188 |         "3"  => "Yeast Mitochondrial",
189 |         "4"  => "Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma",
190 |         "5"  => "Invertebrate Mitochondrial",
191 |         "6"  => "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear",
192 |         "9"  => "Echinoderm Mitochondrial; Flatworm Mitochondrial",
193 |         "10" => "Euplotid Nuclear",
194 |         "11" => "Bacterial, Archaeal and Plant Plastid",
195 |         "12" => "Alternative Yeast Nuclear",
196 |         "13" => "Ascidian Mitochondrial",
197 |         "14" => "Alternative Flatworm Mitochondrial",
198 |         "15" => "Blepharisma Macronuclear",
199 |         "16" => "Chlorophycean Mitochondrial",
200 |         "21" => "Trematode Mitochondrial",
201 |         "22" => "Scenedesmus obliquus mitochondrial",
202 |         "23" => "Thraustochytrium mitochondrial code",
203 |         "24" => "Pterobranchia Mitochondrial",
204 |         "25" => "Candidate Division SR 1 and Gracilibacteria",
205 |         "26" => "Pachysolen tannophilus Nuclear",
206 |         "27" => "Karyorelict Nuclear",
207 |         "28" => "Condylostoma Nuclear",
208 |         "29" => "Mesodinium Nuclear",
209 |         "30" => "Peritrich Nuclear",
210 |         "31" => "Blastocrithidia Nuclear"
211 |     );
212 | 
213 |     my $name_class =
214 | "Acronym\nAnamorph\nAuthority\nBlast Name\nCommon Name\nEquivalent Name\nGenbank Acronym\nGenbank Anamorph\nGenbank Common Name\nGenbank Synonym\nIncludes\nIn-part\nMisnomer\nMisspelling\nScientific Name\nSynonym\nTeleomorph\nType Material";
215 | 
216 |     my $taxonomic_rank =
217 | "no rank\nsuperkingdom\n\tkingdom\n\t\tsubkingdom\nsuperphylum\n\tphylum\n\t\tsubphylum\nsuperclass\n\tclass\n\t\tsubclass\n\t\t\tinfraclass\ncohort\nsuperorder\n\torder\n\t\tsuborder\n\t\t\tinfraorder\n\t\t\t\tparvorder\nsuperfamily\n\tfamily\n\t\tsubfamily\n\t\ttribe\n\t\t\tsubtribe\n\tgenus\n\t\tsubgenus\nspecies group\n\tspecies\n\tspecies subgroup\n\t\tsubspecies\n\t\t\tvarietas\n\t\t\t\tforma\n";
218 | 
219 |     if ( $verbose == 1 ) {
220 |         print "\nDivisions (use number code):\n";
221 |         foreach ( sort { $a <=> $b } keys %divisions ) {
222 |             print "$_: $divisions{$_}\n";
223 |         }
224 | 
225 |         print "\nGenetic Codes (use number code):\n";
226 |         foreach ( sort { $a <=> $b } keys %genetic_code ) {
227 |             print "$_: $genetic_code{$_}\n";
228 |         }
229 | 
230 |         print "\nName Class (use name):\n$name_class\n";
231 | 
232 |         print "\nTaxonomic Rank (use name):\n$taxonomic_rank\n";
233 |     }
234 | 
235 |     exit(1);
236 | }
237 | 


--------------------------------------------------------------------------------