├── md ├── logo_digs.png ├── CONTRIBUTING.md └── code_of_conduct.md ├── modules ├── GUIDE.md ├── DIGS │ ├── CrossMatch.pm │ ├── Extract.pm │ ├── Classify.pm │ ├── Consolidate.pm │ ├── Initialise.pm │ ├── Utility.pm │ ├── TargetDB.pm │ └── DIGS.pm ├── Base │ ├── DevTools.pm │ ├── Console.pm │ └── FileIO.pm └── Interface │ ├── BLAST.pm │ └── MySQLtable.pm ├── digs_tool.pl └── README.md /md/logo_digs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/giffordlabcvr/DIGS-tool/HEAD/md/logo_digs.png -------------------------------------------------------------------------------- /modules/GUIDE.md: -------------------------------------------------------------------------------- 1 | **Organisation of the DIGS code: PERL module library** 2 | ------------------------------------------------------------------------------------ 3 | 4 | ### **Modules in the DIGS directory** 5 | 6 | #### Main modules 7 | 8 | ``` 9 | - DIGS.pm # Database-integrated genome screening - main functions 10 | - TargetDB.pm # Managing the screening directory 11 | - Utility.pm # Utility functions 12 | - Nomenclature.pm # Locus ID creation and management 13 | - Test.pm # Tests 14 | ``` 15 | 16 | #### Modules for setting up 17 | 18 | ``` 19 | - Initialise.pm # General set-up (loading the database etc) 20 | - ScreenBuilder.pm # Setting up a screen 21 | ``` 22 | 23 | #### Modules used for running screens and merging loci 24 | 25 | ``` 26 | - Classify.pm # Classify sequences using BLAST 27 | - CrossMatch.pm # Capture information about cross-matching during DIGS 28 | - Defragment.pm # Functions for clustering, defragmenting, consolidating loci 29 | - Extract.pm # Functions for extracting sequences from FASTA files 30 | ``` 31 | 32 | ### **Modules in the Interface directory** 33 | 34 | ``` 35 | - BLAST.pm # A Perl interface to the BLAST executables 36 | - MySQLtable.pm # A Perl interface to a MySQL table 37 | ``` 38 | 39 | ### **Modules in the Base directory** 40 | 41 | ``` 42 | - Console.pm # Basic console functions 43 | - FileIO.pm # Basic file IO etc 44 | - DevTools.pm # Tools used for debugging 45 | ``` 46 | -------------------------------------------------------------------------------- /md/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # DIGS tool Contributor Guidelines 2 | 3 | Welcome to the DIGS tool ! We appreciate your interest in contributing to our project. By contributing, you help make DIGS tool a better resource for the community. 4 | 5 | ## Table of Contents 6 | 7 | - [How to Contribute](#how-to-contribute) 8 | - [Code of Conduct](#code-of-conduct) 9 | - [Reporting Bugs](#reporting-bugs) 10 | - [Suggesting Enhancements](#suggesting-enhancements) 11 | - [Pull Requests](#pull-requests) 12 | - [Style Guide](#style-guide) 13 | - [License](#license) 14 | 15 | ## How to Contribute 16 | 17 | ### Reporting Bugs 18 | 19 | If you encounter a bug, please help us by providing detailed information. You can report bugs by: 20 | 21 | 1. Checking the existing issues to see if the bug has already been reported. 22 | 2. Creating a new issue, including a clear and descriptive title, steps to reproduce the bug, and any relevant error messages or screenshots. 23 | 24 | ### Suggesting Enhancements 25 | 26 | We welcome suggestions for new features or enhancements. To suggest an enhancement: 27 | 28 | 1. Check the existing issues to make sure your idea hasn't been suggested. 29 | 2. Create a new issue, explaining the enhancement you'd like to see and why it would be valuable. 30 | 31 | ### Pull Requests 32 | 33 | If you want to contribute directly to the codebase: 34 | 35 | 1. Fork the repository and create a new branch for your feature or bug fix. 36 | 2. Make your changes and ensure the existing tests pass. 37 | 3. Add tests for your changes if applicable. 38 | 4. Update the documentation to reflect your changes. 39 | 5. Submit a pull request, providing a clear description of your changes and reference any related issues. 40 | 41 | ## Code of Conduct 42 | 43 | Please review and adhere to our [Code of Conduct](./code_of_conduct.md). By participating, you are expected to uphold this code. Report any unacceptable behavior to [robert.gifford@glasgow.ac.uk](mailto:robert.gifford@glasgow.ac.uk). 44 | 45 | ## Style Guide 46 | 47 | Please follow the existing code style within the DIGS tool project to maintain consistency across the codebase. 48 | 49 | ## License 50 | 51 | By contributing to DIGS tool, you agree that your contributions will be licensed under the [GNU Affero General Public License v. 3.0](https://www.gnu.org/licenses/agpl-3.0.en.html). 52 | 53 | Thank you for contributing to DIGS tool ! 54 | -------------------------------------------------------------------------------- /modules/DIGS/CrossMatch.pm: -------------------------------------------------------------------------------- 1 | #!usr/bin/perl -w 2 | ############################################################################ 3 | # Module: CrossMatch.pm 4 | # Description: Capture information about cross-matching during DIGS 5 | # History: May 2017: Created by Robert Gifford 6 | ############################################################################ 7 | package CrossMatch; 8 | 9 | ############################################################################ 10 | # Import statements/packages (externally developed packages) 11 | ############################################################################ 12 | use strict; 13 | 14 | ############################################################################ 15 | # Import statements/packages (internally developed packages) 16 | ############################################################################ 17 | 18 | # Base classes 19 | use Base::FileIO; 20 | use Base::Console; 21 | use Base::DevTools; 22 | 23 | ############################################################################ 24 | # Globals 25 | ############################################################################ 26 | 27 | # Base objects 28 | my $fileio = FileIO->new(); 29 | my $console = Console->new(); 30 | my $devtools = DevTools->new(); 31 | 1; 32 | 33 | ############################################################################ 34 | # LIFECYCLE 35 | ############################################################################ 36 | 37 | #*************************************************************************** 38 | # Subroutine: new 39 | # Description: create new CrossMatch 'object' 40 | #*************************************************************************** 41 | sub new { 42 | 43 | my ($invocant, $parameter_ref) = @_; 44 | my $class = ref($invocant) || $invocant; 45 | 46 | # Declare empty data structures 47 | my %crossmatching; 48 | 49 | # Set member variables 50 | my $self = { 51 | 52 | # Global settings 53 | process_id => $parameter_ref->{process_id}, 54 | program_version => $parameter_ref->{program_version}, 55 | 56 | # Flags 57 | verbose => $parameter_ref->{verbose}, 58 | force => $parameter_ref->{force}, 59 | 60 | # Data structures 61 | crossmatching => \%crossmatching, 62 | 63 | }; 64 | 65 | bless ($self, $class); 66 | return $self; 67 | } 68 | 69 | ############################################################################ 70 | # INTERNAL FUNCTIONS: recording cross-matching 71 | ########################################################################### 72 | 73 | #*************************************************************************** 74 | # Subroutine: update_cross_matching 75 | # Description: update a hash to record cross-matches 76 | #*************************************************************************** 77 | sub update_cross_matching { 78 | 79 | my ($self, $probe_key, $assigned) = @_; 80 | 81 | my $crossmatch_ref = $self->{crossmatching}; 82 | 83 | if ($crossmatch_ref->{$probe_key}) { 84 | my $cross_matches_ref = $crossmatch_ref->{$probe_key}; 85 | if ($cross_matches_ref->{$assigned}) { 86 | $cross_matches_ref->{$assigned}++; 87 | } 88 | else { 89 | $cross_matches_ref->{$assigned} = 1; 90 | } 91 | } 92 | else { 93 | my %crossmatch; 94 | $crossmatch{$assigned} = 1; 95 | $crossmatch_ref->{$probe_key} = \%crossmatch; 96 | } 97 | } 98 | 99 | #*************************************************************************** 100 | # Subroutine: show_cross_matching 101 | # Description: show contents of hash that records cross-matches 102 | #*************************************************************************** 103 | sub show_cross_matching { 104 | 105 | my ($self) = @_; 106 | 107 | print "\n\n\t Summary of cross-matching"; 108 | my $crossmatch_ref = $self->{crossmatching}; 109 | my @probe_names = keys %$crossmatch_ref; 110 | foreach my $probe_name (@probe_names) { 111 | 112 | my $cross_matches_ref = $crossmatch_ref->{$probe_name}; 113 | my @cross_matches = keys %$cross_matches_ref; 114 | foreach my $cross_match (@cross_matches) { 115 | my $count = $cross_matches_ref->{$cross_match}; 116 | print "\n\t\t # $count x $probe_name to $cross_match"; 117 | } 118 | } 119 | } 120 | 121 | ############################################################################ 122 | # EOF 123 | ############################################################################ 124 | -------------------------------------------------------------------------------- /modules/DIGS/Extract.pm: -------------------------------------------------------------------------------- 1 | #!usr/bin/perl -w 2 | ############################################################################ 3 | # Module: Extract.pm 4 | # Description: Functions for extracting sequences from FASTA files 5 | # History: May 2017: Created by Robert Gifford 6 | ############################################################################ 7 | package Extract; 8 | 9 | ############################################################################ 10 | # Import statements/packages (externally developed packages) 11 | ############################################################################ 12 | use strict; 13 | 14 | ############################################################################ 15 | # Import statements/packages (internally developed packages) 16 | ############################################################################ 17 | 18 | # Base classes 19 | use Base::FileIO; 20 | use Base::Console; 21 | use Base::DevTools; 22 | 23 | ############################################################################ 24 | # Globals 25 | ############################################################################ 26 | 27 | # Base objects 28 | my $fileio = FileIO->new(); 29 | my $console = Console->new(); 30 | my $devtools = DevTools->new(); 31 | 32 | my $maximum_extract_length = 30000; 33 | 34 | 1; 35 | 36 | ############################################################################ 37 | # MAIN LOOP 38 | ############################################################################ 39 | 40 | #*************************************************************************** 41 | # Subroutine: new 42 | # Description: create new Extract 'object' 43 | #*************************************************************************** 44 | sub new { 45 | 46 | my ($invocant, $parameter_ref) = @_; 47 | my $class = ref($invocant) || $invocant; 48 | 49 | # Set member variables 50 | my $self = { 51 | 52 | # Global settings 53 | process_id => $parameter_ref->{process_id}, 54 | program_version => $parameter_ref->{program_version}, 55 | 56 | # Flags 57 | verbose => $parameter_ref->{verbose}, 58 | force => $parameter_ref->{force}, 59 | 60 | # Paths 61 | blast_bin_path => $parameter_ref->{blast_bin_path}, 62 | 63 | # Parameters for DIGS 64 | extract_buffer => '', # Obtained from control file 65 | 66 | }; 67 | 68 | bless ($self, $class); 69 | return $self; 70 | } 71 | 72 | ############################################################################ 73 | # INTERNAL FUNCTIONS: EXTRACT 74 | ############################################################################ 75 | 76 | #*************************************************************************** 77 | # Subroutine: extract_locus_sequence_using_blast 78 | # Description: extract sequence from a BLAST-indexed FASTA file using BLAST 79 | #*************************************************************************** 80 | sub extract_locus_sequence_using_blast { 81 | 82 | my ($self, $locus_ref) = @_; 83 | 84 | # Interface to BLAST 85 | my %blast_params; 86 | $blast_params{blast_bin_path} = $self->{blast_bin_path}; 87 | my $blast_obj = BLAST->new(\%blast_params); 88 | 89 | # Get paths, obhects and variables from self 90 | my $verbose = $self->{verbose}; 91 | my $buffer = $self->{extract_buffer}; 92 | 93 | # Add any buffer 94 | if ($buffer) { 95 | my $orientation = $locus_ref->{orientation}; 96 | $self->add_buffer_to_sequence($locus_ref, $orientation); 97 | } 98 | 99 | my $seq_length = ($locus_ref->{end} - $locus_ref->{start}) + 1; 100 | #$seq_length = length $sequence; # Set sequence length 101 | if ($seq_length > $maximum_extract_length) { 102 | 103 | print "\n\t # Sequence too long - skipping!!"; 104 | sleep 1; 105 | return; 106 | } 107 | 108 | # Extract the sequence 109 | my $target_path = $locus_ref->{target_path}; 110 | my $sequence = $blast_obj->extract_sequence($target_path, $locus_ref); 111 | if ($sequence) { 112 | 113 | # If we extracted a sequence, update the data for this locus 114 | if ($verbose) { print "\n\t\t\t # Extracted sequence: $seq_length nucleotides "; } 115 | $locus_ref->{extract_start} = $locus_ref->{start}; 116 | $locus_ref->{extract_end} = $locus_ref->{end}; 117 | $locus_ref->{sequence} = $sequence; 118 | $locus_ref->{sequence_length} = $seq_length; 119 | } 120 | } 121 | 122 | #*************************************************************************** 123 | # Subroutine: add_buffer_to_sequence 124 | # Description: eadd leading-and-trailing buffer to extract coordinates 125 | #*************************************************************************** 126 | sub add_buffer_to_sequence { 127 | 128 | my ($self, $locus_ref, $orientation) = @_; 129 | 130 | my $buffer = $self->{extract_buffer}; 131 | 132 | if ($orientation eq '-') { 133 | $locus_ref->{start} = $locus_ref->{start} + $buffer; 134 | $locus_ref->{end} = $locus_ref->{end} - $buffer; 135 | if ($locus_ref->{end} < 1) { # Don't allow negative coordinates 136 | $locus_ref->{end} = 1; 137 | } 138 | } 139 | else { 140 | $locus_ref->{start} = $locus_ref->{start} - $buffer; 141 | if ($locus_ref->{start} < 1) { # Don't allow negative coordinates 142 | $locus_ref->{start} = 1; 143 | } 144 | $locus_ref->{end} = $locus_ref->{end} + $buffer; 145 | } 146 | } 147 | 148 | ############################################################################ 149 | # EOF 150 | ############################################################################ 151 | -------------------------------------------------------------------------------- /modules/Base/DevTools.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | ############################################################################ 3 | # Module: DevTools.pm 4 | # Description: Functions for viewing the contents of PERL data structures 5 | # History: Rob Gifford, Novemeber 2006: Creation 6 | ############################################################################ 7 | package DevTools; 8 | 9 | ############################################################################ 10 | # Import statements/packages (externally developed packages) 11 | ############################################################################ 12 | use strict; 13 | 14 | ############################################################################ 15 | # Import statements/packages (internally developed packages) 16 | ############################################################################ 17 | 18 | # Base classes 19 | use Base::FileIO; 20 | 21 | ############################################################################ 22 | # Globals 23 | ############################################################################ 24 | 25 | # Create base objects 26 | 1; 27 | 28 | ############################################################################ 29 | # LIFECYCLE 30 | ############################################################################ 31 | 32 | #*************************************************************************** 33 | # Subroutine: new 34 | # Description: Parameters 35 | #*************************************************************************** 36 | sub new { 37 | 38 | my ($invocant, $parameter_ref) = @_; 39 | my $class = ref($invocant) || $invocant; 40 | 41 | # Member variables 42 | my $self = { 43 | }; 44 | 45 | bless ($self, $class); 46 | return $self; 47 | } 48 | 49 | ############################################################################ 50 | # Public Member Functions 51 | ############################################################################ 52 | 53 | #*************************************************************************** 54 | # Subroutine: print_hash 55 | # Description: prints the contents of a hash, including other arrays and 56 | # hashes. 57 | # Arguments: $hash_ref: reference to the hash being printed 58 | # $formatting: additional formatting for recursive fxn call 59 | #*************************************************************************** 60 | sub print_hash { 61 | 62 | my ($self, $hash_ref, $formatting) = @_; 63 | 64 | # don't show this line if it's a recursive call 65 | unless ($formatting) { 66 | print "\n\t#~o~ ~o~ ~o~ ~o~ ~o~ ~o~# Showing Hash Contents \n"; 67 | } 68 | 69 | while ( my ($key, $value) = each(%$hash_ref) ) { 70 | 71 | unless ($value) { 72 | $value = "\t\t\t\t>>> UNDEFINED <<<"; 73 | } 74 | 75 | if (ref($value) eq 'HASH' ) { 76 | print "\n\t Hash '$key' in hash:"; 77 | $self->print_hash($value, "\t"); 78 | } 79 | elsif (ref($value) eq 'ARRAY') { 80 | print "\n\t Array '$key' in hash:"; 81 | $self->print_array($value, "\t"); 82 | } 83 | else { 84 | if ($formatting) { print "\n\t$formatting $key => $value"; } 85 | else { print "\n\t$key => $value"; } 86 | } 87 | } 88 | print "\n"; 89 | } 90 | 91 | #*************************************************************************** 92 | # Subroutine: print_array 93 | # Description: prints the contents of an array, including other arrays and 94 | # hashes. 95 | # Arguments: $array_ref: reference to the hash being printed 96 | # $formatting: additional formatting for recursive fxn call 97 | #*************************************************************************** 98 | sub print_array { 99 | 100 | my ($self, $array_ref, $formatting) = @_; 101 | 102 | # don't show this line if it's a recursive call 103 | unless ($formatting) { 104 | print "\n\t#~o~ ~o~ ~o~ ~o~ ~o~ ~o~# Showing Array Contents \n"; 105 | } 106 | 107 | foreach my $item(@$array_ref) { 108 | 109 | if (ref($item) eq 'ARRAY') { 110 | print "\n\t Array in array:"; 111 | $self->print_array($item, "\t"); 112 | } 113 | elsif (ref($item) eq 'HASH' ) { 114 | print "\n\t Hash in array:"; 115 | $self->print_hash($item, "\t"); 116 | } 117 | else { 118 | chomp $item; 119 | if ($formatting) { print "\n\t$formatting $item"; } 120 | else { print "\n\t$item"; } 121 | } 122 | } 123 | print "\n"; 124 | } 125 | #*************************************************************************** 126 | # Subroutine: print_web_hash 127 | # Description: prints the contents of a hash, including other arrays and 128 | # hashes. 129 | # Arguments: $hash_ref: reference to the hash being printed 130 | # $formatting: additional formatting for recursive fxn call 131 | #*************************************************************************** 132 | sub print_web_hash { 133 | 134 | my ($self, $hash_ref, $formatting) = @_; 135 | 136 | # don't show this line if it's a recursive call 137 | unless ($formatting) { 138 | print "
 #~o~ ~o~ ~o~ ~o~ ~o~ ~o~# Showing Hash Contents \n"; 139 | } 140 | 141 | while ( my ($key, $value) = each(%$hash_ref) ) { 142 | 143 | unless ($value) { 144 | $value = "   >>> UNDEFINED <<<"; 145 | } 146 | 147 | if (ref($value) eq 'HASH' ) { 148 | print "
  Hash '$key' in hash:"; 149 | $self->print_web_hash($value, " "); 150 | } 151 | elsif (ref($value) eq 'ARRAY') { 152 | print "
  Array '$key' in hash:"; 153 | $self->print_web_array($value, " "); 154 | } 155 | else { 156 | if ($formatting) { print "
 $formatting $key => $value"; } 157 | else { print "
 $key => $value"; } 158 | } 159 | } 160 | print "
"; 161 | } 162 | 163 | -------------------------------------------------------------------------------- /md/code_of_conduct.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | [INSERT CONTACT METHOD]. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0]. 120 | 121 | Community Impact Guidelines were inspired by 122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 123 | 124 | For answers to common questions about this code of conduct, see the FAQ at 125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available 126 | at [https://www.contributor-covenant.org/translations][translations]. 127 | 128 | [homepage]: https://www.contributor-covenant.org 129 | [v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html 130 | [Mozilla CoC]: https://github.com/mozilla/diversity 131 | [FAQ]: https://www.contributor-covenant.org/faq 132 | [translations]: https://www.contributor-covenant.org/translations 133 | 134 | -------------------------------------------------------------------------------- /digs_tool.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | ############################################################################ 3 | # Script: digs_tool.pl database-integrated genome screening (DIGS) tool 4 | # Description: A tool for exploring genomes 'in silico' using BLAST and 5 | # a relational database 6 | # History: Updated: February 2017 7 | ############################################################################ 8 | 9 | # Capture variables set in the environment 10 | unless ($ENV{'DIGS_GENOMES'}) { 11 | print "\n\n\t Required environment variable '\$DIGS_GENOMES' is undefined\n"; 12 | exit; 13 | } 14 | unless ($ENV{'DIGS_HOME'}) { 15 | print "\n\n\t Required environment variable '\$DIGS_HOME' is undefined\n"; 16 | exit; 17 | } 18 | unless ($ENV{'DIGS_MYSQL_USER'}) { 19 | print "\n\n\t Required environment variable '\$DIGS_MYSQL_USER' is undefined\n"; 20 | exit; 21 | } 22 | unless ($ENV{'DIGS_MYSQL_PASSWORD'}) { 23 | print "\n\n\t Required environment variable '\$DIGS_MYSQL_PASSWORD' is undefined\n"; 24 | exit; 25 | } 26 | 27 | # Include the PERL module library for DIGS 28 | use lib ($ENV{DIGS_HOME}) . '/modules/'; 29 | 30 | ############################################################################ 31 | # Import statements/packages (externally developed packages) 32 | ############################################################################ 33 | use strict; 34 | use Getopt::Long; 35 | use Getopt::Long; 36 | #use Carp; 37 | 38 | ############################################################################ 39 | # Import statements/packages (internally developed packages) 40 | ############################################################################ 41 | 42 | # Base modules 43 | use Base::Console; 44 | use Base::FileIO; 45 | 46 | # Third party program interface modules 47 | use Interface::BLAST; # Interface to BLAST 48 | use Interface::MySQLtable; # Interface to BLAST 49 | 50 | # DIGS framework modules 51 | use DIGS::DIGS; 52 | use DIGS::ScreenBuilder; 53 | use DIGS::TargetDB; 54 | use DIGS::ScreeningDB; 55 | use DIGS::Utility; 56 | 57 | ############################################################################ 58 | # Paths & Globals 59 | ############################################################################ 60 | 61 | # Paths and database connection details from environment variables 62 | my $mysql_username = ($ENV{DIGS_MYSQL_USER}); 63 | my $mysql_password = ($ENV{DIGS_MYSQL_PASSWORD}); 64 | my $genome_use_path = $ENV{DIGS_GENOMES} . '/'; 65 | my $blast_bin_path = ''; # left empty if BLAST+ programs are in your path 66 | my $tmp_path = './tmp'; 67 | 68 | # Version number 69 | my $program_version = '1.13.2'; 70 | 71 | # Create a unique process ID for this DIGS screening process 72 | my $pid = $$; 73 | my $time = time; 74 | my $process_id = $pid . '_' . $time; 75 | 76 | ############################################################################ 77 | # Instantiations 78 | ############################################################################ 79 | 80 | # Base utilites 81 | my $fileio = FileIO->new(); 82 | my $console = Console->new(); 83 | my $devtools = DevTools->new(); 84 | 85 | # Instantiate main program classes using global settings 86 | my %params; 87 | $params{program_version} = $program_version; 88 | $params{process_id} = $process_id; 89 | $params{blast_bin_path} = $blast_bin_path; 90 | $params{genome_use_path} = $genome_use_path; 91 | $params{mysql_username} = $mysql_username ; 92 | $params{mysql_password} = $mysql_password; 93 | $params{tmp_path} = $tmp_path; 94 | my $digs_tool_obj = DIGS->new(\%params); 95 | #$devtools->print_hash(\%params); die; 96 | 97 | ############################################################################ 98 | # Set up USAGE statement 99 | ############################################################################ 100 | 101 | # Initialise usage statement to print if usage is incorrect 102 | my $USAGE = "\n\t ### DIGS version $program_version"; 103 | $USAGE .= "\n\t ### usage: $0 m=[option] -i=[control file] -h=[help]\n\n"; 104 | 105 | ############################################################################ 106 | # Main program 107 | ############################################################################ 108 | 109 | # Run script 110 | main(); 111 | exit; 112 | 113 | ############################################################################ 114 | # Subroutines 115 | ############################################################################ 116 | 117 | #*************************************************************************** 118 | # Subroutine: main 119 | # Description: top level handler fxn 120 | #*************************************************************************** 121 | sub main { 122 | 123 | # Options that require a file path 124 | my $infile = undef; 125 | 126 | # Options that require a numerical value 127 | my $mode = undef; 128 | my $database = undef; 129 | my $utility = undef; 130 | my $genomes = undef; 131 | 132 | # Options that don't require a value 133 | my $help = undef; 134 | my $extra_help = undef; 135 | my $verbose = undef; 136 | my $force = undef; 137 | my $test = undef; 138 | my $create_ids = undef; 139 | 140 | # Read in options using GetOpt::Long 141 | GetOptions ('infile|i=s' => \$infile, 142 | 143 | 'mode|m=i' => \$mode, 144 | 'database|d=i' => \$database, 145 | 'utility=i' => \$utility, 146 | 'genomes=i' => \$genomes, 147 | 'create_ids' => \$create_ids, 148 | 'verbose' => \$verbose, 149 | 'force' => \$force, 150 | 'help' => \$help, 151 | 'test' => \$test, 152 | 153 | ) or die $USAGE; 154 | 155 | # Set flags based on options received 156 | if ($verbose) { $digs_tool_obj->{verbose} = 'true'; } 157 | if ($force) { $digs_tool_obj->{force} = 'true'; } 158 | 159 | # Hand off to functions based on options received 160 | if ($help) { # Show help page 161 | $digs_tool_obj->show_help_page(); 162 | exit; 163 | } 164 | elsif ($mode) { # Main DIGS tool functions 165 | $digs_tool_obj->run_digs_process($infile, $mode); 166 | } 167 | elsif ($database or $utility or $genomes or $utility) { # Utility functions 168 | my $utility_obj = Utility->new($digs_tool_obj); 169 | $utility_obj->run_utility_process($infile, $database, $genomes, $utility); 170 | } 171 | elsif ($test) { # Run inbuilt tests 172 | my $test_obj = Test->new($digs_tool_obj); 173 | $test_obj->show_test_validation_options(); 174 | } 175 | else { die $USAGE; } 176 | 177 | # Exit script 178 | print "\n\n\t # Exit\n\n"; 179 | } 180 | 181 | 182 | ############################################################################ 183 | # End of file 184 | ############################################################################ 185 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Database-Integrated Genome Screening (DIGS) Tool 2 | 3 | 4 | 5 | 6 | Welcome to the GitHub repository for the **DIGS Tool**! 7 | 8 | **Systematic, sequence similarity search-based genome screening** is a powerful approach for identifying and characterising genome features in silico. This approach extends the basic [sequence similarity search](https://blast.ncbi.nlm.nih.gov/) by: 9 | 10 | 1. Performing multiple searches systematically, involving various **query sequences** and/or **target databases**. 11 | 2. Classifying “**hits**” (matching sequences) via comparison to a **reference sequence library** curated by the investigator. 12 | 13 | **Database-integrated genome screening (DIGS)** is a form of systematic genome screening in which a similarity search-based screening pipeline is linked to a **relational database management system** ([RDBMS](https://www.w3schools.com/mysql/mysql_rdbms.asp)). This provides a robust foundation for implementing large-scale, automated screens, and enables a 'database querying' approach to investigating screening output. 14 | 15 | **The DIGS Tool is a software framework for implementing DIGS on UNIX/LINUX and Macintosh OSX platforms**. The program is accessible through a text-based console interface. It uses the [BLAST+ ](https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/) program suite to perform similarity search-based screening, and the [MySQL](https://dev.mysql.com/downloads/mysql/) RDBMS to capture screen output. 16 | 17 | Please see the [User Guide](https://github.com/giffordlabcvr/DIGS-tool/wiki) for more details. 18 | 19 | ## Installation and Setup 20 | 21 | To run the DIGS tool requires [PERL](https://www.perl.org/), [BLAST](https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/) and [MySQL](https://dev.mysql.com/downloads/mysql/) (or a supported fork of MySQL such as MariaDB). 22 | 23 | Steps involved in installing the DIGS tool and using it to perform DIGS are as follows: 24 | 25 | 1. Install and configure DIGS 26 | - [Download](https://github.com/giffordlabcvr/DIGS-tool/zipball/master) the DIGS tool 27 | - Install [PERL](https://www.perl.org/), [BLAST](https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/) and [MySQL](https://dev.mysql.com/downloads/mysql/) 28 | - Install Perl `DBI` and `DBD::MySQL` packages (if they are not already installed) 29 | - Set `$DIGS_HOME` and `$DIGS_GENOMES` environment variables 30 | - `$DIGS_HOME` = path to DIGS tool directory 31 | - `$DIGS_GENOMES` = path to the top level of the target database (tDB) directory (see below) 32 | - Create a MySQL user for DIGS 33 | - Set `$DIGS_MYSQL_USER` and `$DIGS_MYSQL_PASSWORD` environment variables 34 | 35 | 3. Set up the target database (tDB), create a reference sequence library (RSL) and select query (probe) sequences 36 | 37 | 4. Create a [control file](https://github.com/giffordlabcvr/DIGS-tool/wiki/Control-File) for a DIGS project 38 | 39 | 5. Run the DIGS screen based on the control file 40 | 41 | 6. Interrogate the output of DIGS 42 | 43 | 7. Update reference libraries and repeat steps 4+5 using updated information 44 | 45 | **Step 1** and its sub-components are one-offs associated with initial set-up of the DIGS tool. 46 | 47 | **Steps 2-3** refer to the set-up of individual DIGS projects, and will need to be repeated for each distinct screen. 48 | 49 | **Steps 4-6** encapsulate the actual DIGS process. **Step 5** can entail analysis within the screening database (i.e. using [SQL](https://github.com/giffordlabcvr/DIGS-tool/wiki/Example-SQL), but may also entail the analysis of DIGS output in external programs (e.g. phylogeny packages, statistical analysis programs). Iterating on a DIGS project (**Step 6**) is optional. However, it is anticipated that many DIGS projects will be heuristic in nature, and these will commonly require iteration. 50 | 51 | Please see the [User Guide](https://github.com/giffordlabcvr/DIGS-tool/wiki) for more details. 52 | 53 | ## Quick Start 54 | 55 | To see options for screening: 56 | 57 | `./digs_tool.pl -h` 58 | 59 | To run DIGS, the following input data components are required: 60 | 61 | 1. **Target Database ([TDb](https://github.com/giffordlabcvr/DIGS-tool/wiki/Target-Database-%28tDB%29)):** A collection of whole genome sequence or transcriptome assemblies serving as the target for similarity searches. 62 | 2. **Reference Sequence Library ([RSL](https://github.com/giffordlabcvr/DIGS-tool/wiki/Reference-Sequence-Library-%28RSL%29)):** Represents the genetic diversity associated with the genome feature(s) under investigation. 63 | 3. **Query Sequences ([Probes](https://github.com/giffordlabcvr/DIGS-tool/wiki/Query-Sequences-(Probes))):** Input sequences for similarity searches of the Target Database. 64 | 4. **[Control File](https://github.com/giffordlabcvr/DIGS-tool/wiki/Control-File):** Defines parameters and paths for screening. 65 | 66 | 67 | Before running a screen for the first time, you will need to index the TDb for BLAST searching: 68 | 69 | `./digs_tool.pl –m=1 –i=[path to control file]` 70 | 71 | ### Running a sceen 72 | 73 | Once the target database has been indexed, a screen can be executed as follows: 74 | 75 | `./digs_tool.pl –m=2 –i=[path to control file]` 76 | 77 | Progress is written to the terminal, and can also be monitored by issuing SQL queries against the relevant screening database. A screen can be stopped at any time. The next time the tool is restarted, it will initiate screening at the point it left off. 78 | 79 | Please see the [User Guide](https://github.com/giffordlabcvr/DIGS-tool/wiki) for more details. 80 | 81 | ## Citation 82 | 83 | A [paper describing the DIGS tool](https://doi.org/10.1186/s13059-024-03258-y) has been published in the journal Genome Biology: 84 | 85 | Blanco-Melo D, Campbell MA, Zhu H, Dennis TPW, Modha S, Lytras S, Hughes J, Gatseva A, and Gifford RJ (**2024**) 86 | A novel approach to exploring the dark genome and its application to mapping of the vertebrate virus fossil record. 87 | _Genome Biology_ May 13;25(1):120 88 | 89 | ## Contributing 90 | 91 | The DIGS tool team is very open to further development of this software by the open source bioinformatics community. It is probably worth raising any ideas you have with the team before embarking on development. 92 | 93 | If contributing to the DIGS tool, please review our [Contribution Guidelines](./md/CONTRIBUTING.md). 94 | 95 | [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg)](./md/code_of_conduct.md) 96 | 97 | ## Contact 98 | 99 | For questions, issues, or feedback, please contact us at [digstool@gmail.com](mailto:digstool@gmail.com) or open an [issue](https://github.com/giffordlabcvr/DIGS-tool/issues). 100 | 101 | ## Credits 102 | 103 | The DIGS tool was written by Robert J. Gifford. 104 | 105 | ## License 106 | 107 | The project is licensed under the [GNU Affero General Public License v. 3.0](https://www.gnu.org/licenses/agpl-3.0.en.html) 108 | -------------------------------------------------------------------------------- /modules/Interface/BLAST.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | ############################################################################ 3 | # Module: BLAST.pm 4 | # Description: A Perl interface to the BLAST executables 5 | # History: Rob Gifford January 2007: Creation 6 | ############################################################################ 7 | package BLAST; 8 | 9 | ############################################################################ 10 | # Import statements/packages (externally developed packages) 11 | ############################################################################ 12 | use strict; 13 | 14 | ############################################################################ 15 | # Import statements/packages (internally developed packages) 16 | ############################################################################ 17 | 18 | # Base classes 19 | use Base::FileIO; 20 | use Base::DevTools; 21 | 22 | ############################################################################ 23 | # Globals 24 | ############################################################################ 25 | 26 | my $fileio = FileIO->new(); 27 | my $devtools = DevTools->new(); 28 | 1; 29 | 30 | ############################################################################ 31 | # LIFECYCLE 32 | ############################################################################ 33 | 34 | #*************************************************************************** 35 | # Subroutine: new 36 | # Description: Create a new BLAST.pm 'object' 37 | #*************************************************************************** 38 | sub new { 39 | 40 | my ($invocant, $parameter_ref) = @_; 41 | my $class = ref($invocant) || $invocant; 42 | 43 | # Member variables 44 | my $self = { 45 | 46 | verbose => $parameter_ref->{verbose}, 47 | blast_bin_path => $parameter_ref->{blast_bin_path}, 48 | 49 | }; 50 | 51 | bless ($self, $class); 52 | return $self; 53 | } 54 | 55 | ############################################################################ 56 | # Top level fxns 57 | ############################################################################ 58 | 59 | #*************************************************************************** 60 | # Subroutine: BLAST 61 | # Description: Execute BLAST a search 62 | #*************************************************************************** 63 | sub blast { 64 | 65 | my ($self, $method, $target_path, $probe_path, $result_path, $options_ref) = @_; 66 | 67 | # Get paths and flags from self 68 | my $blast_path = $self->{blast_bin_path}; 69 | 70 | # Get blast options if supplied 71 | my $num_threads = $options_ref->{num_threads}; 72 | my $word_size = $options_ref->{word_size}; 73 | my $evalue = $options_ref->{evalue}; 74 | my $penalty = $options_ref->{penalty}; 75 | my $reward = $options_ref->{reward}; 76 | my $gapopen = $options_ref->{gapopen}; 77 | my $gapextend = $options_ref->{gapextend}; 78 | my $dust = $options_ref->{dust}; 79 | my $softmasking = $options_ref->{softmasking}; 80 | my $seg = $options_ref->{seg}; 81 | my $outfmt = $options_ref->{outfmt}; 82 | unless ($outfmt) { $outfmt = 7; } # default output format is tab-delimited ( -outfmt 7) 83 | #$devtools->print_hash($options_ref); die; 84 | 85 | # Create BLAST command 86 | $blast_path .= $method; 87 | my $blast_type; 88 | my $set_params; 89 | if ($options_ref) { 90 | $blast_type = $options_ref->{blast_type}; 91 | } 92 | my $command; 93 | if ($blast_type) { 94 | $command = "$blast_path -query $probe_path -subject $target_path "; 95 | $command .= "-out $result_path "; 96 | } 97 | else { 98 | $command = "$blast_path -query $probe_path -db $target_path "; 99 | $command .= "-out $result_path "; 100 | } 101 | 102 | if ($num_threads) { 103 | $command .= "-num_threads $num_threads "; 104 | } 105 | 106 | if ($word_size) { $command .= " -word_size $word_size "; } 107 | if ($evalue) { $command .= " -evalue $evalue "; } 108 | if ($penalty and $reward and $gapopen and $gapextend) { 109 | $command .= " -penalty $penalty "; 110 | $command .= " -reward $reward "; 111 | $command .= " -gapopen $gapopen "; 112 | } 113 | if ($method eq 'blastn') { 114 | 115 | if ($softmasking) { 116 | $command .= " -soft_masking $softmasking "; 117 | } 118 | if ($dust) { 119 | $command .= " -dust $dust "; 120 | } 121 | } 122 | elsif ($method eq 'tblastn') { 123 | if ($seg) { 124 | $command .= " -seg $seg "; 125 | } 126 | } 127 | 128 | # Set the output format for BLAST 129 | $command .= "-outfmt $outfmt"; 130 | 131 | #if ($self->{verbose}) { 132 | # print "\n\n\t ### BLAST COMMAND $command\n"; 133 | #} 134 | 135 | # Execute the command 136 | system $command; 137 | 138 | my $exit_status = system($command); 139 | 140 | if ($exit_status == 0) { 141 | # BLAST search was successful 142 | return 1; 143 | 144 | } else { 145 | # BLAST search failed 146 | print "\n\n\t ### BLAST search failed with error code: $exit_status\n\n"; 147 | return 0; 148 | } 149 | 150 | } 151 | 152 | #*************************************************************************** 153 | # Subroutine: extract_sequence 154 | # Description: Interface to the BLAST+ sequence extraction functions 155 | # see: http://www.ncbi.nlm.nih.gov/books/NBK1763/ 156 | #*************************************************************************** 157 | sub extract_sequence { 158 | 159 | my ($self, $target_path, $data_ref) = @_; 160 | 161 | # Get path to BLAST binary (note: may be NULL value if BLAST programs are in the users path) 162 | my $blast_path = $self->{blast_bin_path}; 163 | 164 | # Get extraction parameters 165 | my $start = $data_ref->{start}; 166 | my $end = $data_ref->{end}; 167 | my $orientation = $data_ref->{orientation}; 168 | my $scaffold = $data_ref->{scaffold}; 169 | unless ($start and $end and $orientation and $scaffold and $target_path) { 170 | $devtools->print_hash($data_ref); die; 171 | } 172 | 173 | # Parsing for blastdbcmd 174 | my @gi = split(/\|/,$scaffold); 175 | if (scalar(@gi) > 1) { 176 | $scaffold = $gi[1]; 177 | } 178 | 179 | # Create the command 180 | # Command example: 181 | # /bin/blast/blastdbcmd -db hs_alt_HuRef_chrX.fa -entry 157734237 182 | # -range 10-60 -strand minus 183 | $blast_path .= 'blastdbcmd'; 184 | my $command = $blast_path . " -db $target_path"; 185 | $command .= " -entry $scaffold "; 186 | $command .= " -range $start-$end "; 187 | if ($orientation eq '-') { $command .= ' -strand minus '; } 188 | 189 | # Execute the command 190 | my @sequence = `$command`; 191 | shift @sequence; # Remove header 192 | my $sequence = join ('', @sequence); 193 | $sequence =~ s/\n//g; 194 | 195 | return $sequence; 196 | } 197 | 198 | ############################################################################ 199 | # Lower level parsing fxns 200 | ############################################################################ 201 | 202 | #*************************************************************************** 203 | # Subroutine: parse_tab_format_results 204 | # Description: parse BLAST results 205 | #*************************************************************************** 206 | sub parse_tab_format_results { 207 | 208 | my ($self, $file, $result_ref) = @_; 209 | 210 | # Read the file into an array 211 | my $fileio = FileIO->new(); 212 | my @file; 213 | $fileio->read_file($file, \@file); 214 | 215 | # Process the file 216 | my @matches; 217 | foreach my $line (@file) { 218 | 219 | if ($line =~ /^#/) { next; } # skip comments 220 | my @data = split("\t", $line); 221 | 222 | # GET THE DATA FOR THIS HIT 223 | my %match; 224 | $match{probe} = $data[0]; 225 | $match{scaffold} = $data[1]; 226 | $match{identity} = $data[2]; 227 | $match{align_len} = $data[3]; 228 | $match{mismatches} = $data[4]; 229 | my $gap_openings = $data[5]; 230 | unless ($gap_openings) { 231 | $gap_openings = '0'; 232 | } 233 | $match{gap_openings} = $gap_openings; 234 | $match{query_start} = $data[6]; 235 | $match{query_stop} = $data[7]; 236 | my $aln_start = $data[8]; 237 | my $aln_stop = $data[9]; 238 | my $e_value = $data[10]; 239 | my $bitscore = $data[11]; 240 | chomp $bitscore; 241 | 242 | # Convert e value 243 | $self->convert_evalue($e_value, \%match); 244 | 245 | # Set orientaton 246 | my $orientation; 247 | if ($aln_stop < $aln_start) { 248 | $orientation = '-'; 249 | 250 | # switch the start and stop around if in -ve orientation 251 | my $switch_start = $aln_stop; 252 | my $switch_stop = $aln_start; 253 | $match{aln_start} = $switch_start; 254 | $match{aln_stop} = $switch_stop; 255 | } 256 | else { 257 | $orientation = '+'; 258 | $match{aln_start} = $aln_start; 259 | $match{aln_stop} = $aln_stop; 260 | } 261 | 262 | $match{orientation} = $orientation; 263 | $match{bitscore} = $bitscore; 264 | push(@$result_ref, \%match); 265 | } 266 | } 267 | 268 | #*************************************************************************** 269 | # Subroutine: convert_evalue 270 | # Description: split evalue string into number and exponential 271 | #*************************************************************************** 272 | sub convert_evalue { 273 | 274 | my ($self, $e_value, $data_ref) = @_; 275 | 276 | my $evalue_num = 0; 277 | my $evalue_exp = 0; 278 | if ($e_value =~ /e/) { 279 | my @evalue_bits = split ("e-", $e_value); 280 | $evalue_num = $evalue_bits[0]; 281 | $evalue_exp = $evalue_bits[1]; 282 | } 283 | else { 284 | $evalue_num = $e_value; 285 | $evalue_exp = 1; 286 | } 287 | $data_ref->{evalue_num} = $evalue_num; 288 | $data_ref->{evalue_exp} = $evalue_exp; 289 | } 290 | 291 | ############################################################################ 292 | # EOF 293 | ############################################################################ 294 | -------------------------------------------------------------------------------- /modules/Interface/MySQLtable.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | ############################################################################ 3 | # Module: MySQLtable.pm 4 | # Description: A Perl interface to a MySQL table 5 | # History: March 2011: Creation 6 | ############################################################################ 7 | package MySQLtable; 8 | 9 | ############################################################################ 10 | # Import statements/packages (externally developed packages) 11 | ############################################################################ 12 | use strict; 13 | 14 | ############################################################################ 15 | # Import statements/packages (internally developed packages) 16 | ############################################################################ 17 | 18 | ############################################################################ 19 | # Globals 20 | ############################################################################ 21 | 1; 22 | 23 | ############################################################################ 24 | # LIFECYCLE 25 | ############################################################################ 26 | 27 | #*************************************************************************** 28 | # Subroutine: new 29 | # Description: Create a new MySQLtable.pm object 30 | #*************************************************************************** 31 | sub new { 32 | 33 | my ($invocant, $name, $dbh, $fields_ref) = @_; 34 | 35 | my $class = ref($invocant) || $invocant; 36 | 37 | # Member variables 38 | my $self = { 39 | name => $name, 40 | dbh => $dbh, 41 | fields => $fields_ref, 42 | }; 43 | 44 | bless ($self, $class); 45 | 46 | return $self; 47 | } 48 | 49 | ############################################################################ 50 | # Member Functions 51 | ############################################################################ 52 | 53 | #*************************************************************************** 54 | # Subroutine: insert_row 55 | # Description: generic fxn to submit a row of data to the table 56 | # Arguments: $sample_data_ref: hash reference containing field values 57 | #*************************************************************************** 58 | sub insert_row { 59 | 60 | my ($self, $data_ref) = @_; 61 | 62 | my $dbh = $self->{dbh}; 63 | my $field_ref = $self->{fields}; 64 | my @fields = keys %$field_ref; 65 | my $fields = join(',', @fields); 66 | my $insert_clause = "INSERT INTO $self->{name} \( $fields\)"; 67 | my @values; 68 | foreach my $field (@fields) { 69 | 70 | my $value = $data_ref->{$field}; 71 | unless ($value) { 72 | #print "\n\t WARNING: NO VALUE FOR FIELD '$field' in table $self->{name}"; 73 | $value = '0'; 74 | } 75 | chomp $value; # remove newline 76 | 77 | $value =~ s/\s+$//; # remove trailing whitespace 78 | my $type = $field_ref->{$field}; 79 | 80 | # format the value according to type 81 | my $type_value = $value; 82 | if ($type eq 'varchar' or $type eq 'text' or $type eq 'date') { 83 | $type_value = "'$value'"; # string values should be in quotes 84 | } 85 | push (@values, $type_value); 86 | } 87 | 88 | my $values = join(',', @values); 89 | 90 | my $value_clause = " VALUES \($values\)"; 91 | my $insert = "$insert_clause $value_clause"; 92 | #print "\n\t INSERT: $insert_clause $value_clause\n\n\n"; 93 | 94 | my $sth = $dbh->prepare($insert); 95 | unless ($sth->execute()) { print $insert; exit; } 96 | 97 | # Get the sample ID (generated via autoincrement) 98 | my $db_id = $sth->{mysql_insertid}; 99 | 100 | return $db_id; 101 | } 102 | 103 | #*************************************************************************** 104 | # Subroutine: select row 105 | # Description: Generic select fxn for a selecting a single row and storing 106 | # data in a hash 107 | # Arguments: $field_ref: reference to an array with fields to fetch 108 | # $data_ref: reference to a hash to store the data 109 | # $where: the where clause of the select as a string 110 | #*************************************************************************** 111 | sub select_row { 112 | 113 | my ($self, $field_ref, $data_ref, $where) = @_; 114 | 115 | my $dbh = $self->{dbh}; 116 | my $fields = join(',', @$field_ref); 117 | my $query = "SELECT $fields FROM $self->{name} $where"; 118 | my $sth = $dbh->prepare($query); 119 | ##print "\n\t QUERY $query\n\n\n"; 120 | $sth->execute(); 121 | 122 | # get the values into a hash 123 | my $i = 0; 124 | my @row = $sth->fetchrow_array; 125 | foreach my $field (@$field_ref) { 126 | 127 | $data_ref->{$field} = $row[$i]; 128 | $i++; 129 | } 130 | } 131 | 132 | #*************************************************************************** 133 | # Subroutine: select rows 134 | # Description: Generic select fxn for a one or more rows and storing the 135 | # data in an array of hashes (one row per hash) 136 | #*************************************************************************** 137 | sub select_rows { 138 | 139 | my ($self, $field_ref, $data_ref, $where) = @_; 140 | 141 | my $dbh = $self->{dbh}; 142 | my $fields = join(',', @$field_ref); 143 | 144 | # remove any quotations 145 | $fields =~ s/"//g; 146 | my $query = "SELECT $fields FROM $self->{name}"; 147 | if ($where) { $query .= " $where"; } 148 | #print "\n\n\t QUERY: $query \n\n\n"; 149 | my $sth = $dbh->prepare($query); 150 | unless ($sth->execute()) { print $query; exit; } 151 | my $row_count = 0; 152 | while (my $row = $sth->fetchrow_arrayref) { 153 | 154 | $row_count++; 155 | my $i = 0; 156 | my %row; 157 | foreach my $field (@$field_ref) { 158 | 159 | # Deal with an aliased field 160 | if ($field =~ m/ AS /) { 161 | my @field = split(/\'/,$field); 162 | my $alias = pop @field; 163 | $field = $alias; 164 | } 165 | 166 | my $value = @$row[$i]; 167 | $row{$field} = $value; 168 | $i++; 169 | } 170 | push (@$data_ref, \%row); 171 | } 172 | } 173 | 174 | #*************************************************************************** 175 | # Subroutine: select distinct 176 | # Description: Generic select distinct 177 | # Arguments: $field_ref: list of fields to constrain the 'select' statement 178 | # $data_ref: reference to an array to store the rows 179 | # $where: the where clause of the select as a string 180 | #*************************************************************************** 181 | sub select_distinct { 182 | 183 | my ($self, $field_ref, $data_ref, $where) = @_; 184 | 185 | unless ($where) { $where = ''; } 186 | 187 | my $dbh = $self->{dbh}; 188 | my $fields = join(',', @$field_ref); 189 | 190 | my $query = "SELECT DISTINCT $fields FROM $self->{name}"; 191 | if ($where) { $query .= " $where"; } 192 | my $sth = $dbh->prepare($query); 193 | $sth->execute(); 194 | while (my $row = $sth->fetchrow_arrayref) { 195 | my $i = 0; 196 | my %row; 197 | foreach my $field (@$field_ref) { 198 | my $value = @$row[$i]; 199 | $row{$field} = $value; 200 | $i++; 201 | } 202 | push (@$data_ref, \%row); 203 | } 204 | } 205 | 206 | #*************************************************************************** 207 | # Subroutine: update 208 | # Description: generic update fxn using hashes 209 | # Arguments: $set: reference to hash with the new values 210 | # $where: reference to hash with the where relationships 211 | #*************************************************************************** 212 | sub update { 213 | 214 | my ($self, $set, $where) = @_; 215 | 216 | my $dbh = $self->{dbh}; 217 | my $fields_ref = $self->{fields}; 218 | 219 | my @set_clause; 220 | my @fields = keys %$set; 221 | foreach my $field (@fields) { 222 | my $value = $set->{$field}; 223 | my $type = $fields_ref->{$field}; 224 | unless ($type) { die "no type for field '$field'"; } 225 | 226 | my $f_value = $value; 227 | if ($type eq 'varchar' or $type eq 'text') { 228 | $f_value = "'$value'"; 229 | } 230 | 231 | my $subclause = "$field = $f_value"; 232 | push (@set_clause, $subclause); 233 | } 234 | my $set_clause = join (',', @set_clause); 235 | my $query = "UPDATE $self->{name} 236 | SET $set_clause 237 | $where"; 238 | #print "\n\t\t $query\n\n"; 239 | my $sth = $dbh->prepare($query); 240 | $sth->execute(); 241 | } 242 | 243 | #*************************************************************************** 244 | # Subroutine: delete_rows 245 | # Description: generic fxn to delete rows (safer than flush because requires 246 | # a 'WHERE" statement) 247 | #*************************************************************************** 248 | sub delete_rows { 249 | 250 | my ($self, $where) = @_; 251 | 252 | my $dbh = $self->{dbh}; 253 | unless ($where) { die; } 254 | my $query = "DELETE from $self->{name} $where"; 255 | #print "\n\t\t $query\n\n"; 256 | my $sth = $dbh->prepare($query); 257 | $sth->execute(); 258 | } 259 | 260 | #*************************************************************************** 261 | # Subroutine: flush 262 | # Description: Empty table of all data 263 | #*************************************************************************** 264 | sub flush { 265 | 266 | my ($self, $where) = @_; 267 | 268 | my $dbh = $self->{dbh}; 269 | my $query = "DELETE from $self->{name}"; 270 | if ($where) { 271 | $query .= $where; 272 | } 273 | my $sth = $dbh->prepare($query); 274 | $sth->execute(); 275 | } 276 | 277 | #*************************************************************************** 278 | # Subroutine: reset_primary_key 279 | # Description: Reset primary key of table to start counting at 1 280 | #*************************************************************************** 281 | sub reset_primary_keys { 282 | 283 | my ($self) = @_; 284 | 285 | my $dbh = $self->{dbh}; 286 | my $alter = "ALTER TABLE $self->{name} AUTO_INCREMENT=1"; 287 | my $sth = $dbh->prepare($alter); 288 | $sth->execute(); 289 | } 290 | 291 | ############################################################################ 292 | # EOF 293 | ############################################################################ 294 | -------------------------------------------------------------------------------- /modules/DIGS/Classify.pm: -------------------------------------------------------------------------------- 1 | #!usr/bin/perl -w 2 | ############################################################################ 3 | # Module: Classify.pm 4 | # Description: Capture information about cross-matching during DIGS 5 | # History: May 2017: Created by Robert Gifford 6 | ############################################################################ 7 | package Classify; 8 | 9 | ############################################################################ 10 | # Import statements/packages (externally developed packages) 11 | ############################################################################ 12 | use strict; 13 | 14 | ############################################################################ 15 | # Import statements/packages (internally developed packages) 16 | ############################################################################ 17 | 18 | # Base classes 19 | use Base::FileIO; 20 | use Base::Console; 21 | use Base::DevTools; 22 | 23 | ############################################################################ 24 | # Globals 25 | ############################################################################ 26 | 27 | # Base objects 28 | my $fileio = FileIO->new(); 29 | my $console = Console->new(); 30 | my $devtools = DevTools->new(); 31 | 1; 32 | 33 | ############################################################################ 34 | # LIFECYCLE 35 | ############################################################################ 36 | 37 | #*************************************************************************** 38 | # Subroutine: new 39 | # Description: create new Classify 'object' 40 | #*************************************************************************** 41 | sub new { 42 | 43 | my ($invocant, $parameter_ref) = @_; 44 | my $class = ref($invocant) || $invocant; 45 | 46 | # Set member variables 47 | my $self = { 48 | 49 | # Flags 50 | verbose => $parameter_ref->{verbose}, 51 | 52 | # Parameters for reverse BLAST (hits versus reference sequence library) 53 | num_threads => $parameter_ref->{rev_num_threads}, 54 | word_size => $parameter_ref->{rev_word_size}, 55 | evalue => $parameter_ref->{rev_evalue}, 56 | penalty => $parameter_ref->{rev_penalty}, 57 | reward => $parameter_ref->{rev_reward}, 58 | gapopen => $parameter_ref->{rev_gapopen}, 59 | gapextend => $parameter_ref->{rev_gapextend}, 60 | dust => $parameter_ref->{rev_dust}, 61 | softmasking => $parameter_ref->{rev_softmasking}, 62 | seg => $parameter_ref->{rev_seg}, 63 | 64 | # Paths used in DIGS process 65 | tmp_path => $parameter_ref->{tmp_path}, 66 | blast_bin_path => $parameter_ref->{blast_bin_path}, 67 | # TODO: check why both these are neccessary 68 | aa_reference_library => $parameter_ref->{aa_reference_library}, 69 | na_reference_library => $parameter_ref->{na_reference_library}, 70 | blast_orf_lib_path => $parameter_ref->{blast_orf_lib_path}, 71 | blast_utr_lib_path => $parameter_ref->{blast_utr_lib_path}, 72 | 73 | }; 74 | 75 | bless ($self, $class); 76 | return $self; 77 | } 78 | 79 | ############################################################################ 80 | # INTERNAL FUNCTIONS: CLASSIFY 81 | ############################################################################ 82 | 83 | #*************************************************************************** 84 | # Subroutine: classify_sequence_using_blast 85 | # Description: classify a nucleotide sequence using blast 86 | #*************************************************************************** 87 | sub classify_sequence_using_blast { 88 | 89 | my ($self, $locus_ref) = @_; 90 | 91 | # Set up BLAST object with parameters for the reverse BLAST (from $self) 92 | my $blast_obj = BLAST->new($self); 93 | my $result_path = $self->{tmp_path}; 94 | unless ($result_path) { die; } # Sanity checking 95 | 96 | # Get data about this probe sequence 97 | my $sequence = $locus_ref->{sequence}; 98 | my $probe_type = $locus_ref->{probe_type}; 99 | unless ($probe_type) { die; } # Sanity checking 100 | unless ($sequence) { die; } # Sanity checking 101 | 102 | # Make a FASTA query file 103 | $sequence =~ s/-//g; # Remove any gaps that might happen to be there 104 | $sequence =~ s/~//g; # Remove any gaps that might happen to be there 105 | $sequence =~ s/\s+//g; # Remove any gaps that might happen to be there 106 | my $fasta = ">TEMPORARY\n$sequence"; 107 | my $query_file = $result_path . '/TEMPORARY.fas'; 108 | $fileio->write_text_to_file($query_file, $fasta); 109 | my $result_file = $result_path . '/TEMPORARY.blast_result'; 110 | 111 | # Do the BLAST according to the type of sequence (AA or NA) 112 | my $blast_alg = $self->get_blast_algorithm($probe_type); 113 | my $lib_path = $self->get_blast_library_path($probe_type); 114 | my $lib_file; 115 | if ($probe_type eq 'ORF') { $lib_file = $self->{aa_reference_library}; } 116 | elsif ($probe_type eq 'UTR') { $lib_file = $self->{na_reference_library}; } 117 | else { die; } 118 | unless ($lib_file) { die; } 119 | 120 | # Set parameters for the reverse BLAST 121 | #$devtools->print_hash(\%blast_run_params); 122 | #$devtools->print_hash($self); die; 123 | # Do reverse BLAST search (hits versus reference sequence library) 124 | $blast_obj->blast($blast_alg, $lib_path, $query_file, $result_file, $self); 125 | 126 | # Parse the results 127 | my @results; 128 | $blast_obj->parse_tab_format_results($result_file, \@results); 129 | 130 | # Define some variables for capturing the result 131 | my $top_match = shift @results; 132 | my $query_start = $top_match->{query_start}; 133 | my $query_end = $top_match->{query_stop}; 134 | my $subject_start = $top_match->{aln_start}; 135 | my $subject_end = $top_match->{aln_stop}; 136 | my $assigned_key = $top_match->{scaffold}; 137 | my $assigned; 138 | my $success = 1; 139 | 140 | # Deal with a query that matched nothing in the 2nd BLAST search 141 | unless ($assigned_key) { 142 | $self->set_default_values_for_unassigned_locus($locus_ref); 143 | $assigned = undef; 144 | $success = undef; 145 | } 146 | else { # Assign the extracted sequence based on matches from 2nd BLAST search 147 | 148 | # Split assigned to into (i) refseq match (ii) refseq description (e.g. gene) 149 | my @assigned_key = split('_', $assigned_key); 150 | my $assigned_gene = pop @assigned_key; 151 | my $assigned_name = join ('_', @assigned_key); 152 | #$assigned_name = join ('_', @assigned_name); 153 | $locus_ref->{assigned_name} = $assigned_name; 154 | $locus_ref->{assigned_gene} = $assigned_gene; 155 | $locus_ref->{identity} = $top_match->{identity}; 156 | $locus_ref->{bitscore} = $top_match->{bitscore}; 157 | $locus_ref->{evalue_exp} = $top_match->{evalue_exp}; 158 | $locus_ref->{evalue_num} = $top_match->{evalue_num}; 159 | $locus_ref->{mismatches} = $top_match->{mismatches}; 160 | $locus_ref->{align_len} = $top_match->{align_len}; 161 | $locus_ref->{gap_openings} = $top_match->{gap_openings}; 162 | $locus_ref->{query_end} = $query_end; 163 | $locus_ref->{query_start} = $query_start; 164 | $locus_ref->{subject_end} = $subject_end; 165 | $locus_ref->{subject_start} = $subject_start; 166 | #$devtools->print_hash($locus_ref); die; 167 | 168 | my $id = $locus_ref->{record_id}; 169 | print "\n\t\t# Assigned as '$assigned_name ($assigned_gene)'"; 170 | print " via $blast_alg comparison to $lib_file"; 171 | $assigned = $assigned_name . '_' . $assigned_gene; 172 | } 173 | 174 | # Clean up 175 | my $command1 = "rm $query_file"; 176 | my $command2 = "rm $result_file"; 177 | system $command1; 178 | system $command2; 179 | 180 | return $success; 181 | } 182 | 183 | #*************************************************************************** 184 | # Subroutine: set_default_values_for_unassigned_locus 185 | # Description: set default values for an unassigned extracted sequence 186 | #*************************************************************************** 187 | sub set_default_values_for_unassigned_locus { 188 | 189 | my ($self, $hit_ref) = @_; 190 | 191 | $hit_ref->{assigned_name} = 'Unassigned'; 192 | $hit_ref->{assigned_gene} = 'Unassigned'; 193 | $hit_ref->{identity} = 0; 194 | $hit_ref->{bitscore} = 0; 195 | $hit_ref->{evalue_exp} = 0; 196 | $hit_ref->{evalue_num} = 0; 197 | $hit_ref->{mismatches} = 0; 198 | $hit_ref->{align_len} = 0; 199 | $hit_ref->{gap_openings} = 0; 200 | $hit_ref->{query_end} = 0; 201 | $hit_ref->{query_start} = 0; 202 | $hit_ref->{subject_end} = 0; 203 | $hit_ref->{subject_start} = 0; 204 | 205 | } 206 | 207 | #*************************************************************************** 208 | # Subroutine: get_blast_algorithm 209 | # Description: determine which blast algorithm to use based on settings 210 | #*************************************************************************** 211 | sub get_blast_algorithm { 212 | 213 | my ($self, $probe_type) = @_; 214 | 215 | my $blast_alg; 216 | if ($probe_type eq 'UTR') { $blast_alg = 'blastn'; } 217 | elsif ($probe_type eq 'ORF') { $blast_alg = 'blastx'; } 218 | else { die "\n\t Unknown probe type '$probe_type '\n\n"; } 219 | 220 | return $blast_alg; 221 | } 222 | 223 | #*************************************************************************** 224 | # Subroutine: get_blast_library_path 225 | # Description: get path to a reference library, based on settings 226 | #*************************************************************************** 227 | sub get_blast_library_path { 228 | 229 | my ($self, $probe_type) = @_; 230 | my $lib_path; 231 | 232 | if ($probe_type eq 'UTR') { 233 | $lib_path = $self->{blast_utr_lib_path}; 234 | unless ($lib_path) { 235 | $devtools->print_hash($self); 236 | die "\n\t NO UTR LIBRARY defined"; 237 | } 238 | } 239 | elsif ($probe_type eq 'ORF') { 240 | $lib_path = $self->{blast_orf_lib_path}; 241 | unless ($lib_path) { 242 | $devtools->print_hash($self); 243 | die "\n\t NO ORF LIBRARY defined"; 244 | } 245 | } 246 | return $lib_path; 247 | } 248 | 249 | ############################################################################ 250 | # EOF 251 | ############################################################################ 252 | -------------------------------------------------------------------------------- /modules/Base/Console.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | ############################################################################ 3 | # Module: Console.pm 4 | # Description: Functions for text console programs 5 | # History: Rob Gifford, November 2006: Creation 6 | ############################################################################ 7 | package Console; 8 | 9 | ############################################################################ 10 | # Import statements/packages (externally developed packages) 11 | ############################################################################ 12 | use strict; 13 | 14 | ############################################################################ 15 | # Import statements/packages (internally developed packages) 16 | ############################################################################ 17 | 18 | # Base classes 19 | use Base::FileIO; 20 | 21 | ############################################################################ 22 | # Globals 23 | ############################################################################ 24 | 25 | # Create base objects 26 | my $fileio = FileIO->new(); 27 | my $console_width = 70; # assumed width of the console 28 | 1; 29 | 30 | ############################################################################ 31 | # LIFECYCLE 32 | ############################################################################ 33 | 34 | #*************************************************************************** 35 | # Subroutine: new 36 | # Description: create new Console.pm object 37 | #*************************************************************************** 38 | sub new { 39 | 40 | my ($invocant, $parameter_ref) = @_; 41 | my $class = ref($invocant) || $invocant; 42 | 43 | # Member variables 44 | my $self = { 45 | 46 | }; 47 | 48 | bless ($self, $class); 49 | return $self; 50 | } 51 | 52 | ############################################################################ 53 | # Public Member Functions 54 | ############################################################################ 55 | 56 | #*************************************************************************** 57 | # Subroutine: refresh_console 58 | # Description: Clear the screen by sending the 'clear' command via 'system' 59 | #*************************************************************************** 60 | sub refresh { 61 | 62 | my ($self) = @_; 63 | my $command = 'clear'; 64 | system $command; 65 | } 66 | 67 | #*************************************************************************** 68 | # Subroutine: show_about_box 69 | # Description: show a formatted title box for a console application 70 | # Arguments: the program description as a series of strings: 71 | # - $title, $version, $description, $author, $contact 72 | #*************************************************************************** 73 | sub show_about_box { 74 | 75 | my ($self, $title, $version, $description, $author, $contact) = @_; 76 | 77 | my $solid_line = "\n\t" . '#' x $console_width; 78 | my $border_line = "\n\t" . '#' . (' ' x ($console_width - 2)) . "#"; 79 | 80 | # Format the text 81 | my $title_version = $title . ' ' . $version; 82 | 83 | my $f_title_version = enclose_box_text($title_version); 84 | my $f_description = enclose_box_text($description); 85 | my $f_author = enclose_box_text($author); 86 | my $f_contact = enclose_box_text($contact); 87 | 88 | # Print the box 89 | print "\n\n"; 90 | print $solid_line; 91 | print $border_line; 92 | print $f_title_version; 93 | print $f_description; 94 | print $f_author; 95 | print $f_contact; 96 | print $border_line; 97 | print $solid_line; 98 | print "\n\n"; 99 | } 100 | 101 | ############################################################################ 102 | # Getting user input via the console 103 | ############################################################################ 104 | 105 | #*************************************************************************** 106 | # Subroutine: ask_question 107 | # Description: just ask a question and return the input 108 | # Arguments: $question: the question to ask 109 | # Returns: $answer: the response of the user 110 | #*************************************************************************** 111 | sub ask_question { 112 | 113 | my ($self, $question) = @_; 114 | 115 | print "$question : "; 116 | my $answer = ; 117 | chomp $answer; 118 | return $answer; 119 | } 120 | 121 | #*************************************************************************** 122 | # Subroutine: ask_yes_no_question 123 | # Description: ask a question and accept only 'y' or 'n' as an answer 124 | # Arguments: $question: the question to ask 125 | # Returns: $answer: the integer value entered by the user 126 | #*************************************************************************** 127 | sub ask_yes_no_question { 128 | 129 | my ($self, $question) = @_; 130 | 131 | my $answer; 132 | do { 133 | print "$question " . "\(y\/n\): "; 134 | $answer = ; 135 | chomp $answer; 136 | } until ($answer eq 'n' or $answer eq 'y'); 137 | 138 | return $answer; 139 | } 140 | 141 | #*************************************************************************** 142 | # Subroutine: ask_simple_choice_question 143 | # Description: ask a question and accept only a range of possible answers 144 | # Arguments: $question: the question to ask 145 | # $choice_ref: reference to an array with the possible choices 146 | # Returns: $answer: the integer value entered by the user 147 | #*************************************************************************** 148 | sub ask_simple_choice_question { 149 | 150 | my ($self, $question, $choice_ref) = @_; 151 | 152 | # convert the choices array to a scalar 153 | my $choice_string = join('/', @$choice_ref); 154 | my $answer; 155 | my $answer_in_set = undef; 156 | do { 157 | print "$question " . "\($choice_string\): "; 158 | $answer = ; 159 | chomp $answer; 160 | foreach my $choice (@$choice_ref) { 161 | #print "\n\t '$choice' '$answer'"; 162 | if ($answer eq $choice) { 163 | $answer_in_set = 'true'; 164 | } 165 | } 166 | 167 | } until ($answer_in_set); 168 | 169 | return $answer; 170 | } 171 | 172 | #*************************************************************************** 173 | # Subroutine: ask_float_question 174 | # Description: ask a question and accept only float or int as a response 175 | # Arguments: $question: the question to ask 176 | # Returns: $answer: the float value entered by the user 177 | #*************************************************************************** 178 | sub ask_float_question { 179 | 180 | my ($self, $question) = @_; 181 | 182 | my $answer; 183 | do { 184 | print "$question : "; 185 | $answer = ; 186 | chomp $answer; 187 | } until ($answer =~ /^-?\d+\.?\d*$/); 188 | return $answer; 189 | } 190 | 191 | #*************************************************************************** 192 | # Subroutine: ask_float_with_bounds_question 193 | # Description: ask a question and accept only a float or int that falls 194 | # within a defined range as a response 195 | # Arguments: $question: the question to ask 196 | # $lower_bound, $upper_bound: the specified bounds 197 | # Returns: $answer: the value entered by the user 198 | #*************************************************************************** 199 | sub ask_float_with_bounds_question { 200 | 201 | my ($self, $question, $lower_bound, $upper_bound) = @_; 202 | 203 | my $answer; 204 | do { 205 | print "$question \($lower_bound-$upper_bound\): "; 206 | $answer = ; 207 | chomp $answer; 208 | } until ($answer >= $lower_bound 209 | and $answer <= $upper_bound 210 | and $answer =~ /^-?\d+\.?\d*$/ 211 | ); 212 | return $answer; 213 | } 214 | 215 | #*************************************************************************** 216 | # Subroutine: ask_int_question 217 | # Description: ask a question and accept only an integer as a response 218 | # Arguments: $question: the question to ask 219 | # Returns: $answer: the integer value entered by the user 220 | #*************************************************************************** 221 | sub ask_int_question { 222 | 223 | my ($self, $question) = @_; 224 | 225 | my $answer; 226 | do { 227 | print "$question : "; 228 | $answer = ; 229 | chomp $answer; 230 | } until ($answer =~ /\d/); # TODO: this isn't strict enough 231 | return $answer; 232 | } 233 | 234 | #*************************************************************************** 235 | # Subroutine: ask_int_with_bounds_question 236 | # Description: ask a question and accept only an integer that falls within 237 | # a defined range as a response 238 | # Arguments: $question: the question to ask 239 | # $lower_bound, $upper_bound: the specified bounds 240 | # Returns: $answer: the integer value entered by the user 241 | # TODO: doesn't discriminate ints and floats 242 | #*************************************************************************** 243 | sub ask_int_with_bounds_question { 244 | 245 | my ($self, $question, $lower_bound, $upper_bound) = @_; 246 | 247 | my $answer; 248 | do { 249 | print "$question \($lower_bound-$upper_bound\): "; 250 | $answer = ; 251 | chomp $answer; 252 | } until ($answer >= $lower_bound and $answer <= $upper_bound); 253 | return $answer; 254 | } 255 | 256 | #*************************************************************************** 257 | # Subroutine: ask_list_question 258 | # Description: ask user to choose an option from a numbered list 259 | # Arguments: $question: the question to ask 260 | # $list_length: the number of options in the list 261 | # Returns: $answer: the integer value entered by the user 262 | #*************************************************************************** 263 | sub ask_list_question { 264 | 265 | my ($self, $question, $list_length) = @_; 266 | 267 | my $answer; 268 | my $return = 0; # Initialize $return to a defined value 269 | do { 270 | print "$question (1-$list_length): "; 271 | $answer = ; 272 | chomp $answer; 273 | if ($answer =~ /^\d+$/) { # Check if the answer is a positive integer 274 | if ($answer >= 1 && $answer <= $list_length) { 275 | $return = 1; 276 | } else { 277 | print "Please enter a number between 1 and $list_length.\n"; 278 | } 279 | } else { 280 | print "\n\t Please enter a valid number."; 281 | } 282 | } until ($return == 1); 283 | 284 | return $answer; 285 | } 286 | 287 | 288 | #*************************************************************************** 289 | # Subroutine: do_read_tabdelim_dialogue 290 | # Description: read in a tab delimited file, capture column headers as field names 291 | #*************************************************************************** 292 | sub do_read_tabdelim_dialogue { 293 | 294 | my ($self, $path, $data_ref, $fields_array, $fields_hash) = @_; 295 | 296 | # Get database handle, die if we can't 297 | my $digs_obj = $self->{digs_obj}; 298 | 299 | my @infile; 300 | $fileio->read_file($path, \@infile); 301 | 302 | my $line_number = 0; 303 | foreach my $line (@infile) { 304 | $line_number++; 305 | if ($line =~ /^\s*$/) { next; } # discard blank line 306 | elsif ($line =~ /^\s*#/) { next; } # discard comment line 307 | unless ($line =~ /\t/) { print "\n\t Incorrect formatting at line '$line_number'"; die; } 308 | push (@$data_ref, $line); 309 | } 310 | my $data = scalar @$data_ref; 311 | unless ($data) { 312 | die "\n\t Couldn't read input file\n\n"; 313 | } 314 | 315 | # DISPLAY the column headers read from the tab-delimited file 316 | my $header_row = shift @$data_ref; 317 | my @header_row = split ("\t", $header_row); 318 | print "\n\n\t The following cleaned column headers (i.e. table fields) were obtained\n"; 319 | my $i; 320 | foreach my $element (@header_row) { 321 | chomp $element; 322 | $i++; 323 | $element =~ s/\s+/_/g; # Remove whitespace 324 | $element =~ s/-/_/g; # Remove hyphens (avoid in mysql field names) 325 | if ($element eq '') { $element = 'EMPTY_COLUMN_' . $i; } 326 | print "\n\t\t Column $i: '$element'"; 327 | push (@$fields_array, $element); 328 | $fields_hash->{$element} = "varchar"; 329 | } 330 | 331 | # Prompt user - did we read the file correctly? 332 | my $question3 = "\n\n\t Is this correct?"; 333 | my $answer3 = $self->ask_yes_no_question($question3); 334 | if ($answer3 eq 'n') { # Exit if theres a problem with the infile 335 | print "\n\t\t Aborted!\n\n\n"; exit; 336 | } 337 | } 338 | 339 | ############################################################################ 340 | # Private Member Functions 341 | ############################################################################ 342 | 343 | #*************************************************************************** 344 | # Subroutine: enclose_box_text 345 | # Description: Format text for an about box by centering it within a box 346 | #*************************************************************************** 347 | sub enclose_box_text { 348 | 349 | my ($text) = @_; 350 | 351 | my $f_text; 352 | my $left_spacing; 353 | my $right_spacing; 354 | my $text_length = length $text; 355 | 356 | if ($text_length > ($console_width - 4)) { 357 | die ("\n\t Title field was more than max length"); 358 | 359 | } 360 | else { 361 | # calculate total white space 362 | my $space = ($console_width - ($text_length + 2)); 363 | 364 | # use this value to centre text 365 | $left_spacing = $space / 2; 366 | my $adjust_for_uneven = $space % 2; 367 | $right_spacing = ($space / 2) + $adjust_for_uneven; 368 | } 369 | 370 | $f_text = "\n\t#" . (' ' x $left_spacing); 371 | $f_text .= $text; 372 | $f_text .= (' ' x $right_spacing) . "#"; 373 | 374 | return $f_text; 375 | } 376 | 377 | ############################################################################ 378 | # EOF 379 | ############################################################################ 380 | -------------------------------------------------------------------------------- /modules/Base/FileIO.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | ############################################################################ 3 | # Module: FileIO.pm 4 | # Description: Functions for working with ASCII text files 5 | # History: Rob Gifford, Novemeber 2006: Creation 6 | ############################################################################ 7 | package FileIO; 8 | 9 | ############################################################################ 10 | # Import statements/packages (externally developed packages) 11 | ############################################################################ 12 | use strict; 13 | 14 | ############################################################################ 15 | # Import statements/packages (internally developed packages) 16 | ############################################################################ 17 | 18 | ############################################################################ 19 | # Globals 20 | ############################################################################ 21 | 1; 22 | 23 | ############################################################################ 24 | # LIFECYCLE 25 | ############################################################################ 26 | 27 | #*************************************************************************** 28 | # Subroutine: new 29 | # Description: create a new FileIO object 30 | #*************************************************************************** 31 | sub new { 32 | 33 | my ($invocant, $parameter_ref) = @_; 34 | my $class = ref($invocant) || $invocant; 35 | 36 | # Member variables 37 | my $self = { 38 | }; 39 | 40 | bless ($self, $class); 41 | return $self; 42 | } 43 | 44 | ############################################################################ 45 | # Public Member Functions 46 | ############################################################################ 47 | 48 | #*************************************************************************** 49 | # Subroutine: get_infile_type 50 | # Description: returns the extension of an infile as a 'file_type' 51 | #*************************************************************************** 52 | sub get_infile_type { 53 | 54 | my ($self, $file) = @_; 55 | my @file_bits = split (/\./, $file); 56 | my $file_type = pop @file_bits; 57 | return $file_type; 58 | } 59 | 60 | #*************************************************************************** 61 | # Subroutine: check_directory_exists 62 | # Description: routine to check if a directory path is valid 63 | #*************************************************************************** 64 | sub check_directory_exists { 65 | 66 | my ($self, $directory) = @_; 67 | unless (opendir(DIR, $directory)) { 68 | #die "\n\t Cannot open directory \"$directory\"\n\n"; 69 | return undef; 70 | } 71 | return 1; 72 | } 73 | 74 | #*************************************************************************** 75 | # Subroutine: read_file 76 | # Description: read an input file to an array 77 | # Arguments: $file: the name of the file to read 78 | # $array_ref: array to copy to 79 | #*************************************************************************** 80 | sub read_file { 81 | 82 | my ($self, $file, $array_ref) = @_; 83 | 84 | unless (-f $file) { 85 | if (-d $file) { 86 | print "\n\t Cannot open file \"$file\" - it is a directory\n\n"; 87 | return 0; 88 | } 89 | else { 90 | print "\n\t Cannot open file \"$file\"\n\n"; 91 | return 0; 92 | } 93 | 94 | } 95 | unless (open(INFILE, "$file")) { 96 | print "\n\t Cannot open file \"$file\"\n\n"; 97 | return 0; 98 | } 99 | @$array_ref = ; 100 | close INFILE; 101 | 102 | return 1; 103 | } 104 | 105 | #*************************************************************************** 106 | # Subroutine: write_file 107 | # Description: write an array to an ouput file 108 | # Arguments: $file: the name of the file to write to 109 | # $array_ref: array to copy 110 | #*************************************************************************** 111 | sub write_file { 112 | 113 | my ($self, $file, $array_ref) = @_; 114 | unless (open(OUTFILE, ">$file")) { 115 | print "\n\t Couldn't open file \"$file\" for writing\n\n"; 116 | return 0; 117 | } 118 | print OUTFILE @$array_ref; 119 | close OUTFILE; 120 | } 121 | 122 | #*************************************************************************** 123 | # Subroutine: write_text_to_file 124 | # Description: write a formatted text string to an ouput file 125 | # Arguments: $file: the name of the file to write to 126 | # $text; the string to write out 127 | #*************************************************************************** 128 | sub write_text_to_file { 129 | 130 | my ($self, $file, $text) = @_; 131 | 132 | unless (open(OUTFILE, ">$file")) { 133 | print "\n\t Couldn't open file \"$file\" for writing\n\n"; 134 | return 0; 135 | } 136 | print OUTFILE $text; 137 | close OUTFILE; 138 | } 139 | 140 | #*************************************************************************** 141 | # Subroutine: append_text_to_file 142 | # Description: append a text string to an ouput file 143 | # Arguments: $file: the name of the file to write to 144 | # $text; the string to append 145 | #*************************************************************************** 146 | sub append_text_to_file { 147 | 148 | my ($self, $file, $text) = @_; 149 | 150 | unless (open(OUTFILE, ">>$file")) { 151 | print "\n\t Couldn't open file \"$file\" for writing\n\n"; 152 | return; 153 | } 154 | print OUTFILE $text; 155 | close OUTFILE; 156 | #print "\n\t File \"$file\" created!\n\n"; 157 | } 158 | 159 | #*************************************************************************** 160 | # Subroutine: read_standard_field_value_block 161 | # Description: read a NEXUS style block containing [field]=[value] lines 162 | # Returns: 1 if block was found and lines were read 163 | #*************************************************************************** 164 | sub read_standard_field_value_block { 165 | 166 | my ($self, $file_data_ref, $start_mark, $end_mark, $extract_ref) = @_; 167 | 168 | # Extract the block 169 | my @block_data; 170 | $self->extract_text_block($file_data_ref, \@block_data, $start_mark, $end_mark); 171 | my $block_size = scalar @block_data; 172 | unless ($block_size) { # Nothing read 173 | return 0; 174 | } 175 | 176 | # Get the field-value pairs from the block 177 | my $captured = 0; 178 | foreach my $line (@block_data) { 179 | 180 | chomp $line; 181 | if ($line =~ /^\s*$/) { next; } # discard blank line 182 | elsif ($line =~ /^\s*#/) { next; } # discard comment line 183 | 184 | my @bits = split('=', $line); 185 | my $field = $bits[0]; 186 | $field =~ s/\s+//; 187 | my $value = $bits[1]; 188 | $value =~ s/\s+//; 189 | $value =~ s/;//; 190 | if ($field and $value) { 191 | $captured++; 192 | $extract_ref->{$field} = $value; 193 | } 194 | } 195 | return $captured; 196 | } 197 | 198 | ############################################################################ 199 | # Reading from directories 200 | ############################################################################ 201 | 202 | #*************************************************************************** 203 | # Subroutine: read_directory_to_array 204 | # Description: read a directory and copy the names of all the files in it 205 | # to an array 206 | # Arguments: $directory: the path to the directory we're reading 207 | # $array_ref: array to copy to 208 | #*************************************************************************** 209 | sub read_directory_to_array { 210 | 211 | my ($self, $directory, $array_ref) = @_; 212 | 213 | unless (opendir(DIR, $directory)) { 214 | print "\n\t Cannot open directory \"$directory\"\n\n"; 215 | return; 216 | } 217 | my $file; 218 | while( defined ($file = readdir(DIR))) { 219 | # don't copy anything that starts with a '.' 220 | unless ($file =~ /^\./) { 221 | push(@$array_ref, $file); 222 | } 223 | } 224 | } 225 | 226 | #*************************************************************************** 227 | # Subroutine: read_directory_tree_leaves_simple 228 | # Description: read a directory tree, and store each 'leaf' 229 | # Arguments: $path: path to directory 230 | # $leaves: array to store 'leaves' (files) as hashes 231 | #*************************************************************************** 232 | sub read_directory_tree_leaves_simple { 233 | 234 | my ($self, $path, $leaves_ref) = @_; 235 | 236 | unless ($path and $leaves_ref) { die; } 237 | 238 | # Read in the top level directory 239 | my @directory; 240 | $self->read_directory_to_array($path, \@directory); 241 | foreach my $file (@directory) { 242 | 243 | # Create the file path 244 | my $file_path = $path . '/' . $file; 245 | if (opendir(DIR, $file_path)) { 246 | $self->recursive_read2($file_path, $leaves_ref); 247 | } 248 | else { # If its not a directory store it 249 | my %file; 250 | $file{path} = $file_path; 251 | $file{file} = $file; 252 | push (@$leaves_ref, \%file); 253 | } 254 | } 255 | } 256 | 257 | #*************************************************************************** 258 | # Subroutine: read_directory_tree_leaves 259 | # Description: read a directory tree, and store each 'leaf' 260 | # Arguments: $path: path to directory 261 | # $leaves: array to store 'leaves' (files data) as hashes 262 | # [optional] $level_codes: hash with correspondence between directory levels and values/labels/classifiers etc 263 | #*************************************************************************** 264 | sub read_directory_tree_leaves { 265 | 266 | my ($self, $path, $leaves, $level_codes, $file_code) = @_; 267 | 268 | unless ($path and $leaves) { die; } 269 | 270 | # Set thE current state 271 | my $level = 1; 272 | my $value; 273 | if ($level_codes) { 274 | $value = $level_codes->{$level}; 275 | } 276 | 277 | # Read in the top level directory 278 | my @directory; 279 | $self->read_directory_to_array($path, \@directory); 280 | foreach my $file (@directory) { 281 | my %branch_data; 282 | $branch_data{$value} = $file; 283 | my $file_path = $path . '/' . $file; 284 | if (opendir(DIR, $file_path)) { 285 | $self->recursive_read($file_path, $leaves, \%branch_data, $level, $level_codes, $file_code); 286 | } 287 | } 288 | } 289 | 290 | ############################################################################ 291 | # FASTA IO 292 | ############################################################################ 293 | 294 | #*************************************************************************** 295 | # Subroutine: read_fasta 296 | # Description: read a fasta file into an array of hashes. 297 | #*************************************************************************** 298 | sub read_fasta { 299 | 300 | my ($self, $file, $array_ref, $length_only) = @_; 301 | 302 | # Read in the file or else return 303 | unless (open(INFILE, $file)) { 304 | print "\n\t Cannot open file \"$file\"\n\n"; 305 | return undef; 306 | } 307 | 308 | # Iterate through lines in the file 309 | my @raw_fasta = ; 310 | close INFILE; 311 | my $header; 312 | my $sequence; 313 | foreach my $line (@raw_fasta) { 314 | 315 | chomp $line; 316 | if ($line =~ /^\s*$/) { next; } # discard blank line 317 | elsif ($line =~ /^\s*#/) { next; } # discard comment line 318 | elsif ($line =~ /^>/) { 319 | 320 | $line =~ s/^>//g; 321 | 322 | # new header, store any sequence held in the buffer 323 | if ($header and $sequence) { 324 | $sequence = uc $sequence; 325 | my $length = length $sequence; 326 | my %seq; 327 | $seq{header} = $header; 328 | unless ($length_only) { 329 | $seq{sequence} = $sequence; 330 | } 331 | $seq{seq_length} = $length; 332 | push(@$array_ref, \%seq); 333 | } 334 | 335 | # reset the variables 336 | $line =~ s/^>//; 337 | $header = $line; 338 | $sequence = undef; 339 | } 340 | else { 341 | # keep line, add to sequence string 342 | $sequence .= $line; 343 | } 344 | } 345 | 346 | # Before exit, store any sequence held in the buffer 347 | if ($header and $sequence) { 348 | $sequence =~ s/\s+//g; # Remove whitespace 349 | $sequence = uc $sequence; 350 | my $length = length $sequence; 351 | my %seq; 352 | $seq{header} = $header; 353 | unless ($length_only) { 354 | $seq{sequence} = $sequence; 355 | } 356 | $seq{seq_length} = $length; 357 | push(@$array_ref, \%seq); 358 | 359 | } 360 | } 361 | 362 | #*************************************************************************** 363 | # Subroutine: recursive_read2 364 | # Description: read everything under a given path 365 | #*************************************************************************** 366 | sub recursive_read2 { 367 | 368 | my ($self, $path, $leaves_ref) = @_; 369 | 370 | # Read in the top level directory 371 | my @directory; 372 | $self->read_directory_to_array($path, \@directory); 373 | 374 | # Iterate through 375 | foreach my $file (@directory) { 376 | 377 | my $file_path = $path . '/' . $file; 378 | 379 | # Recurse if's a directory 380 | if (opendir(DIR, $file_path)) { 381 | $self->recursive_read2($file_path, $leaves_ref); 382 | } 383 | else { 384 | my %file; 385 | $file{path} = $file_path; 386 | $file{file} = $file; 387 | push (@$leaves_ref, \%file); 388 | } 389 | } 390 | } 391 | 392 | #*************************************************************************** 393 | # Subroutine: recursive_read 394 | # Description: read everything under a given directory path 395 | #*************************************************************************** 396 | sub recursive_read { 397 | 398 | my ($self, $path, $leaves, $branch_data, $level, $level_codes, $file_code) = @_; 399 | 400 | # Increment level 401 | $level++; 402 | 403 | # Read in the top level directory 404 | my @directory; 405 | $self->read_directory_to_array($path, \@directory); 406 | 407 | # Get the total number of levels 408 | my @levels = keys %$level_codes; 409 | my $levels = scalar @levels; 410 | 411 | # Iterate through 412 | foreach my $file (@directory) { 413 | my $file_path = $path . '/' . $file; 414 | # Recurse if's a directory 415 | if (opendir(DIR, $file_path)) { 416 | my $value = $level_codes->{$level}; 417 | $branch_data->{$value} = $file; 418 | $self->recursive_read($file_path, $leaves, $branch_data, $level, $level_codes, $file_code); 419 | # Reset branch data 420 | delete $branch_data->{$value}; 421 | 422 | } 423 | elsif ($level eq ($levels + 1)) { 424 | unless ($file_code) { $file_code = 'file'; } 425 | $branch_data->{$file_code} = $file; 426 | my %data = %$branch_data; 427 | $data{path} = $file_path; 428 | push (@$leaves, \%data); 429 | } 430 | else { 431 | print "\n\t File '$file' is located in internal node"; 432 | } 433 | } 434 | } 435 | 436 | #*************************************************************************** 437 | # Subroutine: extract_text_block 438 | # Description: extract a block of text denoted by start and stop tokens 439 | #*************************************************************************** 440 | sub extract_text_block { 441 | 442 | my ($self, $input_ref, $block_ref, $start_token, $stop_token) = @_; 443 | 444 | # Make start and stop tokens upper case so tokens are not case sensitive 445 | $start_token = uc $start_token; 446 | $stop_token = uc $stop_token; 447 | #print "\n\t ### Stop token: $stop_token"; 448 | 449 | my $extract = undef; 450 | foreach my $line (@$input_ref) { 451 | #print "\n\t ### Start token: $start_token"; 452 | chomp $line; 453 | my $uc_line = uc $line; 454 | #print "\n\t ### LINE $uc_line;"; 455 | if ($stop_token) { 456 | if ($uc_line =~ $stop_token) { 457 | $extract = undef; 458 | } 459 | } 460 | if ($extract) { 461 | push (@$block_ref, $line); 462 | } 463 | elsif ($uc_line =~ $start_token) { 464 | $extract = 'true'; 465 | } 466 | } 467 | } 468 | 469 | #*************************************************************************** 470 | # Subroutine: create_unique_directory 471 | # Description: create a unique directory 472 | #*************************************************************************** 473 | sub create_directory { 474 | 475 | my ($self, $unique_dir) = @_; 476 | 477 | my $mkdir_cmd = "mkdir $unique_dir"; 478 | my $result = system $mkdir_cmd; 479 | if ($result > 0) { 480 | print "\n\t ### Error: couldn't create output directory using command 'mkdir $unique_dir'\n\n"; 481 | die; 482 | } 483 | } 484 | 485 | 486 | ############################################################################ 487 | # EOF 488 | ############################################################################ -------------------------------------------------------------------------------- /modules/DIGS/Consolidate.pm: -------------------------------------------------------------------------------- 1 | #!usr/bin/perl -w 2 | ############################################################################ 3 | # Module: Consolidate.pm 4 | # Description: Functions for clustering merging matches to different probes 5 | # into higher order structures 6 | # History: April 2017: Created by Robert Gifford 7 | ############################################################################ 8 | package Consolidate; 9 | 10 | ############################################################################ 11 | # Import statements/packages (externally developed packages) 12 | ############################################################################ 13 | use strict; 14 | 15 | ############################################################################ 16 | # Import statements/packages (internally developed packages) 17 | ############################################################################ 18 | 19 | # Base classes 20 | use Base::FileIO; 21 | use Base::Console; 22 | use Base::DevTools; 23 | 24 | ############################################################################ 25 | # Globals 26 | ############################################################################ 27 | 28 | # Base objects 29 | my $fileio = FileIO->new(); 30 | my $console = Console->new(); 31 | my $devtools = DevTools->new(); 32 | 33 | # Maximum range for Consolidate 34 | my $max = 100000000; 35 | 1; 36 | 37 | ############################################################################ 38 | # LIFECYCLE 39 | ############################################################################ 40 | 41 | #*************************************************************************** 42 | # Subroutine: new 43 | # Description: create new Consolidate 'object' 44 | #*************************************************************************** 45 | sub new { 46 | 47 | my ($invocant, $parameter_ref) = @_; 48 | my $class = ref($invocant) || $invocant; 49 | 50 | # Declare empty data structures 51 | my %crossmatching; 52 | 53 | # Set member variables 54 | my $self = { 55 | 56 | # Set-up params 57 | consolidate_range => $parameter_ref->{consolidate_range}, 58 | consolidate_mode => $parameter_ref->{consolidate_mode}, 59 | consolidate_settings => $parameter_ref->{consolidate_settings}, 60 | na_reference_library => $parameter_ref->{na_reference_library}, 61 | aa_reference_library => $parameter_ref->{aa_reference_library}, 62 | blast_utr_lib_path => $parameter_ref->{blast_utr_lib_path}, 63 | blast_orf_lib_path => $parameter_ref->{blast_orf_lib_path}, 64 | blast_threads => $parameter_ref->{blast_threads}, 65 | program_version => $parameter_ref->{program_version}, 66 | 67 | # Member classes 68 | db => $parameter_ref->{db}, 69 | blast_obj => $parameter_ref->{blast_obj}, 70 | 71 | # Paths used in consolidate processes 72 | genome_use_path => $parameter_ref->{genome_use_path}, 73 | output_path => $parameter_ref->{output_path}, 74 | target_groups => $parameter_ref->{target_groups}, 75 | tmp_path => $parameter_ref->{tmp_path}, 76 | 77 | # Flags 78 | verbose => $parameter_ref->{verbose}, 79 | force => $parameter_ref->{force}, 80 | }; 81 | 82 | bless ($self, $class); 83 | return $self; 84 | } 85 | 86 | ############################################################################ 87 | # CONSOLIDATION FXNS 88 | ############################################################################ 89 | 90 | #*************************************************************************** 91 | # Subroutine: consolidate_loci 92 | # Description: assemble digs_results rows into higher-order loci 93 | #*************************************************************************** 94 | sub consolidate_loci { 95 | 96 | my ($self) = @_; 97 | 98 | # Get settings 99 | my $settings_ref = $self->{consolidate_settings}; 100 | unless ($settings_ref) { die; } 101 | my $range = $settings_ref->{range}; 102 | unless ($range) { die; } 103 | my $defragment_obj = Defragment->new($self); 104 | $defragment_obj->{defragment_mode} = 'consolidate'; 105 | $defragment_obj->{defragment_range} = $range; 106 | 107 | 108 | # Get the digs results sorted by scaffold and extract start 109 | my $db = $self->{db}; 110 | my $where = $settings_ref->{where_clause}; 111 | my @sorted; 112 | $db->get_sorted_digs_results(\@sorted, $where); 113 | 114 | # Set up for consolidation 115 | my $total_loci = scalar @sorted; 116 | print "\n\t Consolidating assigned extracted sequences into loci"; 117 | print "\n\t $total_loci loci in the digs_results table prior to consolidation'"; 118 | 119 | # Compose clusters of overlapping/adjacent BLAST hits and extracted loci 120 | my %consolidated; 121 | $defragment_obj->compose_clusters(\%consolidated, \@sorted, $settings_ref); 122 | #$devtools->print_hash(\%consolidated); 123 | 124 | 125 | # Check the output 126 | my @cluster_ids = keys %consolidated; 127 | my $num_clusters = scalar @cluster_ids; 128 | my $hit_count = 0; 129 | my $cluster_count; 130 | foreach my $cluster_id (@cluster_ids) { 131 | 132 | my $cluster_array_ref = $consolidated{$cluster_id}; 133 | #$devtools->print_array($cluster_array_ref); 134 | my $cluster_first_ref = @$cluster_array_ref[0]; 135 | my $organism = $cluster_first_ref->{organism}; 136 | my $scaffold = $cluster_first_ref->{scaffold}; 137 | my $num_hits = scalar @$cluster_array_ref; 138 | if ($num_hits > 1) { 139 | print "\n\t '$num_hits' hits'\t'$organism: $scaffold'"; 140 | } 141 | $hit_count = $hit_count + $num_hits; 142 | $cluster_count++; 143 | } 144 | #$devtools->print_hash(\%consolidated); exit; 145 | print "\n\n\t HIT COUNT: $hit_count, CLUSTER COUNT: $cluster_count"; 146 | 147 | if ($total_loci > $num_clusters) { 148 | print "\n\n\t $num_clusters clusters of loci within '$range' bp of one another "; 149 | } 150 | 151 | # Update locus data based on consolidated results 152 | $self->derive_locus_table_from_clustered_digs_results(\%consolidated); 153 | 154 | # Return the number of clusters 155 | return $num_clusters; 156 | } 157 | 158 | #*************************************************************************** 159 | # Subroutine: derive_locus_table_from_clustered_digs_results 160 | # Description: compile locus information and update the locus tables 161 | #*************************************************************************** 162 | sub derive_locus_table_from_clustered_digs_results { 163 | 164 | my ($self, $consolidated_ref) = @_; 165 | 166 | unless ($consolidated_ref) { die; } 167 | 168 | # Get parameters and data structures 169 | my $verbose = $self->{verbose}; 170 | my $db_ref = $self->{db}; 171 | my $loci_table = $db_ref->{loci_table}; 172 | my $loci_chains_table = $db_ref->{loci_chains_table}; 173 | my $classify_obj = Classify->new($self); 174 | 175 | 176 | # Flags for how to handle 177 | my $reextract = 'true'; 178 | #my $reextract = undef; 179 | my $annotate_ends = 'true'; 180 | 181 | # Iterate through the clusters 182 | my $assigned_count = 0; 183 | my @cluster_ids = keys %$consolidated_ref; 184 | foreach my $cluster_id (@cluster_ids) { 185 | 186 | # Get the loci in this cluster 187 | my $cluster_ref = $consolidated_ref->{$cluster_id}; 188 | 189 | # Turn this cluster into an annotated locus 190 | my %locus; 191 | $self->derive_locus_structure(\%locus, $cluster_ref); 192 | 193 | # Extract the consolidate locus if the flag is set 194 | if ($reextract) { 195 | $self->extract_consolidated_locus(\%locus); 196 | } 197 | 198 | # Annotate extracted sequences (identify truncated ends if they occur) 199 | if ($annotate_ends) { 200 | #$self->annotate_consolidated_locus_flanks(\%locus); 201 | } 202 | 203 | # Classify the extracted locus using BLAST 204 | if ($locus{sequence}) { 205 | $locus{probe_type} = 'ORF'; 206 | $classify_obj->classify_sequence_using_blast(\%locus); 207 | my $assigned = $locus{assigned_name}; 208 | unless ($assigned) { die; } 209 | if ($assigned) { $assigned_count++; } 210 | } 211 | else { 212 | print "\n\n\t No sequence retrieved for hit in '$locus{organism}', '$locus{scaffold}'\n"; 213 | $locus{assigned_name} = 'Not extracted'; 214 | #devtools->print_hash{\%locus}; 215 | } 216 | 217 | # Insert the consolidated locus information 218 | my $locus_array = $locus{locus_array}; 219 | my $locus_structure = join('-', @$locus_array); 220 | $locus{locus_structure} = $locus_structure; 221 | 222 | # Insert the data 223 | my $locus_id = $loci_table->insert_row(\%locus); 224 | 225 | # Create the links between the loci and digs_results tables 226 | foreach my $digs_result_ref (@$cluster_ref) { 227 | my $digs_result_id = $digs_result_ref->{record_id}; 228 | my %chain_data; 229 | $chain_data{digs_result_id} = $digs_result_id; 230 | $chain_data{locus_id} = $locus_id; 231 | $loci_chains_table->insert_row(\%chain_data); 232 | } 233 | } 234 | } 235 | 236 | #*************************************************************************** 237 | # Subroutine: derive_locus_structure 238 | # Description: derive locus structure based on clustered digs results 239 | #*************************************************************************** 240 | sub derive_locus_structure { 241 | 242 | my ($self, $consolidated_ref, $cluster_ref) = @_; 243 | 244 | my $annotate_flanks = undef; 245 | my $initialised = undef; 246 | my $organism; 247 | my $version; 248 | my $target_name; 249 | my $datatype; 250 | 251 | my $assigned_name; 252 | my $last_assigned_name; 253 | 254 | my $feature; 255 | my $lowest; 256 | my $highest; 257 | my $scaffold; 258 | my $orientation; 259 | my $target_datatype; 260 | my $target_version; 261 | my @locus_structure; 262 | my $target_id; 263 | my $multiple_orientations = undef; 264 | my $last_element = undef; 265 | foreach my $element_ref (@$cluster_ref) { 266 | 267 | # Capture values from the previous iterations 268 | my $last_feature = $feature; 269 | my $last_orientation = $orientation; 270 | #my $last_scaffold = $scaffold; 271 | 272 | # Get the data about this digs_results table row 273 | my $start = $element_ref->{extract_start}; 274 | my $end = $element_ref->{extract_end}; 275 | $feature = $element_ref->{assigned_gene}; 276 | $assigned_name = $element_ref->{assigned_name}; 277 | $version = $element_ref->{target_version}; 278 | $datatype = $element_ref->{target_datatype}; 279 | $orientation = $element_ref->{orientation}; 280 | $scaffold = $element_ref->{scaffold}; 281 | $organism = $element_ref->{organism}; 282 | $target_name = $element_ref->{target_name}; 283 | unless ($feature and $orientation) { die; } # Sanity checking 284 | my $record = "$feature\[$assigned_name($orientation)\]"; 285 | #print "\n\t RECORD $record"; 286 | 287 | # Create a target key so we can extract a sequence later 288 | $organism = $element_ref->{organism}; 289 | $target_name = $element_ref->{target_name}; 290 | $target_datatype = $element_ref->{target_datatype}; 291 | $target_version = $element_ref->{target_version}; 292 | my @genome = ( $organism , $target_datatype, $target_version ); 293 | my $this_target_id = join ('|', @genome); 294 | if ($target_id) { 295 | unless ($this_target_id eq $target_id) { 296 | print "\n\t Error Target '$target_id' NE '$this_target_id'\n\n"; 297 | #die; 298 | } 299 | } 300 | $target_id = $this_target_id; 301 | 302 | # Deal with first locus in a cluster 303 | unless ($initialised) { 304 | $highest = $end; 305 | $lowest = $start; 306 | $last_element = $element_ref; 307 | $initialised = 'true'; 308 | } 309 | else { 310 | 311 | if ($start < $lowest) { 312 | $lowest = $start; 313 | } 314 | if ($end > $highest) { 315 | $highest = $end; 316 | } 317 | 318 | } 319 | 320 | push(@locus_structure, $record); 321 | 322 | $last_element = $element_ref; 323 | } 324 | 325 | # Store the data 326 | $consolidated_ref->{organism} = $organism; 327 | $consolidated_ref->{target_version} = $version; 328 | $consolidated_ref->{target_name} = $target_name; 329 | $consolidated_ref->{target_datatype} = $datatype; 330 | $consolidated_ref->{scaffold} = $scaffold; 331 | $consolidated_ref->{target_id} = $target_id; 332 | $consolidated_ref->{orientation} = $orientation; 333 | $consolidated_ref->{start} = $lowest; 334 | $consolidated_ref->{end} = $highest; 335 | $consolidated_ref->{extract_start} = $lowest; 336 | $consolidated_ref->{extract_end} = $highest; 337 | $consolidated_ref->{assigned_name} = $assigned_name; 338 | $consolidated_ref->{assigned_name} = $assigned_name; 339 | $consolidated_ref->{locus_array} = \@locus_structure; 340 | 341 | } 342 | 343 | #*************************************************************************** 344 | # Subroutine: extract_consolidated_locus 345 | # Description: extract a sequence spanning a consolidated locus set 346 | #*************************************************************************** 347 | sub extract_consolidated_locus { 348 | 349 | my ($self, $consolidated_ref) = @_; 350 | 351 | my $db_ref = $self->{db}; 352 | my $verbose = $self->{verbose}; 353 | my $blast_obj = $self->{blast_obj}; 354 | unless ($blast_obj) { die; } 355 | my $seq_len = 0; 356 | 357 | my $genome_use_path = $self->{genome_use_path}; 358 | my $target_group_ref = $self->{target_groups}; 359 | 360 | my $organism = $consolidated_ref->{organism}; 361 | my $target_version = $consolidated_ref->{target_version}; 362 | my $target_datatype = $consolidated_ref->{target_datatype}; 363 | my $target_name = $consolidated_ref->{target_name}; 364 | my $target_id = $consolidated_ref->{target_id}; 365 | my $lowest = $consolidated_ref->{start}; 366 | my $highest = $consolidated_ref->{end}; 367 | 368 | my $full_id = $target_id . '|' . $target_name; 369 | my $target_group = $target_group_ref->{$full_id}; 370 | unless ($target_group) { 371 | print " \n\t Defrag set-up error: No target group found for TARGET ID $full_id\n\n"; 372 | #$devtools->print_hash($target_group_ref); 373 | sleep 1; 374 | return 0; 375 | } 376 | 377 | # Construct the path to this target file 378 | my @path; 379 | push (@path, $genome_use_path); 380 | push (@path, $target_group); 381 | push (@path, $organism); 382 | push (@path, $target_datatype); 383 | push (@path, $target_version); 384 | push (@path, $target_name); 385 | my $target_path = join ('/', @path); 386 | 387 | # Extract the sequence 388 | #print "\n\t\t # TARGET: '$target_path'"; 389 | my $sequence = $blast_obj->extract_sequence($target_path, $consolidated_ref); 390 | my $seq_length = length $sequence; # Set sequence length 391 | if ($sequence) { 392 | 393 | # If we extracted a sequence, update the data for this locus 394 | if ($verbose) { print "\n\t\t - Re-extracted sequence: $seq_length nucleotides "; } 395 | $consolidated_ref->{sequence} = $sequence; 396 | $consolidated_ref->{sequence_length} = $seq_length; 397 | } 398 | elsif ($verbose) { 399 | print "\n\t\t # Sequence extraction failed "; 400 | } 401 | } 402 | 403 | #*************************************************************************** 404 | # Subroutine: annotate_consolidated_locus_flanks 405 | # Description: 406 | #*************************************************************************** 407 | sub annotate_consolidated_locus_flanks { 408 | 409 | my ($self, $consolidated_ref) = @_; 410 | 411 | my $db_ref = $self->{db}; 412 | my $contigs_table = $db_ref->{contigs_table}; 413 | my $lowest = $consolidated_ref->{start}; 414 | my $highest = $consolidated_ref->{end}; 415 | my $scaffold = $consolidated_ref->{scaffold}; 416 | 417 | # Get the length of this contig 418 | my %data; 419 | my @fields = qw [ contig_id seq_length ]; 420 | my $where = " WHERE contig_id = '$scaffold'"; 421 | $contigs_table->select_row(\@fields, \%data, $where); 422 | my $contig_length = $data{seq_length}; 423 | unless ($contig_length) { die; } 424 | 425 | # Check the start of the match 426 | my @locus_structure; 427 | if ($lowest eq 1) { 428 | unshift(@locus_structure, 'T'); 429 | } 430 | else { 431 | unshift(@locus_structure, 'X'); 432 | } 433 | # Check the end of the match 434 | if ($highest eq $contig_length) { 435 | push(@locus_structure, 'T'); 436 | } 437 | else { 438 | push(@locus_structure, 'X'); 439 | } 440 | } 441 | 442 | ############################################################################ 443 | # EOF 444 | ############################################################################ 445 | 446 | -------------------------------------------------------------------------------- /modules/DIGS/Initialise.pm: -------------------------------------------------------------------------------- 1 | #!usr/bin/perl -w 2 | 3 | use lib ('./modules/'); 4 | ############################################################################ 5 | # Module: Initialise.pm 6 | # Description: Functions for carrying out basic initialisation in DIGS 7 | # History: April 2017: Created by Robert Gifford 8 | ############################################################################ 9 | package Initialise; 10 | 11 | ############################################################################ 12 | # Import statements/packages (externally developed packages) 13 | ############################################################################ 14 | use strict; 15 | 16 | ############################################################################ 17 | # Import statements/packages (internally developed packages) 18 | ############################################################################ 19 | 20 | # Base classes 21 | use Base::FileIO; 22 | use Base::Console; 23 | use Base::DevTools; 24 | 25 | ############################################################################ 26 | # Globals 27 | ############################################################################ 28 | 29 | # Base objects 30 | my $fileio = FileIO->new(); 31 | my $console = Console->new(); 32 | my $devtools = DevTools->new(); 33 | 34 | # Maximum range for defragment 35 | my $maximum = 100000000; 36 | 1; 37 | 38 | ############################################################################ 39 | # LIFECYCLE 40 | ############################################################################ 41 | 42 | #*************************************************************************** 43 | # Subroutine: new 44 | # Description: create new DIGS 'object' 45 | #*************************************************************************** 46 | sub new { 47 | 48 | my ($invocant, $parameter_ref) = @_; 49 | my $class = ref($invocant) || $invocant; 50 | 51 | # Set member variables 52 | my $self = { 53 | 54 | # Global settings 55 | process_id => $parameter_ref->{process_id}, 56 | program_version => $parameter_ref->{program_version}, 57 | 58 | }; 59 | 60 | bless ($self, $class); 61 | return $self; 62 | } 63 | 64 | ############################################################################ 65 | # Main initialisation functions 66 | ############################################################################ 67 | 68 | #*************************************************************************** 69 | # Subroutine: initialise 70 | # Description: do general set-up, then hand off to option-specific set-up fxns 71 | #*************************************************************************** 72 | sub initialise { 73 | 74 | my ($self, $digs_obj, $option, $ctl_file) = @_; 75 | 76 | 77 | # READ CONTROL FILE 78 | my $loader_obj = ScreenBuilder->new($digs_obj); 79 | my $db_name = $self->parse_digs_control_file($digs_obj, $loader_obj, $option, $ctl_file); 80 | # DEV $devtools->print_hash($digs_obj); die; 81 | 82 | # LOAD/CREATE THE DATABASE 83 | $self->initialise_screening_db($digs_obj, $db_name); 84 | 85 | # SET-UP OUTPUT DIRECTORIES 86 | $self->create_output_directories($digs_obj, $loader_obj, $option); 87 | 88 | # SET-UP FOR DIGS SCREENING 89 | if ($option eq 2) { 90 | my $valid = $self->setup_for_a_digs_run($digs_obj); 91 | unless ($valid) { return 0; } 92 | } 93 | 94 | # SET-UP FOR REASSIGN 95 | if ($option eq 3) { 96 | my $force = $digs_obj->{force}; 97 | $self->setup_for_reassign($digs_obj, $force); 98 | } 99 | #print "\n\nOPTION IS '$option'\n\n\n";die; 100 | 101 | # DO SET-UP NEEDED FOR BOTH DEFRAGMENT & CONSOLIDATE 102 | if ($option eq 4 or $option eq 5) { 103 | $self->setup_for_defrag_or_consolidate($digs_obj, $option); 104 | } 105 | 106 | return 1; 107 | } 108 | 109 | #*************************************************************************** 110 | # Subroutine: parse_digs_control_file 111 | # Description: parse control file and extract DIGS run parameters 112 | #*************************************************************************** 113 | sub parse_digs_control_file { 114 | 115 | my ($self, $digs_obj, $loader_obj, $option, $ctl_file) = @_; 116 | 117 | # Try opening control file 118 | my @ctl_file; 119 | my $valid = $fileio->read_file($ctl_file, \@ctl_file); 120 | unless ($valid) { # Exit if we can't open the file 121 | die "\n\t ### Couldn't open control file '$ctl_file'\n\n\n "; 122 | } 123 | 124 | # Parse the control file 125 | $digs_obj->{ctl_file} = $ctl_file; 126 | $loader_obj->parse_control_file($ctl_file, $digs_obj, $option); 127 | 128 | # Return the database name as obtained from the control file 129 | my $db_name = $loader_obj->{db_name}; 130 | unless ($db_name) { die "\n\t Error: no DB name defined \n\n\n"; } # Sanity check 131 | return $db_name; 132 | } 133 | 134 | 135 | #*************************************************************************** 136 | # Subroutine: create output directories 137 | # Description: create a unique 'report' directory for this process 138 | #*************************************************************************** 139 | sub create_output_directories { 140 | 141 | my ($self, $digs_obj, $loader_obj, $option) = @_; 142 | 143 | # Create the output directories if running a screen or re-assigning results table 144 | if ($option >= 2 and $option <=5) { # Need output directory for options 2-5 145 | 146 | # Create a unique ID and report directory for this run 147 | my $process_id = $digs_obj->{process_id}; 148 | my $output_path = $digs_obj->{output_path}; 149 | unless ($process_id) { die; } 150 | unless ($output_path) { die; } 151 | 152 | # $devtools->print_hash($digs_obj); die; 153 | my $report_dir = $output_path . 'result_set_' . $process_id; 154 | $fileio->create_directory($report_dir); 155 | $digs_obj->{report_dir} = $report_dir . '/'; 156 | print "\n\t Created report directory"; 157 | print "\n\t Path: '$report_dir'"; 158 | 159 | # Create the tmp directory inside the report directory 160 | my $tmp_path = $report_dir . '/tmp'; 161 | $fileio->create_directory($tmp_path); 162 | $digs_obj->{tmp_path} = $tmp_path; 163 | 164 | # Create log file 165 | my $log_file = $report_dir . "/log.txt"; 166 | $fileio->append_text_to_file($log_file, "DIGS process $process_id\n"); 167 | $digs_obj->{log_file} = $log_file; 168 | 169 | # Add tore the ScreenBuilder object 170 | $loader_obj->{report_dir} = $digs_obj->{report_dir}; 171 | $loader_obj->{tmp_path} = $digs_obj->{tmp_path}; 172 | $digs_obj->{loader_obj} = $loader_obj; 173 | 174 | } 175 | 176 | } 177 | 178 | #*************************************************************************** 179 | # Subroutine: initialise_screening_db 180 | # Description: load a DIGS screening database (create if doesn't exist) 181 | #*************************************************************************** 182 | sub initialise_screening_db { 183 | 184 | my ($self, $digs_obj, $db_name) = @_; 185 | 186 | # Create the screening DB object 187 | my $db_obj = ScreeningDB->new($digs_obj); 188 | 189 | # Check if this screening DB exists, if not then create it 190 | my $db_exists = $db_obj->does_db_exist($db_name); 191 | unless ($db_exists) { 192 | $db_obj->create_screening_db($db_name); 193 | } 194 | 195 | # Load map with table names into screening database 196 | print "\n\n\t Connecting to DB: $db_name"; 197 | $db_obj->load_screening_db($db_name); 198 | $digs_obj->{db} = $db_obj; # Store the database object reference 199 | } 200 | 201 | #*************************************************************************** 202 | # Subroutine: setup_for_a_digs_run 203 | # Description: prepare database and DIGS query list prior to screening 204 | #*************************************************************************** 205 | sub setup_for_a_digs_run { 206 | 207 | my ($self, $digs_obj) = @_; 208 | 209 | # Flush active set 210 | my $db = $digs_obj->{db}; 211 | my $loader_obj = $digs_obj->{loader_obj}; 212 | unless ($loader_obj) { die; } # Sanity checking 213 | unless ($db) { die "\n\t Error: no DB defined \n\n\n"; } 214 | 215 | #print "\n\t Flushing 'active_set' table\n"; 216 | my $active_set_table = $db->{active_set_table}; 217 | $active_set_table->flush(); 218 | 219 | # Index previously executed searches 220 | my %done; 221 | $self->index_previously_executed_searches($digs_obj, \%done); 222 | $loader_obj->{previously_executed_searches} = \%done; 223 | 224 | # Finally, set up the screen 225 | my %queries; 226 | my $total_queries = $loader_obj->setup_screen($digs_obj, \%queries); 227 | unless ($total_queries) { 228 | print "\n\t Exiting DIGS setup"; 229 | return 0 230 | } 231 | 232 | # Record queries 233 | $digs_obj->{queries} = \%queries; 234 | $digs_obj->{total_queries} = $total_queries; 235 | $digs_obj->{defragment_mode} = 'defragment'; 236 | 237 | return 1; 238 | } 239 | 240 | #*************************************************************************** 241 | # Subroutine: index_previously_executed_searches 242 | # Description: index BLAST searches that have previously been executed 243 | #*************************************************************************** 244 | sub index_previously_executed_searches { 245 | 246 | my ($self, $digs_obj, $done_ref) = @_; 247 | 248 | my $db = $digs_obj->{db}; 249 | my $searches_table = $db->{searches_table}; 250 | unless ($searches_table) { die "\n\t Searches_performed table not loaded\n\n"; } 251 | my @data; 252 | my @fields = qw [ record_id 253 | probe_name probe_gene 254 | organism target_datatype target_version target_name ]; 255 | my $where = " ORDER BY record_id "; 256 | $searches_table->select_rows(\@fields, \@data, $where); 257 | 258 | # Index the executed searches 259 | foreach my $data_ref (@data) { 260 | 261 | # Get the query parameters 262 | my $organism = $data_ref->{organism}; 263 | my $target_datatype = $data_ref->{target_datatype}; 264 | my $version = $data_ref->{target_version}; 265 | my $target_name = $data_ref->{target_name}; 266 | my $probe_name = $data_ref->{probe_name}; 267 | my $probe_gene = $data_ref->{probe_gene}; 268 | 269 | # Sanity checking 270 | unless ( $organism ) { die; } 271 | unless ( $target_datatype ) { die; } 272 | unless ( $version ) { die; } 273 | unless ( $target_name ) { die; } 274 | unless ( $probe_name ) { die; } 275 | unless ( $probe_gene ) { die; } 276 | 277 | # Create the unique key for this search 278 | my @genome = ( $organism , $target_datatype, $version ); 279 | my $target_id = join ('|', @genome); 280 | my $probe_id = $probe_name . '_' . $probe_gene; 281 | my @key = ( $target_id, $target_name, $probe_id ); 282 | my $key = join ('|', @key); 283 | 284 | # Record the query, indexed by it's unique key 285 | $done_ref->{$key} = $data_ref; 286 | } 287 | 288 | #$devtools->print_hash($done_ref); die; # DEBUG 289 | } 290 | 291 | #*************************************************************************** 292 | # Subroutine: setup_for_defrag_or_consolidate 293 | # Description: do general set up for a defragment or consolidate process 294 | #*************************************************************************** 295 | sub setup_for_defrag_or_consolidate { 296 | 297 | my ($self, $digs_obj, $option) = @_; 298 | 299 | #my $loader_obj = $self->{loader_obj}; 300 | #$devtools->print_hash($digs_obj); die; 301 | my $loader_obj = ScreenBuilder->new($digs_obj); 302 | 303 | if ($option eq 4 or $option eq 5) { 304 | 305 | # Set target sequence files for screening 306 | my %targets; 307 | $loader_obj->{target_paths} = $digs_obj->{target_paths}; 308 | my $num_targets = $loader_obj->set_targets(\%targets); 309 | 310 | # Show error and exit if no targets found 311 | unless ($num_targets) { 312 | $loader_obj->show_no_targets_found_error(); 313 | } 314 | 315 | # Get the 'group' field for each target file (part of the path to the file) 316 | my %target_groups; 317 | $loader_obj->set_target_groups(\%targets, \%target_groups); 318 | #$devtools->print_hash(\%target_groups); die; # DEBUG 319 | 320 | $digs_obj->{target_groups} = \%target_groups; 321 | } 322 | 323 | # DO SET-UP NEEDED FOR DEFRAGMENT ONLY 324 | if ($option eq 4) { 325 | 326 | $digs_obj->{defragment_mode} = 'defragment'; 327 | #$devtools->print_hash($digs_obj); die; 328 | 329 | # Get the target list 330 | my $db = $digs_obj->{db}; 331 | my $digs_results_table = $db->{digs_results_table}; 332 | my @fields = qw [ organism target_datatype target_version target_name ]; 333 | my @targets; 334 | $digs_results_table->select_distinct(\@fields, \@targets); 335 | 336 | # Settings for clustering 337 | my %settings; 338 | $settings{total_loci} = '0'; 339 | $settings{total_clusters} = '0'; 340 | $settings{range} = undef; 341 | $settings{reextract} = undef; 342 | $settings{start} = 'extract_start'; 343 | $settings{end} = 'extract_end'; 344 | $settings{targets} = \@targets; 345 | $digs_obj->{defragment_settings} = \%settings; 346 | 347 | my $force = 'true'; # Prevents console prompting for a WHERE clause 348 | $self->setup_for_reassign($digs_obj, $force); 349 | 350 | # Set up the reference library 351 | $loader_obj->setup_reference_libraries($digs_obj); 352 | 353 | } 354 | 355 | # DO SET-UP NEEDED FOR CONSOLIDATE ONLY 356 | elsif ($option eq 5) { 357 | $self->set_up_consolidate_tables($digs_obj); 358 | $digs_obj->{defragment_mode} = 'consolidate'; 359 | #$devtools->print_hash($digs_obj); die; 360 | 361 | # Get the parameters for consolidation 362 | my $c_range = $digs_obj->{consolidate_range}; 363 | my $d_range = $digs_obj->{defragment_range}; 364 | my $consolidate_refseq_library; 365 | unless ($d_range) { $d_range = '0'; } # Default defragment setting 366 | 367 | unless ($c_range) { 368 | print "\n\n\t # Control file parameter 'consolidate_range' is not set"; 369 | my $question1 = "\n\t # Please set the nucleotide length range for consolidating digs results"; 370 | $c_range = $console->ask_int_with_bounds_question($question1, $d_range, $maximum); 371 | } 372 | 373 | # Set the parameters for consolidation 374 | my %consolidate_settings; 375 | $consolidate_settings{range} = $c_range; 376 | $consolidate_settings{start} = 'extract_start'; 377 | $consolidate_settings{end} = 'extract_end'; 378 | $consolidate_settings{where_clause} = ''; 379 | $digs_obj->{consolidate_settings} = \%consolidate_settings; 380 | 381 | # Set up the reference library 382 | $loader_obj->setup_reference_libraries($digs_obj, 'consolidate'); 383 | 384 | } 385 | } 386 | 387 | #*************************************************************************** 388 | # Subroutine: setup_for_reassign 389 | # Description: do general set up for a reassign process 390 | #*************************************************************************** 391 | sub setup_for_reassign { 392 | 393 | my ($self, $digs_obj, $force) = @_; 394 | 395 | my $loader_obj = $digs_obj->{loader_obj}; 396 | 397 | # Set up the reference library 398 | $loader_obj->setup_reference_libraries($digs_obj); 399 | # DEV $devtools->print_hash($digs_obj); die; 400 | 401 | my $where = ''; 402 | unless ($force) { 403 | # Option to enter a WHERE statement 404 | my $question = "\n\n\t Enter a WHERE statement to limit reaasign (Optional)"; 405 | $where = $console->ask_question($question); 406 | } 407 | 408 | # Get database tables 409 | my @reassign_loci; 410 | my $db = $digs_obj->{db}; 411 | my $digs_results_table = $db->{digs_results_table}; 412 | 413 | # Set the fields to get values for 414 | my @fields = qw [ record_id assigned_name assigned_gene 415 | probe_type sequence ]; 416 | 417 | # Get the assigned digs_results 418 | $digs_results_table->select_rows(\@fields, \@reassign_loci, $where); 419 | $digs_obj->{reassign_loci} = \@reassign_loci; 420 | 421 | 422 | } 423 | 424 | 425 | #*************************************************************************** 426 | # Subroutine: set_up_consolidate_tables 427 | # Description: create the database tables to store consolidate results 428 | #*************************************************************************** 429 | sub set_up_consolidate_tables { 430 | 431 | my ($self, $digs_obj) = @_; 432 | 433 | # Create tables if they don't exist already 434 | my $db_ref = $digs_obj->{db}; 435 | my $dbh = $db_ref->{dbh}; 436 | my $loci_exists = $db_ref->does_table_exist('loci'); 437 | unless ($loci_exists) { 438 | $db_ref->create_loci_table($dbh); 439 | } 440 | my $loci_chains_exists = $db_ref->does_table_exist('loci_chains'); 441 | unless ($loci_chains_exists) { 442 | $db_ref->create_loci_chains_table($dbh); 443 | } 444 | 445 | # Load tables 446 | $db_ref->load_loci_table($dbh); 447 | $db_ref->load_loci_chains_table($dbh); 448 | 449 | # Get table references and set up for this consolidation process 450 | my $loci_table = $db_ref->{loci_table}; 451 | my $loci_chains_table = $db_ref->{loci_chains_table}; 452 | my $contigs_table = $db_ref->{contigs_table}; 453 | $loci_table->flush(); 454 | $loci_chains_table->flush(); 455 | } 456 | 457 | ############################################################################ 458 | # EOF 459 | ############################################################################ 460 | 461 | -------------------------------------------------------------------------------- /modules/DIGS/Utility.pm: -------------------------------------------------------------------------------- 1 | #!usr/bin/perl -w 2 | ############################################################################ 3 | # Module: Utility.pm 4 | # Description: DIGS tool utility functions 5 | # History: December 2017: Created by Robert Gifford 6 | ############################################################################ 7 | package Utility; 8 | 9 | ############################################################################ 10 | # Import statements/packages (externally developed packages) 11 | ############################################################################ 12 | use strict; 13 | 14 | ############################################################################ 15 | # Import statements/packages (internally developed packages) 16 | ############################################################################ 17 | 18 | # Base classes 19 | use Base::FileIO; 20 | use Base::Console; 21 | use Base::DevTools; 22 | 23 | # Program components 24 | use DIGS::ScreenBuilder; # Functions to set up screen 25 | 26 | ############################################################################ 27 | # Globals 28 | ############################################################################ 29 | 30 | # Base objects 31 | my $fileio = FileIO->new(); 32 | my $console = Console->new(); 33 | my $devtools = DevTools->new(); 34 | 35 | # Program components 36 | use DIGS::Initialise; # Initialises the DIGS tool 37 | 1; 38 | 39 | ############################################################################ 40 | # LIFECYCLE 41 | ############################################################################ 42 | 43 | #*************************************************************************** 44 | # Subroutine: new 45 | # Description: create new Utility 'object' 46 | #*************************************************************************** 47 | sub new { 48 | 49 | my ($invocant, $digs_obj) = @_; 50 | my $class = ref($invocant) || $invocant; 51 | 52 | # Set member variables 53 | my $self = { 54 | 55 | # Global settings 56 | process_id => $digs_obj->{process_id}, 57 | program_version => $digs_obj->{program_version}, 58 | 59 | # Member classes 60 | digs_obj => $digs_obj, 61 | blast_obj => $digs_obj->{blast_obj}, 62 | 63 | # MySQL database connection parameters 64 | mysql_username => $digs_obj->{mysql_username}, 65 | mysql_password => $digs_obj->{mysql_password}, 66 | genome_use_path => $digs_obj->{genome_use_path}, 67 | db_name => '', # Obtained from control file or user 68 | mysql_server => '', # Obtained from control file or user 69 | 70 | }; 71 | 72 | bless ($self, $class); 73 | return $self; 74 | } 75 | 76 | ############################################################################ 77 | # TOP LEVEL HANDLER 78 | ############################################################################ 79 | 80 | #*************************************************************************** 81 | # Subroutine: run_utility_process 82 | # Description: top-level handler for DIGS tool utility functions 83 | #*************************************************************************** 84 | sub run_utility_process { 85 | 86 | my ($self, $infile, $database, $genomes, $utility) = @_; 87 | 88 | # Show title 89 | my $digs_obj = $self->{digs_obj}; 90 | $digs_obj->show_title(); 91 | 92 | if ($genomes) { # Run a target database summary process 93 | $self->run_target_db_summary_process($genomes); 94 | } 95 | elsif ($database) { # Run a target database summary process 96 | 97 | if ($database eq 6) { 98 | $self->extract_track_sequences($infile); 99 | } 100 | else { 101 | 102 | # Use a control file to connect to database 103 | if ($infile) { 104 | $self->parse_ctl_file_and_connect_to_db($infile); 105 | } 106 | else { 107 | print "\n\t No infile was supplied: enter parameters below\n\n"; 108 | $self->do_load_db_dialogue(); 109 | } 110 | if ($database) { 111 | $self->run_screening_db_management_process($database); 112 | } 113 | } 114 | } 115 | else { 116 | die; 117 | } 118 | } 119 | 120 | ############################################################################ 121 | # SECOND LEVEL HANDLERS FOR UTILITY FUNCTIONS 122 | ############################################################################ 123 | 124 | #*************************************************************************** 125 | # Subroutine: run_target_db_summary_process 126 | # Description: handler for functions summarising target databases 127 | #*************************************************************************** 128 | sub run_target_db_summary_process { 129 | 130 | my ($self, $option) = @_; 131 | 132 | my $digs_obj = $self->{digs_obj}; 133 | 134 | if ($option eq 1) { # Summarise target genome directory (short) 135 | my $target_db_obj = TargetDB->new($digs_obj); 136 | $target_db_obj->summarise_targets_short(); 137 | } 138 | elsif ($option eq 2) { # Summarise target genome directory (long) 139 | my $target_db_obj = TargetDB->new($digs_obj); 140 | $target_db_obj->summarise_targets_long(); 141 | } 142 | else { 143 | print "\n\t Unrecognized option '-g=$option'\n"; 144 | } 145 | } 146 | 147 | #*************************************************************************** 148 | # Subroutine: run_screening_db_management_process 149 | # Description: handler for DIGS screening database utility fxns 150 | #*************************************************************************** 151 | sub run_screening_db_management_process { 152 | 153 | my ($self, $option, $ctl_file) = @_; 154 | 155 | my $digs_obj = $self->{digs_obj}; 156 | 157 | # Hand off to functions 158 | if ($option eq 1) { # Manage ancillary tables in a screening DB 159 | $self->import_data(); 160 | } 161 | elsif ($option eq 2) { # Flush screening DB 162 | my $db = $digs_obj->{db}; 163 | my $db_name = $db->{db_name}; 164 | my $question = "\n\n\t Are you sure you want to flush data in the $db_name database?"; 165 | my $answer1 = $console->ask_yes_no_question($question); # Ask to make sure 166 | if ($answer1 eq 'y') { $db->flush_screening_db(); } 167 | } 168 | elsif ($option eq 3) { # Drop tables 169 | my $db = $digs_obj->{db}; 170 | my $db_name = $db->{db_name}; 171 | $self->drop_db_table(); 172 | } 173 | elsif ($option eq 4) { # Drop screening DB 174 | my $db = $digs_obj->{db}; 175 | $db->drop_screening_db(); 176 | } 177 | elsif ($option eq 5) { 178 | $self->append_to_digs_results($ctl_file); 179 | } 180 | else { 181 | print "\n\t Unrecognized option '-d=$option'\n"; 182 | } 183 | } 184 | 185 | ############################################################################ 186 | # DATABASE MANAGEMENT FUNCTIONS 187 | ############################################################################ 188 | 189 | #*************************************************************************** 190 | # Subroutine: import_data 191 | # Description: import data to DIGS screening DB tables 192 | #*************************************************************************** 193 | sub import_data { 194 | 195 | my ($self) = @_; 196 | 197 | # Get database handle, die if we can't 198 | my $digs_obj = $self->{digs_obj}; 199 | 200 | my $db = $digs_obj->{db}; 201 | unless ($db) { die; } 202 | my $dbh = $db->{dbh}; 203 | unless ($dbh) { die "\n\t Couldn't retrieve database handle \n\n"; } 204 | 205 | # Get the file path 206 | print "\n\n\t #### WARNING: This function expects a tab-delimited data table with column headers!"; 207 | my $question = "\n\n\t Please enter the path to the file with the table data and column headings\n\n\t"; 208 | my $infile = $console->ask_question($question); 209 | unless ($infile) { die; } 210 | 211 | # Read in the data from a tab delimited file 212 | my @data; 213 | my %fields; 214 | my @fields; 215 | $console->do_read_tabdelim_dialogue($infile, \@data, \@fields, \%fields); 216 | #$devtools->print_array(\@data); die; 217 | 218 | # Get a reference to a table object for the ancillary table 219 | my $anc_table = $db->do_ancillary_table_dialogue(\@fields, \%fields); 220 | #$devtools->print_hash(\%fields); die; 221 | #$devtools->print_array(\@fields); die; 222 | 223 | # Insert the data 224 | print "\n\n\t #### IMPORTING to table '$anc_table'"; 225 | my $verbose = $self->{verbose}; # Get 'verbose' flag setting 226 | my $row_count = $db->import_data_to_ancillary_table($anc_table, \@data, \@fields, \%fields, $verbose); 227 | 228 | } 229 | 230 | #*************************************************************************** 231 | # Subroutine: drop_db_table 232 | # Description: drop a DIGS screening DB table 233 | #*************************************************************************** 234 | sub drop_db_table { 235 | 236 | my ($self) = @_; 237 | 238 | # Get database handle, die if we can't 239 | my $digs_obj = $self->{digs_obj}; 240 | 241 | my $db = $digs_obj->{db}; 242 | my $db_name = $db->{db_name}; 243 | 244 | unless ($db) { die; } 245 | my $dbh = $db->{dbh}; 246 | unless ($dbh) { die "\n\t Couldn't retrieve database handle \n\n"; } 247 | 248 | # Get the ancillary tables in this DB 249 | my @tables; 250 | my %tables; 251 | $db->get_ancillary_table_names(\@tables); 252 | my $num_choices = scalar @tables; 253 | if ($num_choices) { 254 | 255 | print "\n\n\t # Drop ancillary tables in DIGS screening DB '$db_name'\n"; 256 | my $table_num = 0; 257 | foreach my $table_name (@tables) { 258 | $table_num++; 259 | $tables{$table_num} = $table_name; 260 | print "\n\t\t Table $table_num: '$table_name'"; 261 | } 262 | my @table_choices = sort keys %tables; 263 | 264 | my $question = "\n\n\t Apply to which of the above tables?"; 265 | my $answer = $console->ask_list_question($question, $num_choices); 266 | my $table_to_drop = $tables{$answer}; 267 | unless ($table_to_drop) { die; } 268 | 269 | $db->drop_ancillary_table($table_to_drop); 270 | } 271 | else { 272 | print "\n\n\t # There are no ancillary tables in DIGS screening DB '$db_name'\n"; 273 | } 274 | 275 | } 276 | 277 | #*************************************************************************** 278 | # Subroutine: append_to_digs_results 279 | # Description: append to digs_result table 280 | #*************************************************************************** 281 | sub append_to_digs_results { 282 | 283 | my ($self) = @_; 284 | 285 | # Get database handle, die if we can't 286 | my $digs_obj = $self->{digs_obj}; 287 | 288 | my $db = $digs_obj->{db}; 289 | unless ($db) { die; } 290 | 291 | # Get the file path 292 | print "\n\n\t #### WARNING: This function expects a tab-delimited data table with column headers!"; 293 | my $question = "\n\n\t Please enter the path to the file with the table data and column headings\n\n\t"; 294 | my $infile = $console->ask_question($question); 295 | unless ($infile) { die; } 296 | 297 | # Insert the data 298 | print "\n\n\t #### IMPORTING '$infile' to digs_results table"; 299 | $db->import_data_to_digs_results($infile) 300 | 301 | } 302 | 303 | ############################################################################ 304 | # DATA FUNCTIONS 305 | ############################################################################ 306 | 307 | #*************************************************************************** 308 | # Subroutine: extract_track_sequences 309 | # Description: extract FASTA nucs from a genome assembly using an input track 310 | #*************************************************************************** 311 | sub extract_track_sequences { 312 | 313 | my ($self, $infile) = @_; 314 | 315 | # Index genomes by key ( organism | type | version ) 316 | my $digs_obj = $self->{digs_obj}; 317 | my $genome_use_path = $digs_obj->{genome_use_path}; 318 | unless ($genome_use_path) { die "\n\t Path to genomes is not set\n\n\n"; } 319 | my $target_db_obj = TargetDB->new($digs_obj); 320 | my %server_data; 321 | $target_db_obj->read_target_directory(\%server_data); 322 | #$devtools->print_hash(\%server_data); 323 | #die; 324 | 325 | # Read infile 326 | my @infile; 327 | $fileio->read_file($infile, \@infile); 328 | 329 | # Get the header row 330 | my $header_row = shift @infile; 331 | my %fields; 332 | my %params; 333 | $self->check_input_and_set_flank_params($header_row, \%params, \%fields); 334 | 335 | # Iterate through the tracks extracting 336 | my @fasta; 337 | foreach my $line (@infile) { 338 | 339 | chomp $line; # remove newline 340 | #print "\n\t ## LINE: $line"; 341 | 342 | # Index the line properties, using field_names as keys 343 | my %indexed; 344 | $self->create_indexed_line($line, $header_row, \%fields, \%indexed); 345 | 346 | # Get the correct path to the indexed target file 347 | my $target_path = $self->create_target_path(\%server_data, \%indexed); 348 | $params{target_path} = $target_path; 349 | 350 | # Now do the extraction 351 | if ($target_path) { 352 | my $digs_fasta = $self->extract_seq_and_flanks(\%params, \%indexed); 353 | push (@fasta, $digs_fasta); 354 | } 355 | } 356 | 357 | my $outfile = 'extracted.DIGS.fna'; 358 | $fileio->write_file($outfile, \@fasta); 359 | 360 | } 361 | 362 | #*************************************************************************** 363 | # Subroutine: check_input_and_set_flank_params 364 | # Description: 365 | #*************************************************************************** 366 | sub check_input_and_set_flank_params { 367 | 368 | my ($self, $header_row, $params_ref, $fields_ref) = @_; 369 | 370 | my @header_row = split ("\t", $header_row); 371 | print "\n\n\t The following column headers (i.e. table fields) were obtained\n"; 372 | my $i = '0'; 373 | foreach my $element (@header_row) { 374 | chomp $element; 375 | $element =~ s/\s+/_/g; 376 | print "\n\t\t Column $i: '$element'"; 377 | $fields_ref->{$element} = $i; 378 | $i++; 379 | } 380 | print "\n\n\t\t CHECK COLUMN HEADINGS LOOK CORRECT!\n"; sleep 1; 381 | 382 | my $question3 = "\n\t Set 3' flank"; 383 | my $question5 = "\n\t Set 5' flank"; 384 | my $flank3 = $console->ask_int_with_bounds_question($question3, '0', 100000); 385 | my $flank5 = $console->ask_int_with_bounds_question($question5, '0', 100000); 386 | $params_ref->{flank3} = $flank3; 387 | $params_ref->{flank5} = $flank5; 388 | 389 | } 390 | 391 | #*************************************************************************** 392 | # Subroutine: extract_seq_and_flanks 393 | # Description: 394 | #*************************************************************************** 395 | sub extract_seq_and_flanks { 396 | 397 | my ($self, $params_ref, $indexed_ref) = @_; 398 | 399 | my $blast_obj = $self->{blast_obj}; 400 | my $target_path = $params_ref->{'target_path'}; 401 | my $flank3 = $params_ref->{'flank3'}; 402 | my $flank5 = $params_ref->{'flank5'}; 403 | my $organism = $indexed_ref->{'organism'}; 404 | my $scaffold = $indexed_ref->{'scaffold'}; 405 | my $orientation = $indexed_ref->{'orientation'}; 406 | my $start = $indexed_ref->{'extract_start'}; 407 | my $end = $indexed_ref->{'extract_end'}; 408 | my $record_id = $indexed_ref->{'record_ID'}; 409 | my $virus_genus = $indexed_ref->{'virus_genus'}; 410 | 411 | my $truncated3; 412 | my $truncated5; 413 | my $extract_start; 414 | my $extract_end; 415 | if ($orientation eq '+') { 416 | my $adjust_start = $start - $flank5; 417 | if ($adjust_start < 1) { 418 | print "\n\t\t\t 5' TRUNCATED!!!"; 419 | $extract_start = 1; 420 | $truncated5 = abs($adjust_start); 421 | } 422 | else { 423 | $extract_start = $adjust_start; 424 | } 425 | $extract_end = $end + $flank3; 426 | } 427 | elsif ($orientation eq '-') { 428 | my $adjust_start = $start - $flank3; 429 | if ($adjust_start < 1) { 430 | print "\n\t\t\t 5' TRUNCATED!!!"; 431 | $extract_start = 1; 432 | $truncated3 = abs($adjust_start); 433 | } 434 | else { 435 | $extract_start = $adjust_start; 436 | } 437 | $extract_end = $end + $flank5; 438 | } 439 | 440 | # Adjust the start coordinates according to NYT 441 | my %data; 442 | $data{scaffold} = $scaffold; 443 | $data{orientation} = $orientation; # $orientation; 444 | $data{start} = $extract_start; 445 | $data{end} = $extract_end; 446 | my $digs_fasta = ''; 447 | 448 | my $expected_length = ($extract_end - $extract_start) + 1; 449 | my $sequence = $blast_obj->extract_sequence($target_path, \%data); 450 | my $extracted_length = length $sequence; 451 | if ($expected_length > $extracted_length) { 452 | 453 | print "\n\t\t\t 3' TRUNCATED!!!"; 454 | #print "\n\t\t\t Expected length: '$expected_length', extracted length: '$extracted_length'"; 455 | my $length_missing = ($expected_length - $extracted_length) + 1; 456 | $truncated5 = abs($length_missing); 457 | } 458 | 459 | unless ($sequence) { 460 | print "\n\t Sequence extraction failed for record"; 461 | sleep 1; 462 | } 463 | else { 464 | my $header = $organism . ',' . $scaffold . ',' . $extract_start . ',' . $extracted_length; 465 | if ($truncated5) { 466 | $header .= "-T5($truncated5)"; 467 | } 468 | if ($truncated3) { 469 | $header .= "-T5(0)-T3($truncated3)" . $extracted_length; 470 | } 471 | print "\n\t\t Got sequence for $header"; 472 | $digs_fasta = ">$header" . "\n$sequence\n"; 473 | } 474 | return $digs_fasta; 475 | 476 | } 477 | 478 | #*************************************************************************** 479 | # Subroutine: create_indexed_line 480 | # Description: 481 | #*************************************************************************** 482 | sub create_indexed_line { 483 | 484 | my ($self, $property_row, $field_names_row, $field_name_index, $indexed_ref) = @_; 485 | 486 | my @field_names_row = split ("\t", $field_names_row); 487 | my $field_i = '0'; 488 | my @line = split("\t", $property_row); 489 | foreach my $field (@field_names_row) { 490 | chomp $field; 491 | $field =~ s/\s+/_/g; 492 | my $property_i = $field_name_index->{$field}; 493 | my $property = $line[$property_i]; 494 | $indexed_ref->{$field} = $property; 495 | $field_i++; 496 | #print "\n\t ### Index: $field_i | $property_i: FIELD '$field' = '$property'"; 497 | } 498 | } 499 | 500 | #*************************************************************************** 501 | # Subroutine: create_target_path 502 | # Description: 503 | #*************************************************************************** 504 | sub create_target_path { 505 | 506 | my ($self, $server_data, $indexed_ref) = @_; 507 | 508 | my $digs_obj = $self->{digs_obj}; 509 | my $genome_use_path = $digs_obj->{genome_use_path}; 510 | unless ($genome_use_path) { die "\n\t Path to genomes is not set\n\n\n"; } 511 | my $target_path = ''; 512 | 513 | #$devtools->print_hash($genome_data); 514 | my $organism = $indexed_ref->{'organism'}; 515 | my $type = $indexed_ref->{'target_datatype'}; 516 | my $version = $indexed_ref->{'target_version'}; 517 | my $target_name = $indexed_ref->{'target_name'}; 518 | my $scaffold = $indexed_ref->{'scaffold'}; 519 | my $orientation = $indexed_ref->{'orientation'}; 520 | 521 | # Get the top level categorisation 522 | my $key = $organism . '|' . $type . '|' . $version; # Create the key 523 | my $genome_data = $server_data->{$key}; 524 | unless ($genome_data) { 525 | print "\n\t\t\t Skipping row '$key': missing link data"; 526 | sleep 1; 527 | return $target_path; 528 | } 529 | my $group = $genome_data->{grouping}; 530 | unless ($group) { 531 | print "\n\t\t\t Skipping row - missing genome path data (group)"; 532 | return $target_path; 533 | } 534 | #print "\n\t TARGET $organism - grouping $group"; 535 | my @target_path; 536 | push (@target_path, $genome_use_path); 537 | push (@target_path, $group); 538 | push (@target_path, $organism); 539 | push (@target_path, $type); 540 | push (@target_path, $version); 541 | push (@target_path, $target_name); 542 | $target_path = join('/', @target_path); 543 | 544 | return $target_path; 545 | 546 | } 547 | 548 | ############################################################################ 549 | # INITIALISATION 550 | ############################################################################ 551 | 552 | #*************************************************************************** 553 | # Subroutine: parse_ctl_file_and_connect_to_db 554 | # Description: connect to a DIGS screening DB by parsing a DIGS control file 555 | #*************************************************************************** 556 | sub parse_ctl_file_and_connect_to_db { 557 | 558 | my ($self, $infile) = @_; 559 | 560 | my $digs_obj = $self->{digs_obj}; 561 | 562 | # Try opening control file 563 | my @ctl_file; 564 | my $valid = $fileio->read_file($infile, \@ctl_file); 565 | unless ($valid) { # Exit if we can't open the file 566 | die "\n\t ### Couldn't open control file '$infile'\n\n\n "; 567 | } 568 | 569 | # If control file looks OK, store the path and parse the file 570 | $self->{ctl_file} = $infile; 571 | my $loader_obj = ScreenBuilder->new($digs_obj); 572 | $loader_obj->parse_control_file($infile, $digs_obj); 573 | 574 | # Store the ScreenBuilder object (used later) 575 | $self->{loader_obj} = $loader_obj; 576 | 577 | # Load/create the screening database 578 | my $db_name = $loader_obj->{db_name}; 579 | unless ($db_name) { die "\n\t Error: no DB name defined \n\n\n"; } 580 | 581 | my $initialise_obj = Initialise->new($digs_obj); 582 | $initialise_obj->initialise_screening_db($digs_obj, $db_name); 583 | } 584 | 585 | #*************************************************************************** 586 | # Subroutine: do_load_db_dialogue 587 | # Description: connect to a DIGS screening DB 588 | #*************************************************************************** 589 | sub do_load_db_dialogue { 590 | 591 | my ($self, $infile) = @_; 592 | 593 | my $digs_obj = $self->{digs_obj}; 594 | 595 | # Load/create the screening database 596 | my $question = "\t Enter the name of a DIGS screening database"; 597 | my $db_name = $console->ask_question($question); 598 | unless ($db_name) { die "\n\t Error: no DB name defined \n\n\n"; } 599 | $digs_obj->{mysql_server} = 'localhost'; 600 | 601 | # Create the screening DB object 602 | my $db_obj = ScreeningDB->new($digs_obj); 603 | 604 | # Check if this screening DB exists, if not then create it 605 | my $db_exists = $db_obj->does_db_exist($db_name); 606 | unless ($db_exists) { 607 | print "\n\t Could not connect to screening DB '$db_name'"; 608 | print "\n\t Exiting.\n\n\n"; exit; 609 | } 610 | 611 | # Load the database 612 | print "\n\t Connecting to DB: $db_name"; 613 | $db_obj->load_screening_db($db_name); 614 | $digs_obj->{db} = $db_obj; # Store the database object reference 615 | 616 | } 617 | 618 | ############################################################################ 619 | # EOF 620 | ############################################################################ 621 | -------------------------------------------------------------------------------- /modules/DIGS/TargetDB.pm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | ############################################################################ 3 | # Module: TargetDB.pm 4 | # Description: Genome/organism sequence data management functions 5 | # History: December 2009: Created by Robert Gifford 6 | ############################################################################ 7 | package TargetDB; 8 | 9 | ############################################################################ 10 | # Import statements/packages (externally developed packages) 11 | ############################################################################ 12 | use strict; 13 | 14 | ############################################################################ 15 | # Import statements/packages (internally developed packages) 16 | ############################################################################ 17 | 18 | # Base classes 19 | use Base::FileIO; 20 | use Base::Console; 21 | 22 | ############################################################################ 23 | # Globals 24 | ############################################################################ 25 | 26 | my $s_length = 70; # Standard line length (for FASTA) 27 | my $line_limit = 10000000000; # Maximum number lines in file 28 | my $blast_program = 'makeblastdb'; 29 | #my $blast_program = 'formatdb'; 30 | 31 | # Create base objects 32 | my $fileio = FileIO->new(); 33 | my $console = Console->new(); 34 | my $devtools = DevTools->new(); 35 | 36 | 1; 37 | 38 | ############################################################################ 39 | # LIFECYCLE 40 | ############################################################################ 41 | 42 | #*************************************************************************** 43 | # Subroutine: new 44 | # Description: Create a new TargetDB.pm 'object' 45 | #*************************************************************************** 46 | sub new { 47 | 48 | my ($invocant, $parameter_ref) = @_; 49 | my $class = ref($invocant) || $invocant; 50 | 51 | my %levels; 52 | $levels{1} = 'grouping'; 53 | $levels{2} = 'organism'; 54 | $levels{3} = 'source_type'; 55 | $levels{4} = 'version'; 56 | 57 | # Set member variables 58 | my $self = { 59 | 60 | # Paths 61 | directory_levels => \%levels, 62 | line_limit => $line_limit, 63 | genome_use_path => $parameter_ref->{genome_use_path}, 64 | skipindexing_paths => $parameter_ref->{skipindexing_paths}, 65 | 66 | }; 67 | 68 | bless ($self, $class); 69 | return $self; 70 | } 71 | 72 | ############################################################################ 73 | # Functions 74 | ############################################################################ 75 | 76 | #*************************************************************************** 77 | # Subroutine: format_targets_for_blast 78 | # Description: index unformatted FASTA files for BLAST searching 79 | #*************************************************************************** 80 | sub format_targets_for_blast { 81 | 82 | my ($self) = @_; 83 | 84 | # Get variables from self 85 | my $genome_path = $self->{genome_use_path}; 86 | unless ($genome_path) { die "\n\t Path to target files is not set\n\n\n"; } 87 | 88 | # Show warning 89 | print "\n\t ### WARNING: This function requires standard file extensions for target FASTA files.\n"; 90 | print "\n\t ### i.e. '.fas', '.fa', '.fasta', '.faa', '.fna'\n"; 91 | print "\n\t ### Note: large files may generate split index files (01.nin, 02.nin ... etc)"; 92 | print "\n\t ### When this happens this utility will prompt for formatting, even "; 93 | print "if index files already exist.\n"; 94 | 95 | # Ask user how to handle the process 96 | my $question = "\n\t Prompt before formatting each target file? "; 97 | my $prompt = $console->ask_yes_no_question($question); 98 | if ($prompt eq 'n') { $prompt = undef; } 99 | 100 | # Index genomes by key ( organism | type | version ) 101 | my %server_data; 102 | print "\n\n\t Refreshing target data under path '$genome_path'\n"; 103 | $self->read_target_directory(\%server_data); 104 | 105 | my $skip_ref = $self->{skipindexing_paths}; 106 | #$devtools->print_hash($skip_ref); #die; 107 | 108 | # Iterate through and check formatting in each genome 109 | print "\n\t #~#~# Loading target data\n"; 110 | my $skipped = '0'; 111 | my @keys = sort keys %server_data; 112 | foreach my $key (@keys) { 113 | 114 | # Get genome data 115 | my $genome_ref = $server_data{$key}; 116 | my $organism = $genome_ref->{organism}; 117 | my $type = $genome_ref->{source_type}; 118 | my $version = $genome_ref->{version}; 119 | my $path = $genome_ref->{version_path}; 120 | my $group = $genome_ref->{grouping}; 121 | 122 | my @target = ( $group, $organism , $type, $version ); 123 | my $target_id = join ('/', @target); 124 | if ($skip_ref->{$target_id}) { 125 | print "\n\t Skipping '$organism': '$type' '$version'"; 126 | $skipped++; 127 | next; 128 | } 129 | 130 | print "\n\t #~#~# Checking: $group;\t'$organism'\t'$type'\t'$version'"; 131 | $self->check_genome_formatting($genome_ref); 132 | my $unformatted_ref = $genome_ref->{unformatted}; 133 | my $num_unformatted = scalar @$unformatted_ref; 134 | 135 | if ($num_unformatted) { # Do formatting 136 | print "\n\n\t #~#~# Format target file: $organism, $type, $version"; 137 | foreach my $file (@$unformatted_ref) { 138 | print "\n\t file '$file'"; 139 | } 140 | if ($prompt) { 141 | my $question = "\n\n\t Do you want to format the above files?"; 142 | my $answer = $console->ask_yes_no_question($question); 143 | if ($answer eq 'y') { 144 | $self->format_target_for_blast($genome_ref); 145 | } 146 | } 147 | else { $self->format_target_for_blast($genome_ref); } 148 | } 149 | } 150 | print "\n\n\t #~#~# Skipped '$skipped' files"; 151 | } 152 | 153 | #*************************************************************************** 154 | # Subroutine: summarise_targets_long 155 | # Description: summarise data in the 'target genomes' directory 156 | #*************************************************************************** 157 | sub summarise_targets_long { 158 | 159 | my ($self) = @_; 160 | 161 | # Get variables from self 162 | my $genome_path = $self->{genome_use_path}; 163 | unless ($genome_path) { die "\n\t Path to genomes is not set\n\n\n"; } 164 | 165 | # Index genomes by key ( organism | type | version ) 166 | my %server_data; 167 | $self->read_target_directory(\%server_data); 168 | 169 | # add header row 170 | my @summary; 171 | my @header = ('File', 'Organism', 'Group', 'Data-type', 'Version', 'Scaffolds', 172 | '# bases', '# lines'); 173 | my $header = join ("\t", @header); 174 | $header .= "\n"; 175 | push (@summary, $header); 176 | 177 | # Iterate through, summarising target files 178 | print "\n\n\t ### Generating detailed summary of target files under '\$DIGS_GENOMES'"; 179 | my @keys = keys %server_data; 180 | foreach my $key (@keys) { 181 | 182 | # Get genome data 183 | my $genome_ref = $server_data{$key}; 184 | my $group = $genome_ref->{grouping}; 185 | my $organism = $genome_ref->{organism}; 186 | my $type = $genome_ref->{source_type}; 187 | my $version = $genome_ref->{version}; 188 | my $path = $genome_ref->{version_path}; 189 | #$devtools->print_hash($genome_ref); die; 190 | print "\n\t ### Summarising target files for '$organism'"; 191 | 192 | # Iterate through indexing by file stems 193 | my $files_ref = $genome_ref->{files}; 194 | my @files; 195 | foreach my $file (@$files_ref) { 196 | my @file_bits = split (/\./, $file); 197 | my $type = pop @file_bits; 198 | if ($type eq 'fa') { 199 | push (@files, $file); 200 | } 201 | } 202 | my @sorted = sort @files; 203 | 204 | # Get statistics for each file 205 | foreach my $file (@sorted) { 206 | 207 | # Get path to file 208 | my $chunk_path = $genome_path . "/$path/$file"; 209 | my %data; 210 | $self->get_target_file_statistics(\%data, $chunk_path); 211 | my $total_bases = $data{total_bases}; 212 | my $line_count = $data{total_lines}; 213 | my $num_scaffolds = $data{number_scaffolds}; 214 | #print "\n\t FILE $file: $chunk_path"; 215 | 216 | my @line; 217 | push (@line, $file); 218 | push (@line, $organism); 219 | push (@line, $group); 220 | push (@line, $type); 221 | push (@line, $version); 222 | push (@line, $num_scaffolds); 223 | push (@line, $total_bases); 224 | push (@line, $line_count); 225 | my $line = join("\t", @line); 226 | push (@summary, "$line\n"); 227 | } 228 | } 229 | 230 | # Write results to file 231 | my $summary = "digs-target-dbs-detailed-summary.txt"; 232 | print "\n\n\t ### Writing detailed summary to '$summary'\n"; 233 | $fileio->write_file($summary, \@summary); 234 | } 235 | 236 | #*************************************************************************** 237 | # Subroutine: summarise_targets_short 238 | # Description: summarise data in the 'targets' directory 239 | #*************************************************************************** 240 | sub summarise_targets_short { 241 | 242 | my ($self) = @_; 243 | 244 | # Get member data structure that describes expected directory structure 245 | my $genome_path = $self->{genome_use_path}; 246 | unless ($genome_path) { die "\n\t Path not set\n\n\n"; } 247 | my $levels_ref = $self->{directory_levels}; 248 | unless ($levels_ref) { die "\n\t Levels not set\n\n\n"; } 249 | my @levels = sort by_number keys %$levels_ref; 250 | 251 | # Index current, locally-held genome data 252 | my @genome_files; 253 | $fileio->read_directory_tree_leaves($genome_path, \@genome_files, $levels_ref); 254 | 255 | # Iterate through, summarising target files 256 | print "\n\n\t ### Generating brief summary of target files under '\$DIGS_GENOMES'"; 257 | my %target_keys; 258 | foreach my $file_ref (@genome_files) { 259 | 260 | my $grouping = $file_ref->{grouping}; 261 | my $organism = $file_ref->{organism}; 262 | my $type = $file_ref->{source_type}; 263 | my $version = $file_ref->{version}; 264 | my $key = $grouping . '|' .$organism . '|' . $type . '|' . $version; 265 | $target_keys{$key} = 1; 266 | } 267 | 268 | # Add header row 269 | my @summary; 270 | my @header = ('Grouping', 'Organism', 'Data type', 'Version'); 271 | my $header = join("\t", @header); 272 | push (@summary, "$header\n"); 273 | 274 | # Add the summary information rows 275 | my @keys = sort keys %target_keys; 276 | foreach my $key (@keys) { 277 | $key =~ s/\|/\t/g; 278 | push (@summary, "$key\n"); 279 | } 280 | 281 | # Write results to file 282 | my $summary = "digs-target-dbs-brief-summary.txt"; 283 | print "\n\n\t ### Writing brief summary to '$summary'\n"; 284 | $fileio->write_file($summary, \@summary); 285 | 286 | } 287 | 288 | #*************************************************************************** 289 | # Subroutine: get_target_file_statistics 290 | # Description: get metrics on a target genome sequence file 291 | #*************************************************************************** 292 | sub get_target_file_statistics { 293 | 294 | my ($self, $data_ref, $chunk_path) = @_; 295 | 296 | # open the file for reading, or die on failure 297 | open FILE, "<$chunk_path" 298 | or die "\n\tCan't open '$chunk_path'\n"; 299 | 300 | # Open filehandle 301 | my $num_lines; 302 | my $num_scaffolds = 0; 303 | my $line_count = 0; 304 | my $total_bases = 0; 305 | 306 | # Count the number of lines in the file 307 | while ( ) { 308 | 309 | my $line = $_; 310 | if ($line =~ /^\s*$/) { next; } # discard blank line 311 | $line_count++; 312 | chomp $line; 313 | if ($line =~ /^>/) { $num_scaffolds++; } 314 | else { 315 | $line =~ s/\s+//g; # remove whitespace 316 | my $line_length = length $line; 317 | $total_bases = $total_bases + $line_length; 318 | } 319 | } 320 | close FILE; 321 | 322 | #print "\n\t # Paths: $chunk_path: "; 323 | #print "\n\t # Line count: $line_count"; 324 | #print "\n\t # Base count: $total_bases"; 325 | #print "\n\t # Scaffolds: $num_scaffolds"; 326 | $data_ref->{total_bases} = $total_bases; 327 | $data_ref->{total_lines} = $line_count; 328 | $data_ref->{number_scaffolds} = $num_scaffolds; 329 | } 330 | 331 | ############################################################################ 332 | # Internals 333 | ############################################################################ 334 | 335 | #*************************************************************************** 336 | # Subroutine: read_target_directory 337 | # Description: read the contents of a 'target DB' directory containing 338 | #*************************************************************************** 339 | sub read_target_directory { 340 | 341 | my ($self, $data_ref) = @_; 342 | 343 | # Get member data structure that describes expected directory structure 344 | my $genome_path = $self->{genome_use_path}; 345 | unless ($genome_path) { die "\n\t Path not set\n\n\n"; } 346 | my $levels_ref = $self->{directory_levels}; 347 | unless ($levels_ref) { die "\n\t Levels not set\n\n\n"; } 348 | my @levels = sort by_number keys %$levels_ref; 349 | 350 | # Index current, locally-held target data 351 | my @genome_files; 352 | $fileio->read_directory_tree_leaves($genome_path, \@genome_files, $levels_ref); 353 | 354 | # Iterate through the files 355 | foreach my $file_ref (@genome_files) { 356 | 357 | my $file = $file_ref->{file}; 358 | my $type = $file_ref->{source_type}; 359 | my $organism = $file_ref->{organism}; 360 | my $version = $file_ref->{version}; 361 | 362 | # Create the path to file 363 | my $path; 364 | foreach my $level (@levels) { 365 | my $field = $levels_ref->{$level}; 366 | my $value = $file_ref->{$field}; 367 | #print "\n\t LEVEL $level\tFIELD $field\tVALUE $value";; 368 | $path .= "$value/"; 369 | } 370 | $file_ref->{version_path} = $path; 371 | 372 | # Store in a hash 373 | my $key = $organism . '|' . $type . '|' . $version; 374 | if ($data_ref->{$key}) { 375 | my $genome_ref = $data_ref->{$key}; 376 | my $files_ref = $genome_ref->{files}; 377 | push (@$files_ref, $file); 378 | } 379 | else { 380 | my @files; 381 | push (@files, $file); 382 | $file_ref->{files} = \@files; 383 | $data_ref->{$key} = $file_ref; 384 | } 385 | } 386 | } 387 | 388 | #*************************************************************************** 389 | # Subroutine: check_genome_formatting 390 | # Description: look for pre-existing BLAST indexes in a genome directory 391 | #*************************************************************************** 392 | sub check_genome_formatting { 393 | 394 | my ($self, $genome_ref) = @_; 395 | 396 | # Iterate through indexing by file stems 397 | my $files_ref = $genome_ref->{files}; 398 | my %stems; 399 | my @formatted; 400 | my @unformatted; 401 | foreach my $file (@$files_ref) { 402 | 403 | my @file_bits = split (/\./, $file); 404 | my $type = pop @file_bits; 405 | my $stem = join ('.', @file_bits); 406 | #print "\n\t NAME $file:\t\t $stem"; 407 | unless ($stem and $type) { 408 | die "\n\n\t filename structure error for '$file'\n\n"; 409 | } 410 | 411 | # Parsing the file extension of the target sequence file 412 | if ($type eq 'fasta' or $type eq 'fas' or $type eq 'fa' 413 | or $type eq 'fna' or $type eq 'faa' 414 | ) { 415 | $stem .= ".$type"; 416 | $type = 'fa'; # standardise type for FASTA files 417 | } 418 | 419 | # Check if the file looks like a BLAST sequence database file 420 | my $result = $self->check_filetype($type); 421 | unless ($result) { 422 | print "\n\t File '$file' is not a recognized BLAST-associated filetype"; 423 | } 424 | 425 | # Add this filestem to the list of filestems we have seen the stem 426 | if ($stems{$stem}) { 427 | my $files_ref = $stems{$stem}; 428 | if ($files_ref->{$type}) { 429 | print "\n\t Showing hash of file extensions for this file stem\n"; 430 | $devtools->print_hash($files_ref); 431 | print "\n\t NAME $file, TYPE '$type': $stem"; 432 | print "\n\t This file appears to be a duplicate, exiting....\n\n"; 433 | exit; 434 | } 435 | $files_ref->{$type} = 1; 436 | if ($type eq 'fa') { 437 | $files_ref->{stem_target} = $file; 438 | } 439 | } 440 | else { 441 | my %stem_files; 442 | $stem_files{$type} = 1; 443 | $stem_files{stemfile_name} = $file; 444 | if ($type eq 'fa') { 445 | $stem_files{stem_target} = $file; 446 | } 447 | $stems{$stem} = \%stem_files; 448 | } 449 | 450 | } 451 | 452 | my @stems = keys %stems; 453 | foreach my $stem (@stems) { 454 | 455 | my $types_ref = $stems{$stem}; 456 | my $nin = $types_ref->{nin}; 457 | my $nsq = $types_ref->{nsq}; 458 | my $nhr = $types_ref->{nhr}; 459 | #my $nsi = $chunk_data->{nsi}; 460 | #my $nsd = $chunk_data->{nsd}; 461 | my $target = $types_ref->{stem_target}; 462 | unless ($nhr and $nin and $nsq) { 463 | 464 | my $stemfile_data = $stems{$stem}; 465 | my $stemfile_name = $stemfile_data->{stemfile_name}; 466 | if ($target) { 467 | print "\n\t adding unformatted target $target for '$stemfile_name'"; 468 | push (@unformatted, $target); 469 | } 470 | else { 471 | print "\n\t No target found for:"; 472 | print "\n\t filestem '$stem'"; 473 | print "\n\t from file '$stemfile_name' (ignoring)"; 474 | } 475 | } 476 | else { 477 | push (@formatted, $target); 478 | } 479 | } 480 | 481 | $genome_ref->{formatted} = \@formatted; 482 | $genome_ref->{unformatted} = \@unformatted; 483 | } 484 | 485 | #*************************************************************************** 486 | # Subroutine: check_filetype 487 | # Description: Check if string $type is a known file extension of BLAST database 488 | #*************************************************************************** 489 | sub check_filetype { 490 | 491 | my ($self, $type) = @_; 492 | 493 | my $result = undef; 494 | 495 | if( $type eq 'fa' or $type eq 'ol' or $type eq 'gz' or 496 | 497 | $type eq 'nnd' or $type eq 'nni' or 498 | $type eq 'nsd' or $type eq 'nsi' or 499 | $type eq 'nsq' or $type eq 'ntm' or 500 | $type eq 'nin' or $type eq 'nhr' or 501 | $type eq 'nin' or $type eq 'nhr' or 502 | $type eq 'nhd' or $type eq 'nhi' or 503 | $type eq 'nog' or $type eq 'nal' or 504 | $type eq 'nog' or $type eq 'nos' or 505 | $type eq 'nog' or $type eq 'ntf' or 506 | $type eq 'nog' or $type eq 'nto' or 507 | $type eq 'nog' or $type eq 'not' or 508 | $type eq 'fai' or $type eq 'ndb' or 509 | $type eq 'txt') { 510 | 511 | $result = 1; 512 | 513 | } 514 | 515 | return $result; 516 | 517 | } 518 | 519 | #*************************************************************************** 520 | # Subroutine: format_target_for_blast 521 | # Description: format sequence files in a directory for BLAST 522 | #*************************************************************************** 523 | sub format_target_for_blast { 524 | 525 | my ($self, $genome_ref) = @_; 526 | 527 | # Get data 528 | my $genome_path = $self->{genome_use_path}; 529 | my $line_limit = $self->{line_limit}; 530 | my $organism = $genome_ref->{organism}; 531 | my $type = $genome_ref->{source_type}; 532 | my $version = $genome_ref->{version}; 533 | my $version_path = $genome_ref->{version_path}; 534 | my $path = $genome_path . $version_path; 535 | my $bin_path = $blast_program; 536 | 537 | # Iterate through the files 538 | my $formatted_ref = $genome_ref->{formatted}; 539 | my $unformatted_ref = $genome_ref->{unformatted}; 540 | foreach my $file (@$unformatted_ref) { 541 | 542 | # Get path to file 543 | my $chunk_path = $path . "/$file"; 544 | my %data; 545 | #print "\n\t FILE $file: $chunk_path"; 546 | $self->get_target_file_statistics(\%data, $chunk_path); 547 | my $total_bases = $data{total_bases}; 548 | my $line_count = $data{total_lines}; 549 | my $num_scaffolds = $data{number_scaffolds}; 550 | print "\n\t Target file line count: $line_count"; 551 | 552 | if ($num_scaffolds > 50 and $line_count > $line_limit) { 553 | 554 | # Work out file splitting params 555 | my $split_num = int ($line_count / $line_limit); 556 | if ($split_num < 1) { $split_num++; } 557 | my @split_chunks; 558 | my $done_split = undef; 559 | print "\n\t Splitting $file to approximately $split_num files"; 560 | unless ($split_num eq 1) { 561 | $done_split = 1; 562 | print "\n\t Splitting $file to approximately $split_num files"; 563 | $self->split_genome_chunk($path, $file, \@split_chunks); 564 | } 565 | else { 566 | my $file_path = $path . "/$file"; 567 | my $command = "makeblastdb -in $file_path -dbtype nucl -parse_seqids > /dev/null"; 568 | #my $command = "formatdb -i $file_path -p F -o T > /dev/null"; 569 | print "\n\t$command\n"; 570 | system $command; 571 | } 572 | 573 | if ($done_split) { 574 | foreach my $new_chunk_path (@split_chunks) { 575 | my $command = "makeblastdb -in $new_chunk_path -dbtype nucl -parse_seqids > /dev/null"; 576 | #my $command = "formatdb -i $new_chunk_path -p F -o T > /dev/null"; 577 | print "\n\t$command\n"; 578 | system $command; 579 | } 580 | my $file_path = $path . "/$file"; 581 | my $command = "mv $file_path ./"; 582 | #print "\n\tMoving original file '$file_path' to ./\n"; 583 | #system $command; 584 | } 585 | } 586 | else { 587 | my $command = "makeblastdb -in $chunk_path -dbtype nucl -parse_seqids > /dev/null"; 588 | #my $command = "formatdb -i $chunk_path -p F -o T > /dev/null"; 589 | print "\n\t$command\n"; 590 | system $command; 591 | push (@$formatted_ref, $file); 592 | } 593 | } 594 | 595 | } 596 | 597 | ############################################################################ 598 | # Splitting large files 599 | ############################################################################ 600 | 601 | #*************************************************************************** 602 | # Subroutine: split_genome_chunk 603 | # Description: split a large sequence file into several files 604 | #*************************************************************************** 605 | sub split_genome_chunk { 606 | 607 | my ($self, $path, $file, $split_chunks_ref) = @_; 608 | 609 | # Get data 610 | my $line_limit = $self->{line_limit}; 611 | 612 | # Split the file 613 | my @scaffold_chunk; 614 | my $line_count = 0; 615 | my $chunk_count = 0; 616 | my $scaffold_count = 0; 617 | my $approx_divider; 618 | my $chunk_path = $path . $file; 619 | open GENOME, "<$chunk_path" or die "\n\tCan't open $chunk_path\n"; 620 | while ( ) { 621 | 622 | my $line = $_; 623 | if ($line =~ /^>/) { 624 | $scaffold_count++; 625 | 626 | if ($line_count > $line_limit) { 627 | $chunk_count++; 628 | my $new_path = $path . $file . '_' . $chunk_count . '.fa'; 629 | $fileio->write_file($new_path, \@scaffold_chunk); 630 | push (@$split_chunks_ref, $new_path); 631 | 632 | $scaffold_count = 1; # reset scaffold count 633 | $line_count = 0; # reset line_count 634 | @scaffold_chunk = (); # reset chunk 635 | 636 | # Store the header line in the new chunk 637 | push (@scaffold_chunk, $line); 638 | } 639 | else { 640 | push (@scaffold_chunk, $line); 641 | } 642 | } 643 | else { 644 | $line_count++; 645 | push (@scaffold_chunk, $line); 646 | } 647 | } 648 | 649 | # Store last chunk 650 | $chunk_count++; 651 | my $new_path = $path . $file . '_' . $chunk_count . '.fa'; 652 | $fileio->write_file($new_path, \@scaffold_chunk); 653 | push (@$split_chunks_ref, $new_path); 654 | 655 | # Remove the original file 656 | my $command = "rm $chunk_path"; 657 | print "\n\t REMOVED file:\n\t $command\n\n"; 658 | system $command; 659 | 660 | } 661 | 662 | #*************************************************************************** 663 | # Subroutine: split_longline_contig 664 | # Description: split single line of sequence into several lines 665 | #*************************************************************************** 666 | sub split_longline_contig { 667 | 668 | my ($self, $chunk_path) = @_; 669 | 670 | # Create path to chunk, and open filehandle 671 | print "\n\t $chunk_path"; 672 | open TARGET, "<$chunk_path" or die "\n\tCan't open $chunk_path\n"; 673 | my @file_bits = split(/\./, $chunk_path); 674 | pop @file_bits; 675 | my $chunk_file_stem = join ('', @file_bits); 676 | my $reformat_path = $chunk_file_stem . '_rf.fa'; 677 | print "\n\t $reformat_path"; 678 | open REFORMATTED_CHUNK, ">$reformat_path" or die "\n\tCan't open $reformat_path\n"; 679 | 680 | # Run through chunk, extracting matches as we encounter them 681 | my $i; 682 | while ( ) { 683 | 684 | $i++; 685 | print "\n\t line $i"; 686 | my $line = $_; 687 | if ($line =~ /^\s*$/) { next; } # discard blank line 688 | chomp $line; 689 | 690 | if ($line =~ /^>/) { 691 | # write line 692 | print REFORMATTED_CHUNK "$line\n"; 693 | } 694 | else { 695 | 696 | # Work out coordinates in this line 697 | $line =~ s/\s+//g; # remove whitespace 698 | my $line_length = length $line; 699 | if ($line_length > $s_length) { 700 | 701 | my @line = split ('', $line); 702 | my $nt_counter = 0; 703 | my $short_line = ''; 704 | foreach my $char (@line) { 705 | $nt_counter++; 706 | if ($nt_counter eq $s_length) { 707 | $nt_counter = 0; 708 | # write line 709 | print REFORMATTED_CHUNK "$short_line\n"; 710 | $short_line = ''; 711 | } 712 | else { 713 | $short_line .= $char; 714 | } 715 | } 716 | if ($short_line) { 717 | print REFORMATTED_CHUNK "$short_line\n"; 718 | } 719 | } 720 | else { 721 | # write line 722 | print REFORMATTED_CHUNK "$line\n"; 723 | } 724 | } 725 | } 726 | close TARGET; 727 | close REFORMATTED_CHUNK; 728 | } 729 | 730 | ############################################################################ 731 | # Basic 732 | ############################################################################ 733 | 734 | #*************************************************************************** 735 | # Subroutine: by number 736 | # Description: sort an array of integers by ascending numerical order 737 | #*************************************************************************** 738 | sub by_number { $a <=> $b } 739 | 740 | ############################################################################ 741 | # EOF 742 | ############################################################################ 743 | -------------------------------------------------------------------------------- /modules/DIGS/DIGS.pm: -------------------------------------------------------------------------------- 1 | #!usr/bin/perl -w 2 | ############################################################################ 3 | # Module: DIGS.pm database-integrated genome screening (DIGS) 4 | # Description: Functions for implementing DIGS 5 | # History: December 2013: Created by Robert Gifford 6 | ############################################################################ 7 | package DIGS; 8 | 9 | ############################################################################ 10 | # Import statements/packages (externally developed packages) 11 | ############################################################################ 12 | use strict; 13 | 14 | ############################################################################ 15 | # Import statements/packages (internally developed packages) 16 | ############################################################################ 17 | 18 | # Base classes 19 | use Base::FileIO; 20 | use Base::Console; 21 | use Base::DevTools; 22 | 23 | # Interface 24 | use Interface::BLAST; 25 | 26 | # Program components 27 | use DIGS::Initialise; # Initialises the DIGS tool 28 | use DIGS::ScreenBuilder; # To set up a DIGS run 29 | use DIGS::Defragment; # Defragment tools 30 | use DIGS::Consolidate; # Consolidate locus functions 31 | use DIGS::Extract; # Extracting sequences for FASTA files using BLAST 32 | use DIGS::Classify; # Classifying sequences using BLAST 33 | use DIGS::CrossMatch; # Recording cross-matching during DIGS 34 | 35 | 36 | 37 | ############################################################################ 38 | # Globals 39 | ############################################################################ 40 | 41 | # Base objects 42 | my $fileio = FileIO->new(); 43 | my $console = Console->new(); 44 | my $devtools = DevTools->new(); 45 | 1; 46 | 47 | ############################################################################ 48 | # LIFECYCLE 49 | ############################################################################ 50 | 51 | #*************************************************************************** 52 | # Subroutine: new 53 | # Description: create new DIGS 'object' 54 | #*************************************************************************** 55 | sub new { 56 | 57 | my ($invocant, $parameter_ref) = @_; 58 | my $class = ref($invocant) || $invocant; 59 | 60 | # Create objects for screening 61 | my $crossmatch_obj = CrossMatch->new($parameter_ref); 62 | 63 | # Set member variables 64 | my $self = { 65 | 66 | # Global settings 67 | process_id => $parameter_ref->{process_id}, 68 | program_version => $parameter_ref->{program_version}, 69 | 70 | # Flags 71 | verbose => $parameter_ref->{verbose}, 72 | force => $parameter_ref->{force}, 73 | 74 | # Member classes 75 | crossmatch_obj => $crossmatch_obj, 76 | 77 | # MySQL database connection parameters 78 | mysql_username => $parameter_ref->{mysql_username}, 79 | mysql_password => $parameter_ref->{mysql_password}, 80 | db_name => '', # Obtained from control file or console 81 | mysql_server => '', # Obtained from control file or console 82 | 83 | # Paths to probe and reference files 84 | query_na_fasta => '', # Path to nuceotide seq probes 85 | query_aa_fasta => '', # Path to AA seq probes 86 | reference_na_fasta => '', # Path to an nucleotide reference seq library 87 | reference_aa_fasta => '', # Path to an AA reference seq library 88 | aa_reference_library => '', # Obtained from control file 89 | na_reference_library => '', # Obtained from control file 90 | 91 | # Parameters for DIGS 92 | bitscore_min_tblastn => '', # Minimum bitscore for extracting hits (tBLASTn) 93 | bitscore_min_blastn => '', # Minimum bitscore for extracting hits (BLASTn) 94 | seq_length_minimum => '', # Minimum seq length for extracting hits 95 | extract_buffer => '', # Size of lead/trailer sequence to extract upstream & downstream of a hit 96 | defragment_mode => '', # Determined by input options 97 | 98 | # Paths used in DIGS process 99 | blast_bin_path => $parameter_ref->{blast_bin_path}, 100 | genome_use_path => $parameter_ref->{genome_use_path}, 101 | output_path => $parameter_ref->{output_path}, 102 | tmp_path => $parameter_ref->{tmp_path}, 103 | 104 | }; 105 | 106 | bless ($self, $class); 107 | return $self; 108 | } 109 | 110 | ############################################################################ 111 | # MAIN LOOP 112 | ############################################################################ 113 | 114 | #*************************************************************************** 115 | # Subroutine: run_digs_process 116 | # Description: handler for main DIGS functions 117 | #*************************************************************************** 118 | sub run_digs_process { 119 | 120 | my ($self, $ctl_file, $option) = @_; 121 | 122 | $self->show_title(); 123 | 124 | # Initialise the screen based on (i) options given and (ii) control file 125 | my $valid = $self->initialise($ctl_file, $option); 126 | 127 | if ($valid) { 128 | 129 | # Hand off to the appropriate function, depending on the option received 130 | $self->hand_off_to_digs_fxns($option); 131 | 132 | # Show final summary and exit message 133 | $self->wrap_up($option); 134 | } 135 | } 136 | 137 | ############################################################################ 138 | # SETTING UP - Processing and validating input, handing off to subroutines 139 | ############################################################################ 140 | 141 | #*************************************************************************** 142 | # Subroutine: initialise 143 | # Description: check input file and options received are valid, & initialise 144 | #*************************************************************************** 145 | sub initialise { 146 | 147 | my ($self, $ctl_file, $option) = @_; 148 | 149 | # Check tmp directory is set up 150 | $self->check_tmp_directory(); 151 | 152 | my $valid = undef; 153 | my $initialise_obj = Initialise->new($self); 154 | 155 | if ($ctl_file) { # Try to initialise using the input file 156 | $valid = $initialise_obj->initialise($self, $option, $ctl_file); 157 | } 158 | elsif ($option eq 1) { 159 | $valid = 1; # An infile is optional for option -m=1 160 | } 161 | elsif ($option > 1 and $option <= 5) { # Show error if no infile 162 | print "\n\t Option '-m=$option' requires an infile\n\n"; 163 | } 164 | 165 | 166 | if ($option > 5) { # # Unavailable option error 167 | print "\n\t Unrecognized option '-m=$option'\n\n"; 168 | } 169 | 170 | 171 | return $valid; 172 | } 173 | 174 | #*************************************************************************** 175 | # Subroutine: hand_off_to_digs_fxns 176 | # Description: hand off to the appropriate function, depending on options 177 | #*************************************************************************** 178 | sub hand_off_to_digs_fxns { 179 | 180 | my ($self, $option) = @_; 181 | 182 | # Main screening functions 183 | if ($option eq 1) { # Check the target sequences are formatted for BLAST 184 | $self->index_target_files_for_blast(); 185 | } 186 | elsif ($option eq 2) { # Screen 187 | $self->do_digs(); 188 | } 189 | elsif ($option eq 3) { # Reassign data in digs_results table 190 | $self->reassign(); 191 | } 192 | 193 | # Defragmenting screening results 194 | elsif ($option eq 4 or $option eq 5) { 195 | 196 | if ($option eq 4) { 197 | # Create a defragmenter module 198 | my $defragment_obj = Defragment->new($self); 199 | # Interactively defragment contiguous hits to the same gene 200 | $defragment_obj->interactive_defragment(); 201 | } 202 | elsif ($option eq 5) { 203 | # Create a consolidate module 204 | my $consolidate_obj = Consolidate->new($self); 205 | # Combine hits to different genes into higher order locus structures 206 | $consolidate_obj->consolidate_loci(); 207 | } 208 | } 209 | else { 210 | print "\n\n\t # Unrecognised option m='$option'\n\n\n\n"; 211 | exit; 212 | } # Shouldn't get here 213 | 214 | } 215 | 216 | ############################################################################ 217 | # MAIN DIGS FUNCTIONS 218 | ############################################################################ 219 | 220 | #*************************************************************************** 221 | # Subroutine: index_target_files_for_blast 222 | # Description: create index files for all target databases 223 | #*************************************************************************** 224 | sub index_target_files_for_blast { 225 | 226 | my ($self) = @_; 227 | 228 | # Format targets files for BLAST searching 229 | my $target_db_obj = TargetDB->new($self); 230 | $target_db_obj->format_targets_for_blast(); 231 | 232 | } 233 | 234 | #*************************************************************************** 235 | # Subroutine: do_digs 236 | # Description: do the core database-integrated genome screening processes 237 | #*************************************************************************** 238 | sub do_digs { 239 | 240 | my ($self, $mode) = @_; 241 | 242 | # Get the DB tables we will use in this fxn 243 | my $db_ref = $self->{db}; 244 | my $searches_table = $db_ref->{searches_table}; 245 | 246 | # Iterate through the list of DIGS queries, dealing each in turn 247 | # Each DIGS query constitutes a probe sequence and a target FASTA file 248 | my $current_query_num = 0; 249 | my $queries_ref = $self->{queries}; 250 | unless ($queries_ref) { die; } # Sanity checking 251 | my @probes = keys %$queries_ref; # Get the list of queries 252 | print "\n\t ### Starting database-integrated genome screening"; 253 | foreach my $probe_name (@probes) { 254 | 255 | # Get the array of queries for this target file 256 | my $probe_queries = $queries_ref->{$probe_name}; 257 | foreach my $query_ref (@$probe_queries) { 258 | 259 | # Increment query count 260 | $current_query_num++; 261 | $self->{current_query_num} = $current_query_num; 262 | 263 | # DIGS round one 264 | my @to_extract; 265 | my @to_delete; 266 | $self->run_digs_search_phase($query_ref, \@to_extract, \@to_delete); 267 | 268 | # DIGS round two 269 | my $extract_count = scalar @to_extract; 270 | if ($extract_count) { 271 | $self->run_digs_classify_phase($query_ref, \@to_extract, \@to_delete); 272 | } 273 | 274 | # Update the searches_performed table 275 | $searches_table->insert_row($query_ref); 276 | 277 | # Show a status update in the console 278 | $self->show_digs_progress(); 279 | } 280 | } 281 | } 282 | 283 | 284 | #*************************************************************************** 285 | # Subroutine: run_digs_search_phase 286 | # Description: execute a BLAST query and get non-redundant results 287 | #*************************************************************************** 288 | sub run_digs_search_phase { 289 | 290 | my ($self, $query_ref, $to_extract_ref, $to_delete_ref) = @_; 291 | 292 | # Do the 1st BLAST (probe vs target) 293 | $self->search_targetdb_file($query_ref); 294 | 295 | # Create a non-redundant result set for this query 296 | my @combined; 297 | $self->created_combined_locus_set($query_ref, \@combined); 298 | 299 | # Defragment the combined set of new and previously identified loci 300 | $self->create_nonredundant_hit_set(\@combined, $to_extract_ref, $to_delete_ref); 301 | 302 | } 303 | 304 | #*************************************************************************** 305 | # Subroutine: run_digs_classify_phase 306 | # Description: extract hits and classify 307 | #*************************************************************************** 308 | sub run_digs_classify_phase { 309 | 310 | my ($self, $query_ref, $to_extract_ref, $to_delete_ref) = @_; 311 | 312 | # Get/create the various objects we will use here 313 | my $db_ref = $self->{db}; 314 | my $classify_obj = Classify->new($self); 315 | my $extract_obj = Extract->new($self); 316 | 317 | my $num_to_extract = scalar @$to_extract_ref; 318 | print "\n\t\t# Extracting $num_to_extract hits from target database"; 319 | 320 | # Extract newly identified or extended sequences 321 | my @extracted; 322 | foreach my $locus_ref (@$to_extract_ref) { 323 | $locus_ref->{target_path} = $query_ref->{target_path}; 324 | $extract_obj->extract_locus_sequence_using_blast($locus_ref); 325 | my %copy_locus = %$locus_ref; 326 | push (@extracted, \%copy_locus); 327 | } 328 | if ($self->{verbose}) { print "\n\t\t # Extraction step DONE for this results set\n"; } 329 | 330 | # Do BLAST-based classification (new/extended hits from 1st search vs reference library) 331 | $self->classify_hits(\@extracted, $query_ref); 332 | 333 | # Update the digs_results table 334 | $db_ref->update_db($to_delete_ref, \@extracted, 'digs_results_table'); 335 | 336 | if ($self->{verbose}) { print "\n\t\t # RESULTS recorded for this probe-target pair"; } 337 | 338 | } 339 | 340 | #*************************************************************************** 341 | # Subroutine: reassign 342 | # Description: re-classify sequences in the digs_results_table 343 | #*************************************************************************** 344 | sub reassign { 345 | 346 | my ($self) = @_; 347 | 348 | # Interface to BLAST 349 | my %blast_params; 350 | $blast_params{blast_bin_path} = $self->{blast_bin_path}; 351 | my $blast_obj = BLAST->new(\%blast_params); 352 | 353 | # Get data structures, paths and flags from self 354 | my $crossmatch_obj = $self->{crossmatch_obj}; 355 | my $result_path = $self->{report_dir}; 356 | my $verbose = $self->{verbose}; 357 | my $classifier = Classify->new($self); 358 | 359 | # Get the connection to the digs_results table (so we can update it) 360 | my $db = $self->{db}; 361 | my $digs_results_table = $db->{digs_results_table}; 362 | unless ($digs_results_table) { die; } 363 | 364 | # Get the sequences to reassign 365 | my $reassign_loci = $self->{reassign_loci}; 366 | unless ($reassign_loci) { die; } 367 | my $num_to_reassign = scalar @$reassign_loci; 368 | print "\n\n\t ### Reassigning $num_to_reassign hits in the digs_results table\n"; 369 | 370 | # Iterate through the loci, doing the reassign process for each 371 | my $count = 0; 372 | foreach my $locus_ref (@$reassign_loci) { 373 | 374 | # Set the linking to the BLAST result table 375 | my $record_id = $locus_ref->{record_id}; 376 | my $extract_start = $locus_ref->{extract_start}; 377 | my $extract_end = $locus_ref->{extract_end}; 378 | $locus_ref->{subject_start} = $extract_start; 379 | $locus_ref->{subject_end} = $extract_end; 380 | delete $locus_ref->{extract_start}; 381 | delete $locus_ref->{extract_end}; 382 | 383 | # Execute the 'reverse' BLAST (2nd BLAST in a round of paired BLAST) 384 | my $previous_assign = $locus_ref->{assigned_name}; 385 | my $previous_gene = $locus_ref->{assigned_gene}; 386 | $classifier->classify_sequence_using_blast($locus_ref); 387 | 388 | $count++; 389 | if (($count % 100) eq 0) { print "\n\t Checked $count rows"; } 390 | 391 | my $assigned_name = $locus_ref->{assigned_name}; 392 | my $assigned_gene = $locus_ref->{assigned_gene}; 393 | if ($assigned_name ne $previous_assign or $assigned_gene ne $previous_gene) { 394 | 395 | if ($verbose) { # Report the outcome 396 | print "\n\t\t - reassigned: was previously '$previous_assign ($previous_gene)'"; 397 | } 398 | 399 | # Update the matrix 400 | my $previous_key = $previous_assign . '_' . $previous_gene; 401 | my $assigned_key = $assigned_name . '_' . $assigned_gene; 402 | $crossmatch_obj->update_cross_matching($previous_key, $assigned_key); 403 | 404 | # Insert the data 405 | my $where = " WHERE record_id = $record_id "; 406 | delete $locus_ref->{record_id}; # Required to remove this 407 | delete $locus_ref->{organism}; # Update not required for this field 408 | $digs_results_table->update($locus_ref, $where); 409 | } 410 | } 411 | 412 | # Cleanup 413 | my $output_dir = $self->{report_dir}; 414 | my $command1 = "rm -rf $output_dir"; 415 | system $command1; 416 | } 417 | 418 | ############################################################################ 419 | # INTERNALS - search, classify, defragment, and updated screening database 420 | ############################################################################ 421 | 422 | #*************************************************************************** 423 | # Subroutine: search_targetdb_file 424 | # Description: execute a similarity search and parse the results 425 | #*************************************************************************** 426 | sub search_targetdb_file { 427 | 428 | my ($self, $query_ref) = @_; 429 | 430 | # Get settings from self 431 | my $verbose = $self->{verbose}; 432 | 433 | # Set parameters for the forward BLAST 434 | my %blast_params; 435 | $blast_params{verbose} = $verbose; 436 | $blast_params{blast_bin_path} = $self->{blast_bin_path}; 437 | $blast_params{num_threads} = $self->{fwd_num_threads}; 438 | $blast_params{word_size} = $self->{fwd_word_size}; 439 | $blast_params{evalue} = $self->{fwd_evalue}; 440 | $blast_params{penalty} = $self->{fwd_penalty}; 441 | $blast_params{reward} = $self->{fwd_reward}; 442 | $blast_params{gapopen} = $self->{fwd_gapopen}; 443 | $blast_params{gapextend} = $self->{fwd_gapextend}; 444 | $blast_params{dust} = $self->{fwd_dust}; 445 | $blast_params{softmasking} = $self->{fwd_softmasking}; 446 | $blast_params{seg} = $self->{fwd_seg}; 447 | my $blast_obj = BLAST->new(\%blast_params); 448 | #$devtools->print_hash(\%blast_params); die; 449 | #unless ($blast_obj) { die; } 450 | #$devtools->print_hash($self); die; 451 | 452 | # Get relevant member variables and objects 453 | my $tmp_path = $self->{tmp_path}; 454 | my $min_length = $self->{seq_length_minimum}; 455 | my $min_score = $self->{bitscore_minimum}; 456 | my $db_ref = $self->{db}; 457 | #print "\n\tMinimum score $min_score"; die; 458 | 459 | # Sanity checking 460 | unless ($min_length) { die; } 461 | unless ($min_score) { die; } 462 | unless ($tmp_path) { $devtools->print_hash($self); die; } 463 | 464 | # Get query details 465 | my $probe_id = $query_ref->{probe_id}; 466 | my $blast_alg = $query_ref->{blast_alg}; 467 | my $probe_name = $query_ref->{probe_name}; 468 | my $probe_gene = $query_ref->{probe_gene}; 469 | my $probe_type = $query_ref->{probe_type}; 470 | my $probe_path = $query_ref->{probe_path}; 471 | my $organism = $query_ref->{organism}; 472 | my $version = $query_ref->{target_version}; 473 | my $datatype = $query_ref->{target_datatype}; 474 | my $target_name = $query_ref->{target_name}; 475 | my $target_path = $query_ref->{target_path}; 476 | my $result_file = $tmp_path . "/$probe_id" . "_$target_name.blast_result.tmp"; 477 | unless ($probe_id and $blast_alg) { die; } 478 | 479 | # Do forward BLAST search (probe versus target genome) 480 | my $completed = $self->{current_query_num}; 481 | print "\n\n\t # $blast_alg: $completed: '$organism' ($version, $datatype)"; 482 | print "\n\t # target: '$target_name'"; 483 | print "\n\t # probe: '$probe_id'"; 484 | my $success = $blast_obj->blast($blast_alg, $target_path, $probe_path, $result_file, \%blast_params); 485 | 486 | if ($success) { 487 | 488 | # Extract the results from tabular format BLAST output 489 | my @hits; 490 | $blast_obj->parse_tab_format_results($result_file, \@hits); 491 | my $rm_command = "rm $result_file"; 492 | system $rm_command; # Remove the result file 493 | 494 | # Summarise raw results of BLAST search 495 | my $num_hits = scalar @hits; 496 | if ($num_hits > 0) { 497 | print "\n\n\t\t# $num_hits matches to probe: $probe_name, $probe_gene"; 498 | } 499 | 500 | # Apply filters & store results 501 | my $num_retained_hits = 0; 502 | my $score_exclude_count = '0'; 503 | my $length_exclude_count = '0'; 504 | foreach my $hit_ref (@hits) { 505 | 506 | my $skip = undef; 507 | # Apply length cutoff 508 | if ($min_length) { # Skip sequences that are too short 509 | my $start = $hit_ref->{aln_start}; 510 | my $end = $hit_ref->{aln_stop}; 511 | if ($end - $start < $min_length) { 512 | $skip = 'true'; 513 | $length_exclude_count++; 514 | } 515 | } 516 | # Apply bitscore cutoff if one has been set 517 | if ($min_score) { 518 | # Skip hits below bitscore threshold 519 | my $query_score = $hit_ref->{bitscore}; 520 | if ($query_score < $min_score) { 521 | unless ($skip) { # Don't count as a bit_score exclusion if already exclude via length 522 | $skip = 'true'; 523 | $score_exclude_count++; 524 | if ($verbose) { 525 | print "\n\t\t# Excluding hit below bitscore threshold (threshold = $min_score, bitscore = $query_score)"; 526 | } 527 | } 528 | } 529 | } 530 | unless ($skip) { 531 | # Insert values into 'active_set' table 532 | $db_ref->insert_row_in_active_set_table($query_ref, $hit_ref); 533 | $num_retained_hits++; 534 | } 535 | } 536 | 537 | # Show summary of BLAST results after filtering 538 | if ($score_exclude_count or $length_exclude_count) { 539 | print "\n\t\t# $num_retained_hits matches above threshold "; 540 | print "(excluded: $length_exclude_count < min. length; $score_exclude_count < min. bitscore)"; 541 | } 542 | } 543 | } 544 | 545 | #*************************************************************************** 546 | # Subroutine: classify_hits 547 | # Description: classify a set of sequences using blast 548 | #*************************************************************************** 549 | sub classify_hits { 550 | 551 | my ($self, $extracted_ref, $query_ref) = @_; 552 | 553 | my $classifier = Classify->new($self); 554 | 555 | my $assigned_count = 0; 556 | my $crossmatch_count = 0; 557 | my $num_to_assign = scalar @$extracted_ref; 558 | unless ($query_ref) { die; } 559 | 560 | if ($num_to_assign) { 561 | print "\n\t\t# Assigning $num_to_assign hits by BLAST comparison to reference library"; 562 | } 563 | 564 | foreach my $locus_ref (@$extracted_ref) { # Iterate through the matches 565 | 566 | # Classify using BLAST 567 | my $success = $classifier->classify_sequence_using_blast($locus_ref); 568 | 569 | if ($success) { 570 | 571 | my $assigned = $locus_ref->{assigned_name}; 572 | $assigned_count++; 573 | 574 | # Get the unique key for this probe 575 | my $probe_name = $query_ref->{probe_name}; 576 | my $probe_gene = $query_ref->{probe_gene}; 577 | my $probe_key = $probe_name . '_' . $probe_gene; 578 | 579 | # Record cross-matching 580 | if ($probe_key ne $assigned) { 581 | $crossmatch_count++; 582 | my $crossmatch_obj = $self->{crossmatch_obj}; 583 | $crossmatch_obj->update_cross_matching($probe_key, $assigned); 584 | } 585 | } 586 | } 587 | 588 | print "\n\t\t# $assigned_count of $num_to_assign extracted sequences successfully classified"; 589 | 590 | if ($self->{verbose}) { 591 | print "\n\t\t# $crossmatch_count cross-matched to something other than the probe"; 592 | } 593 | } 594 | 595 | #*************************************************************************** 596 | # Subroutine: created_combined_locus_set 597 | # Description: combine a new set of hits with existing hits for a given target file 598 | #*************************************************************************** 599 | sub created_combined_locus_set { 600 | 601 | my ($self, $query_ref, $combined_ref) = @_; 602 | 603 | # Get flags and objects 604 | my $verbose = $self->{verbose}; 605 | my $db_ref = $self->{db}; 606 | 607 | # Compose SQL WHERE statement to retrieve relevant set of loci 608 | my $target_name = $query_ref->{target_name}; 609 | my $organism = $query_ref->{organism}; 610 | my $probe_name = $query_ref->{probe_name}; 611 | my $probe_gene = $query_ref->{probe_gene}; 612 | my $probe_type = $query_ref->{probe_type}; 613 | my $where = " WHERE organism = '$organism' "; 614 | $where .= " AND target_name = '$target_name' "; # Always limit by target 615 | $where .= " AND probe_type = '$probe_type' "; # WE MAY WANT EITHER non-coding OR coding sequence, BUT NEVER BOTH 616 | 617 | # Get the relevant set of DIGS results 618 | my @digs_results; 619 | $db_ref->get_sorted_digs_results(\@digs_results, $where); 620 | my $num_loci = scalar @digs_results; 621 | if ($verbose) { print "\n\t\t # $num_loci previously extracted $probe_type loci"; } 622 | 623 | # Add the digs results from the main results table to the BLAST hits in the active_set table 624 | # We want everything in one table so we can use MySQL to sort (Order) the results 625 | $db_ref->add_digs_results_to_active_set(\@digs_results); 626 | 627 | # Get sorted list of digs results and BLAST hits from active_set table 628 | $db_ref->get_sorted_active_set($combined_ref, $where); 629 | 630 | # Show output 631 | my $total_loci = scalar @$combined_ref; 632 | if ($verbose) { 633 | if ($total_loci > 0) { 634 | print "\n\t\t # $total_loci rows in active set (including $num_loci previously extracted) "; 635 | } 636 | } 637 | } 638 | 639 | #*************************************************************************** 640 | # Subroutine: create_nonredundant_hit_set 641 | # Description: combine most recent set of hits with previous 642 | # and derive a non-redundant set 643 | #*************************************************************************** 644 | sub create_nonredundant_hit_set { 645 | 646 | my ($self, $combined_ref, $to_extract_ref, $to_delete_ref) = @_; 647 | 648 | # Create the defragment object 649 | my $defragment_obj = Defragment->new($self); 650 | 651 | # Compose clusters of overlapping/adjacent loci 652 | my %clusters; 653 | $defragment_obj->compose_clusters(\%clusters, $combined_ref); 654 | # DEBUG $devtools->print_hash(\%clusters); die; 655 | 656 | # Derive a non-redundant locus set (i.e. one merged locus for each cluster) 657 | my %merged; 658 | my %singletons; 659 | $defragment_obj->merge_clustered_loci(\%clusters, \%merged, \%singletons, $to_delete_ref); 660 | 661 | # Work out what we need to extract and classify 662 | $defragment_obj->get_loci_to_extract(\%merged, \%singletons, $to_extract_ref); 663 | 664 | } 665 | 666 | ############################################################################ 667 | # CONSOLE OUTPUT 668 | ############################################################################ 669 | 670 | #*************************************************************************** 671 | # Subroutine: show_title 672 | # Description: show command line title blurb 673 | #*************************************************************************** 674 | sub show_title { 675 | 676 | my ($self) = @_; 677 | 678 | my $version_num = $self->{program_version}; 679 | unless ($version_num) { 680 | $version_num = 'version undefined (use with caution)'; 681 | } 682 | 683 | #$console->refresh(); 684 | my $title = "DIGS (version: $version_num)"; 685 | my $description = 'Database-Integrated Genome Screening'; 686 | my $author = 'Robert J. Gifford'; 687 | my $contact = ''; 688 | $console->show_about_box($title, $version_num, $description, $author, $contact); 689 | } 690 | 691 | #*************************************************************************** 692 | # Subroutine: show_help_page 693 | # Description: show help page information 694 | #*************************************************************************** 695 | sub show_help_page { 696 | 697 | my ($self) = @_; 698 | 699 | # Create help menu 700 | $console->refresh(); 701 | my $program_version = $self->{program_version}; 702 | 703 | my $HELP = "\n\n\t ### DIGS version $program_version"; 704 | $HELP .= "\n\t ### usage: $0 m=[option] -i=[control file] -h=[help]\n"; 705 | 706 | $HELP .= "\n\t ### Main functions\n"; 707 | $HELP .= "\n\t -m=1 Prepare target files (index files for BLAST)"; 708 | $HELP .= "\n\t -m=2 Do DIGS"; 709 | $HELP .= "\n\t -m=3 Reassign loci"; 710 | $HELP .= "\n\t -m=4 Defragment loci"; 711 | $HELP .= "\n\t -m=5 Consolidate loci"; 712 | 713 | $HELP .= "\n\n\t ### Summarising target databases\n"; 714 | $HELP .= "\n\t -g=1 Summarise targets (brief summary, by species)"; 715 | $HELP .= "\n\t -g=2 Summarise targets (long, by individual target file)\n"; 716 | 717 | $HELP .= "\n\t ### Managing DIGS screening DBs\n"; 718 | $HELP .= "\n\t -d=1 Import tab-delimited data"; 719 | $HELP .= "\n\t -d=2 Flush core tables"; 720 | $HELP .= "\n\t -d=3 Drop tables"; 721 | $HELP .= "\n\t -d=4 Drop a screening DB"; 722 | $HELP .= "\n\t -d=5 Append data to 'digs_results' table"; 723 | $HELP .= "\n\t -d=6 Extract sequences using tabular file"; 724 | 725 | $HELP .= "\n\n\t Target path variable '\$DIGS_GENOMES' is set to '$ENV{DIGS_GENOMES}'"; 726 | $HELP .= "\n\n"; 727 | 728 | print $HELP; 729 | } 730 | 731 | #*************************************************************************** 732 | # Subroutine: show_digs_progress 733 | # Description: show progress in DIGS screening 734 | #*************************************************************************** 735 | sub show_digs_progress { 736 | 737 | my ($self) = @_; 738 | 739 | # Get the counts 740 | my $total_queries = $self->{total_queries}; 741 | my $completed = $self->{current_query_num}; 742 | unless ($completed and $total_queries) { die; } # Sanity checking 743 | 744 | # Calculate percentage progress 745 | my $percent_prog = ($completed / $total_queries) * 100; 746 | my $f_percent_prog = sprintf("%.2f", $percent_prog); 747 | #print "\n\t\t "; 748 | print "\n\n\t### done $completed of $total_queries queries (%$f_percent_prog)"; 749 | } 750 | 751 | #*************************************************************************** 752 | # Subroutine: wrap_up 753 | # Description: clean-up functions etc prior to exiting program 754 | #*************************************************************************** 755 | sub wrap_up { 756 | 757 | my ($self, $option) = @_; 758 | 759 | # Remove the output directory 760 | my $output_dir = $self->{report_dir}; 761 | if ($output_dir) { 762 | my $command1 = "rm -rf $output_dir"; 763 | system $command1; 764 | } 765 | 766 | # Show cross matching at end if verbose output setting is on 767 | my $crossmatch_obj = $self->{crossmatch_obj}; 768 | my $verbose = $self->{verbose}; 769 | if ($verbose and $option eq 2 770 | or $verbose and $option eq 3) { 771 | $crossmatch_obj->show_cross_matching(); 772 | } 773 | 774 | # Print finished message 775 | print "\n\n\t ### Process completed ~ + ~ + ~"; 776 | 777 | } 778 | 779 | #*************************************************************************** 780 | # Subroutine: check_tmp_directory 781 | # Description: 782 | #*************************************************************************** 783 | sub check_tmp_directory { 784 | 785 | my ($self) = @_; 786 | 787 | my $tmp_path = $self->{tmp_path}; 788 | 789 | unless (-e $tmp_path && -d _) { 790 | print "\t\t The directory '$tmp_path' doesn't exist.\n"; 791 | print "\t\t Do you want to create it? (y/n): "; 792 | my $response = ; 793 | chomp $response; 794 | if (lc($response) eq 'y') { 795 | mkdir $tmp_path or die "Failed to create directory '$tmp_path': $!"; 796 | print "\t ### Directory '$tmp_path' created successfully.\n"; 797 | } else { 798 | print "\t ### Directory creation aborted.\n"; 799 | } 800 | } else { 801 | 802 | my $total_size = 0; 803 | opendir(my $dh, $tmp_path) or die "Unable to open directory '$tmp_path': $!"; 804 | while (my $file = readdir($dh)) { 805 | next if $file =~ /^\.\.?$/; # Skip "." and ".." entries 806 | my $file_path = "$tmp_path/$file"; 807 | $total_size += -s $file_path if -f $file_path; 808 | } 809 | closedir($dh); 810 | 811 | my $threshold = 100 * 1024 * 1024; # 100 megabytes in bytes 812 | if ($total_size > $threshold) { 813 | my $excess_size_mb = sprintf("%.2f", $total_size / (1024 * 1024)); 814 | print "\t ### WARNING: Directory '$tmp_path' size is $excess_size_mb megabytes, which exceeds the threshold of 100 megabytes. Please consider cleaning it up.\n"; 815 | } 816 | } 817 | } 818 | 819 | ############################################################################ 820 | # EOF 821 | ############################################################################ 822 | --------------------------------------------------------------------------------