├── README ├── bin ├── orthomclLoadBlast.sql ├── orthomclMclToGroups ├── orthomclAdjustFasta ├── orthomclInstallSchema.sql ├── orthomclLoadBlast ├── orthomclFilterFasta ├── orthomclDumpPairsFiles ├── orthomclInstallSchema ├── orthomclBlastParser └── orthomclPairs ├── doc └── OrthoMCLEngine │ └── Main │ ├── oracleConfigurationGuide.txt │ ├── orthomcl.config.template │ ├── mysql.cnf │ ├── mysqlConfigurationGuide.txt │ ├── mysqlInstallGuide.txt │ └── UserGuide.txt ├── config ├── .build.info └── gus.config ├── SoftwareLicense.txt └── lib └── perl └── OrthoMCLEngine └── Main └── Base.pm /README: -------------------------------------------------------------------------------- 1 | This is the OrthoMCL software. See http://www.orthomcl.org for the offical version. 2 | -------------------------------------------------------------------------------- /bin/orthomclLoadBlast.sql: -------------------------------------------------------------------------------- 1 | USE orthomcl; 2 | 3 | LOAD DATA 4 | LOCAL INFILE "~/orthoPackage/data/orthomcl_all_3.lst" 5 | REPLACE INTO TABLE orthomcl.SimilarSequences 6 | FIELDS TERMINATED BY '\t' 7 | -------------------------------------------------------------------------------- /doc/OrthoMCLEngine/Main/oracleConfigurationGuide.txt: -------------------------------------------------------------------------------- 1 | Whatever tablespace is used for the orthomcl data may need to be very large. 2 | 3 | A good estimate is 5x size of the file produced by the orthomclBlastParser program. 4 | 5 | If the DBA desires indexes to be in a separate tablespace, use the oracleIndexTblSpc property in the orthomcl.config file 6 | -------------------------------------------------------------------------------- /doc/OrthoMCLEngine/Main/orthomcl.config.template: -------------------------------------------------------------------------------- 1 | # this config assumes a mysql database named 'orthomcl'. adjust according 2 | # to your situation. 3 | dbVendor=mysql 4 | dbConnectString=dbi:mysql:orthomcl:3307 5 | dbLogin= 6 | dbPassword= 7 | similarSequencesTable=SimilarSequences 8 | orthologTable=Ortholog 9 | inParalogTable=InParalog 10 | coOrthologTable=CoOrtholog 11 | interTaxonMatchView=InterTaxonMatch 12 | percentMatchCutoff=50 13 | evalueExponentCutoff=-5 14 | oracleIndexTblSpc=NONE -------------------------------------------------------------------------------- /config/.build.info: -------------------------------------------------------------------------------- 1 | #Build Information 2 | #Wed Jan 27 11:41:13 EST 2010 3 | OrthoMCLEngine.svn.status= 4 | \!Last.build=OrthoMCLEngine @ 2010/01/27 11\:41\:13 5 | \!Last.build.component=OrthoMCLEngine 6 | OrthoMCLEngine.svn.info=URL\: https\://www.cbil.upenn.edu/svn/apidb/OrthoMCLEngine/trunk\nRevision\: 33071\nLast Changed Rev\: 32454\nLast Changed Date\: 2009-12-04 13\:29\:53 -0500 (Fri, 04 Dec 2009) 7 | \!Last.build.initialTarget=install 8 | \!Last.build.timestamp=2010/01/27 11\:41\:13 9 | OrthoMCLEngine.Main.buildtime=2010/01/27 11\:41\:13 10 | -------------------------------------------------------------------------------- /SoftwareLicense.txt: -------------------------------------------------------------------------------- 1 | The OrthoMCL Software is Copyright 2010 by the EuPathDB Bioinformatics Resource Center. 2 | 3 | This program is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | 8 | This program is distributed in the hope that it will be useful, 9 | but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | GNU General Public License for more details. 12 | 13 | You should have received a copy of the GNU General Public License 14 | along with this program. If not, see . 15 | -------------------------------------------------------------------------------- /bin/orthomclMclToGroups: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | my ($prefix, $startId) = @ARGV; 4 | 5 | &usage unless ($prefix && ($startId =~ /\d+/)); 6 | 7 | 8 | while () { 9 | s/\t/ /g; 10 | print "$prefix$startId: $_"; 11 | $startId++; 12 | } 13 | 14 | sub usage { 15 | print " 16 | mclOutput2groupsFile prefix starting_id_num 17 | 18 | create an orthomcl groups file from an mcl output file. just generate a group ID for each group, and prepend it to that group's line. 19 | 20 | where: 21 | prefix a prefix to use when generating group ids. For example OG2_ 22 | starting_id_num a number to start the id generating with. For example 1000 23 | 24 | std input: mcl output file (label mode) 25 | std output: orthomcl groups file 26 | 27 | an orthomcl group file has one line per group and looks like this: 28 | 29 | OG2_1009: osa|ENS1222992 pfa|PF11_0844 30 | 31 | 32 | "; 33 | exit(1); 34 | } 35 | -------------------------------------------------------------------------------- /doc/OrthoMCLEngine/Main/mysql.cnf: -------------------------------------------------------------------------------- 1 | [client] 2 | 3 | socket=/tmp/mysql_2.sock 4 | 5 | port=3307 6 | 7 | #################################### 8 | 9 | [mysqld] 10 | 11 | #REQUIRED!! 12 | #Change the basedir directory to reflect your mysql home directory 13 | basedir=your_mysql_dir 14 | 15 | #REQUIRED!! 16 | #Change the data direcory to reflect your mysql data directory 17 | datadir=your_mysql_dir/data 18 | 19 | port=3307 20 | 21 | socket=/tmp/mysql_2.sock 22 | 23 | key_buffer_size=64M 24 | 25 | #[OPTIMIZATION] 26 | #Set this value to 50% of available RAM if your environment permits. 27 | myisam_sort_buffer_size=4G 28 | 29 | #[OPTIMIZATION] 30 | #This value should be at least 50% of free hard drive space. Use caution if setting it to 100% of free space however. Your hard disk may fill up! 31 | myisam_max_sort_file_size=200G 32 | 33 | #[OPTIMIZATION] 34 | #Our default of 2G is probably fine for this value. Change this value only if you are using a machine with little resources available. 35 | read_buffer_size=2G 36 | -------------------------------------------------------------------------------- /doc/OrthoMCLEngine/Main/mysqlConfigurationGuide.txt: -------------------------------------------------------------------------------- 1 | There are three key configuration properties that must be set so MySQL can handle the size of data you are using. 2 | 3 | They are: 4 | myisam_max_sort_file_size 5 | myisam_sort_buffer_size 6 | read_buffer_size 7 | 8 | * If you are using an existing MySQL database, login to it and run these commands to see the current values of these properties: 9 | 10 | mysql> show variables LIKE 'myisam_max_sort_file_size'; 11 | 12 | mysql> show variables LIKE 'myisam_sort_buffer_size'; 13 | 14 | mysql> show variables LIKE 'read_buffer_size'; 15 | 16 | 17 | NOTE: These values will display in bytes, i.e. a value of 18 | 2147479552 for read_buffer_size is equivalent to 2 Gigabytes. 19 | 20 | If these values are less than what you will need, as described below, contact your mysql administrator (provide this file for reference). 21 | 22 | 23 | * If you are installing your own mysql, edit these properties in the mysql.cnf file. 24 | 25 | 26 | * Suggested settings 27 | 28 | a) set myisam_sort_buffer_size= to 50% of available ram 29 | - to find out how much ram you have: 30 | dmesg | grep Memory 31 | - it gives you a report in kilobytes. divide by 1000000 to get a number of Gs. 32 | 33 | b) set myisam_max_sort_file_size= to 5 x the size of the file made by orthomclBlastParser. 34 | - (revisit this after you have run orthomclBlastParser) 35 | 36 | c) set read_buffer_size= to ??? 37 | -------------------------------------------------------------------------------- /lib/perl/OrthoMCLEngine/Main/Base.pm: -------------------------------------------------------------------------------- 1 | package OrthoMCLEngine::Main::Base; 2 | 3 | use strict; 4 | use DBI; 5 | 6 | sub new { 7 | my ($class, $configFile, $loghandle) = @_; 8 | 9 | my $self = {}; 10 | bless($self,$class); 11 | $self->parseConfigFile($configFile, $loghandle); 12 | return $self; 13 | } 14 | 15 | sub parseConfigFile { 16 | my ($self, $configFile, $loghandle) = @_; 17 | 18 | open(F, $configFile) || die "Can't open config file '$configFile'\n"; 19 | 20 | $self->{configFile} = $configFile; 21 | while() { 22 | chomp; 23 | s/\s+$//; 24 | next if /^\#/; 25 | /^(\w+)\=(.+)/ || die "illegal line in config file '$_'\n"; 26 | my $key=$1; 27 | my $val=$2; 28 | $self->{config}->{$key} = $val; 29 | if ($loghandle) { 30 | $val = '********' if $key eq 'dbPassword'; 31 | print $loghandle localtime() . " configuration: $key=$val\n"; 32 | } 33 | } 34 | } 35 | 36 | sub getConfig { 37 | my ($self, $prop) = @_; 38 | die "can't find property $prop in config file" unless $self->{config}->{$prop}; 39 | return $self->{config}->{$prop}; 40 | } 41 | 42 | 43 | sub getDbh { 44 | my ($self) = @_; 45 | 46 | if (!$self->{dbh}) { 47 | my $dbVendor = $self->getConfig("dbVendor"); 48 | if ($dbVendor eq 'oracle') { 49 | require DBD::Oracle; 50 | } elsif ($dbVendor eq 'mysql') { 51 | require DBD::mysql; 52 | } else { 53 | die "config file '$self->{configFile}' has invalid value '$dbVendor' for dbVendor property\n"; 54 | } 55 | 56 | $self->{dbh} = DBI->connect($self->getConfig("dbConnectString"), 57 | $self->getConfig("dbLogin"), 58 | $self->getConfig("dbPassword")) or die DBI::errstr; 59 | } 60 | return $self->{dbh}; 61 | } 62 | 1; 63 | -------------------------------------------------------------------------------- /bin/orthomclAdjustFasta: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | 5 | &usage() unless scalar(@ARGV) == 3; 6 | 7 | my $taxoncode = $ARGV[0]; 8 | my $inputfile = $ARGV[1]; 9 | my $idField = $ARGV[2]; 10 | 11 | open(IN, $inputfile) || die "Can't open input file '$inputfile'\n"; 12 | open(OUT, ">$taxoncode.fasta") || die "Can't open output file '$taxoncode.fasta'\n"; 13 | 14 | while() { 15 | if (/\>/) { 16 | s/^\>\s*//; 17 | s/\s+/ /g; 18 | s/\s*\|\s*/\|/g; 19 | my @a = split(/[\s\|]/); 20 | print OUT ">$taxoncode|$a[$idField-1]\n"; 21 | } else { 22 | print OUT $_; 23 | } 24 | } 25 | 26 | 27 | 28 | sub usage { 29 | print " 30 | Create an OrthoMCL compliant .fasta file, by adjusting definition lines. 31 | 32 | Usage: 33 | orthomclAdjustFasta taxon_code fasta_file id_field 34 | 35 | where: 36 | taxon_code: a three or four letter unique abbreviation for the taxon 37 | fasta_file: the input fasta file 38 | id_field: a number indicating what field in the definition line contains 39 | the protein ID. Fields are separated by either ' ' or '|'. Any 40 | spaces immediately following the '>' are ignored. The first 41 | field is 1. For example, in the following definition line, the 42 | ID (AP_000668.1) is in field 4: >gi|89106888|ref|AP_000668.1| 43 | 44 | Input file requirements: 45 | (1) .fasta format 46 | (2) a unique id is provided for each sequence, and is in the field specified 47 | by id_field 48 | 49 | Output file format: 50 | (1) .fasta format 51 | (2) definition line is of the form: 52 | >taxoncode|unique_protein_id 53 | 54 | The output file is named taxoncode.fasta 55 | 56 | Note: if your input files do not meet the requirements, you can do some simple perl or awk processing of them to create the required input files to this program, or the required output files. This program is provided as a convenience, but OrthoMCL users are expected to have the scripting skills to provide OrthoMCL compliant .fasta files. 57 | 58 | EXAMPLE: orthomclSoftware/bin/orthomclAdjustFasta hsa Homo_sapiens.NCBI36.53.pep.all.fa 1 59 | 60 | "; 61 | exit(1); 62 | } 63 | -------------------------------------------------------------------------------- /bin/orthomclInstallSchema.sql: -------------------------------------------------------------------------------- 1 | USE orthomcl; 2 | 3 | --Remove schema if it already exists 4 | 5 | DROP TABLE IF EXISTS orthomcl.SimilarSequences; 6 | DROP TABLE IF EXISTS orthomcl.InParalog; 7 | DROP TABLE IF EXISTS orthomcl.Ortholog; 8 | DROP TABLE IF EXISTS orthomcl.CoOrtholog; 9 | DROP TABLE IF EXISTS orthomcl.BestInterTaxonScore; 10 | DROP TABLE IF EXISTS orthomcl.BestQueryTaxonScore; 11 | DROP TABLE IF EXISTS orthomcl.InterTaxonMatch; 12 | 13 | --Create schema 14 | 15 | CREATE TABLE orthomcl.SimilarSequences ( 16 | QUERY_ID VARCHAR(15), 17 | SUBJECT_ID VARCHAR(15), 18 | QUERY_TAXON_ID VARCHAR(15), 19 | SUBJECT_TAXON_ID VARCHAR(15), 20 | EVALUE_MANT BIGINT(20), 21 | EVALUE_EXP BIGINT(20), 22 | PERCENT_IDENTITY FLOAT, 23 | PERCENT_MATCH FLOAT 24 | ); 25 | 26 | 27 | 28 | CREATE INDEX ss_qtaxexp_ix ON orthomcl.SimilarSequences(query_id, subject_taxon_id, evalue_exp, evalue_mant, query_taxon_id, subject_id); 29 | CREATE INDEX ss_seqs_ix ON orthomcl.SimilarSequences(query_id, subject_id, evalue_exp, evalue_mant); 30 | 31 | 32 | ----------------------------------------------------------- 33 | 34 | CREATE TABLE orthomcl.InParalog ( 35 | SEQUENCE_ID_A VARCHAR(15), 36 | SEQUENCE_ID_B VARCHAR(15), 37 | TAXON_ID VARCHAR(15), 38 | UNNORMALIZED_SCORE DOUBLE, 39 | NORMALIZED_SCORE DOUBLE 40 | ); 41 | 42 | 43 | ------------------------------------------------------------ 44 | 45 | CREATE TABLE orthomcl.Ortholog ( 46 | SEQUENCE_ID_A VARCHAR(15), 47 | SEQUENCE_ID_B VARCHAR(15), 48 | TAXON_ID_A VARCHAR(15), 49 | TAXON_ID_B VARCHAR(15), 50 | UNNORMALIZED_SCORE DOUBLE, 51 | NORMALIZED_SCORE DOUBLE 52 | ); 53 | 54 | CREATE INDEX orthomcl.ortholog_seq_a_ix on orthomcl.ortholog(sequence_id_a); 55 | CREATE INDEX orthomcl.ortholog_seq_b_ix on orthomcl.ortholog(sequence_id_b); 56 | 57 | 58 | ------------------------------------------------------------- 59 | 60 | CREATE TABLE orthomcl.CoOrtholog ( 61 | SEQUENCE_ID_A VARCHAR(15), 62 | SEQUENCE_ID_B VARCHAR(15), 63 | TAXON_ID_A VARCHAR(15), 64 | TAXON_ID_B VARCHAR(15), 65 | UNNORMALIZED_SCORE DOUBLE, 66 | NORMALIZED_SCORE DOUBLE 67 | ); 68 | 69 | 70 | 71 | 72 | CREATE OR REPLACE VIEW orthomcl.InterTaxonMatch 73 | AS SELECT ss.query_id, ss.subject_id, ss.subject_taxon_id, 74 | ss.evalue_mant, ss.evalue_exp 75 | FROM orthomcl.SimilarSequences ss 76 | WHERE ss.subject_taxon_id != ss.query_taxon_id; 77 | 78 | 79 | 80 | 81 | -- exit; 82 | -------------------------------------------------------------------------------- /config/gus.config: -------------------------------------------------------------------------------- 1 | ## 2 | ## GUS Configuration 3 | ## 4 | ## 5 | ## @version $Revision: 1.1.2.2 $ $Date: 2005/05/06 16:32:34 $ 6 | ## 7 | 8 | ### GUS End User Configurtion 9 | ### 10 | 11 | 12 | ####### gus.properties 13 | 14 | # RDBMS Connection Strings 15 | ## dbiDsn is for Perl ( i.e. dbi:Oracle:NAME_OF_DATABASE ) 16 | ## jdbcDsn is for JDBC/Java ( i.e. jdbc:oracle:thin:@HOSTNAME:PORT:NAME_OF_DATABASE ) 17 | dbVendor=Oracle 18 | 19 | #dbiDsn=dbi:Oracle:plas550n 20 | #databaseLogin=sfischer 21 | #databasePassword=heathersofia 22 | #group=plasmoDB 23 | #project=plasmoDB:5.4 24 | #databaseLogin=apidb 25 | #databasePassword=po34weep 26 | 27 | dbiDsn=dbi:Oracle:trypdev 28 | jdbcDsn=jdbc:oracle:oci:@trypdev 29 | databaseLogin=sfischer 30 | databasePassword=heathersofia 31 | group=OrthoMCL 32 | project=OrthoMCL:2.0 33 | 34 | 35 | # Username, group, and project info from the relevant Core tables 36 | 37 | userName=dba 38 | 39 | tablespace=GUS 40 | 41 | ####### install.prop 42 | 43 | # Path to Perl Executable 44 | perl=/usr/bin/perl 45 | 46 | ####### GUS-PluginMgr.prop 47 | 48 | # Path to MD5 Executable 49 | md5sum=/usr/bin/md5sum 50 | 51 | ################################################################################ 52 | ### Warning: Do not change items below here unless you know what you're doing 53 | ################################################################################ 54 | 55 | gusSchemas=Core,App,RAD,DoTS,SRes,TESS,Prot,Study,PlasmoDB 56 | coreSchemaName=CORE 57 | 58 | sequenceStart=1 59 | 60 | ### Common delimited list of housekeeping columns. In order as they should appear in the tables 61 | housekeepingColumns=MODIFICATION_DATE,USER_READ,USER_WRITE,GROUP_READ,GROUP_WRITE,OTHER_READ,OTHER_WRITE,ROW_USER_ID,ROW_GROUP_ID,ROW_PROJECT_ID,ROW_ALG_INVOCATION_ID 62 | housekeepingColumnsVer=MODIFICATION_DATE,USER_READ,USER_WRITE,GROUP_READ,GROUP_WRITE,OTHER_READ,OTHER_WRITE,ROW_USER_ID,ROW_GROUP_ID,ROW_PROJECT_ID,ROW_ALG_INVOCATION_ID,VERSION_ALG_INVOCATION_ID,VERSION_DATE,VERSION_TRANSACTION_ID 63 | 64 | ### type,length,precision,nullable 65 | 66 | hkspec.MODIFICATION_DATE=Date,0,0,false 67 | hkspec.USER_READ=Character,1,0,false 68 | hkspec.USER_WRITE=Character,1,0,false 69 | hkspec.GROUP_READ=Character,1,0,false 70 | hkspec.GROUP_WRITE=Character,1,0,false 71 | hkspec.OTHER_READ=Character,1,0,false 72 | hkspec.OTHER_WRITE=Character,1,0,false 73 | hkspec.ROW_USER_ID=Number,12,0,false 74 | hkspec.ROW_GROUP_ID=Number,4,0,false 75 | hkspec.ROW_PROJECT_ID=Number,4,0,false 76 | hkspec.ROW_ALG_INVOCATION_ID=Number,12,0,false 77 | 78 | hkspec.VERSION_ALG_INVOCATION_ID=Number,12,0,true 79 | hkspec.VERSION_DATE=Date,0,0,true 80 | hkspec.VERSION_TRANSACTION_ID=Number,12,0,true 81 | 82 | hibernate.mapdir=/Users/msaffitz/cvswork/gusdba/build/hbm 83 | hibernate.basePkg=org.gus.model 84 | -------------------------------------------------------------------------------- /bin/orthomclLoadBlast: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use DBI; 4 | use FindBin; 5 | use lib "$FindBin::Bin/../lib/perl"; 6 | use OrthoMCLEngine::Main::Base; 7 | use strict; 8 | 9 | usage() unless (@ARGV >= 2); 10 | my $configFile = $ARGV[0]; 11 | my $blastFile = $ARGV[1]; 12 | 13 | my $base = OrthoMCLEngine::Main::Base->new($configFile); 14 | my $dbh = $base->getDbh(); 15 | 16 | my $dbVendor = $base->getConfig("dbVendor"); 17 | 18 | if ($dbVendor eq 'mysql') { 19 | loadBlastMySQL($base, $blastFile); 20 | } 21 | elsif ($dbVendor eq 'oracle') { 22 | loadBlastOracle($base, $blastFile); 23 | } else { 24 | die "Config file '$configFile' contains invalid value '$dbVendor' for dbVendor\n"; 25 | } 26 | 27 | sub loadBlastMySQL { 28 | my ($base, $blastFile) = @_; 29 | require DBD::mysql; 30 | my $dbh = $base->getDbh(); 31 | my $sst = $base->getConfig("similarSequencesTable"); 32 | my $sql = " 33 | LOAD DATA 34 | LOCAL INFILE \"$blastFile\" 35 | REPLACE INTO TABLE $sst 36 | FIELDS TERMINATED BY '\\t' 37 | "; 38 | my $stmt = $dbh->prepare($sql) or die DBI::errstr; 39 | $stmt->execute() or die DBI::errstr; 40 | } 41 | 42 | 43 | sub loadBlastOracle { 44 | my ($base, $blastFile) = @_; 45 | 46 | my $dbLogin = $base->getConfig("dbLogin"); 47 | my $dbPassword = $base->getConfig("dbPassword"); 48 | my $dbString = $base->getConfig("dbConnectString"); 49 | my @database = split(/:/, $dbString); 50 | my $dbInstance = $database[2]; 51 | 52 | open (PARFILE, ">orthomclPar.tmp"); 53 | print PARFILE "userid=$dbLogin/$dbPassword\@$dbInstance\n"; 54 | close PARFILE; 55 | 56 | my $sst = $base->getConfig("similarSequencesTable"); 57 | 58 | my $sqlHeader = " 59 | LOAD DATA 60 | INFILE '$blastFile' 61 | INTO TABLE $sst 62 | FIELDS TERMINATED BY \"\\t\" OPTIONALLY ENCLOSED BY '\"' 63 | TRAILING NULLCOLS 64 | ( query_id, 65 | subject_id, 66 | query_taxon_id, 67 | subject_taxon_id, 68 | evalue_mant, 69 | evalue_exp, 70 | percent_identity, 71 | percent_match 72 | ) 73 | "; 74 | 75 | open (CTLFILE, ">orthomclCtl.tmp"); 76 | print CTLFILE $sqlHeader; 77 | close CTLFILE; 78 | 79 | my $command=`sqlldr parfile=orthomclPar.tmp control=orthomclCtl.tmp`; 80 | unlink("orthomclCtl.tmp", "orthomclPar.tmp"); 81 | } 82 | 83 | sub usage { 84 | print " 85 | Load Blast results into an Oracle or Mysql database. 86 | 87 | usage: orthomclLoadBlast config_file similar_seqs_file 88 | 89 | where: 90 | config_file : see below 91 | similar_seqs_file : output from orthomclParseBlast 92 | 93 | EXAMPLE: orthomclSoftware/bin/orthomclLoadBlast my_orthomcl_dir/orthomcl.config my_orthomcl_dir/similarSequences.txt 94 | 95 | NOTE: the database login in the config file must have update/insert/truncate privileges on the tables specified in the config file. 96 | 97 | Sample Config File: 98 | 99 | dbVendor=oracle (or mysql) 100 | dbConnectString=dbi:Oracle:orthomcl 101 | dbLogin=my_db_login 102 | dbPassword=my_db_password 103 | similarSequencesTable=SimilarSequences 104 | "; 105 | } 106 | -------------------------------------------------------------------------------- /bin/orthomclFilterFasta: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | 5 | &usage() unless scalar(@ARGV) == 3; 6 | 7 | my $inputDir = $ARGV[0]; 8 | my $minLength = $ARGV[1]; 9 | my $maxStopPercent = $ARGV[2]; 10 | 11 | opendir(DIR, $inputDir) || die "Can't open input directory '$inputDir'\n"; 12 | my @files = readdir(DIR); 13 | closedir(DIR); 14 | 15 | die "Input directory $inputDir does not contain any files" unless scalar(@files); 16 | 17 | my $rejectRates = []; 18 | open(GOOD, ">goodProteins.fasta"); 19 | open(BAD, ">poorProteins.fasta"); 20 | foreach my $file (@files) { 21 | next if $file =~ /^\./; 22 | open(F, "$inputDir/$file") || die "Can't open input file '$file'\n"; 23 | print STDERR "processing file $file\n"; 24 | my $seqCount; 25 | my $rejectSeqCount; 26 | my $currentSeq; 27 | my $currentLen; 28 | my $currentStopCnt; 29 | 30 | # process lines of one file 31 | while () { 32 | chomp; 33 | # handle prev seq 34 | if (/\>/) { 35 | if ($currentSeq) { 36 | $seqCount++; 37 | $rejectSeqCount += &handleSeq($currentSeq, $currentLen, $currentStopCnt); 38 | $currentSeq = ""; 39 | $currentLen = 0; 40 | $currentStopCnt = 0; 41 | } 42 | } else { 43 | $currentLen += length($_); 44 | $currentStopCnt += tr/[^A-Za-z]//; # this removes the stop codon from $_ 45 | } 46 | $currentSeq .= "$_\n"; 47 | } 48 | $rejectSeqCount += &handleSeq($currentSeq, $currentLen, $currentStopCnt); 49 | $seqCount++; 50 | 51 | # add file stats to reject count if it qualifies 52 | if ($rejectSeqCount) { 53 | my $pct = $rejectSeqCount/$seqCount * 100; 54 | if ($pct > 10) { 55 | push(@$rejectRates, [$file, $pct]); 56 | } 57 | } 58 | close(F); 59 | } 60 | 61 | if (scalar(@$rejectRates)) { 62 | print "\nProteomes with > 10% poor proteins:\n"; 63 | my @sortedRR = sort {$b->[1] <=> $a->[1]} @$rejectRates; 64 | foreach my $reject (@sortedRR) { 65 | my $intPct = int($reject->[1]); 66 | print " $reject->[0]\t$intPct%\n"; 67 | } 68 | } 69 | 70 | sub handleSeq { 71 | my ($seq, $len, $stopCnt) = @_; 72 | my $isBad = 0; 73 | my $stopPercent = (($len - $stopCnt)/$len)* 100; 74 | 75 | if ($len < $minLength || $stopPercent > $maxStopPercent) { 76 | print BAD $seq; 77 | $isBad = 1; 78 | } else { 79 | print GOOD $seq; 80 | } 81 | return $isBad; 82 | } 83 | 84 | sub usage { 85 | print " 86 | Create goodProteins.fasta containing all good proteins and rejectProteins.fasta containing all rejects. Input is a directory containing a set of compliant input .fasta files (as produced by orthomclAdjustFasta). 87 | 88 | Usage: 89 | orthomclFilterFasta input_dir min_length max_percent_stops 90 | 91 | where: 92 | input_dir: a directory containing a set of .fasta files 93 | min_length: minimum allowed length of proteins. (suggested: 10) 94 | max_percent_stop: maximum percent stop codons. (suggested 20) 95 | 96 | EXAMPLE: orthomclSoftware/bin/orthomclFilterFasta my_orthomcl_dir/compliantFasta 10 20 97 | 98 | "; 99 | exit(1); 100 | } 101 | -------------------------------------------------------------------------------- /bin/orthomclDumpPairsFiles: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use DBI; 4 | use FindBin; 5 | use lib "$FindBin::Bin/../lib/perl"; 6 | use OrthoMCLEngine::Main::Base; 7 | use strict; 8 | 9 | my $configFile = $ARGV[0]; 10 | 11 | &usage() unless $configFile; 12 | 13 | my $base = OrthoMCLEngine::Main::Base->new($configFile); 14 | my $dbh = $base->getDbh(); 15 | 16 | my $orthologTable = $base->getConfig("orthologTable"); 17 | my $inParalogTable = $base->getConfig("inParalogTable"); 18 | my $coOrthologTable = $base->getConfig("coOrthologTable"); 19 | 20 | my $dir = "pairs"; 21 | 22 | die "dir '$dir' already exists" if -e $dir; 23 | 24 | mkdir($dir); 25 | 26 | printOrthologsFile($dbh, $orthologTable, "$dir/orthologs.txt"); 27 | 28 | printInparalogsFile($dbh, $inParalogTable, "$dir/inparalogs.txt"); 29 | 30 | printOrthologsFile($dbh, $coOrthologTable, "$dir/coorthologs.txt"); 31 | 32 | printMclAbcFile($dbh, $orthologTable, $inParalogTable, $coOrthologTable, 33 | "mclInput"); 34 | 35 | 36 | ################# subroutines ######################### 37 | 38 | sub printInparalogsFile { 39 | my ($dbh, $inparalogTable, $fileName) = @_; 40 | 41 | my $sql = " 42 | select taxon_id, sequence_id_a, sequence_id_b, normalized_score 43 | from $inparalogTable 44 | order by taxon_id, sequence_id_a, sequence_id_b asc 45 | "; 46 | 47 | my $stmt = $dbh->prepare($sql) or die DBI::errstr; 48 | $stmt->execute(); 49 | open(F, ">$fileName") || die "Can't open '$fileName' for writing"; 50 | while (my ($taxonId, $sourceIdA, $sourceIdB, $score) = $stmt->fetchrow_array()) { 51 | $score = int($score * 1000 + .5)/1000; 52 | print F "$sourceIdA\t$sourceIdB\t$score\n"; 53 | } 54 | close(F); 55 | } 56 | 57 | sub printOrthologsFile { 58 | my ($dbh, $table, $fileName) = @_; 59 | 60 | my $sql = " 61 | select taxon_id_a, taxon_id_b, sequence_id_a, sequence_id_b, normalized_score 62 | from $table 63 | order by taxon_id_a, taxon_id_b, sequence_id_a, sequence_id_b asc 64 | "; 65 | 66 | my $stmt = $dbh->prepare($sql) or die DBI::errstr; 67 | $stmt->execute(); 68 | open(F, ">$fileName") || die "Can't open '$fileName' for writing"; 69 | while (my ($taxonIdA, $taxonIdB, $sourceIdA, $sourceIdB, $score) = $stmt->fetchrow_array()) { 70 | $score = int($score * 1000 + .5)/1000; 71 | print F "$sourceIdA\t$sourceIdB\t$score\n"; 72 | } 73 | close(F); 74 | } 75 | 76 | sub printMclAbcFile { 77 | my ($dbh, $orthologTable, $inParalogTable, $coOrthologTable, $fileName) = @_; 78 | 79 | my $sql = " 80 | select sequence_id_a, sequence_id_b, normalized_score 81 | from $inParalogTable 82 | union 83 | select sequence_id_a, sequence_id_b, normalized_score 84 | from $orthologTable 85 | union 86 | select sequence_id_a, sequence_id_b, normalized_score 87 | from $coOrthologTable 88 | "; 89 | 90 | my $stmt = $dbh->prepare($sql) or die DBI::errstr; 91 | $stmt->execute() or die DBI::errstr; 92 | open(F, ">$fileName") || die "Can't open '$fileName' for writing"; 93 | while (my ($queryId, $subjectId, $score) = $stmt->fetchrow_array()) { 94 | $score = int($score * 1000 + .5)/1000; 95 | print F "$queryId\t$subjectId\t$score\n"; 96 | } 97 | close(F); 98 | } 99 | 100 | sub usage { 101 | print " 102 | Dump files from the database produced by the orthomclPairs program. 103 | 104 | usage: orthomclDumpPairsFiles config_file 105 | 106 | where: 107 | config_file : see below (you can use the same file given to orthomclPairs) 108 | 109 | Database Input: 110 | - InParalog, Ortholog, CoOrtholog tables - populated by orthomclPairs 111 | 112 | Output files: 113 | orthomclMclInput - file required by the mcl program 114 | pairs/ - dir holding relationship files 115 | potentialOrthologs.txt - ortholog relationships 116 | potentialInparalogs.txt - inparalog relationships 117 | potentialCoorthologs.txt - coortholog relationships 118 | 119 | The pairs/ files contain the pairs found by the orthomclPairs tables, and their 120 | average normalized scores. This is the same information as in the 121 | orthomclMclInput file, but segregated by relationship type. These are 122 | candidate relationships (edges) that will subsequently be grouped (clustered) 123 | by the mcl program to form the OrthoMCL ortholog groups. These files contain 124 | more sensitive and less selective relationships then the final ortholog groups. 125 | 126 | Standard Error: 127 | - logging info 128 | 129 | EXAMPLE: orthomclSoftware/bin/orthomclDumpPairsFile my_orthomcl_dir/orthomcl.config 130 | 131 | Sample Config File: 132 | 133 | dbVendor=oracle (or mysql) 134 | dbConnectString=dbi:Oracle:orthomcl 135 | dbLogin=my_db_login 136 | dbPassword=my_db_password 137 | orthologTable=Ortholog 138 | inParalogTable=InParalog 139 | coOrthologTable=CoOrtholog 140 | "; 141 | exit(1); 142 | } 143 | 144 | -------------------------------------------------------------------------------- /bin/orthomclInstallSchema: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use DBI; 4 | use FindBin; 5 | use lib "$FindBin::Bin/../lib/perl"; 6 | use OrthoMCLEngine::Main::Base; 7 | use strict; 8 | 9 | 10 | usage() unless (@ARGV >= 1); 11 | my $configFile = $ARGV[0]; 12 | my $sqlLog = $ARGV[1]; 13 | 14 | my $base = OrthoMCLEngine::Main::Base->new($configFile); 15 | my $dbh = $base->getDbh(); 16 | 17 | if ($sqlLog) { 18 | open (LOGFILE, ">$sqlLog"); 19 | } 20 | 21 | my $dbVendor; 22 | my $intType = ($dbVendor eq 'oracle') ? 'NUMBER' : 'INT'; 23 | my $oracleNoLogging = $base->getConfig("dbVendor") eq 'oracle'? " NOLOGGING" : ""; 24 | my $oracleIndexTblSpc = $base->getConfig("oracleIndexTblSpc"); 25 | $oracleIndexTblSpc =~ s/\s//g; 26 | 27 | createSimilarSequencesTable(); 28 | createInParalogTable(); 29 | createOrthologTable(); 30 | createCoOrthologTable(); 31 | createInterTaxonMatchView(); 32 | 33 | ############################################################## 34 | 35 | sub createSimilarSequencesTable { 36 | my $sst = $base->getConfig("similarSequencesTable"); 37 | 38 | my $sql = " 39 | CREATE TABLE $sst ( 40 | QUERY_ID VARCHAR(60), 41 | SUBJECT_ID VARCHAR(60), 42 | QUERY_TAXON_ID VARCHAR(40), 43 | SUBJECT_TAXON_ID VARCHAR(40), 44 | EVALUE_MANT FLOAT, 45 | EVALUE_EXP $intType, 46 | PERCENT_IDENTITY FLOAT, 47 | PERCENT_MATCH FLOAT 48 | ) $oracleNoLogging 49 | "; 50 | runSql($sql); 51 | 52 | $sql = " 53 | CREATE UNIQUE INDEX ss_qtaxexp_ix 54 | ON $sst(query_id, subject_taxon_id, 55 | evalue_exp, evalue_mant, 56 | query_taxon_id, subject_id) $oracleNoLogging 57 | "; 58 | runSql($sql); 59 | 60 | $sql = " 61 | CREATE UNIQUE INDEX ss_seqs_ix 62 | ON $sst(query_id, subject_id, 63 | evalue_exp, evalue_mant, percent_match) $oracleNoLogging 64 | "; 65 | runSql($sql); 66 | } 67 | 68 | 69 | sub createInParalogTable { 70 | my $ipt = $base->getConfig("inParalogTable"); 71 | my $sql = " 72 | CREATE TABLE $ipt ( 73 | SEQUENCE_ID_A VARCHAR(60), 74 | SEQUENCE_ID_B VARCHAR(60), 75 | TAXON_ID VARCHAR(40), 76 | UNNORMALIZED_SCORE FLOAT, 77 | NORMALIZED_SCORE FLOAT 78 | ) 79 | "; 80 | runSql($sql); 81 | } 82 | 83 | 84 | sub createOrthologTable { 85 | my $olt = $base->getConfig("orthologTable"); 86 | my $sql = " 87 | CREATE TABLE $olt ( 88 | SEQUENCE_ID_A VARCHAR(60), 89 | SEQUENCE_ID_B VARCHAR(60), 90 | TAXON_ID_A VARCHAR(40), 91 | TAXON_ID_B VARCHAR(40), 92 | UNNORMALIZED_SCORE FLOAT, 93 | NORMALIZED_SCORE FLOAT 94 | ) 95 | "; 96 | runSql($sql); 97 | 98 | $sql = " 99 | CREATE INDEX ortholog_seq_a_ix 100 | ON $olt(sequence_id_a) 101 | "; 102 | runSql($sql); 103 | 104 | $sql = " 105 | CREATE INDEX ortholog_seq_b_ix 106 | ON $olt(sequence_id_b) 107 | "; 108 | runSql($sql); 109 | } 110 | 111 | 112 | sub createCoOrthologTable { 113 | my $cot = $base->getConfig("coOrthologTable"); 114 | my $sql = " 115 | CREATE TABLE $cot ( 116 | SEQUENCE_ID_A VARCHAR(60), 117 | SEQUENCE_ID_B VARCHAR(60), 118 | TAXON_ID_A VARCHAR(40), 119 | TAXON_ID_B VARCHAR(40), 120 | UNNORMALIZED_SCORE FLOAT, 121 | NORMALIZED_SCORE FLOAT 122 | ) 123 | "; 124 | runSql($sql); 125 | } 126 | 127 | sub createInterTaxonMatchView { 128 | my $sst = $base->getConfig("similarSequencesTable"); 129 | my $itv = $base->getConfig("interTaxonMatchView"); 130 | my $sql = " 131 | CREATE OR REPLACE VIEW $itv 132 | AS SELECT ss.query_id, ss.subject_id, ss.subject_taxon_id, 133 | ss.evalue_mant, ss.evalue_exp 134 | FROM $sst ss 135 | WHERE ss.subject_taxon_id != ss.query_taxon_id 136 | "; 137 | runSql($sql); 138 | } 139 | 140 | sub runSql { 141 | my $sql = $_[0]; 142 | if ($sqlLog) { 143 | logSql($sql); 144 | } 145 | my $stmt = $dbh->prepare($sql) or die DBI::errstr; 146 | $stmt->execute() or die DBI::errstr; 147 | } 148 | 149 | 150 | sub logSql { 151 | my $sql = $_[0]; 152 | print LOGFILE "\n$sql"; 153 | } 154 | 155 | sub usage { 156 | print " 157 | Create OrthoMCL schema in an Oracle or Mysql database. 158 | 159 | usage: orthomclInstallSchema config_file sql_log_file 160 | 161 | where: 162 | config_file : see below 163 | sql_log_file : optional log of sql executed 164 | 165 | EXAMPLE: orthomclSoftware/bin/orthomclInstallSchema my_orthomcl_dir/orthomcl.config my_orthomcl_dir/install_schema.log 166 | 167 | NOTE: the database login in the config file must have update/insert/truncate privileges on the tables specified in the config file. 168 | 169 | Sample Config File: 170 | 171 | dbVendor=oracle (or mysql) 172 | dbConnectString=dbi:Oracle:orthomcl 173 | dbLogin=my_db_login 174 | dbPassword=my_db_password 175 | blastResultsTable=BlastResults 176 | orthologTable=Ortholog 177 | inParalogTable=InParalog 178 | coOrthologTable=CoOrtholog 179 | interTaxonMatchView=InterTaxonMatch 180 | 181 | "; 182 | exit(1); 183 | } 184 | 185 | -------------------------------------------------------------------------------- /bin/orthomclBlastParser: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | 5 | my $blastFile = shift(@ARGV); 6 | my $fastaFilesDir = shift(@ARGV); 7 | 8 | usage() unless $blastFile; 9 | 10 | usage() unless $fastaFilesDir; 11 | 12 | opendir(DIR, $fastaFilesDir) || die "Can't open fasta directory '$fastaFilesDir'\n"; 13 | my @fastaFiles = readdir(DIR); 14 | closedir(DIR); 15 | 16 | my $genes = getGenesFromFasta($fastaFilesDir, @fastaFiles); 17 | 18 | open(F,$blastFile) || die "can't open BLAST file '$blastFile'\n"; 19 | 20 | =pod 21 | query_name, hitname, 22 | $pcid, len, 23 | mismatches, ngaps, 24 | start('query'), end('query'), 25 | start('hit'), end('hit'), 26 | evalue, bits 27 | =cut 28 | 29 | my $prevSubjectId = 'blah'; 30 | my $subject; # hash to hold subject info 31 | my $queryShorter; 32 | 33 | while() { 34 | chomp; 35 | my ($queryId, $subjectId, $percentIdentity, $length, $mismatches, $ngaps, $queryStart, $queryEnd, $subjectStart, $subjectEnd, $evalue, $bits) = split; 36 | 37 | if ($subjectId ne $prevSubjectId) { 38 | 39 | # print previous subject 40 | printPreviousSubject($subject) if $subject; 41 | 42 | # initialize new one from first HSP 43 | $prevSubjectId = $subjectId; 44 | 45 | $subject = {}; 46 | $subject->{queryId} = $queryId; 47 | $subject->{subjectId} = $subjectId; 48 | $subject->{queryShorter} = getTaxonAndLength($subject, $genes); 49 | 50 | ($subject->{evalueMant}, $subject->{evalueExp}) 51 | = formatEvalue($evalue); # from first hsp 52 | } 53 | 54 | # get additional info from subsequent HSPs 55 | my $hspspan = [$subjectStart, $subjectEnd]; 56 | $hspspan = [$queryStart, $queryEnd] if $subject->{queryShorter}; 57 | push(@{$subject->{hspspans}}, $hspspan); 58 | $subject->{totalIdentities} += $percentIdentity * $length; 59 | $subject->{totalLength} += $length; 60 | } 61 | printPreviousSubject($subject); 62 | 63 | ######################################################################################## 64 | 65 | sub getGenesFromFasta { 66 | my $fastaFilesDir = shift(@_); 67 | my (@fastaFiles) = @_; 68 | 69 | my $genes; 70 | foreach my $fastaFile (@fastaFiles) { 71 | next if $fastaFile =~ /^\./; 72 | print STDERR "acquiring genes from $fastaFile\n"; 73 | $fastaFile =~ /(\w+).fasta/ || die "'$fastaFile' is not in 'taxon.fasta' format\n"; 74 | my $taxon = $1; 75 | open(FF,"$fastaFilesDir/$fastaFile") || die "can't open fasta file '$fastaFilesDir/$fastaFile'"; 76 | my $gene; 77 | my $length; 78 | while () { 79 | chomp; 80 | next if /^\s*$/; 81 | if (/\>(\S+)/) { 82 | $genes->{$gene}->{length} = $length if $gene; 83 | $genes->{$gene}->{taxon} = $taxon if $gene; 84 | $gene = $1; 85 | $length = 0; 86 | } else { 87 | $length += length($_); 88 | } 89 | } 90 | $genes->{$gene}->{length} = $length if $gene; 91 | $genes->{$gene}->{taxon} = $taxon if $gene; 92 | close(FF); 93 | } 94 | return $genes; 95 | } 96 | 97 | sub getTaxonAndLength { 98 | my ($subject, $genes) = @_; 99 | $subject->{queryTaxon} = $genes->{$subject->{queryId}}->{taxon}; 100 | $subject->{subjectTaxon} = $genes->{$subject->{subjectId}}->{taxon}; 101 | $subject->{queryLength} = $genes->{$subject->{queryId}}->{length}; 102 | $subject->{subjectLength} = $genes->{$subject->{subjectId}}->{length}; 103 | die "couldn't find taxon for gene '$subject->{subjectId}'" unless $subject->{subjectTaxon}; 104 | die "couldn't find taxon for gene '$subject->{queryId}'" unless $subject->{queryTaxon}; 105 | return $subject->{queryLength} < $subject->{subjectLength}; 106 | } 107 | 108 | sub printPreviousSubject { 109 | my ($subject) = @_; 110 | 111 | my $nonOverlapMatchLen = computeNonOverlappingMatchLength($subject); 112 | 113 | my $percentIdent = 114 | int($subject->{totalIdentities} / $subject->{totalLength} * 10 + .5)/10; 115 | my $shorterLength = $subject->{queryShorter}? $subject->{queryLength} : $subject->{subjectLength}; 116 | my $percentMatch = int($nonOverlapMatchLen / $shorterLength * 1000 + .5) / 10; 117 | print "$subject->{queryId}\t$subject->{subjectId}\t$subject->{queryTaxon}\t$subject->{subjectTaxon}\t$subject->{evalueMant}\t$subject->{evalueExp}\t$percentIdent\t$percentMatch\n"; 118 | } 119 | 120 | sub formatEvalue { 121 | my ($evalue) = @_; 122 | 123 | my ($evalue_mant, $evalue_exp); 124 | if ($evalue =~ /e/) { 125 | $evalue = '1' . $evalue if ($evalue =~ /^e/); 126 | ($evalue_mant, $evalue_exp) = split(/e\-/, $evalue); 127 | } else { 128 | $evalue_mant = int($evalue); 129 | $evalue_exp = 0; 130 | } 131 | return ($evalue_mant, -$evalue_exp); 132 | } 133 | 134 | sub computeNonOverlappingMatchLength { 135 | my ($subject) = @_; 136 | 137 | my @hsps = sort {$a->[0] <=> $b->[0]} @{$subject->{hspspans}}; 138 | my $first = shift @hsps; 139 | return 0 unless $first; 140 | my ($start, $end) = getStartEnd($first); 141 | my $len = 0; 142 | foreach my $h (@hsps){ 143 | my ($hspStart,$hspEnd) = getStartEnd($h); 144 | 145 | next if $hspEnd <= $end; ##does not extend 146 | if ($hspStart <= $end) { ##overlaps 147 | $end = $hspEnd; #extend end ... already dealt with if new end is less 148 | } else { ##there is a gap in between .. 149 | $len += $end - $start + 1; 150 | $start = $hspStart; 151 | $end = $hspEnd; 152 | } 153 | } 154 | $len += $end - $start + 1; # deal with the last one 155 | return $len 156 | } 157 | 158 | # flip orientation if nec. 159 | sub getStartEnd { 160 | my ($h) = @_; 161 | my $hspStart = $h->[0]; 162 | my $hspEnd = $h->[1]; 163 | if ($hspStart > $hspEnd) { 164 | $hspEnd = $h->[0]; 165 | $hspStart = $h->[1]; 166 | } 167 | return($hspStart,$hspEnd); 168 | } 169 | 170 | sub usage { 171 | print " 172 | 173 | orthomclBlastParser blast_file fasta_files_dir 174 | 175 | where: 176 | blast_file: BLAST output in m8 format. 177 | fasta_files_dir: a directory of compliant fasta files as produced by 178 | orthomclAdjustFasta 179 | 180 | 181 | m8 format has these columns: 182 | query_name, hitname, pcid, len, mismatches, ngaps, start('query'), 183 | end('query'), start('hit'), end('hit'), evalue, bits 184 | 185 | output: 186 | tab delimited text file, with one row per query-subject match. the columns are: 187 | query_id, subject_id, query_taxon, subject_taxon, 188 | evalue_mant, evalue_exp, percent_ident, percent_match 189 | 190 | (percent_match is computed by counting the number of bases or amino acids in the shorter sequence that are matched in any hsp, and dividing by the length of that shorter sequence) 191 | 192 | EXAMPLE: orthomclSoftware/bin/orthomclBlastParser my_blast_results my_orthomcl_dir/compliantFasta >> my_orthomcl_dir/similarSequences.txt 193 | 194 | 195 | "; 196 | 197 | 198 | exit(1); 199 | } 200 | -------------------------------------------------------------------------------- /doc/OrthoMCLEngine/Main/mysqlInstallGuide.txt: -------------------------------------------------------------------------------- 1 | THIS FILE IS UNDER CONSTRUCTION. please mail stevef@pcbi.upenn.edu with questions.... 2 | 3 | This file is a guide to installing a mysql server by a regular user (not an administrator). 4 | It installs mysql in the user's space. 5 | 6 | It is intended to serve users who: 7 | - do not already have a mysql available, and cannot get an admin to install one 8 | - do have one already installed, but will not be able to use it or reconfigure it 9 | 10 | 11 | ----INSTALLATION OVERVIEW--------------------------- 12 | 13 | I. General Requirements 14 | II. Installing from General Linux/Unix Binary Packages 15 | III. Creating a Database and User Account 16 | IV. Installing Required PERL modules for MySQL 17 | V. Optimizing MySQL 18 | VI. Troubleshooting Installation Issues 19 | VII. Removing Your MySQL Installation 20 | 21 | ---------------------------------------------------- 22 | ---------------------------------------------------- 23 | I. General Requirements 24 | 25 | - MySQL server 5.1 or greater 26 | - MySQL DBI and DBD driver modules for PERL (available at CPAN) 27 | - Unix/Linux or MacOS 10.0.1 or greater 28 | 29 | 30 | ---------------------------------------------------- 31 | ---------------------------------------------------- 32 | II. Installing MySQL From Linux/Unix Binary Packages (Root access not required) 33 | 34 | 1. Go to http://www.mysql.com/downloads/ 35 | a) click on "Download" under MySQL Community Server 36 | 37 | b) for linux: click on "Linux (non RPM packages)" 38 | 39 | c) otherwise, find the .tar package for your OS 40 | 41 | d) choose the download for your platform 42 | - use the 'uname -a' command if you don't know your platform 43 | - 'cat /proc/cpuinfo | grep model' will list your processor type (ie. 44 | AMD or Intel, 32 or 64 bit) if you are running Linux. 45 | 46 | e) either login/register or click "No thanks, just take me to the downloads!" 47 | 48 | f) choose a mirror near you, and download 49 | 50 | 51 | 2. Create a directory for your installation, and move the file to it. 52 | a) This directory needs to be on a volume that has *adequate space* for all the data. 53 | - Please see the UserGuide.txt to estimate how much disk space you will need. 54 | 55 | b) (See step 6 below if you want to relocate the mysql data to a separate volume.) 56 | 57 | 58 | 3. Unzip the downloaded file into the directory: 59 | 60 | tar -xzvf mysql-standard-5.1.34-linux-i686-glibc23.tar.gz 61 | rm mysql-standard-5.1.34-linux-i686-glibc23.tar.gz (to save disk space) 62 | 63 | 4. Give the new directory a shortcut name 64 | 65 | ln -s mysql-standard-5.1.34-linux-i686-glibc23 mysql 66 | 67 | 68 | 5. Change to the mysql directory and configure mysql using the sample config file provided in orthomcl 69 | download: 70 | 71 | cd mysql 72 | cp my_orthomcl_dir/doc/OrthoMCLEngine/Main/mysql.cnf . 73 | 74 | a) set basedir= to the full path of your new mysql directory 75 | 76 | b) set datadir= to the full path of the /data subdir in your new mysql 77 | directory 78 | - this is the directory that will hold all the data. 79 | - use the df -h command to see how much space you have 80 | - you will need at least 5x the size of the file made by 81 | orthomclBlastParser 82 | - (revisit this after you have run orthomclBlastParser) 83 | 84 | c) now see the mysqlConfigurationGuide.txt for important optimization 85 | configuration items 86 | 87 | 6. Set up the default MySQL databases: 88 | 89 | 90 | ./scripts/mysql_install_db --defaults-file=mysql.cnf 91 | 92 | NOTE: The script will inform that you that need to set a root password. 93 | Don't worry about this for now; you will perform this task in another step. 94 | 95 | 96 | 7. You are now ready to start your MySQL server as a background process. To do so, from within your mysql directory, run: 97 | 98 | ./bin/mysqld_safe --defaults-file=mysql.cnf & 99 | 100 | NOTE: You *must* run this command from the mysql directory. 101 | You should see something similar to the following: 102 | 103 | [1] 67786% Starting mysqld daemon with databases from home/youraccountname/mysql/data 104 | 105 | 8. At this point your MySQL password is still blank. Use the following command to set a new 106 | root password: 107 | 108 | ./bin/mysqladmin --defaults-file=mysql.cnf -u root password "yourpasswordhere" 109 | 110 | NOTE: DO NOT FORGET THIS PASSWORD. write it down someplace that you won't forget. 111 | 112 | -------------------------------------------------- 113 | -------------------------------------------------- 114 | III. Create a New Database and User Account 115 | 116 | 1. Log in to your mysql server as root. If you are logging in to an existing MySQL server, 117 | use any existing account that can create a user and grant privileges: 118 | 119 | ./bin/mysql --defaults-file=mysql.cnf -u root -p 120 | 121 | Enter the root password you set in Step II.8 when prompted. 122 | 123 | 2. Once logged in as root, create the database and user (schema) that you will use for OrthoMCL 124 | (we use orthomcl as an example here), and grant the user account the necessary privileges: 125 | 126 | mysql> CREATE DATABASE orthomcl; 127 | 128 | mysql> GRANT SELECT,INSERT,UPDATE,DELETE,CREATE VIEW,CREATE, INDEX, DROP on orthomcl.* TO orthomcl@localhost; 129 | 130 | mysql> set password for orthomcl@localhost = password('yourpassword'); 131 | 132 | NOTE: DO NOT FORGET THIS PASSWORD. write it down someplace that you won't forget 133 | 134 | NOTE: if you want to play with the data in the database, you can get into it like this: 135 | ./bin/mysql --defaults-file=mysql.cnf -u orthomcl 136 | 137 | ------------------------------------------------- 138 | ------------------------------------------------- 139 | IV. Installing the Required PERL Modules for MySQL 140 | 141 | 1. Check to see if the DBI and DBD::mysql PERL modules 142 | are installed: 143 | 144 | $ perl -MDBI -e 1 145 | $ perl -MDBD::mysql -e 1 146 | 147 | - If you receive no output, then the module *is* installed and you 148 | can continue to section V. However, if you receive an error message for 149 | either, continue to step 2 and install the missing module(s). 150 | 151 | - It is often the case that the DBI module is installed, but DBD:mysql 152 | is not. 153 | 154 | 155 | 2. If you have root access, the easiest way to install Perl modules on Unix/Linux 156 | is to perform a system-wide install using CPAN: 157 | 158 | $ perl -MCPAN -e shell 159 | cpan> o conf makepl_arg "mysql_config=/path_to_your_mysql_dir/bin/mysql_config" 160 | cpan> install Data::Dumper 161 | cpan> install DBI 162 | cpan> force install DBD::mysql 163 | 164 | 165 | 3. Installing modules as a standard user 166 | 167 | - Follow the steps below to install modules as a non-root user. We 168 | assume /myperl in your home directory is your custom perl directory. 169 | 170 | 1. In your home directory, create the PERL and CPAN directories, and 171 | a blank CPAN config module: 172 | 173 | $ mkdir myperl 174 | $ mkdir .cpan 175 | $ mkdir .cpan/CPAN 176 | $ echo "\$CPAN::Config = {}"> ~/.cpan/CPAN/MyConfig.pm 177 | 178 | 2. Configure your environment by adding the following to your 179 | .bash_profile file in your home directory: 180 | 181 | ###################################### 182 | if [ -z "$PERL5LIB" ] 183 | then 184 | # If PERL5LIB wasn't previously defined, set it... 185 | PERL5LIB=~/myperl/lib 186 | else 187 | # ...otherwise, extend it. 188 | PERL5LIB=$PERL5LIB:~/myperl/lib 189 | fi 190 | 191 | MANPATH=$MANPATH:~/myperl/man 192 | 193 | export PERL5LIB MANPATH 194 | ###################################### 195 | 196 | 3. Create the necessary directories and process your .bash_profile: 197 | 198 | $ mkdir -p ~/myperl/lib 199 | $ mkdir -p ~/myperl/man/man{1,3} 200 | $ source ~/.bash_profile 201 | 202 | 4. Confirm that your custom per5lib paths have been set: 203 | 204 | $ perl -wle'print for grep /myperl/, @INC' 205 | 206 | - You should see pathing relative to your home directory. If not, 207 | repeat steps 1-3. 208 | 209 | 5. Invoke the CPAN shell and complete CPAN configuration: 210 | 211 | $ perl -MCPAN -we shell 212 | 213 | - CPAN will request that you set your config. Accepting the 214 | default (type install Data::Dumper 243 | cpan> install DBI 244 | cpan> force install DBD:mysql 245 | 246 | ------------------------------------------------- 247 | ------------------------------------------------- 248 | V. MySQL Server Optimization 249 | 250 | - please see the mysqlConfigurationGuide.txt document provided in the orthomcl download. 251 | 252 | 253 | ------------------------------------------------- 254 | ------------------------------------------------- 255 | VI. Troublshooting Installation Issues 256 | 257 | The MySQL server logs all status and error messages in a file called 258 | yourhost.err, where yourhost is the name of your machine. The file is located 259 | in the mysql/data directory and contains useful information for debugging problems 260 | with your MySQL server. 261 | 262 | Below are some common installation issues and resolutions: 263 | 264 | (1) 265 | ISSUE: Your MySQL installation is conflicting with another install 266 | 267 | - You may be conflicting with an existing MySQL install if you see an 268 | error in your log similar to the following when running mysql_install_db, 269 | mysqladmin, or mysql: 270 | 271 | Installing MySQL system tables... 272 | 090515 13:32:49 [Warning] Can't create test file 273 | /var/lib/mysql/localhost.lower-test 274 | 090515 13:32:49 [Warning] Can't create test file 275 | /var/lib/mysql/localhost.lower-test 276 | ERROR: 1005 Can't create table 'db' (errno: 13) 277 | 090515 13:32:49 [ERROR] Aborting 278 | 279 | 280 | RESOLUTION: 281 | 282 | - A path is set incorrect in your mysql.cnf if you see a reference to 283 | /var/lib in your error log. Check to ensure that you have correctly 284 | set the mysql_sock and port parameters in mysql.cnf. 285 | 286 | - - to see if 3307 is in use, type this command: 287 | netstat -a | grep tcp | grep 3307 288 | - if so, set port=3308 (or 3309, etc., if 3308 is already used) 289 | 290 | (2) 291 | ISSUE: Unspecified or Misconfigured mysql.cnf file 292 | 293 | - If only the first numeric line appears (you do not see a "Starting 294 | mysqld daemon..." message) when you execute ./bin/mysqld_safe, you 295 | probably entered at least one incorrect path in your mysql.cnf file, or 296 | you did not specify --defaults-file=mysql.cnf when starting MySQL. 297 | 298 | 299 | RESOLUTION: 300 | 301 | - Check to ensure that: 302 | 303 | - You specified the --defaults-file=mysql.cnf when running mysql or 304 | a mysqladmin command. 305 | 306 | - Your mysql.cnf parameters correctly reflect your MySQL 307 | installation path. 308 | 309 | 310 | 311 | ------------------------------------------------- 312 | ------------------------------------------------- 313 | VI. Removing Your MySQL Installation 314 | 315 | - If you wish to remove your MySQL installation, this can be performed in 316 | two simple steps: 317 | 318 | (1) Shutdown your MySQL server: 319 | 320 | cd mysql 321 | ./bin/mysqladmin --defaults-file=mysql.cnf -u root -p shutdown 322 | 323 | 324 | (2) Change to the parent directory of your MySQL installation, and remove 325 | the mysql directory and symbolic link: 326 | 327 | 328 | rm -r -f mysql-standard-5.1.34-linux-i686-glibc23 329 | rm mysql 330 | 331 | 332 | NOTE: Always use caution when using the rm -r -f command! This 333 | command deletes an entire directory structory with no request 334 | for confirmation, so be sure the correct directory is specified. 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | -------------------------------------------------------------------------------- /doc/OrthoMCLEngine/Main/UserGuide.txt: -------------------------------------------------------------------------------- 1 | OrthoMCL User's Guide 2 | Version 2.0 3 | 4 | UNDER CONSTRUCTION: 5 | - mysql install documentation 6 | - mcl documentation 7 | 8 | =================================================== 9 | =========== Introduction ========================== 10 | =================================================== 11 | 12 | For details on the orthomcl algorithm, please read the OrthoMCL Algorithm Document available at: http://docs.google.com/Doc?id=dd996jxg_1gsqsp6 13 | 14 | The input to OrthoMCL is a set of proteomes. 15 | 16 | The output is a set of files: 17 | pairs/ 18 | potentialOrthologs.txt 19 | potentialCoorthologs.txt 20 | potentialInparalogs.txt 21 | groups.txt 22 | 23 | The files in the pairs/ directory contain pairwise relationships between proteins, and their scores. They are categorized into potential orthologs, co-orthologs and inparalogs as described in the OrthoMCL Algorithm Document (docs.google.com/Doc?id=dd996jxg_1gsqsp6). The groups.txt file contains the groups created by clustering the pairs with the MCL program. 24 | 25 | There are three overall stages: 26 | - all-v-all BLAST 27 | - the OrthoMCL Pairs program -- makes the pairs/ directory 28 | - the MCL program -- clusters the pairs to make the groups.txt file 29 | 30 | These stages are executed in a series of thirteen steps detailed below. Most simply involve running a provided program. They are broken into steps for ease of backtracking and recoverability. Most are *very simple* to run, so don't be discouraged. 31 | 32 | 33 | =================================================== 34 | ========= Benchmark Dataset ====================== 35 | =================================================== 36 | 37 | In the documentation for Orthomcl we refer to a Benchmark Dataset. We tested this set extensively. It had: 38 | - 100 proteomes (across the tree of life) 39 | - 1M proteins 40 | - 500M significant similarities (BLAST hits) 41 | 42 | We base hardware requirements and time estimates on this benchmark dataset. The most significant predictor of resourc/time requirements is the number of significant similarities. Even so, as this number changes, resource requirements will change non-linearly. 43 | 44 | 45 | =================================================== 46 | ========= Requirements ============================ 47 | =================================================== 48 | 49 | (1) UNIX 50 | - The OrthoMCL Pairs program has only been tested on UNIX 51 | - The MCL program is UNIX compatible only 52 | 53 | (2) BLAST 54 | - we recommend NCBI BLAST for two reasons 55 | a) theoretically: XXXXXXX 56 | b) practically: NCBI BLAST supports a tab delimited output which we provide parsers for. See Step 7 below. 57 | - for large datasets (e.g. 1M proteins) all-v-all BLAST will likely require a compute cluster. Otherwise, it could run for weeks or months. 58 | 59 | (3) Relational Database 60 | - The OrthoMCL Pairs program runs in a relational database. Supported vendors are: 61 | - Oracle 62 | - MySql 63 | - If you don't already have one of these installed, install MySql, which can be done for free and without significant systems administration support. (Follow the instructions we provide below.) 64 | 65 | We realize that it is a little inconvenient to require a relational database. However, using a relational database as the core technology for orthomclPairs provides speed, robustness and scalability that would have been very hard to acheive without it. 66 | 67 | (4) Hardware 68 | - the hardware requirements vary dramatically with the size of your dataset. 69 | - for the Benchmark Dataset, we recommend: 70 | - memory: at least 4G 71 | - disk: 100G free space. 72 | - you can estimate your disk space needs more accurately when you have completed Step 8 below. You will need at least 5 times the size of the blast results file produced in that step. 90% of the disk space required is to hold that file, load it into the database and index it in the database. 73 | 74 | (5) Perl 75 | - standard perl 76 | - DBI libraries 77 | 78 | (6) MCL program 79 | - more details here... 80 | 81 | (7) Time 82 | - The Benchmark Dataset took: 83 | - 3 days to run all-v-all BLAST on a 500 cpu compute cluster. 84 | - 16 hours for the orthmclPairs processing to find pairs 85 | - 2 hours for MCL to find the groups 86 | 87 | 88 | 89 | =================================================== 90 | =========== Overview of steps ===================== 91 | =================================================== 92 | 93 | This is an overview of the thirteen steps to run orthomcl. Details are in the next sections. 94 | 95 | All programs except mysql and mcl are provided as part of the OrthoMCL download. The provided programs all begin with 'orthomcl' and will print help if called with no arguments. 96 | 97 | (1) install or get access to a supported relational database. If using MySql, certain configurations are required, so it may involve working with your MySql administrator or installing your own MySql. See the mysqlInstallationGuide.txt document provided with the orthomcl software. 98 | 99 | (2) download and install the mcl program according to provided instructions. 100 | 101 | (3) install and configure the OrthoMCL suite of programs. 102 | 103 | (4) run orthomclInstallSchema to install the required schema into the database. 104 | 105 | (5) run orthomclAdjustFasta (or your own simple script) to generate protein fasta files in the required format. 106 | 107 | (6) run orthomclFilterFasta to filter away poor quality proteins, and optionally remove alternative proteins. Creates a single large goodProteins.fasta file (and a poorProteins.fasta file) 108 | 109 | (7) run all-v-all NCBI BLAST on goodProteins.fasta (output format is tab delimited text). We do not provide documentation or support for this step. 110 | 111 | (8) run orthomclBlastParser on the NCBI BLAST tab output to create a file of similarities in the required format 112 | 113 | (9) run orthomclLoadBlast to load the output of orthomclBlastParser into the database. 114 | 115 | (10) run the orthomclPairs program to compute pairwise relationships. 116 | 117 | (11) run the orthomclDumpPairsFiles program to dump the pairs/ directory from the database 118 | 119 | (12) run the mcl program on the mcl_input.txt file created in Step 11. 120 | 121 | (13) run orthomclMclToGroups to convert mcl output to groups.txt 122 | 123 | We recommend you save the output of each step so that you can easily redo it if things go wrong. 124 | 125 | 126 | =================================================== 127 | ============ Steps in detail ====================== 128 | =================================================== 129 | 130 | ========== Step 1: Install and configure the relational database ============ 131 | If you are using Oracle, please see the included oracleConfigurationGuide.txt 132 | 133 | If you are using MySQL, please see the included mysqlConfigurationGuide.txt 134 | 135 | If you do not have either, please see the mysqlInstallationGuide.txt to install your own mysql. 136 | 137 | 138 | ========== Step 2: install mcl ========== 139 | Get the latest software from http://www.micans.org/mcl/src/mcl-latest.tar.gz 140 | 141 | Follow this install instructions. 142 | 143 | MORE HERE SOON... 144 | 145 | 146 | ========== Step 3: install and configure OrthoMCL programs ======== 147 | Input: 148 | - orthomclSoftware.tar 149 | Output: 150 | - directory of executable programs 151 | - home directory for your run of orthomcl 152 | - orthomcl.config file 153 | 154 | Use this command to unpack the software: 155 | tar -xf orthomclSoftware.tar 156 | 157 | The result will be this: 158 | orthomclSoftware/ 159 | bin/ 160 | ... 161 | doc/ 162 | UserGuide.txt 163 | orthomcl.config.template 164 | lib/ 165 | 166 | The bin/ directory has a set of programs. To run the programs you will need to either: 167 | a) include the orthomclSoftware/bin directory in your PATH 168 | b) call the programs using their full directory path 169 | 170 | Make a directory to hold the data and results for your run of orthomcl. In this document we will call that directory "my_orthomcl_dir". 171 | 172 | In the orthomclSoftware/doc/Main/OrthoMCLEngine directory is a file called orthomcl.config.template. Copy that file to my_orthomcl_dir/orthomcl.config, and edit the new file according to the following instructions. 173 | 174 | In the examples below, it is assumed that your MySql server has a database called 'orthomcl'. You can either create one (go into the server and run 'create database orthomcl') or use an existing database, and change the dbConnectString accordingly. 175 | 176 | dbVendor= 177 | - either 'oracle' or 'mysql' 178 | - used by orthomclInstallSchema, orthomclLoadBlast, orthomclPairs 179 | dbConnectString= 180 | - the string required by Perl DBI to find the database. 181 | - examples are: 182 | dbi:Oracle:orthomcl (for an oracle database with service name 'orthomcl') 183 | dbi:MySql:orthomcl (for a centrally installed mysql server with a database called 'orthomcl') 184 | dbi:MySql:orthomcl:localhost:3307 (for a user installed mysql server on port 3307 with a database called 'orthomcl') 185 | - used by orthomclInstallSchema, orthomclLoadBlast, orthomclPairs, orthomclDumpPairsFiles 186 | dbLogin= 187 | - your database login name 188 | - used by orthomclInstallSchema, orthomclLoadBlast, orthomclPairs, orthomclDumpPairsFiles 189 | dbPassword= 190 | - your database password 191 | - used by orthomclInstallSchema, orthomclLoadBlast, orthomclPairs, orthomclDumpPairsFiles 192 | similarSequencesTable= 193 | - the name to give the table that will be loaded with blast results by orthomclLoadBlast. This is configurable for your flexibility. It doesn't matter what you call it. 194 | - used by orthomclInstallSchema, orthomclLoadBlast, orthomclPairs 195 | orthologTable= 196 | - the name of the table that will hold potential ortholog pairs. This is configurable so that you can run orthomclPairs multiple times, and compare results. 197 | - used by orthomclInstallSchema, orthomclPairs, orthomclDumpPairsFiles 198 | inParalogTable=InParalog 199 | - the name of the table that will hold potential inparalog pairs. This is configurable so that you can run orthomclPairs multiple times, and compare results. 200 | - used by orthomclInstallSchema, orthomclPairs, orthomclDumpPairsFiles 201 | coOrthologTable=CoOrtholog 202 | - the name of the table that will hold potential coortholog pairs. This is configurable so that you can run orthomclPairs multiple times, and compare results. 203 | - used by orthomclInstallSchema, orthomclPairs, orthomclDumpPairsFiles 204 | interTaxonMatchView=InterTaxonMatch 205 | percentMatchCutoff= 206 | - blast similarities with percent match less than this value are ignored. 207 | - used by orthomclPairs 208 | evalueExponentCutoff= 209 | - blast similarities with evalue Exponents greather than this value are ignored. 210 | - used by orthomclPairs 211 | oracleIndexTblSpc= 212 | - optional table space to house all oracle indexes, if required by your oracle server. default is blank. 213 | 214 | 215 | ========== Step 4: orthomclInstallSchema ======== 216 | Input: 217 | - database 218 | Output: 219 | - database with schema installed 220 | 221 | Run the orthmclInstallSchema program to install the schema. (Run the program with no arguments to get help. This is true of all following orthomcl programs.) 222 | 223 | Benchmark time: < 1 min 224 | 225 | 226 | ========== Step 5: orthomclAdjustFasta ========== 227 | Input: 228 | - fasta files as acquired from the genome resource. 229 | Output: 230 | - the my_orthomcl_dir/compliantFasta/ directory of orthomcl-compliant fasta files (see Step 6) 231 | 232 | Use orthomclAdjustFasta to produce a compliant file from any input file that conforms to the following pattern (for other files, provide your own script to produce complaint fasta files): 233 | - has one or more fields that are separated by white space or the '|' character (optionally surrounded by white space) 234 | - has the unique ID in the same field of every protein. 235 | 236 | First, for any organism that has multiple protein fasta files, combine them all into one single proteome fasta file 237 | 238 | Then, create an empty my_orthomcl_dir/compliantFasta/ directory, and change to that directory. Run orthomclAdjustFasta once for each input proteome fasta file. It will produce a compliant file in the new directory. Check each file to ensure that the proteins all have proper IDs. 239 | 240 | Benchmark time: < 1 min per genome 241 | 242 | 243 | ======== Step 6: orthomclFilterFasta =========== 244 | Input: 245 | - my_orthomcl_dir/compliantFasta/ 246 | - optionally a gene->protein mapping file 247 | Output: 248 | - my_orthomcl_dir/goodProteins.fasta 249 | - my_orthomcl_dir/poorProteins.fasta 250 | - report of suspicious proteomes (> 10% poor proteins) 251 | 252 | This step produces a single goodProteins.fasta file to run BLAST on. It filters away poor-quality sequences (placing them in poorProteins.fasta). The filter is based on length and percent stop codons. You can adjust these values. 253 | 254 | The input requirements are: 255 | 1) a compliantFasta/ directory which contains all and only the proteome .fasta files, one file per proteome. 256 | 1) each .fasta file must have a name in the form 'xxxx.fasta' where xxxx is a three or four letter unique taxon code. For example: hsa.fasta or eco.fasta 257 | 2) each protein in those files must have a definition line in the following format: 258 | >xxxx|yyyyyyyy 259 | where xxxx is the three or four letter taxon code and yyyyyyy is a sequence identifier unique within that taxon. 260 | 261 | Change dir to my_orthomcl_dir/ and run orthomclFilterFasta. 262 | 263 | Benchmark time: 5 min 264 | 265 | 266 | ========= Step 7: All-v-all BLAST ========= 267 | Input: 268 | - goodProteins.fasta 269 | Output: 270 | - your_blast_results_in_tab_format 271 | 272 | You must run your own BLAST. For large datasets you should consider gaining access to a compute cluster. 273 | 274 | We expect you to: 275 | - use NCBI BLAST 276 | - run with the -m 8 option to provide tab delimited output required by Step 8 277 | - see the OrthoMCL Algorithm Document (docs.google.com/Doc?id=dd996jxg_1gsqsp6) for important details about other BLAST arguments 278 | 279 | If you are a power user you can deviate from this, so long as you can ultimately provide output in exactly the format provided by NCBI BLAST using the -m 8 option, and expected by Step 8. 280 | 281 | If you are a super-power user you can deviate from that, and also skip Step 8. But you must be able to provide the exact format file created by that step as expected by Step 9. The tricky part is computing percent match. 282 | 283 | Time estimate: highly dependent on your data and hardware 284 | 285 | ========= Step 8: orthomclBlastParser ======== 286 | Input: 287 | - your_blast_results_in_tab_format 288 | - my_orthomcl_dir/compliantFasta/ 289 | Output: 290 | - my_orthomcl_dir/similarSequences.txt 291 | 292 | This step parses NCBI BLAST -m 8 output into the format that can be loaded into the orthomcl database. 293 | 294 | Use the orthomclBlastParser program for this. In addition to formatting, it computes the percent match of each hit, which is tricky (see the perl code if you are a super-power user.) 295 | 296 | orthomclBlastParser my_blast_results compliantFasta >> similarSequences.txt 297 | 298 | IMPORTANT NOTE: the size of this file determines the disk space required by the relational database. You will need 5x the size of this file. Please see the oracleConfigGuide or mysqlConfigGuide now that you know the size of this file. 299 | 300 | Benchmark time: 10 min 301 | 302 | 303 | ========= Step 9: orthomclLoadBlast =========== 304 | Input: 305 | - similarSequences.txt 306 | Output: 307 | - SimilarSequences table in the database 308 | 309 | This step loads the BLAST results into the orthomcl database. 310 | 311 | Use the orthomclLoadBlast program for this. 312 | 313 | Benchmark time: 4 hours 314 | 315 | 316 | ========= Step 10: orthomclPairs ========= 317 | Input: 318 | - SimilarSequences table in the database 319 | Output: 320 | - PotentialOrthologs table 321 | - PotentialInParalogs table 322 | - PotentialCoOrthologs table 323 | 324 | This is a computationally major step that finds protein pairs. It executes the algorithm described in the OrthoMCL Algorithm Document (docs.google.com/Doc?id=dd996jxg_1gsqsp6), using a relational database. The program proceeds through a series of internal steps, each creating an intermediate database table or index. There are about 20 such tables created. Finally, it populates the output tables. 325 | 326 | The cleanup= option allows you to control the cleaning up of the intermediary tables. The 'yes' option drops the intermediary tables once they are no longer needed. The 'no' option keeps the intermediary tables in the database. In total, they are expected to be about 50 percent of the SimilarSequences table. They are useful mostly for power users or developers who would like to query them. They can be removed afterwards with the 'only' or 'all' options. The latter also removes the final tables, and should only be done after Step 11 below has dumped them to files. 327 | 328 | The startAfter= option allows you to pick up where the program left off, if it stops for any reason. Look in the log to find the last completed step, and use its tag as the value for startAfter= 329 | 330 | Because this program will run for many hours, we recommend you run it using the UNIX 'screen' program, so that it does not abort in the middle. (If it does, use startAfter=). 331 | 332 | Benchmark time: 16 hours 333 | 334 | 335 | ========== Step 11: orthomclDumpPairsFiles ======== 336 | Input: 337 | - database with populated pairs tables 338 | Output 339 | - pairs/ directory 340 | - mclInput file 341 | 342 | Run the orthomclDumpPairsFiles 343 | 344 | Benchmark time: 5 min 345 | 346 | 347 | ========== Step 12: mcl ======== 348 | Input: 349 | - mclInput file 350 | Output: 351 | - mclOutput file 352 | 353 | mcl my_orthomcl_dir/mclInput --abc -I 1.5 -o my_orthomcl_dir/mclOutput 354 | 355 | Benchmark time: 3 hours 356 | 357 | 358 | ========== Step 13: orthomclMclToGroups ========== 359 | Input: 360 | - mclOutput file 361 | Output: 362 | - groups.txt 363 | 364 | Change to my_orthomcl_dir and run: 365 | orthomclMclToGroups my_prefix 1000 < mclOutput > groups.txt 366 | 367 | my_prefix is an arbitrary string to use as a prefix for your group IDs. 368 | 369 | Benchmark time: 1 min 370 | 371 | 372 | 373 | 374 | 375 | -------------------------------------------------------------------------------- /bin/orthomclPairs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use DBI; 4 | use FindBin; 5 | use lib "$FindBin::Bin/../lib/perl"; 6 | use OrthoMCLEngine::Main::Base; 7 | use strict; 8 | 9 | 10 | my $debug=0; 11 | 12 | my $configFile = $ARGV[0]; 13 | my $logFile = $ARGV[1]; 14 | my $clean = $ARGV[2]; 15 | my $restart = $ARGV[3]; 16 | my $taxonFilter = $ARGV[4]; 17 | 18 | my $stepCount = 1; 19 | my @steps = ( # Common 20 | ['updateMinimumEvalueExponent'], 21 | ['bestQueryTaxonScore'], 22 | ['qtscore_ix'], 23 | # Ortholog 24 | ['bestHit', ['drop table BestQueryTaxonScore']], 25 | ['best_hit_ix'], 26 | ['ortholog', ['drop table BestHit']], 27 | ['orthologTaxon'], 28 | ['orthologAvg'], 29 | ['orthologAvgIndex'], 30 | ['orthologsNormalization', ['drop table OrthologAvgScore', 'drop table OrthologTaxon', 'drop table OrthologTemp']], 31 | # InParalog 32 | ['bestInterTaxonScore'], 33 | ['bis_uids_ix'], 34 | ['uniqueSimSeqsQueryId'], 35 | ['ust_qids_ix'], 36 | ['betterHit', ['drop table BestInterTaxonScore', 'drop table UniqueSimSeqsQueryId']], 37 | ['better_hit_ix'], 38 | ['inParalog', ['drop table BetterHit']], 39 | ['inParalogTaxonAvg'], 40 | ['orthologUniqueId'], 41 | ['orthologUniqueIdIndex'], 42 | ['inParalogOrthologTaxonAvg', ['drop table OrthologUniqueId']], 43 | ['inParalogAvg',['drop table InParalogTaxonAvg', 'drop table InParalogOrthologTaxonAvg']], 44 | ['inParalogAvgIndex'], 45 | ['inParalogsNormalization', ['drop table InParalogAvgScore', 'drop table InParalogTemp']], 46 | # CoOrtholog 47 | ['inParalog2Way'], 48 | ['in2a_ix'], 49 | ['in2b_ix'], 50 | ['ortholog2Way'], 51 | ['ortholog2WayIndex'], 52 | ['inParalogOrthologInParalog'], 53 | ['inParalogOrtholog'], 54 | ['coOrthologCandidate', ['drop table Ortholog2Way', 'drop table InParalog2Way', 'drop table InParalogOrthologInParalog', 'drop table InParalogOrtholog']], 55 | ['coOrthologNotOrtholog', ['drop table CoOrthologCandidate']], 56 | ['coOrthologNotOrthologIndex'], 57 | ['coOrtholog', ['drop table CoOrthologNotOrtholog']], 58 | ['coOrthologTaxon'], 59 | ['coOrthologAvg'], 60 | ['coOrthologAvgIndex'], 61 | ['coOrthologsNormalization', ['drop table CoOrthologAvgScore', 'drop table CoOrthologTaxon', 'drop table CoOrthologTemp']], 62 | ['cleanall', ['truncate table InParalog', 'truncate table Ortholog', 'truncate table CoOrtholog']], 63 | ); 64 | 65 | my $stepsHash; 66 | my $cleanHash; 67 | for (my $i=0; $i{$steps[$i]->[0]} = $i+1; 69 | $cleanHash->{$steps[$i]->[0]} = $steps[$i]->[1] if $steps[$i]->[1]; 70 | } 71 | 72 | &usage() unless $configFile; 73 | &usage() unless $logFile; 74 | &usage() unless $clean =~ /cleanup=(yes|no|only|all)/; 75 | 76 | $clean = $1; 77 | 78 | my $skipPast; 79 | if ($restart) { 80 | if ($restart =~ /taxonFilter=/) { 81 | $taxonFilter = $restart; 82 | } else { 83 | usage() unless $restart =~ /startAfter=(.*)/; 84 | $skipPast = $stepsHash->{$1}; 85 | die "invalid restart arg $restart" unless $skipPast; 86 | } 87 | } 88 | 89 | my $andTaxonFilter = ""; 90 | my $whereTaxonFilter = ""; 91 | my $taxonFilterTaxon; 92 | if ($taxonFilter) { 93 | die "illegal argument '$taxonFilter'\n" unless $taxonFilter =~ /taxonFilter=(.*)/; 94 | $taxonFilterTaxon = $1; 95 | my $subjFilter = "and s.subject_taxon_id != '$taxonFilterTaxon'"; 96 | $andTaxonFilter = "and s.query_taxon_id != '$taxonFilterTaxon' $subjFilter"; 97 | $whereTaxonFilter = "where s.query_taxon_id != '$taxonFilterTaxon' $subjFilter"; 98 | } 99 | 100 | 101 | open (LOGFILE, ">>$logFile") || die "Can't open log file '$logFile'\n"; 102 | my $oldfh = select(LOGFILE); $| = 1; select($oldfh); # flush print buffer 103 | 104 | print LOGFILE "\n\n============================================================================================\n"; 105 | print LOGFILE localtime(). " orthomclPairs " . join(' ', @ARGV) . "\n"; 106 | print LOGFILE "=============================================================================================\n\n"; 107 | 108 | my $base = OrthoMCLEngine::Main::Base->new($configFile, *LOGFILE); 109 | my $dbh = $base->getDbh(); 110 | 111 | my $sst = $base->getConfig("similarSequencesTable"); 112 | 113 | my $oracleNoLogging = $base->getConfig("dbVendor") eq 'oracle'? " NOLOGGING" : ""; 114 | my $straightJoin = $base->getConfig("dbVendor") eq 'oracle'? "" : "STRAIGHT_JOIN"; 115 | 116 | commonTempTables(); 117 | 118 | orthologs(); 119 | 120 | inparalogs(); 121 | 122 | coorthologs(); 123 | 124 | clean('cleanall') if $clean eq 'all'; 125 | 126 | print LOGFILE "\nDone\n"; 127 | 128 | 129 | 130 | ################################################################################ 131 | ############################### Common tables ################################# 132 | ################################################################################ 133 | sub commonTempTables { 134 | print LOGFILE localtime() . " Constructing common temp tables\n" 135 | unless $clean eq 'only' || $clean eq 'all'; 136 | 137 | my $interTaxonMatch = $base->getConfig("interTaxonMatchView"); 138 | 139 | # a little bit of a hack here. mysql can't tolerate finding the 140 | # minEvalueExp in the sql that updates the table 141 | # so, we do it as a preprocess. 142 | # must explicitly avoid the preprocess if just cleaning or if skipping 143 | my $sql = " 144 | select min(evalue_exp) 145 | from $sst 146 | where evalue_mant != 0 147 | "; 148 | my $minEvalueExp; 149 | if ($clean ne 'only' && $clean ne 'all' && !$skipPast) { 150 | print LOGFILE localtime() . " Find min evalue exp (OrthoMCL-DB V2 took ??? for this step)\n"; 151 | my $stmt = $dbh->prepare("$sql") or die DBI::errstr; 152 | $stmt->execute() or die DBI::errstr; 153 | ($minEvalueExp) = $stmt->fetchrow_array(); 154 | print LOGFILE localtime() . " done\n"; 155 | } 156 | 157 | $sql = " 158 | update $sst 159 | set evalue_exp = ${minEvalueExp}-1 160 | where evalue_exp = 0 161 | "; 162 | runSql($sql, "updating $sst, setting 0 evalue_exp to underflow value (${minEvalueExp} - 1)", 163 | 'updateMinimumEvalueExponent', '25 min', undef); 164 | 165 | ########################################################################## 166 | 167 | $sql = " 168 | create table BestQueryTaxonScore $oracleNoLogging as 169 | select im.query_id, im.subject_taxon_id, low_exp.evalue_exp, min(im.evalue_mant) as evalue_mant 170 | from $interTaxonMatch im, 171 | (select query_id, subject_taxon_id, min(evalue_exp) as evalue_exp 172 | from $interTaxonMatch 173 | group by query_id, subject_taxon_id) low_exp 174 | where im.query_id = low_exp.query_id 175 | and im.subject_taxon_id = low_exp.subject_taxon_id 176 | and im.evalue_exp = low_exp.evalue_exp 177 | group by im.query_id, im.subject_taxon_id, low_exp.evalue_exp 178 | "; 179 | 180 | runSql($sql, "create BestQueryTaxonScore", 'bestQueryTaxonScore', '1.5 hours', undef); 181 | 182 | ################################################################################ 183 | 184 | $sql = " 185 | create unique index qtscore_ix on BestQueryTaxonScore(query_id, subject_taxon_id, evalue_exp, evalue_mant) 186 | "; 187 | 188 | runSql($sql, "create qtscore_ix index on BestQueryTaxonScore", 'qtscore_ix', '15 min', 'BestQueryTaxonScore'); 189 | } 190 | 191 | 192 | ################################################################################ 193 | ############################### Orthologs ##################################### 194 | ################################################################################ 195 | sub orthologs { 196 | print LOGFILE localtime() . " Constructing ortholog tables\n" 197 | unless $clean eq 'only' || $clean eq 'all'; 198 | 199 | my $evalueExpThreshold = $base->getConfig("evalueExponentCutoff"); 200 | my $percentMatchThreshold = $base->getConfig("percentMatchCutoff"); 201 | 202 | my $sql = " 203 | create table BestHit $oracleNoLogging as 204 | select s.query_id, s.subject_id, 205 | s.query_taxon_id, s.subject_taxon_id, 206 | s.evalue_exp, s.evalue_mant 207 | from $sst s, BestQueryTaxonScore cutoff 208 | where s.query_id = cutoff.query_id 209 | and s.subject_taxon_id = cutoff.subject_taxon_id 210 | and s.query_taxon_id != s.subject_taxon_id 211 | and s.evalue_exp <= $evalueExpThreshold $andTaxonFilter 212 | and s.percent_match >= $percentMatchThreshold 213 | and (s.evalue_mant < 0.01 214 | or s.evalue_exp = cutoff.evalue_exp 215 | and s.evalue_mant = cutoff.evalue_mant) 216 | "; 217 | 218 | runSql($sql, "create BestHit", 'bestHit', '1.5 hours', undef); 219 | 220 | ###################################################################### 221 | 222 | $sql = " 223 | create unique index best_hit_ix on BestHit(query_id,subject_id) 224 | "; 225 | 226 | runSql($sql, "create best_hit_ix index on BestHit", 'best_hit_ix', '15 min', 'BestHit'); 227 | 228 | ###################################################################### 229 | 230 | $sql = " 231 | create table OrthologTemp $oracleNoLogging as 232 | select bh1.query_id as sequence_id_a, bh1.subject_id as sequence_id_b, 233 | bh1.query_taxon_id as taxon_id_a, bh1.subject_taxon_id as taxon_id_b, 234 | case -- don't try to calculate log(0) -- use rigged exponents of SimSeq 235 | when bh1.evalue_mant < 0.01 or bh2.evalue_mant < 0.01 236 | then (bh1.evalue_exp + bh2.evalue_exp) / -2 237 | else -- score = ( -log10(evalue1) - log10(evalue2) ) / 2 238 | (log(10, bh1.evalue_mant * bh2.evalue_mant) 239 | + bh1.evalue_exp + bh2.evalue_exp) / -2 240 | end as unnormalized_score 241 | from BestHit bh1, BestHit bh2 242 | where bh1.query_id < bh1.subject_id 243 | and bh1.query_id = bh2.subject_id 244 | and bh1.subject_id = bh2.query_id 245 | "; 246 | 247 | runSql($sql, "create OrthologTemp table", 'ortholog', '5 min', 'OrthologTemp'); 248 | 249 | ###################################################################### 250 | 251 | orthologTaxonSub(''); 252 | 253 | ###################################################################### 254 | 255 | normalizeOrthologsSub('', $base->getConfig("orthologTable")); 256 | } 257 | 258 | 259 | ################################################################################ 260 | ############################### InParalogs #################################### 261 | ################################################################################ 262 | sub inparalogs { 263 | print LOGFILE localtime() . " Constructing inParalog tables\n" 264 | unless $clean eq 'only' || $clean eq 'all'; 265 | 266 | my $inParalogTable = $base->getConfig("inParalogTable"); 267 | my $orthologTable = $base->getConfig("orthologTable"); 268 | my $evalueExpThreshold = $base->getConfig("evalueExponentCutoff"); 269 | my $percentMatchThreshold = $base->getConfig("percentMatchCutoff"); 270 | 271 | my $sql = " 272 | create table BestInterTaxonScore $oracleNoLogging as 273 | select im.query_id, low_exp.evalue_exp, min(im.evalue_mant) as evalue_mant 274 | from BestQueryTaxonScore im, 275 | (select query_id, min(evalue_exp) as evalue_exp 276 | from BestQueryTaxonScore 277 | group by query_id) low_exp 278 | where im.query_id = low_exp.query_id 279 | and im.evalue_exp = low_exp.evalue_exp 280 | group by im.query_id, low_exp.evalue_exp 281 | "; 282 | 283 | runSql($sql, "create BestInterTaxonScore", 'bestInterTaxonScore', '5 min', undef); 284 | 285 | ########################################################################### 286 | 287 | $sql = " 288 | create unique index bis_uids_ix on BestInterTaxonScore(query_id) 289 | "; 290 | 291 | runSql($sql, "create bis_uids_ix index on BestQueryTaxonScore", 'bis_uids_ix', '1 min', 'BestQueryTaxonScore'); 292 | 293 | ########################################################################### 294 | 295 | $sql = " 296 | create table UniqueSimSeqsQueryId $oracleNoLogging as 297 | select distinct s.query_id from $sst s $whereTaxonFilter 298 | "; 299 | 300 | runSql($sql, "create UniqueSimSeqsQueryId", 'uniqueSimSeqsQueryId', '25 min', undef); 301 | 302 | ########################################################################### 303 | 304 | $sql = " 305 | create unique index ust_qids_ix on UniqueSimSeqsQueryId(query_id) 306 | "; 307 | 308 | runSql($sql, "create ust_qids_ix index on UniqueSimSeqsQueryId", 'ust_qids_ix', '1 min', 'UniqueSimSeqsQueryId'); 309 | 310 | 311 | ########################################################################### 312 | 313 | $sql = " 314 | create table BetterHit $oracleNoLogging as 315 | select s.query_id, s.subject_id, 316 | s.query_taxon_id as taxon_id, 317 | s.evalue_exp, s.evalue_mant 318 | from $sst s, BestInterTaxonScore bis 319 | where s.query_id != s.subject_id $andTaxonFilter 320 | and s.query_taxon_id = s.subject_taxon_id 321 | and s.query_id = bis.query_id 322 | and s.evalue_exp <= $evalueExpThreshold 323 | and s.percent_match >= $percentMatchThreshold 324 | and (s.evalue_mant < 0.001 325 | or s.evalue_exp < bis.evalue_exp 326 | or (s.evalue_exp = bis.evalue_exp and s.evalue_mant <= bis.evalue_mant)) 327 | -- . . . or Similarity for a protein with no BestInterTaxonScore 328 | -- (i.e. an intrataxon match for a protein with no intertaxon 329 | -- match in the database) 330 | union 331 | select s.query_id, s.subject_id, s.query_taxon_id as taxon_id, s.evalue_exp, s.evalue_mant 332 | from $sst s 333 | where s.query_taxon_id = s.subject_taxon_id $andTaxonFilter 334 | and s.evalue_exp <= $evalueExpThreshold 335 | and s.percent_match >= $percentMatchThreshold 336 | and s.query_id in 337 | (SELECT distinct ust.query_id 338 | from UniqueSimSeqsQueryId ust 339 | LEFT OUTER JOIN BestInterTaxonScore bis ON bis.query_id = ust.query_id 340 | WHERE bis.query_id IS NULL) 341 | "; 342 | 343 | runSql($sql, "create BetterHit table", 'betterHit', '3 hours', undef); 344 | 345 | ########################################################################### 346 | 347 | $sql = " 348 | create unique index better_hit_ix on BetterHit(query_id,subject_id) 349 | "; 350 | 351 | runSql($sql, "create better_hit_ix index on BetterHit", 'better_hit_ix', '25 min', 'BetterHit'); 352 | 353 | ########################################################################### 354 | 355 | $sql = " 356 | create table InParalogTemp $oracleNoLogging as 357 | select bh1.query_id as sequence_id_a, bh1.subject_id as sequence_id_b, 358 | bh1.taxon_id, 359 | case -- don't try to calculate log(0) -- use rigged exponents of SimSeq 360 | when bh1.evalue_mant < 0.01 or bh2.evalue_mant < 0.01 361 | then (bh1.evalue_exp + bh2.evalue_exp) / -2 362 | else -- score = ( -log10(evalue1) - log10(evalue2) ) / 2 363 | (log(10, bh1.evalue_mant * bh2.evalue_mant) 364 | + bh1.evalue_exp + bh2.evalue_exp) / -2 365 | end as unnormalized_score 366 | from BetterHit bh1, BetterHit bh2 367 | where bh1.query_id < bh1.subject_id 368 | and bh1.query_id = bh2.subject_id 369 | and bh1.subject_id = bh2.query_id 370 | "; 371 | 372 | runSql($sql, "create InParalogTemp table", 'inParalog', '15 min', undef); 373 | 374 | ################################################################ 375 | 376 | $sql = " 377 | create table InParalogTaxonAvg $oracleNoLogging as 378 | select avg(i.unnormalized_score) average, i.taxon_id 379 | from InParalogTemp i 380 | group by i.taxon_id 381 | "; 382 | 383 | runSql($sql, "create InParalogTaxonAvg table", 'inParalogTaxonAvg', '1 min', undef); 384 | 385 | ################################################################ 386 | 387 | $sql = " 388 | create table OrthologUniqueId $oracleNoLogging as 389 | select distinct(sequence_id) from ( 390 | select sequence_id_a as sequence_id from $orthologTable 391 | union 392 | select sequence_id_b as sequence_id from $orthologTable) i 393 | "; 394 | 395 | runSql($sql, "create OrthologUniqueId table", 'orthologUniqueId', '5 min', undef); 396 | 397 | ################################################################ 398 | 399 | $sql = "create unique index ortholog_unique_id_ix on OrthologUniqueId(sequence_id)"; 400 | 401 | 402 | runSql($sql, "create unique ortholog_unique_id_ix index", 'orthologUniqueIdIndex', '1 min', 'OrthologUniqueId'); 403 | 404 | ################################################################ 405 | 406 | $sql = " 407 | create table InParalogOrthologTaxonAvg $oracleNoLogging as 408 | 409 | select avg(i.unnormalized_score) average, i.taxon_id 410 | from InParalogTemp i 411 | where i.sequence_id_a in 412 | (select sequence_id from OrthologUniqueId) 413 | or i.sequence_id_b in 414 | (select sequence_id from OrthologUniqueId) 415 | group by i.taxon_id 416 | "; 417 | 418 | runSql($sql, "create InParalogOrthologTaxonAvg table", 'inParalogOrthologTaxonAvg', '10 min', undef); 419 | 420 | ################################################################ 421 | 422 | $sql = " 423 | create table InParalogAvgScore $oracleNoLogging as 424 | select case 425 | when orth_i.average is NULL 426 | then all_i.average 427 | else orth_i.average 428 | end as avg_score, 429 | all_i.taxon_id 430 | from InParalogTaxonAvg all_i LEFT OUTER JOIN InParalogOrthologTaxonAvg orth_i 431 | ON all_i.taxon_id = orth_i.taxon_id 432 | "; 433 | 434 | runSql($sql, "create InParalogAvgScore table", 'inParalogAvg', '1 min', undef); 435 | 436 | ################################################################ 437 | 438 | $sql = "create unique index inparalog_avg_ix on InParalogAvgScore(taxon_id,avg_score)"; 439 | 440 | 441 | runSql($sql, "create InParalogAvgScore index", 'inParalogAvgIndex', '1 min', 'InParalogAvgScore'); 442 | 443 | ################################################################ 444 | 445 | $sql = " 446 | insert into $inParalogTable (sequence_id_a, sequence_id_b, taxon_id, unnormalized_score, normalized_score) 447 | select it.sequence_id_a, it.sequence_id_b, it.taxon_id, it.unnormalized_score, it.unnormalized_score/a.avg_score 448 | from InParalogTemp it, InParalogAvgScore a 449 | where it.taxon_id = a.taxon_id 450 | "; 451 | 452 | runSql($sql, "populate $inParalogTable table, including normalized_score", 'inParalogsNormalization', '3 min', "$inParalogTable"); 453 | 454 | ################################################################ 455 | } 456 | 457 | ################################################################################ 458 | ############################### CoOrthologs ################################### 459 | ################################################################################ 460 | sub coorthologs { 461 | print LOGFILE localtime() . " Constructing coOrtholog tables\n" 462 | unless $clean eq 'only' || $clean eq 'all'; 463 | 464 | my $inParalogTable = $base->getConfig("inParalogTable"); 465 | my $orthologTable = $base->getConfig("orthologTable"); 466 | my $coOrthologTable = $base->getConfig("coOrthologTable"); 467 | my $evalueExpThreshold = $base->getConfig("evalueExponentCutoff"); 468 | my $percentMatchThreshold = $base->getConfig("percentMatchCutoff"); 469 | 470 | my $sql = " 471 | create table InParalog2Way $oracleNoLogging as 472 | select sequence_id_a, sequence_id_b from $inParalogTable 473 | union 474 | select sequence_id_b as sequence_id_a, sequence_id_a as sequence_id_b from $inParalogTable 475 | "; 476 | 477 | runSql($sql, "create InParalog2Way", 'inParalog2Way', '1.5 hours', undef); 478 | 479 | ###################################################################### 480 | 481 | $sql = " 482 | create unique index in2a_ix on InParalog2Way(sequence_id_a, sequence_id_b) 483 | "; 484 | 485 | runSql($sql, "index in2a_ix", 'in2a_ix', '45 min', undef); 486 | 487 | ###################################################################### 488 | 489 | $sql = " 490 | create unique index in2b_ix on InParalog2Way(sequence_id_b, sequence_id_a) 491 | "; 492 | 493 | runSql($sql, "index in2b_ix", 'in2b_ix', '45 min', 'InParalog2Way'); 494 | 495 | ###################################################################### 496 | 497 | $sql = " 498 | create table Ortholog2Way $oracleNoLogging as 499 | -- symmetric closure of Ortholog 500 | select sequence_id_a, sequence_id_b from $orthologTable 501 | union 502 | select sequence_id_b as sequence_id_a, sequence_id_a as sequence_id_b from $orthologTable 503 | "; 504 | 505 | runSql($sql, "create Ortholog2Way", 'ortholog2Way', '1 hours', undef); 506 | 507 | ###################################################################### 508 | 509 | $sql = " 510 | create unique index ortholog2way_ix on Ortholog2Way(sequence_id_a, sequence_id_b) 511 | "; 512 | 513 | runSql($sql, "index ortholog2way_ix", 'ortholog2WayIndex', '5 min', 'Ortholog2Way'); 514 | 515 | ###################################################################### 516 | 517 | $sql = " 518 | create table InParalogOrthologInParalog $oracleNoLogging as 519 | select ip1.sequence_id_a, ip2.sequence_id_b 520 | from Ortholog2Way o, InParalog2Way ip2, InParalog2Way ip1 521 | where ip1.sequence_id_b = o.sequence_id_a 522 | and o.sequence_id_b = ip2.sequence_id_a 523 | "; 524 | 525 | runSql($sql, "create InParalogOrthologInParalog", 'inParalogOrthologInParalog', '20 min', undef); 526 | 527 | ################################################################## 528 | 529 | $sql = " 530 | create table InParalogOrtholog $oracleNoLogging as 531 | select ip.sequence_id_a, o.sequence_id_b 532 | from InParalog2Way ip, Ortholog2Way o 533 | where ip.sequence_id_b = o.sequence_id_a 534 | "; 535 | 536 | runSql($sql, "create InParalogOrtholog", 'inParalogOrtholog', '15 min', undef); 537 | 538 | ################################################################## 539 | 540 | $sql = " 541 | create table CoOrthologCandidate $oracleNoLogging as 542 | select distinct 543 | least(sequence_id_a, sequence_id_b) as sequence_id_a, 544 | greatest(sequence_id_a, sequence_id_b) as sequence_id_b 545 | from (select sequence_id_a, sequence_id_b from InParalogOrthologInParalog 546 | union 547 | select sequence_id_a, sequence_id_b from InParalogOrtholog) t 548 | "; 549 | 550 | runSql($sql, "create CoOrthologCandidate", 'coOrthologCandidate', '1 hour', undef); 551 | 552 | ###################################################################### 553 | 554 | $sql = " 555 | create table CoOrthologNotOrtholog $oracleNoLogging as 556 | SELECT cc.sequence_id_a, cc.sequence_id_b 557 | FROM CoOrthologCandidate cc 558 | LEFT OUTER JOIN $orthologTable o 559 | ON cc.sequence_id_a = o.sequence_id_a 560 | AND cc.sequence_id_b = o.sequence_id_b 561 | WHERE o.sequence_id_a IS NULL 562 | "; 563 | 564 | runSql($sql, "create CoOrthologNotOrtholog table", 'coOrthologNotOrtholog', '10 min', undef); 565 | 566 | 567 | ##################################################################### 568 | 569 | $sql = " 570 | create index coortholog_not_ortholog_ix on CoOrthologNotOrtholog(sequence_id_a,sequence_id_b) 571 | "; 572 | 573 | runSql($sql, "index coortholog_not_ortholog_ix", 'coOrthologNotOrthologIndex', '1 min', 'CoOrthologNotOrtholog'); 574 | 575 | 576 | ###################################################################### 577 | 578 | my $tf; 579 | if ($taxonFilterTaxon) { 580 | $tf = "and ab.query_taxon_id != '$taxonFilterTaxon' and ab.subject_taxon_id != '$taxonFilterTaxon' and ba.query_taxon_id != '$taxonFilterTaxon' and ba.subject_taxon_id != '$taxonFilterTaxon'"; 581 | } 582 | 583 | $sql = " 584 | create table CoOrthologTemp $oracleNoLogging as 585 | select candidate.sequence_id_a, candidate.sequence_id_b, 586 | ab.query_taxon_id as taxon_id_a, ab.subject_taxon_id as taxon_id_b, 587 | case -- in case of 0 evalue, use rigged exponent 588 | when ab.evalue_mant < 0.00001 or ba.evalue_mant < 0.00001 589 | then (ab.evalue_exp + ba.evalue_exp) / -2 590 | else -- score = ( -log10(evalue1) - log10(evalue2) ) / 2 591 | (log(10, ab.evalue_mant * ba.evalue_mant) 592 | + ab.evalue_exp + ba.evalue_exp) / -2 593 | end as unnormalized_score 594 | from $sst ab, $sst ba, CoOrthologNotOrtholog candidate 595 | where ab.query_id = candidate.sequence_id_a $tf 596 | and ab.subject_id = candidate.sequence_id_b 597 | and ab.evalue_exp <= $evalueExpThreshold 598 | and ab.percent_match >= $percentMatchThreshold 599 | and ba.query_id = candidate.sequence_id_b 600 | and ba.subject_id = candidate.sequence_id_a 601 | and ba.evalue_exp <= $evalueExpThreshold 602 | and ba.percent_match >= $percentMatchThreshold 603 | "; 604 | 605 | runSql($sql, "create CoOrthologTemp table", 'coOrtholog', '2 hours', undef); 606 | 607 | ###################################################################### 608 | 609 | orthologTaxonSub('co'); 610 | 611 | ###################################################################### 612 | 613 | normalizeOrthologsSub("Co", $base->getConfig("coOrthologTable")); 614 | } 615 | 616 | 617 | sub orthologTaxonSub { 618 | my ($co) = @_; 619 | 620 | my $coCaps = $co? "Co" : ""; 621 | $co = $co? "coO" : "o"; 622 | 623 | my $sql = "create table ${coCaps}OrthologTaxon $oracleNoLogging as 624 | select case 625 | when taxon_id_a < taxon_id_b 626 | then taxon_id_a 627 | else taxon_id_b 628 | end as smaller_tax_id, 629 | case 630 | when taxon_id_a < taxon_id_b 631 | then taxon_id_b 632 | else taxon_id_a 633 | end as bigger_tax_id, 634 | unnormalized_score 635 | from ${coCaps}OrthologTemp"; 636 | 637 | runSql($sql, "create ${coCaps}OrthologTaxon table", "${co}rthologTaxon", '1 min', undef); 638 | } 639 | 640 | sub normalizeOrthologsSub { 641 | my ($co, $orthologTable) = @_; 642 | 643 | my $coCaps = $co? "Co" : ""; 644 | $co = $co? "coO" : "o"; 645 | 646 | my $sql = " 647 | create table ${coCaps}OrthologAvgScore $oracleNoLogging as 648 | select smaller_tax_id, bigger_tax_id, avg(unnormalized_score) avg_score 649 | from ${coCaps}OrthologTaxon 650 | group by smaller_tax_id, bigger_tax_id 651 | "; 652 | 653 | runSql($sql, "create ${coCaps}OrthologAvgScore table", "${co}rthologAvg", '1 min', undef); 654 | 655 | ################################################################ 656 | 657 | $sql = "create unique index ${co}orthoAvg_ix on ${coCaps}OrthologAvgScore(smaller_tax_id,bigger_tax_id,avg_score)"; 658 | 659 | runSql($sql, "create ${coCaps}OrthologAvgScore index", "${co}rthologAvgIndex", '1 min', "${coCaps}OrthologAvgScore"); 660 | 661 | ################################################################ 662 | 663 | $sql = " 664 | insert into $orthologTable (sequence_id_a, sequence_id_b, taxon_id_a, taxon_id_b, unnormalized_score, normalized_score) 665 | select ot.sequence_id_a, ot.sequence_id_b, ot.taxon_id_a, ot.taxon_id_b, ot.unnormalized_score, ot.unnormalized_score/a.avg_score 666 | from ${coCaps}OrthologTemp ot, ${coCaps}OrthologAvgScore a 667 | where least(ot.taxon_id_a, ot.taxon_id_b) = a.smaller_tax_id 668 | and greatest(ot.taxon_id_a, ot.taxon_id_b) = a.bigger_tax_id 669 | "; 670 | 671 | runSql($sql, "populate $orthologTable table, including normalized_score", "${co}rthologsNormalization", '2 min', "$orthologTable"); 672 | } 673 | 674 | sub runSql { 675 | my ($sql, $msg, $tag, $sampleTime, $tableToAnalyze) = @_; 676 | 677 | print LOGFILE "$sql\n\n" if $debug; 678 | 679 | my $stepNumber = $stepsHash->{$tag}; 680 | die "invalid tag '$tag'" unless $stepNumber; 681 | 682 | if ($skipPast >= $stepNumber) { 683 | print LOGFILE "... skipping '$tag'...\n\n"; 684 | return; 685 | } 686 | 687 | if ($clean ne 'only' && $clean ne 'all') { 688 | my $t = time(); 689 | 690 | print LOGFILE localtime() . " $msg (Benchmark dataset took $sampleTime for this step)\n"; 691 | 692 | my $stmt = $dbh->prepare($sql) or die DBI::errstr; 693 | $stmt->execute() or die DBI::errstr; 694 | 695 | &analyzeStats($tableToAnalyze) if ($tableToAnalyze); 696 | 697 | my $tt = time() - $t; 698 | my $hours = int($tt / 3600); 699 | my $mins = int($tt / 60) % 60; 700 | if ($hours == 0 && $mins == 0) {$mins = 1}; 701 | my $hoursStr = $hours? "$hours hours and " : ""; 702 | print LOGFILE localtime() . " step '$tag' done ($hoursStr$mins mins)\n\n"; 703 | } 704 | 705 | clean($tag) unless ($clean eq 'no'); 706 | } 707 | 708 | sub analyzeStats { 709 | my ($tableToAnalyze) = @_; 710 | 711 | if ($base->getConfig("dbVendor") eq 'oracle') { 712 | my $sql = "analyze table $tableToAnalyze compute statistics"; 713 | my $stmt = $dbh->prepare($sql) or die DBI::errstr; 714 | $stmt->execute() or die DBI::errstr; 715 | $stmt = $dbh->prepare("$sql for all indexes") or die DBI::errstr; 716 | $stmt->execute() or die DBI::errstr; 717 | } else { 718 | my $sql = "analyze table $tableToAnalyze"; 719 | my $stmt = $dbh->prepare($sql) or die DBI::errstr; 720 | $stmt->execute() or die DBI::errstr; 721 | } 722 | } 723 | 724 | sub clean { 725 | my ($tag) = @_; 726 | 727 | my $cleanSqls = $cleanHash->{$tag}; 728 | foreach my $cleanSql (@$cleanSqls) { 729 | if ($cleanSql) { 730 | $cleanSql =~ /(\w+) table (\w+)/i || die "invalid clean sql '$cleanSql'"; 731 | my $action = $1; 732 | my $table = $2; 733 | next if ($action eq 'drop' && &tableAlreadyDropped($table)); 734 | my $stmt = $dbh->prepare($cleanSql) or die DBI::errstr; 735 | print LOGFILE localtime() . " cleaning: $cleanSql\n"; 736 | $stmt->execute() or die DBI::errstr; 737 | print LOGFILE localtime() . " done\n"; 738 | } 739 | } 740 | } 741 | 742 | sub tableAlreadyDropped { 743 | my ($table) = @_; 744 | 745 | my $orthologTable = $base->getConfig("orthologTable"); 746 | my $coOrthologTable = $base->getConfig("coOrthologTable"); 747 | my $inParalogTable = $base->getConfig("inParalogTable"); 748 | 749 | $table = $orthologTable if $table eq 'Ortholog'; 750 | $table = $coOrthologTable if $table eq 'CoOrtholog'; 751 | $table = $inParalogTable if $table eq 'InParalog'; 752 | my $sql; 753 | if ($base->getConfig("dbVendor") eq 'oracle') { 754 | $table = uc($table); 755 | $sql = "select table_name from all_tables where table_name = '$table'"; 756 | } else { 757 | $sql = "show tables like '$table'"; 758 | } 759 | my $stmt = $dbh->prepare($sql); 760 | $stmt->execute() or die DBI::errstr; 761 | while ($stmt->fetchrow()) { return 0}; 762 | return 1; 763 | } 764 | 765 | 766 | sub cleanall { 767 | foreach my $tag (keys (%$cleanHash)) { 768 | clean($tag); 769 | } 770 | } 771 | 772 | sub usage { 773 | my $stepsString; 774 | map { $stepsString .= " $_->[0]\n" } @steps; 775 | 776 | print " 777 | Find pairs for OrthoMCL. 778 | 779 | usage: orthomclPairs config_file log_file cleanup=[yes|no|only|all] 780 | 781 | where: 782 | config_file : see below 783 | cleanup : clean up temp tables? 784 | yes=clean as we go; 785 | no=don't clean as we go; 786 | only=just clean, do nothing else; 787 | all=just clean, plus clean InParalog, Ortholog and CoOrtholog tables. 788 | startAfter : optionally start after a previously completed step. see below for TAGs 789 | 790 | Database Input: 791 | - SimilarSequences table containing all-v-all BLAST hits 792 | - InParalog, Ortholog, CoOrtholog tables - created but empty 793 | 794 | Database Output: 795 | - Populated InParalog, Ortholog and CoOrtholog tables 796 | 797 | Standard Error: 798 | - logging info 799 | 800 | NOTE: the database login in the config file must have update/insert/truncate privileges on the tables specified in the config file. 801 | 802 | EXAMPLE: orthomclSoftware/bin/orthomclPairs my_orthomcl_dir/orthomcl.config my_orthomcl_dir/orthomcl_pairs.log cleanup=no 803 | 804 | 805 | Sample Config File: 806 | 807 | dbVendor=oracle (or mysql) 808 | dbConnectString=dbi:Oracle:orthomcl 809 | dbLogin=my_db_login 810 | dbPassword=my_db_password 811 | similarSequencesTable=SimilarSequences 812 | orthologTable=Ortholog 813 | inParalogTable=InParalog 814 | coOrthologTable=CoOrtholog 815 | interTaxonMatchView=InterTaxonMatch 816 | percentMatchCutoff=50 817 | evalueExponentCutoff=-5 818 | 819 | Names of TAGs to use in startAfter (look in log file to see last one run) 820 | $stepsString 821 | "; 822 | exit(1); 823 | } 824 | 825 | --------------------------------------------------------------------------------